11cb0ef41Sopenharmony_ci#! /usr/bin/env perl 21cb0ef41Sopenharmony_ci# Copyright 2009-2022 The OpenSSL Project Authors. All Rights Reserved. 31cb0ef41Sopenharmony_ci# 41cb0ef41Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 51cb0ef41Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 61cb0ef41Sopenharmony_ci# in the file LICENSE in the source distribution or at 71cb0ef41Sopenharmony_ci# https://www.openssl.org/source/license.html 81cb0ef41Sopenharmony_ci 91cb0ef41Sopenharmony_ci 101cb0ef41Sopenharmony_ci# ==================================================================== 111cb0ef41Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 121cb0ef41Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 131cb0ef41Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 141cb0ef41Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 151cb0ef41Sopenharmony_ci# ==================================================================== 161cb0ef41Sopenharmony_ci# 171cb0ef41Sopenharmony_ci# This module implements support for Intel AES-NI extension. In 181cb0ef41Sopenharmony_ci# OpenSSL context it's used with Intel engine, but can also be used as 191cb0ef41Sopenharmony_ci# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 201cb0ef41Sopenharmony_ci# details]. 211cb0ef41Sopenharmony_ci# 221cb0ef41Sopenharmony_ci# Performance. 231cb0ef41Sopenharmony_ci# 241cb0ef41Sopenharmony_ci# To start with see corresponding paragraph in aesni-x86_64.pl... 251cb0ef41Sopenharmony_ci# Instead of filling table similar to one found there I've chosen to 261cb0ef41Sopenharmony_ci# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 271cb0ef41Sopenharmony_ci# The simplified table below represents 32-bit performance relative 281cb0ef41Sopenharmony_ci# to 64-bit one in every given point. Ratios vary for different 291cb0ef41Sopenharmony_ci# encryption modes, therefore interval values. 301cb0ef41Sopenharmony_ci# 311cb0ef41Sopenharmony_ci# 16-byte 64-byte 256-byte 1-KB 8-KB 321cb0ef41Sopenharmony_ci# 53-67% 67-84% 91-94% 95-98% 97-99.5% 331cb0ef41Sopenharmony_ci# 341cb0ef41Sopenharmony_ci# Lower ratios for smaller block sizes are perfectly understandable, 351cb0ef41Sopenharmony_ci# because function call overhead is higher in 32-bit mode. Largest 361cb0ef41Sopenharmony_ci# 8-KB block performance is virtually same: 32-bit code is less than 371cb0ef41Sopenharmony_ci# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 381cb0ef41Sopenharmony_ci 391cb0ef41Sopenharmony_ci# January 2011 401cb0ef41Sopenharmony_ci# 411cb0ef41Sopenharmony_ci# See aesni-x86_64.pl for details. Unlike x86_64 version this module 421cb0ef41Sopenharmony_ci# interleaves at most 6 aes[enc|dec] instructions, because there are 431cb0ef41Sopenharmony_ci# not enough registers for 8x interleave [which should be optimal for 441cb0ef41Sopenharmony_ci# Sandy Bridge]. Actually, performance results for 6x interleave 451cb0ef41Sopenharmony_ci# factor presented in aesni-x86_64.pl (except for CTR) are for this 461cb0ef41Sopenharmony_ci# module. 471cb0ef41Sopenharmony_ci 481cb0ef41Sopenharmony_ci# April 2011 491cb0ef41Sopenharmony_ci# 501cb0ef41Sopenharmony_ci# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 511cb0ef41Sopenharmony_ci# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 521cb0ef41Sopenharmony_ci 531cb0ef41Sopenharmony_ci# November 2015 541cb0ef41Sopenharmony_ci# 551cb0ef41Sopenharmony_ci# Add aesni_ocb_[en|de]crypt. 561cb0ef41Sopenharmony_ci 571cb0ef41Sopenharmony_ci###################################################################### 581cb0ef41Sopenharmony_ci# Current large-block performance in cycles per byte processed with 591cb0ef41Sopenharmony_ci# 128-bit key (less is better). 601cb0ef41Sopenharmony_ci# 611cb0ef41Sopenharmony_ci# CBC en-/decrypt CTR XTS ECB OCB 621cb0ef41Sopenharmony_ci# Westmere 3.77/1.37 1.37 1.52 1.27 631cb0ef41Sopenharmony_ci# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10 641cb0ef41Sopenharmony_ci# Haswell 4.44/0.80 0.97 1.03 0.72 0.76 651cb0ef41Sopenharmony_ci# Skylake 2.68/0.65 0.65 0.66 0.64 0.66 661cb0ef41Sopenharmony_ci# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03 671cb0ef41Sopenharmony_ci# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70 681cb0ef41Sopenharmony_ci# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23 691cb0ef41Sopenharmony_ci 701cb0ef41Sopenharmony_ci$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 711cb0ef41Sopenharmony_ci # generates drop-in replacement for 721cb0ef41Sopenharmony_ci # crypto/aes/asm/aes-586.pl:-) 731cb0ef41Sopenharmony_ci$inline=1; # inline _aesni_[en|de]crypt 741cb0ef41Sopenharmony_ci 751cb0ef41Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 761cb0ef41Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 771cb0ef41Sopenharmony_cirequire "x86asm.pl"; 781cb0ef41Sopenharmony_ci 791cb0ef41Sopenharmony_ci$output = pop and open STDOUT,">$output"; 801cb0ef41Sopenharmony_ci 811cb0ef41Sopenharmony_ci&asm_init($ARGV[0]); 821cb0ef41Sopenharmony_ci 831cb0ef41Sopenharmony_ci&external_label("OPENSSL_ia32cap_P"); 841cb0ef41Sopenharmony_ci&static_label("key_const"); 851cb0ef41Sopenharmony_ci 861cb0ef41Sopenharmony_ciif ($PREFIX eq "aesni") { $movekey=\&movups; } 871cb0ef41Sopenharmony_cielse { $movekey=\&movups; } 881cb0ef41Sopenharmony_ci 891cb0ef41Sopenharmony_ci$len="eax"; 901cb0ef41Sopenharmony_ci$rounds="ecx"; 911cb0ef41Sopenharmony_ci$key="edx"; 921cb0ef41Sopenharmony_ci$inp="esi"; 931cb0ef41Sopenharmony_ci$out="edi"; 941cb0ef41Sopenharmony_ci$rounds_="ebx"; # backup copy for $rounds 951cb0ef41Sopenharmony_ci$key_="ebp"; # backup copy for $key 961cb0ef41Sopenharmony_ci 971cb0ef41Sopenharmony_ci$rndkey0="xmm0"; 981cb0ef41Sopenharmony_ci$rndkey1="xmm1"; 991cb0ef41Sopenharmony_ci$inout0="xmm2"; 1001cb0ef41Sopenharmony_ci$inout1="xmm3"; 1011cb0ef41Sopenharmony_ci$inout2="xmm4"; 1021cb0ef41Sopenharmony_ci$inout3="xmm5"; $in1="xmm5"; 1031cb0ef41Sopenharmony_ci$inout4="xmm6"; $in0="xmm6"; 1041cb0ef41Sopenharmony_ci$inout5="xmm7"; $ivec="xmm7"; 1051cb0ef41Sopenharmony_ci 1061cb0ef41Sopenharmony_ci# AESNI extension 1071cb0ef41Sopenharmony_cisub aeskeygenassist 1081cb0ef41Sopenharmony_ci{ my($dst,$src,$imm)=@_; 1091cb0ef41Sopenharmony_ci if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 1101cb0ef41Sopenharmony_ci { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 1111cb0ef41Sopenharmony_ci} 1121cb0ef41Sopenharmony_cisub aescommon 1131cb0ef41Sopenharmony_ci{ my($opcodelet,$dst,$src)=@_; 1141cb0ef41Sopenharmony_ci if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 1151cb0ef41Sopenharmony_ci { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 1161cb0ef41Sopenharmony_ci} 1171cb0ef41Sopenharmony_cisub aesimc { aescommon(0xdb,@_); } 1181cb0ef41Sopenharmony_cisub aesenc { aescommon(0xdc,@_); } 1191cb0ef41Sopenharmony_cisub aesenclast { aescommon(0xdd,@_); } 1201cb0ef41Sopenharmony_cisub aesdec { aescommon(0xde,@_); } 1211cb0ef41Sopenharmony_cisub aesdeclast { aescommon(0xdf,@_); } 1221cb0ef41Sopenharmony_ci 1231cb0ef41Sopenharmony_ci# Inline version of internal aesni_[en|de]crypt1 1241cb0ef41Sopenharmony_ci{ my $sn; 1251cb0ef41Sopenharmony_cisub aesni_inline_generate1 1261cb0ef41Sopenharmony_ci{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 1271cb0ef41Sopenharmony_ci $sn++; 1281cb0ef41Sopenharmony_ci 1291cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key)); 1301cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key)); 1311cb0ef41Sopenharmony_ci &xorps ($ivec,$rndkey0) if (defined($ivec)); 1321cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key)); 1331cb0ef41Sopenharmony_ci &xorps ($inout,$ivec) if (defined($ivec)); 1341cb0ef41Sopenharmony_ci &xorps ($inout,$rndkey0) if (!defined($ivec)); 1351cb0ef41Sopenharmony_ci &set_label("${p}1_loop_$sn"); 1361cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 1371cb0ef41Sopenharmony_ci &dec ($rounds); 1381cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key)); 1391cb0ef41Sopenharmony_ci &lea ($key,&DWP(16,$key)); 1401cb0ef41Sopenharmony_ci &jnz (&label("${p}1_loop_$sn")); 1411cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout,$rndkey1)"; 1421cb0ef41Sopenharmony_ci}} 1431cb0ef41Sopenharmony_ci 1441cb0ef41Sopenharmony_cisub aesni_generate1 # fully unrolled loop 1451cb0ef41Sopenharmony_ci{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 1461cb0ef41Sopenharmony_ci 1471cb0ef41Sopenharmony_ci &function_begin_B("_aesni_${p}rypt1"); 1481cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0,$key)); 1491cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0x10,$key)); 1501cb0ef41Sopenharmony_ci &xorps ($inout,$rndkey0); 1511cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0x20,$key)); 1521cb0ef41Sopenharmony_ci &lea ($key,&DWP(0x30,$key)); 1531cb0ef41Sopenharmony_ci &cmp ($rounds,11); 1541cb0ef41Sopenharmony_ci &jb (&label("${p}128")); 1551cb0ef41Sopenharmony_ci &lea ($key,&DWP(0x20,$key)); 1561cb0ef41Sopenharmony_ci &je (&label("${p}192")); 1571cb0ef41Sopenharmony_ci &lea ($key,&DWP(0x20,$key)); 1581cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 1591cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(-0x40,$key)); 1601cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 1611cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-0x30,$key)); 1621cb0ef41Sopenharmony_ci &set_label("${p}192"); 1631cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 1641cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(-0x20,$key)); 1651cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 1661cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-0x10,$key)); 1671cb0ef41Sopenharmony_ci &set_label("${p}128"); 1681cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 1691cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key)); 1701cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 1711cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0x10,$key)); 1721cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 1731cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0x20,$key)); 1741cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 1751cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0x30,$key)); 1761cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 1771cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0x40,$key)); 1781cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 1791cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0x50,$key)); 1801cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 1811cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0x60,$key)); 1821cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 1831cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0x70,$key)); 1841cb0ef41Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 1851cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout,$rndkey0)"; 1861cb0ef41Sopenharmony_ci &ret(); 1871cb0ef41Sopenharmony_ci &function_end_B("_aesni_${p}rypt1"); 1881cb0ef41Sopenharmony_ci} 1891cb0ef41Sopenharmony_ci 1901cb0ef41Sopenharmony_ci# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 1911cb0ef41Sopenharmony_ci&aesni_generate1("enc") if (!$inline); 1921cb0ef41Sopenharmony_ci&function_begin_B("${PREFIX}_encrypt"); 1931cb0ef41Sopenharmony_ci &mov ("eax",&wparam(0)); 1941cb0ef41Sopenharmony_ci &mov ($key,&wparam(2)); 1951cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,"eax")); 1961cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 1971cb0ef41Sopenharmony_ci &mov ("eax",&wparam(1)); 1981cb0ef41Sopenharmony_ci if ($inline) 1991cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc"); } 2001cb0ef41Sopenharmony_ci else 2011cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1"); } 2021cb0ef41Sopenharmony_ci &pxor ($rndkey0,$rndkey0); # clear register bank 2031cb0ef41Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 2041cb0ef41Sopenharmony_ci &movups (&QWP(0,"eax"),$inout0); 2051cb0ef41Sopenharmony_ci &pxor ($inout0,$inout0); 2061cb0ef41Sopenharmony_ci &ret (); 2071cb0ef41Sopenharmony_ci&function_end_B("${PREFIX}_encrypt"); 2081cb0ef41Sopenharmony_ci 2091cb0ef41Sopenharmony_ci# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 2101cb0ef41Sopenharmony_ci&aesni_generate1("dec") if(!$inline); 2111cb0ef41Sopenharmony_ci&function_begin_B("${PREFIX}_decrypt"); 2121cb0ef41Sopenharmony_ci &mov ("eax",&wparam(0)); 2131cb0ef41Sopenharmony_ci &mov ($key,&wparam(2)); 2141cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,"eax")); 2151cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2161cb0ef41Sopenharmony_ci &mov ("eax",&wparam(1)); 2171cb0ef41Sopenharmony_ci if ($inline) 2181cb0ef41Sopenharmony_ci { &aesni_inline_generate1("dec"); } 2191cb0ef41Sopenharmony_ci else 2201cb0ef41Sopenharmony_ci { &call ("_aesni_decrypt1"); } 2211cb0ef41Sopenharmony_ci &pxor ($rndkey0,$rndkey0); # clear register bank 2221cb0ef41Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 2231cb0ef41Sopenharmony_ci &movups (&QWP(0,"eax"),$inout0); 2241cb0ef41Sopenharmony_ci &pxor ($inout0,$inout0); 2251cb0ef41Sopenharmony_ci &ret (); 2261cb0ef41Sopenharmony_ci&function_end_B("${PREFIX}_decrypt"); 2271cb0ef41Sopenharmony_ci 2281cb0ef41Sopenharmony_ci# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 2291cb0ef41Sopenharmony_ci# factor. Why 3x subroutine were originally used in loops? Even though 2301cb0ef41Sopenharmony_ci# aes[enc|dec] latency was originally 6, it could be scheduled only 2311cb0ef41Sopenharmony_ci# every *2nd* cycle. Thus 3x interleave was the one providing optimal 2321cb0ef41Sopenharmony_ci# utilization, i.e. when subroutine's throughput is virtually same as 2331cb0ef41Sopenharmony_ci# of non-interleaved subroutine [for number of input blocks up to 3]. 2341cb0ef41Sopenharmony_ci# This is why it originally made no sense to implement 2x subroutine. 2351cb0ef41Sopenharmony_ci# But times change and it became appropriate to spend extra 192 bytes 2361cb0ef41Sopenharmony_ci# on 2x subroutine on Atom Silvermont account. For processors that 2371cb0ef41Sopenharmony_ci# can schedule aes[enc|dec] every cycle optimal interleave factor 2381cb0ef41Sopenharmony_ci# equals to corresponding instructions latency. 8x is optimal for 2391cb0ef41Sopenharmony_ci# * Bridge, but it's unfeasible to accommodate such implementation 2401cb0ef41Sopenharmony_ci# in XMM registers addressable in 32-bit mode and therefore maximum 2411cb0ef41Sopenharmony_ci# of 6x is used instead... 2421cb0ef41Sopenharmony_ci 2431cb0ef41Sopenharmony_cisub aesni_generate2 2441cb0ef41Sopenharmony_ci{ my $p=shift; 2451cb0ef41Sopenharmony_ci 2461cb0ef41Sopenharmony_ci &function_begin_B("_aesni_${p}rypt2"); 2471cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key)); 2481cb0ef41Sopenharmony_ci &shl ($rounds,4); 2491cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key)); 2501cb0ef41Sopenharmony_ci &xorps ($inout0,$rndkey0); 2511cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); 2521cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key)); 2531cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 2541cb0ef41Sopenharmony_ci &neg ($rounds); 2551cb0ef41Sopenharmony_ci &add ($rounds,16); 2561cb0ef41Sopenharmony_ci 2571cb0ef41Sopenharmony_ci &set_label("${p}2_loop"); 2581cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 2591cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 2601cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 2611cb0ef41Sopenharmony_ci &add ($rounds,32); 2621cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey0)"; 2631cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey0)"; 2641cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2651cb0ef41Sopenharmony_ci &jnz (&label("${p}2_loop")); 2661cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 2671cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 2681cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout0,$rndkey0)"; 2691cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout1,$rndkey0)"; 2701cb0ef41Sopenharmony_ci &ret(); 2711cb0ef41Sopenharmony_ci &function_end_B("_aesni_${p}rypt2"); 2721cb0ef41Sopenharmony_ci} 2731cb0ef41Sopenharmony_ci 2741cb0ef41Sopenharmony_cisub aesni_generate3 2751cb0ef41Sopenharmony_ci{ my $p=shift; 2761cb0ef41Sopenharmony_ci 2771cb0ef41Sopenharmony_ci &function_begin_B("_aesni_${p}rypt3"); 2781cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key)); 2791cb0ef41Sopenharmony_ci &shl ($rounds,4); 2801cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key)); 2811cb0ef41Sopenharmony_ci &xorps ($inout0,$rndkey0); 2821cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); 2831cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); 2841cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key)); 2851cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 2861cb0ef41Sopenharmony_ci &neg ($rounds); 2871cb0ef41Sopenharmony_ci &add ($rounds,16); 2881cb0ef41Sopenharmony_ci 2891cb0ef41Sopenharmony_ci &set_label("${p}3_loop"); 2901cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 2911cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 2921cb0ef41Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 2931cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 2941cb0ef41Sopenharmony_ci &add ($rounds,32); 2951cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey0)"; 2961cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey0)"; 2971cb0ef41Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey0)"; 2981cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2991cb0ef41Sopenharmony_ci &jnz (&label("${p}3_loop")); 3001cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 3011cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 3021cb0ef41Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 3031cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout0,$rndkey0)"; 3041cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout1,$rndkey0)"; 3051cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout2,$rndkey0)"; 3061cb0ef41Sopenharmony_ci &ret(); 3071cb0ef41Sopenharmony_ci &function_end_B("_aesni_${p}rypt3"); 3081cb0ef41Sopenharmony_ci} 3091cb0ef41Sopenharmony_ci 3101cb0ef41Sopenharmony_ci# 4x interleave is implemented to improve small block performance, 3111cb0ef41Sopenharmony_ci# most notably [and naturally] 4 block by ~30%. One can argue that one 3121cb0ef41Sopenharmony_ci# should have implemented 5x as well, but improvement would be <20%, 3131cb0ef41Sopenharmony_ci# so it's not worth it... 3141cb0ef41Sopenharmony_cisub aesni_generate4 3151cb0ef41Sopenharmony_ci{ my $p=shift; 3161cb0ef41Sopenharmony_ci 3171cb0ef41Sopenharmony_ci &function_begin_B("_aesni_${p}rypt4"); 3181cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key)); 3191cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key)); 3201cb0ef41Sopenharmony_ci &shl ($rounds,4); 3211cb0ef41Sopenharmony_ci &xorps ($inout0,$rndkey0); 3221cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); 3231cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); 3241cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); 3251cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key)); 3261cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 3271cb0ef41Sopenharmony_ci &neg ($rounds); 3281cb0ef41Sopenharmony_ci &data_byte (0x0f,0x1f,0x40,0x00); 3291cb0ef41Sopenharmony_ci &add ($rounds,16); 3301cb0ef41Sopenharmony_ci 3311cb0ef41Sopenharmony_ci &set_label("${p}4_loop"); 3321cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 3331cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 3341cb0ef41Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 3351cb0ef41Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey1)"; 3361cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 3371cb0ef41Sopenharmony_ci &add ($rounds,32); 3381cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey0)"; 3391cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey0)"; 3401cb0ef41Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey0)"; 3411cb0ef41Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey0)"; 3421cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 3431cb0ef41Sopenharmony_ci &jnz (&label("${p}4_loop")); 3441cb0ef41Sopenharmony_ci 3451cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 3461cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 3471cb0ef41Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 3481cb0ef41Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey1)"; 3491cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout0,$rndkey0)"; 3501cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout1,$rndkey0)"; 3511cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout2,$rndkey0)"; 3521cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout3,$rndkey0)"; 3531cb0ef41Sopenharmony_ci &ret(); 3541cb0ef41Sopenharmony_ci &function_end_B("_aesni_${p}rypt4"); 3551cb0ef41Sopenharmony_ci} 3561cb0ef41Sopenharmony_ci 3571cb0ef41Sopenharmony_cisub aesni_generate6 3581cb0ef41Sopenharmony_ci{ my $p=shift; 3591cb0ef41Sopenharmony_ci 3601cb0ef41Sopenharmony_ci &function_begin_B("_aesni_${p}rypt6"); 3611cb0ef41Sopenharmony_ci &static_label("_aesni_${p}rypt6_enter"); 3621cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key)); 3631cb0ef41Sopenharmony_ci &shl ($rounds,4); 3641cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key)); 3651cb0ef41Sopenharmony_ci &xorps ($inout0,$rndkey0); 3661cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); # pxor does better here 3671cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); 3681cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 3691cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); 3701cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); 3711cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 3721cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 3731cb0ef41Sopenharmony_ci &neg ($rounds); 3741cb0ef41Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 3751cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); 3761cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 3771cb0ef41Sopenharmony_ci &add ($rounds,16); 3781cb0ef41Sopenharmony_ci &jmp (&label("_aesni_${p}rypt6_inner")); 3791cb0ef41Sopenharmony_ci 3801cb0ef41Sopenharmony_ci &set_label("${p}6_loop",16); 3811cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 3821cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 3831cb0ef41Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 3841cb0ef41Sopenharmony_ci &set_label("_aesni_${p}rypt6_inner"); 3851cb0ef41Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey1)"; 3861cb0ef41Sopenharmony_ci eval"&aes${p} ($inout4,$rndkey1)"; 3871cb0ef41Sopenharmony_ci eval"&aes${p} ($inout5,$rndkey1)"; 3881cb0ef41Sopenharmony_ci &set_label("_aesni_${p}rypt6_enter"); 3891cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 3901cb0ef41Sopenharmony_ci &add ($rounds,32); 3911cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey0)"; 3921cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey0)"; 3931cb0ef41Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey0)"; 3941cb0ef41Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey0)"; 3951cb0ef41Sopenharmony_ci eval"&aes${p} ($inout4,$rndkey0)"; 3961cb0ef41Sopenharmony_ci eval"&aes${p} ($inout5,$rndkey0)"; 3971cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 3981cb0ef41Sopenharmony_ci &jnz (&label("${p}6_loop")); 3991cb0ef41Sopenharmony_ci 4001cb0ef41Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 4011cb0ef41Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 4021cb0ef41Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 4031cb0ef41Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey1)"; 4041cb0ef41Sopenharmony_ci eval"&aes${p} ($inout4,$rndkey1)"; 4051cb0ef41Sopenharmony_ci eval"&aes${p} ($inout5,$rndkey1)"; 4061cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout0,$rndkey0)"; 4071cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout1,$rndkey0)"; 4081cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout2,$rndkey0)"; 4091cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout3,$rndkey0)"; 4101cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout4,$rndkey0)"; 4111cb0ef41Sopenharmony_ci eval"&aes${p}last ($inout5,$rndkey0)"; 4121cb0ef41Sopenharmony_ci &ret(); 4131cb0ef41Sopenharmony_ci &function_end_B("_aesni_${p}rypt6"); 4141cb0ef41Sopenharmony_ci} 4151cb0ef41Sopenharmony_ci&aesni_generate2("enc") if ($PREFIX eq "aesni"); 4161cb0ef41Sopenharmony_ci&aesni_generate2("dec"); 4171cb0ef41Sopenharmony_ci&aesni_generate3("enc") if ($PREFIX eq "aesni"); 4181cb0ef41Sopenharmony_ci&aesni_generate3("dec"); 4191cb0ef41Sopenharmony_ci&aesni_generate4("enc") if ($PREFIX eq "aesni"); 4201cb0ef41Sopenharmony_ci&aesni_generate4("dec"); 4211cb0ef41Sopenharmony_ci&aesni_generate6("enc") if ($PREFIX eq "aesni"); 4221cb0ef41Sopenharmony_ci&aesni_generate6("dec"); 4231cb0ef41Sopenharmony_ci 4241cb0ef41Sopenharmony_ciif ($PREFIX eq "aesni") { 4251cb0ef41Sopenharmony_ci###################################################################### 4261cb0ef41Sopenharmony_ci# void aesni_ecb_encrypt (const void *in, void *out, 4271cb0ef41Sopenharmony_ci# size_t length, const AES_KEY *key, 4281cb0ef41Sopenharmony_ci# int enc); 4291cb0ef41Sopenharmony_ci&function_begin("aesni_ecb_encrypt"); 4301cb0ef41Sopenharmony_ci &mov ($inp,&wparam(0)); 4311cb0ef41Sopenharmony_ci &mov ($out,&wparam(1)); 4321cb0ef41Sopenharmony_ci &mov ($len,&wparam(2)); 4331cb0ef41Sopenharmony_ci &mov ($key,&wparam(3)); 4341cb0ef41Sopenharmony_ci &mov ($rounds_,&wparam(4)); 4351cb0ef41Sopenharmony_ci &and ($len,-16); 4361cb0ef41Sopenharmony_ci &jz (&label("ecb_ret")); 4371cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 4381cb0ef41Sopenharmony_ci &test ($rounds_,$rounds_); 4391cb0ef41Sopenharmony_ci &jz (&label("ecb_decrypt")); 4401cb0ef41Sopenharmony_ci 4411cb0ef41Sopenharmony_ci &mov ($key_,$key); # backup $key 4421cb0ef41Sopenharmony_ci &mov ($rounds_,$rounds); # backup $rounds 4431cb0ef41Sopenharmony_ci &cmp ($len,0x60); 4441cb0ef41Sopenharmony_ci &jb (&label("ecb_enc_tail")); 4451cb0ef41Sopenharmony_ci 4461cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(0,$inp)); 4471cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(0x10,$inp)); 4481cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(0x20,$inp)); 4491cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0x30,$inp)); 4501cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(0x40,$inp)); 4511cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0x50,$inp)); 4521cb0ef41Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 4531cb0ef41Sopenharmony_ci &sub ($len,0x60); 4541cb0ef41Sopenharmony_ci &jmp (&label("ecb_enc_loop6_enter")); 4551cb0ef41Sopenharmony_ci 4561cb0ef41Sopenharmony_ci&set_label("ecb_enc_loop6",16); 4571cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 4581cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(0,$inp)); 4591cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 4601cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(0x10,$inp)); 4611cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 4621cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(0x20,$inp)); 4631cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 4641cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0x30,$inp)); 4651cb0ef41Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 4661cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(0x40,$inp)); 4671cb0ef41Sopenharmony_ci &movups (&QWP(0x50,$out),$inout5); 4681cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x60,$out)); 4691cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0x50,$inp)); 4701cb0ef41Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 4711cb0ef41Sopenharmony_ci&set_label("ecb_enc_loop6_enter"); 4721cb0ef41Sopenharmony_ci 4731cb0ef41Sopenharmony_ci &call ("_aesni_encrypt6"); 4741cb0ef41Sopenharmony_ci 4751cb0ef41Sopenharmony_ci &mov ($key,$key_); # restore $key 4761cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 4771cb0ef41Sopenharmony_ci &sub ($len,0x60); 4781cb0ef41Sopenharmony_ci &jnc (&label("ecb_enc_loop6")); 4791cb0ef41Sopenharmony_ci 4801cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 4811cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 4821cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 4831cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 4841cb0ef41Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 4851cb0ef41Sopenharmony_ci &movups (&QWP(0x50,$out),$inout5); 4861cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x60,$out)); 4871cb0ef41Sopenharmony_ci &add ($len,0x60); 4881cb0ef41Sopenharmony_ci &jz (&label("ecb_ret")); 4891cb0ef41Sopenharmony_ci 4901cb0ef41Sopenharmony_ci&set_label("ecb_enc_tail"); 4911cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 4921cb0ef41Sopenharmony_ci &cmp ($len,0x20); 4931cb0ef41Sopenharmony_ci &jb (&label("ecb_enc_one")); 4941cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(0x10,$inp)); 4951cb0ef41Sopenharmony_ci &je (&label("ecb_enc_two")); 4961cb0ef41Sopenharmony_ci &movups ($inout2,&QWP(0x20,$inp)); 4971cb0ef41Sopenharmony_ci &cmp ($len,0x40); 4981cb0ef41Sopenharmony_ci &jb (&label("ecb_enc_three")); 4991cb0ef41Sopenharmony_ci &movups ($inout3,&QWP(0x30,$inp)); 5001cb0ef41Sopenharmony_ci &je (&label("ecb_enc_four")); 5011cb0ef41Sopenharmony_ci &movups ($inout4,&QWP(0x40,$inp)); 5021cb0ef41Sopenharmony_ci &xorps ($inout5,$inout5); 5031cb0ef41Sopenharmony_ci &call ("_aesni_encrypt6"); 5041cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 5051cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 5061cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 5071cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 5081cb0ef41Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 5091cb0ef41Sopenharmony_ci jmp (&label("ecb_ret")); 5101cb0ef41Sopenharmony_ci 5111cb0ef41Sopenharmony_ci&set_label("ecb_enc_one",16); 5121cb0ef41Sopenharmony_ci if ($inline) 5131cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc"); } 5141cb0ef41Sopenharmony_ci else 5151cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1"); } 5161cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 5171cb0ef41Sopenharmony_ci &jmp (&label("ecb_ret")); 5181cb0ef41Sopenharmony_ci 5191cb0ef41Sopenharmony_ci&set_label("ecb_enc_two",16); 5201cb0ef41Sopenharmony_ci &call ("_aesni_encrypt2"); 5211cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 5221cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 5231cb0ef41Sopenharmony_ci &jmp (&label("ecb_ret")); 5241cb0ef41Sopenharmony_ci 5251cb0ef41Sopenharmony_ci&set_label("ecb_enc_three",16); 5261cb0ef41Sopenharmony_ci &call ("_aesni_encrypt3"); 5271cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 5281cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 5291cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 5301cb0ef41Sopenharmony_ci &jmp (&label("ecb_ret")); 5311cb0ef41Sopenharmony_ci 5321cb0ef41Sopenharmony_ci&set_label("ecb_enc_four",16); 5331cb0ef41Sopenharmony_ci &call ("_aesni_encrypt4"); 5341cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 5351cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 5361cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 5371cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 5381cb0ef41Sopenharmony_ci &jmp (&label("ecb_ret")); 5391cb0ef41Sopenharmony_ci###################################################################### 5401cb0ef41Sopenharmony_ci&set_label("ecb_decrypt",16); 5411cb0ef41Sopenharmony_ci &mov ($key_,$key); # backup $key 5421cb0ef41Sopenharmony_ci &mov ($rounds_,$rounds); # backup $rounds 5431cb0ef41Sopenharmony_ci &cmp ($len,0x60); 5441cb0ef41Sopenharmony_ci &jb (&label("ecb_dec_tail")); 5451cb0ef41Sopenharmony_ci 5461cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(0,$inp)); 5471cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(0x10,$inp)); 5481cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(0x20,$inp)); 5491cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0x30,$inp)); 5501cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(0x40,$inp)); 5511cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0x50,$inp)); 5521cb0ef41Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 5531cb0ef41Sopenharmony_ci &sub ($len,0x60); 5541cb0ef41Sopenharmony_ci &jmp (&label("ecb_dec_loop6_enter")); 5551cb0ef41Sopenharmony_ci 5561cb0ef41Sopenharmony_ci&set_label("ecb_dec_loop6",16); 5571cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 5581cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(0,$inp)); 5591cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 5601cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(0x10,$inp)); 5611cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 5621cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(0x20,$inp)); 5631cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 5641cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0x30,$inp)); 5651cb0ef41Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 5661cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(0x40,$inp)); 5671cb0ef41Sopenharmony_ci &movups (&QWP(0x50,$out),$inout5); 5681cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x60,$out)); 5691cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0x50,$inp)); 5701cb0ef41Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 5711cb0ef41Sopenharmony_ci&set_label("ecb_dec_loop6_enter"); 5721cb0ef41Sopenharmony_ci 5731cb0ef41Sopenharmony_ci &call ("_aesni_decrypt6"); 5741cb0ef41Sopenharmony_ci 5751cb0ef41Sopenharmony_ci &mov ($key,$key_); # restore $key 5761cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 5771cb0ef41Sopenharmony_ci &sub ($len,0x60); 5781cb0ef41Sopenharmony_ci &jnc (&label("ecb_dec_loop6")); 5791cb0ef41Sopenharmony_ci 5801cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 5811cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 5821cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 5831cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 5841cb0ef41Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 5851cb0ef41Sopenharmony_ci &movups (&QWP(0x50,$out),$inout5); 5861cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x60,$out)); 5871cb0ef41Sopenharmony_ci &add ($len,0x60); 5881cb0ef41Sopenharmony_ci &jz (&label("ecb_ret")); 5891cb0ef41Sopenharmony_ci 5901cb0ef41Sopenharmony_ci&set_label("ecb_dec_tail"); 5911cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 5921cb0ef41Sopenharmony_ci &cmp ($len,0x20); 5931cb0ef41Sopenharmony_ci &jb (&label("ecb_dec_one")); 5941cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(0x10,$inp)); 5951cb0ef41Sopenharmony_ci &je (&label("ecb_dec_two")); 5961cb0ef41Sopenharmony_ci &movups ($inout2,&QWP(0x20,$inp)); 5971cb0ef41Sopenharmony_ci &cmp ($len,0x40); 5981cb0ef41Sopenharmony_ci &jb (&label("ecb_dec_three")); 5991cb0ef41Sopenharmony_ci &movups ($inout3,&QWP(0x30,$inp)); 6001cb0ef41Sopenharmony_ci &je (&label("ecb_dec_four")); 6011cb0ef41Sopenharmony_ci &movups ($inout4,&QWP(0x40,$inp)); 6021cb0ef41Sopenharmony_ci &xorps ($inout5,$inout5); 6031cb0ef41Sopenharmony_ci &call ("_aesni_decrypt6"); 6041cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 6051cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 6061cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 6071cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 6081cb0ef41Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 6091cb0ef41Sopenharmony_ci &jmp (&label("ecb_ret")); 6101cb0ef41Sopenharmony_ci 6111cb0ef41Sopenharmony_ci&set_label("ecb_dec_one",16); 6121cb0ef41Sopenharmony_ci if ($inline) 6131cb0ef41Sopenharmony_ci { &aesni_inline_generate1("dec"); } 6141cb0ef41Sopenharmony_ci else 6151cb0ef41Sopenharmony_ci { &call ("_aesni_decrypt1"); } 6161cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 6171cb0ef41Sopenharmony_ci &jmp (&label("ecb_ret")); 6181cb0ef41Sopenharmony_ci 6191cb0ef41Sopenharmony_ci&set_label("ecb_dec_two",16); 6201cb0ef41Sopenharmony_ci &call ("_aesni_decrypt2"); 6211cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 6221cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 6231cb0ef41Sopenharmony_ci &jmp (&label("ecb_ret")); 6241cb0ef41Sopenharmony_ci 6251cb0ef41Sopenharmony_ci&set_label("ecb_dec_three",16); 6261cb0ef41Sopenharmony_ci &call ("_aesni_decrypt3"); 6271cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 6281cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 6291cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 6301cb0ef41Sopenharmony_ci &jmp (&label("ecb_ret")); 6311cb0ef41Sopenharmony_ci 6321cb0ef41Sopenharmony_ci&set_label("ecb_dec_four",16); 6331cb0ef41Sopenharmony_ci &call ("_aesni_decrypt4"); 6341cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 6351cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 6361cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 6371cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 6381cb0ef41Sopenharmony_ci 6391cb0ef41Sopenharmony_ci&set_label("ecb_ret"); 6401cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 6411cb0ef41Sopenharmony_ci &pxor ("xmm1","xmm1"); 6421cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm2"); 6431cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm3"); 6441cb0ef41Sopenharmony_ci &pxor ("xmm4","xmm4"); 6451cb0ef41Sopenharmony_ci &pxor ("xmm5","xmm5"); 6461cb0ef41Sopenharmony_ci &pxor ("xmm6","xmm6"); 6471cb0ef41Sopenharmony_ci &pxor ("xmm7","xmm7"); 6481cb0ef41Sopenharmony_ci&function_end("aesni_ecb_encrypt"); 6491cb0ef41Sopenharmony_ci 6501cb0ef41Sopenharmony_ci###################################################################### 6511cb0ef41Sopenharmony_ci# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 6521cb0ef41Sopenharmony_ci# size_t blocks, const AES_KEY *key, 6531cb0ef41Sopenharmony_ci# const char *ivec,char *cmac); 6541cb0ef41Sopenharmony_ci# 6551cb0ef41Sopenharmony_ci# Handles only complete blocks, operates on 64-bit counter and 6561cb0ef41Sopenharmony_ci# does not update *ivec! Nor does it finalize CMAC value 6571cb0ef41Sopenharmony_ci# (see engine/eng_aesni.c for details) 6581cb0ef41Sopenharmony_ci# 6591cb0ef41Sopenharmony_ci{ my $cmac=$inout1; 6601cb0ef41Sopenharmony_ci&function_begin("aesni_ccm64_encrypt_blocks"); 6611cb0ef41Sopenharmony_ci &mov ($inp,&wparam(0)); 6621cb0ef41Sopenharmony_ci &mov ($out,&wparam(1)); 6631cb0ef41Sopenharmony_ci &mov ($len,&wparam(2)); 6641cb0ef41Sopenharmony_ci &mov ($key,&wparam(3)); 6651cb0ef41Sopenharmony_ci &mov ($rounds_,&wparam(4)); 6661cb0ef41Sopenharmony_ci &mov ($rounds,&wparam(5)); 6671cb0ef41Sopenharmony_ci &mov ($key_,"esp"); 6681cb0ef41Sopenharmony_ci &sub ("esp",60); 6691cb0ef41Sopenharmony_ci &and ("esp",-16); # align stack 6701cb0ef41Sopenharmony_ci &mov (&DWP(48,"esp"),$key_); 6711cb0ef41Sopenharmony_ci 6721cb0ef41Sopenharmony_ci &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 6731cb0ef41Sopenharmony_ci &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 6741cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 6751cb0ef41Sopenharmony_ci 6761cb0ef41Sopenharmony_ci # compose byte-swap control mask for pshufb on stack 6771cb0ef41Sopenharmony_ci &mov (&DWP(0,"esp"),0x0c0d0e0f); 6781cb0ef41Sopenharmony_ci &mov (&DWP(4,"esp"),0x08090a0b); 6791cb0ef41Sopenharmony_ci &mov (&DWP(8,"esp"),0x04050607); 6801cb0ef41Sopenharmony_ci &mov (&DWP(12,"esp"),0x00010203); 6811cb0ef41Sopenharmony_ci 6821cb0ef41Sopenharmony_ci # compose counter increment vector on stack 6831cb0ef41Sopenharmony_ci &mov ($rounds_,1); 6841cb0ef41Sopenharmony_ci &xor ($key_,$key_); 6851cb0ef41Sopenharmony_ci &mov (&DWP(16,"esp"),$rounds_); 6861cb0ef41Sopenharmony_ci &mov (&DWP(20,"esp"),$key_); 6871cb0ef41Sopenharmony_ci &mov (&DWP(24,"esp"),$key_); 6881cb0ef41Sopenharmony_ci &mov (&DWP(28,"esp"),$key_); 6891cb0ef41Sopenharmony_ci 6901cb0ef41Sopenharmony_ci &shl ($rounds,4); 6911cb0ef41Sopenharmony_ci &mov ($rounds_,16); 6921cb0ef41Sopenharmony_ci &lea ($key_,&DWP(0,$key)); 6931cb0ef41Sopenharmony_ci &movdqa ($inout3,&QWP(0,"esp")); 6941cb0ef41Sopenharmony_ci &movdqa ($inout0,$ivec); 6951cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 6961cb0ef41Sopenharmony_ci &sub ($rounds_,$rounds); 6971cb0ef41Sopenharmony_ci &pshufb ($ivec,$inout3); 6981cb0ef41Sopenharmony_ci 6991cb0ef41Sopenharmony_ci&set_label("ccm64_enc_outer"); 7001cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key_)); 7011cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); 7021cb0ef41Sopenharmony_ci &movups ($in0,&QWP(0,$inp)); 7031cb0ef41Sopenharmony_ci 7041cb0ef41Sopenharmony_ci &xorps ($inout0,$rndkey0); 7051cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key_)); 7061cb0ef41Sopenharmony_ci &xorps ($rndkey0,$in0); 7071cb0ef41Sopenharmony_ci &xorps ($cmac,$rndkey0); # cmac^=inp 7081cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key_)); 7091cb0ef41Sopenharmony_ci 7101cb0ef41Sopenharmony_ci&set_label("ccm64_enc2_loop"); 7111cb0ef41Sopenharmony_ci &aesenc ($inout0,$rndkey1); 7121cb0ef41Sopenharmony_ci &aesenc ($cmac,$rndkey1); 7131cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 7141cb0ef41Sopenharmony_ci &add ($rounds,32); 7151cb0ef41Sopenharmony_ci &aesenc ($inout0,$rndkey0); 7161cb0ef41Sopenharmony_ci &aesenc ($cmac,$rndkey0); 7171cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 7181cb0ef41Sopenharmony_ci &jnz (&label("ccm64_enc2_loop")); 7191cb0ef41Sopenharmony_ci &aesenc ($inout0,$rndkey1); 7201cb0ef41Sopenharmony_ci &aesenc ($cmac,$rndkey1); 7211cb0ef41Sopenharmony_ci &paddq ($ivec,&QWP(16,"esp")); 7221cb0ef41Sopenharmony_ci &dec ($len); 7231cb0ef41Sopenharmony_ci &aesenclast ($inout0,$rndkey0); 7241cb0ef41Sopenharmony_ci &aesenclast ($cmac,$rndkey0); 7251cb0ef41Sopenharmony_ci 7261cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16,$inp)); 7271cb0ef41Sopenharmony_ci &xorps ($in0,$inout0); # inp^=E(ivec) 7281cb0ef41Sopenharmony_ci &movdqa ($inout0,$ivec); 7291cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$in0); # save output 7301cb0ef41Sopenharmony_ci &pshufb ($inout0,$inout3); 7311cb0ef41Sopenharmony_ci &lea ($out,&DWP(16,$out)); 7321cb0ef41Sopenharmony_ci &jnz (&label("ccm64_enc_outer")); 7331cb0ef41Sopenharmony_ci 7341cb0ef41Sopenharmony_ci &mov ("esp",&DWP(48,"esp")); 7351cb0ef41Sopenharmony_ci &mov ($out,&wparam(5)); 7361cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$cmac); 7371cb0ef41Sopenharmony_ci 7381cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 7391cb0ef41Sopenharmony_ci &pxor ("xmm1","xmm1"); 7401cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm2"); 7411cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm3"); 7421cb0ef41Sopenharmony_ci &pxor ("xmm4","xmm4"); 7431cb0ef41Sopenharmony_ci &pxor ("xmm5","xmm5"); 7441cb0ef41Sopenharmony_ci &pxor ("xmm6","xmm6"); 7451cb0ef41Sopenharmony_ci &pxor ("xmm7","xmm7"); 7461cb0ef41Sopenharmony_ci&function_end("aesni_ccm64_encrypt_blocks"); 7471cb0ef41Sopenharmony_ci 7481cb0ef41Sopenharmony_ci&function_begin("aesni_ccm64_decrypt_blocks"); 7491cb0ef41Sopenharmony_ci &mov ($inp,&wparam(0)); 7501cb0ef41Sopenharmony_ci &mov ($out,&wparam(1)); 7511cb0ef41Sopenharmony_ci &mov ($len,&wparam(2)); 7521cb0ef41Sopenharmony_ci &mov ($key,&wparam(3)); 7531cb0ef41Sopenharmony_ci &mov ($rounds_,&wparam(4)); 7541cb0ef41Sopenharmony_ci &mov ($rounds,&wparam(5)); 7551cb0ef41Sopenharmony_ci &mov ($key_,"esp"); 7561cb0ef41Sopenharmony_ci &sub ("esp",60); 7571cb0ef41Sopenharmony_ci &and ("esp",-16); # align stack 7581cb0ef41Sopenharmony_ci &mov (&DWP(48,"esp"),$key_); 7591cb0ef41Sopenharmony_ci 7601cb0ef41Sopenharmony_ci &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 7611cb0ef41Sopenharmony_ci &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 7621cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 7631cb0ef41Sopenharmony_ci 7641cb0ef41Sopenharmony_ci # compose byte-swap control mask for pshufb on stack 7651cb0ef41Sopenharmony_ci &mov (&DWP(0,"esp"),0x0c0d0e0f); 7661cb0ef41Sopenharmony_ci &mov (&DWP(4,"esp"),0x08090a0b); 7671cb0ef41Sopenharmony_ci &mov (&DWP(8,"esp"),0x04050607); 7681cb0ef41Sopenharmony_ci &mov (&DWP(12,"esp"),0x00010203); 7691cb0ef41Sopenharmony_ci 7701cb0ef41Sopenharmony_ci # compose counter increment vector on stack 7711cb0ef41Sopenharmony_ci &mov ($rounds_,1); 7721cb0ef41Sopenharmony_ci &xor ($key_,$key_); 7731cb0ef41Sopenharmony_ci &mov (&DWP(16,"esp"),$rounds_); 7741cb0ef41Sopenharmony_ci &mov (&DWP(20,"esp"),$key_); 7751cb0ef41Sopenharmony_ci &mov (&DWP(24,"esp"),$key_); 7761cb0ef41Sopenharmony_ci &mov (&DWP(28,"esp"),$key_); 7771cb0ef41Sopenharmony_ci 7781cb0ef41Sopenharmony_ci &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 7791cb0ef41Sopenharmony_ci &movdqa ($inout0,$ivec); 7801cb0ef41Sopenharmony_ci 7811cb0ef41Sopenharmony_ci &mov ($key_,$key); 7821cb0ef41Sopenharmony_ci &mov ($rounds_,$rounds); 7831cb0ef41Sopenharmony_ci 7841cb0ef41Sopenharmony_ci &pshufb ($ivec,$inout3); 7851cb0ef41Sopenharmony_ci if ($inline) 7861cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc"); } 7871cb0ef41Sopenharmony_ci else 7881cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1"); } 7891cb0ef41Sopenharmony_ci &shl ($rounds_,4); 7901cb0ef41Sopenharmony_ci &mov ($rounds,16); 7911cb0ef41Sopenharmony_ci &movups ($in0,&QWP(0,$inp)); # load inp 7921cb0ef41Sopenharmony_ci &paddq ($ivec,&QWP(16,"esp")); 7931cb0ef41Sopenharmony_ci &lea ($inp,&QWP(16,$inp)); 7941cb0ef41Sopenharmony_ci &sub ($rounds,$rounds_); 7951cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key_,$rounds_)); 7961cb0ef41Sopenharmony_ci &mov ($rounds_,$rounds); 7971cb0ef41Sopenharmony_ci &jmp (&label("ccm64_dec_outer")); 7981cb0ef41Sopenharmony_ci 7991cb0ef41Sopenharmony_ci&set_label("ccm64_dec_outer",16); 8001cb0ef41Sopenharmony_ci &xorps ($in0,$inout0); # inp ^= E(ivec) 8011cb0ef41Sopenharmony_ci &movdqa ($inout0,$ivec); 8021cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$in0); # save output 8031cb0ef41Sopenharmony_ci &lea ($out,&DWP(16,$out)); 8041cb0ef41Sopenharmony_ci &pshufb ($inout0,$inout3); 8051cb0ef41Sopenharmony_ci 8061cb0ef41Sopenharmony_ci &sub ($len,1); 8071cb0ef41Sopenharmony_ci &jz (&label("ccm64_dec_break")); 8081cb0ef41Sopenharmony_ci 8091cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key_)); 8101cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); 8111cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key_)); 8121cb0ef41Sopenharmony_ci &xorps ($in0,$rndkey0); 8131cb0ef41Sopenharmony_ci &xorps ($inout0,$rndkey0); 8141cb0ef41Sopenharmony_ci &xorps ($cmac,$in0); # cmac^=out 8151cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key_)); 8161cb0ef41Sopenharmony_ci 8171cb0ef41Sopenharmony_ci&set_label("ccm64_dec2_loop"); 8181cb0ef41Sopenharmony_ci &aesenc ($inout0,$rndkey1); 8191cb0ef41Sopenharmony_ci &aesenc ($cmac,$rndkey1); 8201cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 8211cb0ef41Sopenharmony_ci &add ($rounds,32); 8221cb0ef41Sopenharmony_ci &aesenc ($inout0,$rndkey0); 8231cb0ef41Sopenharmony_ci &aesenc ($cmac,$rndkey0); 8241cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 8251cb0ef41Sopenharmony_ci &jnz (&label("ccm64_dec2_loop")); 8261cb0ef41Sopenharmony_ci &movups ($in0,&QWP(0,$inp)); # load inp 8271cb0ef41Sopenharmony_ci &paddq ($ivec,&QWP(16,"esp")); 8281cb0ef41Sopenharmony_ci &aesenc ($inout0,$rndkey1); 8291cb0ef41Sopenharmony_ci &aesenc ($cmac,$rndkey1); 8301cb0ef41Sopenharmony_ci &aesenclast ($inout0,$rndkey0); 8311cb0ef41Sopenharmony_ci &aesenclast ($cmac,$rndkey0); 8321cb0ef41Sopenharmony_ci &lea ($inp,&QWP(16,$inp)); 8331cb0ef41Sopenharmony_ci &jmp (&label("ccm64_dec_outer")); 8341cb0ef41Sopenharmony_ci 8351cb0ef41Sopenharmony_ci&set_label("ccm64_dec_break",16); 8361cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key_)); 8371cb0ef41Sopenharmony_ci &mov ($key,$key_); 8381cb0ef41Sopenharmony_ci if ($inline) 8391cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc",$cmac,$in0); } 8401cb0ef41Sopenharmony_ci else 8411cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1",$cmac); } 8421cb0ef41Sopenharmony_ci 8431cb0ef41Sopenharmony_ci &mov ("esp",&DWP(48,"esp")); 8441cb0ef41Sopenharmony_ci &mov ($out,&wparam(5)); 8451cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$cmac); 8461cb0ef41Sopenharmony_ci 8471cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 8481cb0ef41Sopenharmony_ci &pxor ("xmm1","xmm1"); 8491cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm2"); 8501cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm3"); 8511cb0ef41Sopenharmony_ci &pxor ("xmm4","xmm4"); 8521cb0ef41Sopenharmony_ci &pxor ("xmm5","xmm5"); 8531cb0ef41Sopenharmony_ci &pxor ("xmm6","xmm6"); 8541cb0ef41Sopenharmony_ci &pxor ("xmm7","xmm7"); 8551cb0ef41Sopenharmony_ci&function_end("aesni_ccm64_decrypt_blocks"); 8561cb0ef41Sopenharmony_ci} 8571cb0ef41Sopenharmony_ci 8581cb0ef41Sopenharmony_ci###################################################################### 8591cb0ef41Sopenharmony_ci# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 8601cb0ef41Sopenharmony_ci# size_t blocks, const AES_KEY *key, 8611cb0ef41Sopenharmony_ci# const char *ivec); 8621cb0ef41Sopenharmony_ci# 8631cb0ef41Sopenharmony_ci# Handles only complete blocks, operates on 32-bit counter and 8641cb0ef41Sopenharmony_ci# does not update *ivec! (see crypto/modes/ctr128.c for details) 8651cb0ef41Sopenharmony_ci# 8661cb0ef41Sopenharmony_ci# stack layout: 8671cb0ef41Sopenharmony_ci# 0 pshufb mask 8681cb0ef41Sopenharmony_ci# 16 vector addend: 0,6,6,6 8691cb0ef41Sopenharmony_ci# 32 counter-less ivec 8701cb0ef41Sopenharmony_ci# 48 1st triplet of counter vector 8711cb0ef41Sopenharmony_ci# 64 2nd triplet of counter vector 8721cb0ef41Sopenharmony_ci# 80 saved %esp 8731cb0ef41Sopenharmony_ci 8741cb0ef41Sopenharmony_ci&function_begin("aesni_ctr32_encrypt_blocks"); 8751cb0ef41Sopenharmony_ci &mov ($inp,&wparam(0)); 8761cb0ef41Sopenharmony_ci &mov ($out,&wparam(1)); 8771cb0ef41Sopenharmony_ci &mov ($len,&wparam(2)); 8781cb0ef41Sopenharmony_ci &mov ($key,&wparam(3)); 8791cb0ef41Sopenharmony_ci &mov ($rounds_,&wparam(4)); 8801cb0ef41Sopenharmony_ci &mov ($key_,"esp"); 8811cb0ef41Sopenharmony_ci &sub ("esp",88); 8821cb0ef41Sopenharmony_ci &and ("esp",-16); # align stack 8831cb0ef41Sopenharmony_ci &mov (&DWP(80,"esp"),$key_); 8841cb0ef41Sopenharmony_ci 8851cb0ef41Sopenharmony_ci &cmp ($len,1); 8861cb0ef41Sopenharmony_ci &je (&label("ctr32_one_shortcut")); 8871cb0ef41Sopenharmony_ci 8881cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 8891cb0ef41Sopenharmony_ci 8901cb0ef41Sopenharmony_ci # compose byte-swap control mask for pshufb on stack 8911cb0ef41Sopenharmony_ci &mov (&DWP(0,"esp"),0x0c0d0e0f); 8921cb0ef41Sopenharmony_ci &mov (&DWP(4,"esp"),0x08090a0b); 8931cb0ef41Sopenharmony_ci &mov (&DWP(8,"esp"),0x04050607); 8941cb0ef41Sopenharmony_ci &mov (&DWP(12,"esp"),0x00010203); 8951cb0ef41Sopenharmony_ci 8961cb0ef41Sopenharmony_ci # compose counter increment vector on stack 8971cb0ef41Sopenharmony_ci &mov ($rounds,6); 8981cb0ef41Sopenharmony_ci &xor ($key_,$key_); 8991cb0ef41Sopenharmony_ci &mov (&DWP(16,"esp"),$rounds); 9001cb0ef41Sopenharmony_ci &mov (&DWP(20,"esp"),$rounds); 9011cb0ef41Sopenharmony_ci &mov (&DWP(24,"esp"),$rounds); 9021cb0ef41Sopenharmony_ci &mov (&DWP(28,"esp"),$key_); 9031cb0ef41Sopenharmony_ci 9041cb0ef41Sopenharmony_ci &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 9051cb0ef41Sopenharmony_ci &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 9061cb0ef41Sopenharmony_ci 9071cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); # key->rounds 9081cb0ef41Sopenharmony_ci 9091cb0ef41Sopenharmony_ci # compose 2 vectors of 3x32-bit counters 9101cb0ef41Sopenharmony_ci &bswap ($rounds_); 9111cb0ef41Sopenharmony_ci &pxor ($rndkey0,$rndkey0); 9121cb0ef41Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 9131cb0ef41Sopenharmony_ci &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 9141cb0ef41Sopenharmony_ci &pinsrd ($rndkey0,$rounds_,0); 9151cb0ef41Sopenharmony_ci &lea ($key_,&DWP(3,$rounds_)); 9161cb0ef41Sopenharmony_ci &pinsrd ($rndkey1,$key_,0); 9171cb0ef41Sopenharmony_ci &inc ($rounds_); 9181cb0ef41Sopenharmony_ci &pinsrd ($rndkey0,$rounds_,1); 9191cb0ef41Sopenharmony_ci &inc ($key_); 9201cb0ef41Sopenharmony_ci &pinsrd ($rndkey1,$key_,1); 9211cb0ef41Sopenharmony_ci &inc ($rounds_); 9221cb0ef41Sopenharmony_ci &pinsrd ($rndkey0,$rounds_,2); 9231cb0ef41Sopenharmony_ci &inc ($key_); 9241cb0ef41Sopenharmony_ci &pinsrd ($rndkey1,$key_,2); 9251cb0ef41Sopenharmony_ci &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 9261cb0ef41Sopenharmony_ci &pshufb ($rndkey0,$inout0); # byte swap 9271cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(0,$key)); # key[0] 9281cb0ef41Sopenharmony_ci &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 9291cb0ef41Sopenharmony_ci &pshufb ($rndkey1,$inout0); # byte swap 9301cb0ef41Sopenharmony_ci 9311cb0ef41Sopenharmony_ci &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 9321cb0ef41Sopenharmony_ci &pshufd ($inout1,$rndkey0,2<<6); 9331cb0ef41Sopenharmony_ci &cmp ($len,6); 9341cb0ef41Sopenharmony_ci &jb (&label("ctr32_tail")); 9351cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); # counter-less ivec^key[0] 9361cb0ef41Sopenharmony_ci &shl ($rounds,4); 9371cb0ef41Sopenharmony_ci &mov ($rounds_,16); 9381cb0ef41Sopenharmony_ci &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 9391cb0ef41Sopenharmony_ci &mov ($key_,$key); # backup $key 9401cb0ef41Sopenharmony_ci &sub ($rounds_,$rounds); # backup twisted $rounds 9411cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 9421cb0ef41Sopenharmony_ci &sub ($len,6); 9431cb0ef41Sopenharmony_ci &jmp (&label("ctr32_loop6")); 9441cb0ef41Sopenharmony_ci 9451cb0ef41Sopenharmony_ci&set_label("ctr32_loop6",16); 9461cb0ef41Sopenharmony_ci # inlining _aesni_encrypt6's prologue gives ~6% improvement... 9471cb0ef41Sopenharmony_ci &pshufd ($inout2,$rndkey0,1<<6); 9481cb0ef41Sopenharmony_ci &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 9491cb0ef41Sopenharmony_ci &pshufd ($inout3,$rndkey1,3<<6); 9501cb0ef41Sopenharmony_ci &pxor ($inout0,$rndkey0); # merge counter-less ivec 9511cb0ef41Sopenharmony_ci &pshufd ($inout4,$rndkey1,2<<6); 9521cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); 9531cb0ef41Sopenharmony_ci &pshufd ($inout5,$rndkey1,1<<6); 9541cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key_)); 9551cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); 9561cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); 9571cb0ef41Sopenharmony_ci &aesenc ($inout0,$rndkey1); 9581cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); 9591cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); 9601cb0ef41Sopenharmony_ci &aesenc ($inout1,$rndkey1); 9611cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key_)); 9621cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); 9631cb0ef41Sopenharmony_ci &aesenc ($inout2,$rndkey1); 9641cb0ef41Sopenharmony_ci &aesenc ($inout3,$rndkey1); 9651cb0ef41Sopenharmony_ci &aesenc ($inout4,$rndkey1); 9661cb0ef41Sopenharmony_ci &aesenc ($inout5,$rndkey1); 9671cb0ef41Sopenharmony_ci 9681cb0ef41Sopenharmony_ci &call (&label("_aesni_encrypt6_enter")); 9691cb0ef41Sopenharmony_ci 9701cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0,$inp)); 9711cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0x10,$inp)); 9721cb0ef41Sopenharmony_ci &xorps ($inout0,$rndkey1); 9731cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0x20,$inp)); 9741cb0ef41Sopenharmony_ci &xorps ($inout1,$rndkey0); 9751cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 9761cb0ef41Sopenharmony_ci &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 9771cb0ef41Sopenharmony_ci &xorps ($inout2,$rndkey1); 9781cb0ef41Sopenharmony_ci &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 9791cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 9801cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 9811cb0ef41Sopenharmony_ci 9821cb0ef41Sopenharmony_ci &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 9831cb0ef41Sopenharmony_ci &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 9841cb0ef41Sopenharmony_ci &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 9851cb0ef41Sopenharmony_ci 9861cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(0x30,$inp)); 9871cb0ef41Sopenharmony_ci &movups ($inout2,&QWP(0x40,$inp)); 9881cb0ef41Sopenharmony_ci &xorps ($inout3,$inout1); 9891cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(0x50,$inp)); 9901cb0ef41Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 9911cb0ef41Sopenharmony_ci &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 9921cb0ef41Sopenharmony_ci &pshufb ($rndkey0,$inout0); # byte swap 9931cb0ef41Sopenharmony_ci &xorps ($inout4,$inout2); 9941cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 9951cb0ef41Sopenharmony_ci &xorps ($inout5,$inout1); 9961cb0ef41Sopenharmony_ci &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 9971cb0ef41Sopenharmony_ci &pshufb ($rndkey1,$inout0); # byte swap 9981cb0ef41Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 9991cb0ef41Sopenharmony_ci &pshufd ($inout0,$rndkey0,3<<6); 10001cb0ef41Sopenharmony_ci &movups (&QWP(0x50,$out),$inout5); 10011cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x60,$out)); 10021cb0ef41Sopenharmony_ci 10031cb0ef41Sopenharmony_ci &pshufd ($inout1,$rndkey0,2<<6); 10041cb0ef41Sopenharmony_ci &sub ($len,6); 10051cb0ef41Sopenharmony_ci &jnc (&label("ctr32_loop6")); 10061cb0ef41Sopenharmony_ci 10071cb0ef41Sopenharmony_ci &add ($len,6); 10081cb0ef41Sopenharmony_ci &jz (&label("ctr32_ret")); 10091cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$key_)); 10101cb0ef41Sopenharmony_ci &mov ($key,$key_); 10111cb0ef41Sopenharmony_ci &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 10121cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key_)); # restore $rounds 10131cb0ef41Sopenharmony_ci 10141cb0ef41Sopenharmony_ci&set_label("ctr32_tail"); 10151cb0ef41Sopenharmony_ci &por ($inout0,$inout5); 10161cb0ef41Sopenharmony_ci &cmp ($len,2); 10171cb0ef41Sopenharmony_ci &jb (&label("ctr32_one")); 10181cb0ef41Sopenharmony_ci 10191cb0ef41Sopenharmony_ci &pshufd ($inout2,$rndkey0,1<<6); 10201cb0ef41Sopenharmony_ci &por ($inout1,$inout5); 10211cb0ef41Sopenharmony_ci &je (&label("ctr32_two")); 10221cb0ef41Sopenharmony_ci 10231cb0ef41Sopenharmony_ci &pshufd ($inout3,$rndkey1,3<<6); 10241cb0ef41Sopenharmony_ci &por ($inout2,$inout5); 10251cb0ef41Sopenharmony_ci &cmp ($len,4); 10261cb0ef41Sopenharmony_ci &jb (&label("ctr32_three")); 10271cb0ef41Sopenharmony_ci 10281cb0ef41Sopenharmony_ci &pshufd ($inout4,$rndkey1,2<<6); 10291cb0ef41Sopenharmony_ci &por ($inout3,$inout5); 10301cb0ef41Sopenharmony_ci &je (&label("ctr32_four")); 10311cb0ef41Sopenharmony_ci 10321cb0ef41Sopenharmony_ci &por ($inout4,$inout5); 10331cb0ef41Sopenharmony_ci &call ("_aesni_encrypt6"); 10341cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0,$inp)); 10351cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0x10,$inp)); 10361cb0ef41Sopenharmony_ci &xorps ($inout0,$rndkey1); 10371cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0x20,$inp)); 10381cb0ef41Sopenharmony_ci &xorps ($inout1,$rndkey0); 10391cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0x30,$inp)); 10401cb0ef41Sopenharmony_ci &xorps ($inout2,$rndkey1); 10411cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0x40,$inp)); 10421cb0ef41Sopenharmony_ci &xorps ($inout3,$rndkey0); 10431cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 10441cb0ef41Sopenharmony_ci &xorps ($inout4,$rndkey1); 10451cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 10461cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 10471cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 10481cb0ef41Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 10491cb0ef41Sopenharmony_ci &jmp (&label("ctr32_ret")); 10501cb0ef41Sopenharmony_ci 10511cb0ef41Sopenharmony_ci&set_label("ctr32_one_shortcut",16); 10521cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$rounds_)); # load ivec 10531cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 10541cb0ef41Sopenharmony_ci 10551cb0ef41Sopenharmony_ci&set_label("ctr32_one"); 10561cb0ef41Sopenharmony_ci if ($inline) 10571cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc"); } 10581cb0ef41Sopenharmony_ci else 10591cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1"); } 10601cb0ef41Sopenharmony_ci &movups ($in0,&QWP(0,$inp)); 10611cb0ef41Sopenharmony_ci &xorps ($in0,$inout0); 10621cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$in0); 10631cb0ef41Sopenharmony_ci &jmp (&label("ctr32_ret")); 10641cb0ef41Sopenharmony_ci 10651cb0ef41Sopenharmony_ci&set_label("ctr32_two",16); 10661cb0ef41Sopenharmony_ci &call ("_aesni_encrypt2"); 10671cb0ef41Sopenharmony_ci &movups ($inout3,&QWP(0,$inp)); 10681cb0ef41Sopenharmony_ci &movups ($inout4,&QWP(0x10,$inp)); 10691cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); 10701cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 10711cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 10721cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 10731cb0ef41Sopenharmony_ci &jmp (&label("ctr32_ret")); 10741cb0ef41Sopenharmony_ci 10751cb0ef41Sopenharmony_ci&set_label("ctr32_three",16); 10761cb0ef41Sopenharmony_ci &call ("_aesni_encrypt3"); 10771cb0ef41Sopenharmony_ci &movups ($inout3,&QWP(0,$inp)); 10781cb0ef41Sopenharmony_ci &movups ($inout4,&QWP(0x10,$inp)); 10791cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); 10801cb0ef41Sopenharmony_ci &movups ($inout5,&QWP(0x20,$inp)); 10811cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 10821cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 10831cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 10841cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 10851cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 10861cb0ef41Sopenharmony_ci &jmp (&label("ctr32_ret")); 10871cb0ef41Sopenharmony_ci 10881cb0ef41Sopenharmony_ci&set_label("ctr32_four",16); 10891cb0ef41Sopenharmony_ci &call ("_aesni_encrypt4"); 10901cb0ef41Sopenharmony_ci &movups ($inout4,&QWP(0,$inp)); 10911cb0ef41Sopenharmony_ci &movups ($inout5,&QWP(0x10,$inp)); 10921cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0x20,$inp)); 10931cb0ef41Sopenharmony_ci &xorps ($inout0,$inout4); 10941cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0x30,$inp)); 10951cb0ef41Sopenharmony_ci &xorps ($inout1,$inout5); 10961cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 10971cb0ef41Sopenharmony_ci &xorps ($inout2,$rndkey1); 10981cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 10991cb0ef41Sopenharmony_ci &xorps ($inout3,$rndkey0); 11001cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 11011cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 11021cb0ef41Sopenharmony_ci 11031cb0ef41Sopenharmony_ci&set_label("ctr32_ret"); 11041cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 11051cb0ef41Sopenharmony_ci &pxor ("xmm1","xmm1"); 11061cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm2"); 11071cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm3"); 11081cb0ef41Sopenharmony_ci &pxor ("xmm4","xmm4"); 11091cb0ef41Sopenharmony_ci &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 11101cb0ef41Sopenharmony_ci &pxor ("xmm5","xmm5"); 11111cb0ef41Sopenharmony_ci &movdqa (&QWP(48,"esp"),"xmm0"); 11121cb0ef41Sopenharmony_ci &pxor ("xmm6","xmm6"); 11131cb0ef41Sopenharmony_ci &movdqa (&QWP(64,"esp"),"xmm0"); 11141cb0ef41Sopenharmony_ci &pxor ("xmm7","xmm7"); 11151cb0ef41Sopenharmony_ci &mov ("esp",&DWP(80,"esp")); 11161cb0ef41Sopenharmony_ci&function_end("aesni_ctr32_encrypt_blocks"); 11171cb0ef41Sopenharmony_ci 11181cb0ef41Sopenharmony_ci###################################################################### 11191cb0ef41Sopenharmony_ci# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 11201cb0ef41Sopenharmony_ci# const AES_KEY *key1, const AES_KEY *key2 11211cb0ef41Sopenharmony_ci# const unsigned char iv[16]); 11221cb0ef41Sopenharmony_ci# 11231cb0ef41Sopenharmony_ci{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 11241cb0ef41Sopenharmony_ci 11251cb0ef41Sopenharmony_ci&function_begin("aesni_xts_encrypt"); 11261cb0ef41Sopenharmony_ci &mov ($key,&wparam(4)); # key2 11271cb0ef41Sopenharmony_ci &mov ($inp,&wparam(5)); # clear-text tweak 11281cb0ef41Sopenharmony_ci 11291cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); # key2->rounds 11301cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 11311cb0ef41Sopenharmony_ci if ($inline) 11321cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc"); } 11331cb0ef41Sopenharmony_ci else 11341cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1"); } 11351cb0ef41Sopenharmony_ci 11361cb0ef41Sopenharmony_ci &mov ($inp,&wparam(0)); 11371cb0ef41Sopenharmony_ci &mov ($out,&wparam(1)); 11381cb0ef41Sopenharmony_ci &mov ($len,&wparam(2)); 11391cb0ef41Sopenharmony_ci &mov ($key,&wparam(3)); # key1 11401cb0ef41Sopenharmony_ci 11411cb0ef41Sopenharmony_ci &mov ($key_,"esp"); 11421cb0ef41Sopenharmony_ci &sub ("esp",16*7+8); 11431cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); # key1->rounds 11441cb0ef41Sopenharmony_ci &and ("esp",-16); # align stack 11451cb0ef41Sopenharmony_ci 11461cb0ef41Sopenharmony_ci &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 11471cb0ef41Sopenharmony_ci &mov (&DWP(16*6+4,"esp"),0); 11481cb0ef41Sopenharmony_ci &mov (&DWP(16*6+8,"esp"),1); 11491cb0ef41Sopenharmony_ci &mov (&DWP(16*6+12,"esp"),0); 11501cb0ef41Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save original $len 11511cb0ef41Sopenharmony_ci &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 11521cb0ef41Sopenharmony_ci 11531cb0ef41Sopenharmony_ci &movdqa ($tweak,$inout0); 11541cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 11551cb0ef41Sopenharmony_ci &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 11561cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 11571cb0ef41Sopenharmony_ci 11581cb0ef41Sopenharmony_ci &and ($len,-16); 11591cb0ef41Sopenharmony_ci &mov ($key_,$key); # backup $key 11601cb0ef41Sopenharmony_ci &mov ($rounds_,$rounds); # backup $rounds 11611cb0ef41Sopenharmony_ci &sub ($len,16*6); 11621cb0ef41Sopenharmony_ci &jc (&label("xts_enc_short")); 11631cb0ef41Sopenharmony_ci 11641cb0ef41Sopenharmony_ci &shl ($rounds,4); 11651cb0ef41Sopenharmony_ci &mov ($rounds_,16); 11661cb0ef41Sopenharmony_ci &sub ($rounds_,$rounds); 11671cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 11681cb0ef41Sopenharmony_ci &jmp (&label("xts_enc_loop6")); 11691cb0ef41Sopenharmony_ci 11701cb0ef41Sopenharmony_ci&set_label("xts_enc_loop6",16); 11711cb0ef41Sopenharmony_ci for ($i=0;$i<4;$i++) { 11721cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 11731cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 11741cb0ef41Sopenharmony_ci &movdqa (&QWP(16*$i,"esp"),$tweak); 11751cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 11761cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 11771cb0ef41Sopenharmony_ci &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 11781cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 11791cb0ef41Sopenharmony_ci } 11801cb0ef41Sopenharmony_ci &pshufd ($inout5,$twtmp,0x13); 11811cb0ef41Sopenharmony_ci &movdqa (&QWP(16*$i++,"esp"),$tweak); 11821cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 11831cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key_)); 11841cb0ef41Sopenharmony_ci &pand ($inout5,$twmask); # isolate carry and residue 11851cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); # load input 11861cb0ef41Sopenharmony_ci &pxor ($inout5,$tweak); 11871cb0ef41Sopenharmony_ci 11881cb0ef41Sopenharmony_ci # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 11891cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 11901cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 11911cb0ef41Sopenharmony_ci &xorps ($inout0,$rndkey0); # input^=rndkey[0] 11921cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 11931cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); 11941cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 11951cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); 11961cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 11971cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); 11981cb0ef41Sopenharmony_ci &movdqu ($rndkey1,&QWP(16*5,$inp)); 11991cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); 12001cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*6,$inp)); 12011cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 12021cb0ef41Sopenharmony_ci &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 12031cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey1); 12041cb0ef41Sopenharmony_ci 12051cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key_)); 12061cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 12071cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 12081cb0ef41Sopenharmony_ci &aesenc ($inout0,$rndkey1); 12091cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 12101cb0ef41Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 12111cb0ef41Sopenharmony_ci &aesenc ($inout1,$rndkey1); 12121cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); 12131cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key_)); 12141cb0ef41Sopenharmony_ci &aesenc ($inout2,$rndkey1); 12151cb0ef41Sopenharmony_ci &aesenc ($inout3,$rndkey1); 12161cb0ef41Sopenharmony_ci &aesenc ($inout4,$rndkey1); 12171cb0ef41Sopenharmony_ci &aesenc ($inout5,$rndkey1); 12181cb0ef41Sopenharmony_ci &call (&label("_aesni_encrypt6_enter")); 12191cb0ef41Sopenharmony_ci 12201cb0ef41Sopenharmony_ci &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 12211cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 12221cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 12231cb0ef41Sopenharmony_ci &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 12241cb0ef41Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 12251cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 12261cb0ef41Sopenharmony_ci &xorps ($inout2,&QWP(16*2,"esp")); 12271cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 12281cb0ef41Sopenharmony_ci &xorps ($inout3,&QWP(16*3,"esp")); 12291cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 12301cb0ef41Sopenharmony_ci &xorps ($inout4,&QWP(16*4,"esp")); 12311cb0ef41Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 12321cb0ef41Sopenharmony_ci &xorps ($inout5,$tweak); 12331cb0ef41Sopenharmony_ci &movups (&QWP(16*4,$out),$inout4); 12341cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 12351cb0ef41Sopenharmony_ci &movups (&QWP(16*5,$out),$inout5); 12361cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*6,$out)); 12371cb0ef41Sopenharmony_ci &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 12381cb0ef41Sopenharmony_ci 12391cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 12401cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 12411cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 12421cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 12431cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 12441cb0ef41Sopenharmony_ci 12451cb0ef41Sopenharmony_ci &sub ($len,16*6); 12461cb0ef41Sopenharmony_ci &jnc (&label("xts_enc_loop6")); 12471cb0ef41Sopenharmony_ci 12481cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key_)); # restore $rounds 12491cb0ef41Sopenharmony_ci &mov ($key,$key_); # restore $key 12501cb0ef41Sopenharmony_ci &mov ($rounds_,$rounds); 12511cb0ef41Sopenharmony_ci 12521cb0ef41Sopenharmony_ci&set_label("xts_enc_short"); 12531cb0ef41Sopenharmony_ci &add ($len,16*6); 12541cb0ef41Sopenharmony_ci &jz (&label("xts_enc_done6x")); 12551cb0ef41Sopenharmony_ci 12561cb0ef41Sopenharmony_ci &movdqa ($inout3,$tweak); # put aside previous tweak 12571cb0ef41Sopenharmony_ci &cmp ($len,0x20); 12581cb0ef41Sopenharmony_ci &jb (&label("xts_enc_one")); 12591cb0ef41Sopenharmony_ci 12601cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 12611cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 12621cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 12631cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 12641cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 12651cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 12661cb0ef41Sopenharmony_ci &je (&label("xts_enc_two")); 12671cb0ef41Sopenharmony_ci 12681cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 12691cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 12701cb0ef41Sopenharmony_ci &movdqa ($inout4,$tweak); # put aside previous tweak 12711cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 12721cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 12731cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 12741cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 12751cb0ef41Sopenharmony_ci &cmp ($len,0x40); 12761cb0ef41Sopenharmony_ci &jb (&label("xts_enc_three")); 12771cb0ef41Sopenharmony_ci 12781cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 12791cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 12801cb0ef41Sopenharmony_ci &movdqa ($inout5,$tweak); # put aside previous tweak 12811cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 12821cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 12831cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 12841cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 12851cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout3); 12861cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout4); 12871cb0ef41Sopenharmony_ci &je (&label("xts_enc_four")); 12881cb0ef41Sopenharmony_ci 12891cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout5); 12901cb0ef41Sopenharmony_ci &pshufd ($inout5,$twtmp,0x13); 12911cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$tweak); 12921cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($inout0,1); 12931cb0ef41Sopenharmony_ci &pand ($inout5,$twmask); # isolate carry and residue 12941cb0ef41Sopenharmony_ci &pxor ($inout5,$tweak); 12951cb0ef41Sopenharmony_ci 12961cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 12971cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 12981cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 12991cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 13001cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 13011cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 13021cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 13031cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 13041cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*5,$inp)); 13051cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 13061cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 13071cb0ef41Sopenharmony_ci &pxor ($inout4,$inout5); 13081cb0ef41Sopenharmony_ci 13091cb0ef41Sopenharmony_ci &call ("_aesni_encrypt6"); 13101cb0ef41Sopenharmony_ci 13111cb0ef41Sopenharmony_ci &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 13121cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 13131cb0ef41Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 13141cb0ef41Sopenharmony_ci &xorps ($inout2,&QWP(16*2,"esp")); 13151cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 13161cb0ef41Sopenharmony_ci &xorps ($inout3,&QWP(16*3,"esp")); 13171cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 13181cb0ef41Sopenharmony_ci &xorps ($inout4,$tweak); 13191cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 13201cb0ef41Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 13211cb0ef41Sopenharmony_ci &movups (&QWP(16*4,$out),$inout4); 13221cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*5,$out)); 13231cb0ef41Sopenharmony_ci &jmp (&label("xts_enc_done")); 13241cb0ef41Sopenharmony_ci 13251cb0ef41Sopenharmony_ci&set_label("xts_enc_one",16); 13261cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 13271cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*1,$inp)); 13281cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 13291cb0ef41Sopenharmony_ci if ($inline) 13301cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc"); } 13311cb0ef41Sopenharmony_ci else 13321cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1"); } 13331cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 13341cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 13351cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*1,$out)); 13361cb0ef41Sopenharmony_ci 13371cb0ef41Sopenharmony_ci &movdqa ($tweak,$inout3); # last tweak 13381cb0ef41Sopenharmony_ci &jmp (&label("xts_enc_done")); 13391cb0ef41Sopenharmony_ci 13401cb0ef41Sopenharmony_ci&set_label("xts_enc_two",16); 13411cb0ef41Sopenharmony_ci &movaps ($inout4,$tweak); # put aside last tweak 13421cb0ef41Sopenharmony_ci 13431cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 13441cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 13451cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*2,$inp)); 13461cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 13471cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 13481cb0ef41Sopenharmony_ci 13491cb0ef41Sopenharmony_ci &call ("_aesni_encrypt2"); 13501cb0ef41Sopenharmony_ci 13511cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 13521cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 13531cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 13541cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 13551cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*2,$out)); 13561cb0ef41Sopenharmony_ci 13571cb0ef41Sopenharmony_ci &movdqa ($tweak,$inout4); # last tweak 13581cb0ef41Sopenharmony_ci &jmp (&label("xts_enc_done")); 13591cb0ef41Sopenharmony_ci 13601cb0ef41Sopenharmony_ci&set_label("xts_enc_three",16); 13611cb0ef41Sopenharmony_ci &movaps ($inout5,$tweak); # put aside last tweak 13621cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 13631cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 13641cb0ef41Sopenharmony_ci &movups ($inout2,&QWP(16*2,$inp)); 13651cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*3,$inp)); 13661cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 13671cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 13681cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 13691cb0ef41Sopenharmony_ci 13701cb0ef41Sopenharmony_ci &call ("_aesni_encrypt3"); 13711cb0ef41Sopenharmony_ci 13721cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 13731cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 13741cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 13751cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 13761cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 13771cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 13781cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*3,$out)); 13791cb0ef41Sopenharmony_ci 13801cb0ef41Sopenharmony_ci &movdqa ($tweak,$inout5); # last tweak 13811cb0ef41Sopenharmony_ci &jmp (&label("xts_enc_done")); 13821cb0ef41Sopenharmony_ci 13831cb0ef41Sopenharmony_ci&set_label("xts_enc_four",16); 13841cb0ef41Sopenharmony_ci &movaps ($inout4,$tweak); # put aside last tweak 13851cb0ef41Sopenharmony_ci 13861cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 13871cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 13881cb0ef41Sopenharmony_ci &movups ($inout2,&QWP(16*2,$inp)); 13891cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 13901cb0ef41Sopenharmony_ci &movups ($inout3,&QWP(16*3,$inp)); 13911cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*4,$inp)); 13921cb0ef41Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 13931cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 13941cb0ef41Sopenharmony_ci &xorps ($inout3,$inout4); 13951cb0ef41Sopenharmony_ci 13961cb0ef41Sopenharmony_ci &call ("_aesni_encrypt4"); 13971cb0ef41Sopenharmony_ci 13981cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 13991cb0ef41Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 14001cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 14011cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 14021cb0ef41Sopenharmony_ci &xorps ($inout3,$inout4); 14031cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 14041cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 14051cb0ef41Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 14061cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*4,$out)); 14071cb0ef41Sopenharmony_ci 14081cb0ef41Sopenharmony_ci &movdqa ($tweak,$inout4); # last tweak 14091cb0ef41Sopenharmony_ci &jmp (&label("xts_enc_done")); 14101cb0ef41Sopenharmony_ci 14111cb0ef41Sopenharmony_ci&set_label("xts_enc_done6x",16); # $tweak is pre-calculated 14121cb0ef41Sopenharmony_ci &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 14131cb0ef41Sopenharmony_ci &and ($len,15); 14141cb0ef41Sopenharmony_ci &jz (&label("xts_enc_ret")); 14151cb0ef41Sopenharmony_ci &movdqa ($inout3,$tweak); 14161cb0ef41Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 14171cb0ef41Sopenharmony_ci &jmp (&label("xts_enc_steal")); 14181cb0ef41Sopenharmony_ci 14191cb0ef41Sopenharmony_ci&set_label("xts_enc_done",16); 14201cb0ef41Sopenharmony_ci &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 14211cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 14221cb0ef41Sopenharmony_ci &and ($len,15); 14231cb0ef41Sopenharmony_ci &jz (&label("xts_enc_ret")); 14241cb0ef41Sopenharmony_ci 14251cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 14261cb0ef41Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 14271cb0ef41Sopenharmony_ci &pshufd ($inout3,$twtmp,0x13); 14281cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 14291cb0ef41Sopenharmony_ci &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 14301cb0ef41Sopenharmony_ci &pxor ($inout3,$tweak); 14311cb0ef41Sopenharmony_ci 14321cb0ef41Sopenharmony_ci&set_label("xts_enc_steal"); 14331cb0ef41Sopenharmony_ci &movz ($rounds,&BP(0,$inp)); 14341cb0ef41Sopenharmony_ci &movz ($key,&BP(-16,$out)); 14351cb0ef41Sopenharmony_ci &lea ($inp,&DWP(1,$inp)); 14361cb0ef41Sopenharmony_ci &mov (&BP(-16,$out),&LB($rounds)); 14371cb0ef41Sopenharmony_ci &mov (&BP(0,$out),&LB($key)); 14381cb0ef41Sopenharmony_ci &lea ($out,&DWP(1,$out)); 14391cb0ef41Sopenharmony_ci &sub ($len,1); 14401cb0ef41Sopenharmony_ci &jnz (&label("xts_enc_steal")); 14411cb0ef41Sopenharmony_ci 14421cb0ef41Sopenharmony_ci &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 14431cb0ef41Sopenharmony_ci &mov ($key,$key_); # restore $key 14441cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 14451cb0ef41Sopenharmony_ci 14461cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(-16,$out)); # load input 14471cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 14481cb0ef41Sopenharmony_ci if ($inline) 14491cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc"); } 14501cb0ef41Sopenharmony_ci else 14511cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1"); } 14521cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 14531cb0ef41Sopenharmony_ci &movups (&QWP(-16,$out),$inout0); # write output 14541cb0ef41Sopenharmony_ci 14551cb0ef41Sopenharmony_ci&set_label("xts_enc_ret"); 14561cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 14571cb0ef41Sopenharmony_ci &pxor ("xmm1","xmm1"); 14581cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm2"); 14591cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 14601cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm3"); 14611cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),"xmm0"); 14621cb0ef41Sopenharmony_ci &pxor ("xmm4","xmm4"); 14631cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),"xmm0"); 14641cb0ef41Sopenharmony_ci &pxor ("xmm5","xmm5"); 14651cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),"xmm0"); 14661cb0ef41Sopenharmony_ci &pxor ("xmm6","xmm6"); 14671cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),"xmm0"); 14681cb0ef41Sopenharmony_ci &pxor ("xmm7","xmm7"); 14691cb0ef41Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),"xmm0"); 14701cb0ef41Sopenharmony_ci &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 14711cb0ef41Sopenharmony_ci&function_end("aesni_xts_encrypt"); 14721cb0ef41Sopenharmony_ci 14731cb0ef41Sopenharmony_ci&function_begin("aesni_xts_decrypt"); 14741cb0ef41Sopenharmony_ci &mov ($key,&wparam(4)); # key2 14751cb0ef41Sopenharmony_ci &mov ($inp,&wparam(5)); # clear-text tweak 14761cb0ef41Sopenharmony_ci 14771cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); # key2->rounds 14781cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 14791cb0ef41Sopenharmony_ci if ($inline) 14801cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc"); } 14811cb0ef41Sopenharmony_ci else 14821cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1"); } 14831cb0ef41Sopenharmony_ci 14841cb0ef41Sopenharmony_ci &mov ($inp,&wparam(0)); 14851cb0ef41Sopenharmony_ci &mov ($out,&wparam(1)); 14861cb0ef41Sopenharmony_ci &mov ($len,&wparam(2)); 14871cb0ef41Sopenharmony_ci &mov ($key,&wparam(3)); # key1 14881cb0ef41Sopenharmony_ci 14891cb0ef41Sopenharmony_ci &mov ($key_,"esp"); 14901cb0ef41Sopenharmony_ci &sub ("esp",16*7+8); 14911cb0ef41Sopenharmony_ci &and ("esp",-16); # align stack 14921cb0ef41Sopenharmony_ci 14931cb0ef41Sopenharmony_ci &xor ($rounds_,$rounds_); # if(len%16) len-=16; 14941cb0ef41Sopenharmony_ci &test ($len,15); 14951cb0ef41Sopenharmony_ci &setnz (&LB($rounds_)); 14961cb0ef41Sopenharmony_ci &shl ($rounds_,4); 14971cb0ef41Sopenharmony_ci &sub ($len,$rounds_); 14981cb0ef41Sopenharmony_ci 14991cb0ef41Sopenharmony_ci &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 15001cb0ef41Sopenharmony_ci &mov (&DWP(16*6+4,"esp"),0); 15011cb0ef41Sopenharmony_ci &mov (&DWP(16*6+8,"esp"),1); 15021cb0ef41Sopenharmony_ci &mov (&DWP(16*6+12,"esp"),0); 15031cb0ef41Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save original $len 15041cb0ef41Sopenharmony_ci &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 15051cb0ef41Sopenharmony_ci 15061cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); # key1->rounds 15071cb0ef41Sopenharmony_ci &mov ($key_,$key); # backup $key 15081cb0ef41Sopenharmony_ci &mov ($rounds_,$rounds); # backup $rounds 15091cb0ef41Sopenharmony_ci 15101cb0ef41Sopenharmony_ci &movdqa ($tweak,$inout0); 15111cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 15121cb0ef41Sopenharmony_ci &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 15131cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 15141cb0ef41Sopenharmony_ci 15151cb0ef41Sopenharmony_ci &and ($len,-16); 15161cb0ef41Sopenharmony_ci &sub ($len,16*6); 15171cb0ef41Sopenharmony_ci &jc (&label("xts_dec_short")); 15181cb0ef41Sopenharmony_ci 15191cb0ef41Sopenharmony_ci &shl ($rounds,4); 15201cb0ef41Sopenharmony_ci &mov ($rounds_,16); 15211cb0ef41Sopenharmony_ci &sub ($rounds_,$rounds); 15221cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 15231cb0ef41Sopenharmony_ci &jmp (&label("xts_dec_loop6")); 15241cb0ef41Sopenharmony_ci 15251cb0ef41Sopenharmony_ci&set_label("xts_dec_loop6",16); 15261cb0ef41Sopenharmony_ci for ($i=0;$i<4;$i++) { 15271cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 15281cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 15291cb0ef41Sopenharmony_ci &movdqa (&QWP(16*$i,"esp"),$tweak); 15301cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 15311cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 15321cb0ef41Sopenharmony_ci &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 15331cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 15341cb0ef41Sopenharmony_ci } 15351cb0ef41Sopenharmony_ci &pshufd ($inout5,$twtmp,0x13); 15361cb0ef41Sopenharmony_ci &movdqa (&QWP(16*$i++,"esp"),$tweak); 15371cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 15381cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key_)); 15391cb0ef41Sopenharmony_ci &pand ($inout5,$twmask); # isolate carry and residue 15401cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); # load input 15411cb0ef41Sopenharmony_ci &pxor ($inout5,$tweak); 15421cb0ef41Sopenharmony_ci 15431cb0ef41Sopenharmony_ci # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 15441cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); 15451cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 15461cb0ef41Sopenharmony_ci &xorps ($inout0,$rndkey0); # input^=rndkey[0] 15471cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 15481cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); 15491cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 15501cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); 15511cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 15521cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); 15531cb0ef41Sopenharmony_ci &movdqu ($rndkey1,&QWP(16*5,$inp)); 15541cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); 15551cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*6,$inp)); 15561cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 15571cb0ef41Sopenharmony_ci &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 15581cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey1); 15591cb0ef41Sopenharmony_ci 15601cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key_)); 15611cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 15621cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 15631cb0ef41Sopenharmony_ci &aesdec ($inout0,$rndkey1); 15641cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 15651cb0ef41Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 15661cb0ef41Sopenharmony_ci &aesdec ($inout1,$rndkey1); 15671cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); 15681cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key_)); 15691cb0ef41Sopenharmony_ci &aesdec ($inout2,$rndkey1); 15701cb0ef41Sopenharmony_ci &aesdec ($inout3,$rndkey1); 15711cb0ef41Sopenharmony_ci &aesdec ($inout4,$rndkey1); 15721cb0ef41Sopenharmony_ci &aesdec ($inout5,$rndkey1); 15731cb0ef41Sopenharmony_ci &call (&label("_aesni_decrypt6_enter")); 15741cb0ef41Sopenharmony_ci 15751cb0ef41Sopenharmony_ci &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 15761cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 15771cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 15781cb0ef41Sopenharmony_ci &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 15791cb0ef41Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 15801cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 15811cb0ef41Sopenharmony_ci &xorps ($inout2,&QWP(16*2,"esp")); 15821cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 15831cb0ef41Sopenharmony_ci &xorps ($inout3,&QWP(16*3,"esp")); 15841cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 15851cb0ef41Sopenharmony_ci &xorps ($inout4,&QWP(16*4,"esp")); 15861cb0ef41Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 15871cb0ef41Sopenharmony_ci &xorps ($inout5,$tweak); 15881cb0ef41Sopenharmony_ci &movups (&QWP(16*4,$out),$inout4); 15891cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 15901cb0ef41Sopenharmony_ci &movups (&QWP(16*5,$out),$inout5); 15911cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*6,$out)); 15921cb0ef41Sopenharmony_ci &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 15931cb0ef41Sopenharmony_ci 15941cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 15951cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 15961cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 15971cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 15981cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 15991cb0ef41Sopenharmony_ci 16001cb0ef41Sopenharmony_ci &sub ($len,16*6); 16011cb0ef41Sopenharmony_ci &jnc (&label("xts_dec_loop6")); 16021cb0ef41Sopenharmony_ci 16031cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key_)); # restore $rounds 16041cb0ef41Sopenharmony_ci &mov ($key,$key_); # restore $key 16051cb0ef41Sopenharmony_ci &mov ($rounds_,$rounds); 16061cb0ef41Sopenharmony_ci 16071cb0ef41Sopenharmony_ci&set_label("xts_dec_short"); 16081cb0ef41Sopenharmony_ci &add ($len,16*6); 16091cb0ef41Sopenharmony_ci &jz (&label("xts_dec_done6x")); 16101cb0ef41Sopenharmony_ci 16111cb0ef41Sopenharmony_ci &movdqa ($inout3,$tweak); # put aside previous tweak 16121cb0ef41Sopenharmony_ci &cmp ($len,0x20); 16131cb0ef41Sopenharmony_ci &jb (&label("xts_dec_one")); 16141cb0ef41Sopenharmony_ci 16151cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 16161cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 16171cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 16181cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 16191cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 16201cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 16211cb0ef41Sopenharmony_ci &je (&label("xts_dec_two")); 16221cb0ef41Sopenharmony_ci 16231cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 16241cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 16251cb0ef41Sopenharmony_ci &movdqa ($inout4,$tweak); # put aside previous tweak 16261cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 16271cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 16281cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 16291cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 16301cb0ef41Sopenharmony_ci &cmp ($len,0x40); 16311cb0ef41Sopenharmony_ci &jb (&label("xts_dec_three")); 16321cb0ef41Sopenharmony_ci 16331cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 16341cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 16351cb0ef41Sopenharmony_ci &movdqa ($inout5,$tweak); # put aside previous tweak 16361cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 16371cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 16381cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 16391cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 16401cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout3); 16411cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout4); 16421cb0ef41Sopenharmony_ci &je (&label("xts_dec_four")); 16431cb0ef41Sopenharmony_ci 16441cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout5); 16451cb0ef41Sopenharmony_ci &pshufd ($inout5,$twtmp,0x13); 16461cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$tweak); 16471cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($inout0,1); 16481cb0ef41Sopenharmony_ci &pand ($inout5,$twmask); # isolate carry and residue 16491cb0ef41Sopenharmony_ci &pxor ($inout5,$tweak); 16501cb0ef41Sopenharmony_ci 16511cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 16521cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 16531cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 16541cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 16551cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 16561cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 16571cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 16581cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 16591cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*5,$inp)); 16601cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 16611cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 16621cb0ef41Sopenharmony_ci &pxor ($inout4,$inout5); 16631cb0ef41Sopenharmony_ci 16641cb0ef41Sopenharmony_ci &call ("_aesni_decrypt6"); 16651cb0ef41Sopenharmony_ci 16661cb0ef41Sopenharmony_ci &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 16671cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 16681cb0ef41Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 16691cb0ef41Sopenharmony_ci &xorps ($inout2,&QWP(16*2,"esp")); 16701cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 16711cb0ef41Sopenharmony_ci &xorps ($inout3,&QWP(16*3,"esp")); 16721cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 16731cb0ef41Sopenharmony_ci &xorps ($inout4,$tweak); 16741cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 16751cb0ef41Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 16761cb0ef41Sopenharmony_ci &movups (&QWP(16*4,$out),$inout4); 16771cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*5,$out)); 16781cb0ef41Sopenharmony_ci &jmp (&label("xts_dec_done")); 16791cb0ef41Sopenharmony_ci 16801cb0ef41Sopenharmony_ci&set_label("xts_dec_one",16); 16811cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 16821cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*1,$inp)); 16831cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 16841cb0ef41Sopenharmony_ci if ($inline) 16851cb0ef41Sopenharmony_ci { &aesni_inline_generate1("dec"); } 16861cb0ef41Sopenharmony_ci else 16871cb0ef41Sopenharmony_ci { &call ("_aesni_decrypt1"); } 16881cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 16891cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 16901cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*1,$out)); 16911cb0ef41Sopenharmony_ci 16921cb0ef41Sopenharmony_ci &movdqa ($tweak,$inout3); # last tweak 16931cb0ef41Sopenharmony_ci &jmp (&label("xts_dec_done")); 16941cb0ef41Sopenharmony_ci 16951cb0ef41Sopenharmony_ci&set_label("xts_dec_two",16); 16961cb0ef41Sopenharmony_ci &movaps ($inout4,$tweak); # put aside last tweak 16971cb0ef41Sopenharmony_ci 16981cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 16991cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 17001cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*2,$inp)); 17011cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 17021cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 17031cb0ef41Sopenharmony_ci 17041cb0ef41Sopenharmony_ci &call ("_aesni_decrypt2"); 17051cb0ef41Sopenharmony_ci 17061cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 17071cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 17081cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 17091cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 17101cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*2,$out)); 17111cb0ef41Sopenharmony_ci 17121cb0ef41Sopenharmony_ci &movdqa ($tweak,$inout4); # last tweak 17131cb0ef41Sopenharmony_ci &jmp (&label("xts_dec_done")); 17141cb0ef41Sopenharmony_ci 17151cb0ef41Sopenharmony_ci&set_label("xts_dec_three",16); 17161cb0ef41Sopenharmony_ci &movaps ($inout5,$tweak); # put aside last tweak 17171cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 17181cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 17191cb0ef41Sopenharmony_ci &movups ($inout2,&QWP(16*2,$inp)); 17201cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*3,$inp)); 17211cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 17221cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 17231cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 17241cb0ef41Sopenharmony_ci 17251cb0ef41Sopenharmony_ci &call ("_aesni_decrypt3"); 17261cb0ef41Sopenharmony_ci 17271cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 17281cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 17291cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 17301cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 17311cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 17321cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 17331cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*3,$out)); 17341cb0ef41Sopenharmony_ci 17351cb0ef41Sopenharmony_ci &movdqa ($tweak,$inout5); # last tweak 17361cb0ef41Sopenharmony_ci &jmp (&label("xts_dec_done")); 17371cb0ef41Sopenharmony_ci 17381cb0ef41Sopenharmony_ci&set_label("xts_dec_four",16); 17391cb0ef41Sopenharmony_ci &movaps ($inout4,$tweak); # put aside last tweak 17401cb0ef41Sopenharmony_ci 17411cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 17421cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 17431cb0ef41Sopenharmony_ci &movups ($inout2,&QWP(16*2,$inp)); 17441cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 17451cb0ef41Sopenharmony_ci &movups ($inout3,&QWP(16*3,$inp)); 17461cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*4,$inp)); 17471cb0ef41Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 17481cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 17491cb0ef41Sopenharmony_ci &xorps ($inout3,$inout4); 17501cb0ef41Sopenharmony_ci 17511cb0ef41Sopenharmony_ci &call ("_aesni_decrypt4"); 17521cb0ef41Sopenharmony_ci 17531cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 17541cb0ef41Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 17551cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 17561cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 17571cb0ef41Sopenharmony_ci &xorps ($inout3,$inout4); 17581cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 17591cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 17601cb0ef41Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 17611cb0ef41Sopenharmony_ci &lea ($out,&DWP(16*4,$out)); 17621cb0ef41Sopenharmony_ci 17631cb0ef41Sopenharmony_ci &movdqa ($tweak,$inout4); # last tweak 17641cb0ef41Sopenharmony_ci &jmp (&label("xts_dec_done")); 17651cb0ef41Sopenharmony_ci 17661cb0ef41Sopenharmony_ci&set_label("xts_dec_done6x",16); # $tweak is pre-calculated 17671cb0ef41Sopenharmony_ci &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 17681cb0ef41Sopenharmony_ci &and ($len,15); 17691cb0ef41Sopenharmony_ci &jz (&label("xts_dec_ret")); 17701cb0ef41Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 17711cb0ef41Sopenharmony_ci &jmp (&label("xts_dec_only_one_more")); 17721cb0ef41Sopenharmony_ci 17731cb0ef41Sopenharmony_ci&set_label("xts_dec_done",16); 17741cb0ef41Sopenharmony_ci &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 17751cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 17761cb0ef41Sopenharmony_ci &and ($len,15); 17771cb0ef41Sopenharmony_ci &jz (&label("xts_dec_ret")); 17781cb0ef41Sopenharmony_ci 17791cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 17801cb0ef41Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 17811cb0ef41Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 17821cb0ef41Sopenharmony_ci &pxor ($twtmp,$twtmp); 17831cb0ef41Sopenharmony_ci &movdqa ($twmask,&QWP(16*6,"esp")); 17841cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 17851cb0ef41Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 17861cb0ef41Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 17871cb0ef41Sopenharmony_ci &pxor ($tweak,$twres); 17881cb0ef41Sopenharmony_ci 17891cb0ef41Sopenharmony_ci&set_label("xts_dec_only_one_more"); 17901cb0ef41Sopenharmony_ci &pshufd ($inout3,$twtmp,0x13); 17911cb0ef41Sopenharmony_ci &movdqa ($inout4,$tweak); # put aside previous tweak 17921cb0ef41Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 17931cb0ef41Sopenharmony_ci &pand ($inout3,$twmask); # isolate carry and residue 17941cb0ef41Sopenharmony_ci &pxor ($inout3,$tweak); 17951cb0ef41Sopenharmony_ci 17961cb0ef41Sopenharmony_ci &mov ($key,$key_); # restore $key 17971cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 17981cb0ef41Sopenharmony_ci 17991cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); # load input 18001cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 18011cb0ef41Sopenharmony_ci if ($inline) 18021cb0ef41Sopenharmony_ci { &aesni_inline_generate1("dec"); } 18031cb0ef41Sopenharmony_ci else 18041cb0ef41Sopenharmony_ci { &call ("_aesni_decrypt1"); } 18051cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 18061cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); # write output 18071cb0ef41Sopenharmony_ci 18081cb0ef41Sopenharmony_ci&set_label("xts_dec_steal"); 18091cb0ef41Sopenharmony_ci &movz ($rounds,&BP(16,$inp)); 18101cb0ef41Sopenharmony_ci &movz ($key,&BP(0,$out)); 18111cb0ef41Sopenharmony_ci &lea ($inp,&DWP(1,$inp)); 18121cb0ef41Sopenharmony_ci &mov (&BP(0,$out),&LB($rounds)); 18131cb0ef41Sopenharmony_ci &mov (&BP(16,$out),&LB($key)); 18141cb0ef41Sopenharmony_ci &lea ($out,&DWP(1,$out)); 18151cb0ef41Sopenharmony_ci &sub ($len,1); 18161cb0ef41Sopenharmony_ci &jnz (&label("xts_dec_steal")); 18171cb0ef41Sopenharmony_ci 18181cb0ef41Sopenharmony_ci &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 18191cb0ef41Sopenharmony_ci &mov ($key,$key_); # restore $key 18201cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 18211cb0ef41Sopenharmony_ci 18221cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$out)); # load input 18231cb0ef41Sopenharmony_ci &xorps ($inout0,$inout4); # input^=tweak 18241cb0ef41Sopenharmony_ci if ($inline) 18251cb0ef41Sopenharmony_ci { &aesni_inline_generate1("dec"); } 18261cb0ef41Sopenharmony_ci else 18271cb0ef41Sopenharmony_ci { &call ("_aesni_decrypt1"); } 18281cb0ef41Sopenharmony_ci &xorps ($inout0,$inout4); # output^=tweak 18291cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); # write output 18301cb0ef41Sopenharmony_ci 18311cb0ef41Sopenharmony_ci&set_label("xts_dec_ret"); 18321cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 18331cb0ef41Sopenharmony_ci &pxor ("xmm1","xmm1"); 18341cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm2"); 18351cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 18361cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm3"); 18371cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),"xmm0"); 18381cb0ef41Sopenharmony_ci &pxor ("xmm4","xmm4"); 18391cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),"xmm0"); 18401cb0ef41Sopenharmony_ci &pxor ("xmm5","xmm5"); 18411cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),"xmm0"); 18421cb0ef41Sopenharmony_ci &pxor ("xmm6","xmm6"); 18431cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),"xmm0"); 18441cb0ef41Sopenharmony_ci &pxor ("xmm7","xmm7"); 18451cb0ef41Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),"xmm0"); 18461cb0ef41Sopenharmony_ci &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 18471cb0ef41Sopenharmony_ci&function_end("aesni_xts_decrypt"); 18481cb0ef41Sopenharmony_ci} 18491cb0ef41Sopenharmony_ci 18501cb0ef41Sopenharmony_ci###################################################################### 18511cb0ef41Sopenharmony_ci# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 18521cb0ef41Sopenharmony_ci# const AES_KEY *key, unsigned int start_block_num, 18531cb0ef41Sopenharmony_ci# unsigned char offset_i[16], const unsigned char L_[][16], 18541cb0ef41Sopenharmony_ci# unsigned char checksum[16]); 18551cb0ef41Sopenharmony_ci# 18561cb0ef41Sopenharmony_ci{ 18571cb0ef41Sopenharmony_ci# offsets within stack frame 18581cb0ef41Sopenharmony_cimy $checksum = 16*6; 18591cb0ef41Sopenharmony_cimy ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4)); 18601cb0ef41Sopenharmony_ci 18611cb0ef41Sopenharmony_ci# reassigned registers 18621cb0ef41Sopenharmony_cimy ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out); 18631cb0ef41Sopenharmony_ci# $l_, $blocks, $inp, $key are permanently allocated in registers; 18641cb0ef41Sopenharmony_ci# remaining non-volatile ones are offloaded to stack, which even 18651cb0ef41Sopenharmony_ci# stay invariant after written to stack. 18661cb0ef41Sopenharmony_ci 18671cb0ef41Sopenharmony_ci&function_begin("aesni_ocb_encrypt"); 18681cb0ef41Sopenharmony_ci &mov ($rounds,&wparam(5)); # &offset_i 18691cb0ef41Sopenharmony_ci &mov ($rounds_,&wparam(7)); # &checksum 18701cb0ef41Sopenharmony_ci 18711cb0ef41Sopenharmony_ci &mov ($inp,&wparam(0)); 18721cb0ef41Sopenharmony_ci &mov ($out,&wparam(1)); 18731cb0ef41Sopenharmony_ci &mov ($len,&wparam(2)); 18741cb0ef41Sopenharmony_ci &mov ($key,&wparam(3)); 18751cb0ef41Sopenharmony_ci &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i 18761cb0ef41Sopenharmony_ci &mov ($block,&wparam(4)); # start_block_num 18771cb0ef41Sopenharmony_ci &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum 18781cb0ef41Sopenharmony_ci &mov ($l_,&wparam(6)); # L_ 18791cb0ef41Sopenharmony_ci 18801cb0ef41Sopenharmony_ci &mov ($rounds,"esp"); 18811cb0ef41Sopenharmony_ci &sub ("esp",$esp_off+4); # alloca 18821cb0ef41Sopenharmony_ci &and ("esp",-16); # align stack 18831cb0ef41Sopenharmony_ci 18841cb0ef41Sopenharmony_ci &sub ($out,$inp); 18851cb0ef41Sopenharmony_ci &shl ($len,4); 18861cb0ef41Sopenharmony_ci &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6 18871cb0ef41Sopenharmony_ci &mov (&DWP($out_off,"esp"),$out); 18881cb0ef41Sopenharmony_ci &mov (&DWP($end_off,"esp"),$len); 18891cb0ef41Sopenharmony_ci &mov (&DWP($esp_off,"esp"),$rounds); 18901cb0ef41Sopenharmony_ci 18911cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 18921cb0ef41Sopenharmony_ci 18931cb0ef41Sopenharmony_ci &test ($block,1); 18941cb0ef41Sopenharmony_ci &jnz (&label("odd")); 18951cb0ef41Sopenharmony_ci 18961cb0ef41Sopenharmony_ci &bsf ($i3,$block); 18971cb0ef41Sopenharmony_ci &add ($block,1); 18981cb0ef41Sopenharmony_ci &shl ($i3,4); 18991cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i3)); 19001cb0ef41Sopenharmony_ci &mov ($i3,$key); # put aside key 19011cb0ef41Sopenharmony_ci 19021cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 19031cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16,$inp)); 19041cb0ef41Sopenharmony_ci 19051cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); # ^ last offset_i 19061cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 19071cb0ef41Sopenharmony_ci &pxor ($inout0,$inout5); # ^ offset_i 19081cb0ef41Sopenharmony_ci 19091cb0ef41Sopenharmony_ci &movdqa ($inout4,$rndkey1); 19101cb0ef41Sopenharmony_ci if ($inline) 19111cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc"); } 19121cb0ef41Sopenharmony_ci else 19131cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1"); } 19141cb0ef41Sopenharmony_ci 19151cb0ef41Sopenharmony_ci &xorps ($inout0,$inout5); # ^ offset_i 19161cb0ef41Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 19171cb0ef41Sopenharmony_ci &movdqa ($rndkey1,$inout4); # pass the checksum 19181cb0ef41Sopenharmony_ci 19191cb0ef41Sopenharmony_ci &movups (&QWP(-16,$out,$inp),$inout0); # store output 19201cb0ef41Sopenharmony_ci 19211cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$i3)); 19221cb0ef41Sopenharmony_ci &mov ($key,$i3); # restore key 19231cb0ef41Sopenharmony_ci &mov ($len,&DWP($end_off,"esp")); 19241cb0ef41Sopenharmony_ci 19251cb0ef41Sopenharmony_ci&set_label("odd"); 19261cb0ef41Sopenharmony_ci &shl ($rounds,4); 19271cb0ef41Sopenharmony_ci &mov ($out,16); 19281cb0ef41Sopenharmony_ci &sub ($out,$rounds); # twisted rounds 19291cb0ef41Sopenharmony_ci &mov (&DWP($key_off,"esp"),$key); 19301cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule 19311cb0ef41Sopenharmony_ci &mov (&DWP($rounds_off,"esp"),$out); 19321cb0ef41Sopenharmony_ci 19331cb0ef41Sopenharmony_ci &cmp ($inp,$len); 19341cb0ef41Sopenharmony_ci &ja (&label("short")); 19351cb0ef41Sopenharmony_ci &jmp (&label("grandloop")); 19361cb0ef41Sopenharmony_ci 19371cb0ef41Sopenharmony_ci&set_label("grandloop",32); 19381cb0ef41Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 19391cb0ef41Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 19401cb0ef41Sopenharmony_ci &lea ($i5,&DWP(5,$block)); 19411cb0ef41Sopenharmony_ci &add ($block,6); 19421cb0ef41Sopenharmony_ci &bsf ($i1,$i1); 19431cb0ef41Sopenharmony_ci &bsf ($i3,$i3); 19441cb0ef41Sopenharmony_ci &bsf ($i5,$i5); 19451cb0ef41Sopenharmony_ci &shl ($i1,4); 19461cb0ef41Sopenharmony_ci &shl ($i3,4); 19471cb0ef41Sopenharmony_ci &shl ($i5,4); 19481cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(0,$l_)); 19491cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(0,$l_,$i1)); 19501cb0ef41Sopenharmony_ci &mov ($rounds,&DWP($rounds_off,"esp")); 19511cb0ef41Sopenharmony_ci &movdqa ($inout2,$inout0); 19521cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i3)); 19531cb0ef41Sopenharmony_ci &movdqa ($inout4,$inout0); 19541cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i5)); 19551cb0ef41Sopenharmony_ci 19561cb0ef41Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ last offset_i 19571cb0ef41Sopenharmony_ci &pxor ($inout1,$inout0); 19581cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); 19591cb0ef41Sopenharmony_ci &pxor ($inout2,$inout1); 19601cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout1); 19611cb0ef41Sopenharmony_ci &pxor ($inout3,$inout2); 19621cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout2); 19631cb0ef41Sopenharmony_ci &pxor ($inout4,$inout3); 19641cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout3); 19651cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); 19661cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout4); 19671cb0ef41Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),$inout5); 19681cb0ef41Sopenharmony_ci 19691cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 19701cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 19711cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 19721cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 19731cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 19741cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 19751cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(16*5,$inp)); 19761cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*6,$inp)); 19771cb0ef41Sopenharmony_ci 19781cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 19791cb0ef41Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ roundkey[0] 19801cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout1); 19811cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); 19821cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout2); 19831cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); 19841cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout3); 19851cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); 19861cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout4); 19871cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); 19881cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout5); 19891cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); 19901cb0ef41Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 19911cb0ef41Sopenharmony_ci 19921cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 19931cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 19941cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 19951cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 19961cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 19971cb0ef41Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 19981cb0ef41Sopenharmony_ci &pxor ($inout5,&QWP(16*5,"esp")); 19991cb0ef41Sopenharmony_ci 20001cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 20011cb0ef41Sopenharmony_ci &aesenc ($inout0,$rndkey1); 20021cb0ef41Sopenharmony_ci &aesenc ($inout1,$rndkey1); 20031cb0ef41Sopenharmony_ci &aesenc ($inout2,$rndkey1); 20041cb0ef41Sopenharmony_ci &aesenc ($inout3,$rndkey1); 20051cb0ef41Sopenharmony_ci &aesenc ($inout4,$rndkey1); 20061cb0ef41Sopenharmony_ci &aesenc ($inout5,$rndkey1); 20071cb0ef41Sopenharmony_ci 20081cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 20091cb0ef41Sopenharmony_ci &mov ($len,&DWP($end_off,"esp")); 20101cb0ef41Sopenharmony_ci &call ("_aesni_encrypt6_enter"); 20111cb0ef41Sopenharmony_ci 20121cb0ef41Sopenharmony_ci &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i 20131cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 20141cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 20151cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 20161cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 20171cb0ef41Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 20181cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); 20191cb0ef41Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 20201cb0ef41Sopenharmony_ci 20211cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output 20221cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*5,$out,$inp),$inout1); 20231cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*4,$out,$inp),$inout2); 20241cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*3,$out,$inp),$inout3); 20251cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*2,$out,$inp),$inout4); 20261cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*1,$out,$inp),$inout5); 20271cb0ef41Sopenharmony_ci &cmp ($inp,$len); # done yet? 20281cb0ef41Sopenharmony_ci &jbe (&label("grandloop")); 20291cb0ef41Sopenharmony_ci 20301cb0ef41Sopenharmony_ci&set_label("short"); 20311cb0ef41Sopenharmony_ci &add ($len,16*6); 20321cb0ef41Sopenharmony_ci &sub ($len,$inp); 20331cb0ef41Sopenharmony_ci &jz (&label("done")); 20341cb0ef41Sopenharmony_ci 20351cb0ef41Sopenharmony_ci &cmp ($len,16*2); 20361cb0ef41Sopenharmony_ci &jb (&label("one")); 20371cb0ef41Sopenharmony_ci &je (&label("two")); 20381cb0ef41Sopenharmony_ci 20391cb0ef41Sopenharmony_ci &cmp ($len,16*4); 20401cb0ef41Sopenharmony_ci &jb (&label("three")); 20411cb0ef41Sopenharmony_ci &je (&label("four")); 20421cb0ef41Sopenharmony_ci 20431cb0ef41Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 20441cb0ef41Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 20451cb0ef41Sopenharmony_ci &bsf ($i1,$i1); 20461cb0ef41Sopenharmony_ci &bsf ($i3,$i3); 20471cb0ef41Sopenharmony_ci &shl ($i1,4); 20481cb0ef41Sopenharmony_ci &shl ($i3,4); 20491cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(0,$l_)); 20501cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(0,$l_,$i1)); 20511cb0ef41Sopenharmony_ci &mov ($rounds,&DWP($rounds_off,"esp")); 20521cb0ef41Sopenharmony_ci &movdqa ($inout2,$inout0); 20531cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i3)); 20541cb0ef41Sopenharmony_ci &movdqa ($inout4,$inout0); 20551cb0ef41Sopenharmony_ci 20561cb0ef41Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ last offset_i 20571cb0ef41Sopenharmony_ci &pxor ($inout1,$inout0); 20581cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); 20591cb0ef41Sopenharmony_ci &pxor ($inout2,$inout1); 20601cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout1); 20611cb0ef41Sopenharmony_ci &pxor ($inout3,$inout2); 20621cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout2); 20631cb0ef41Sopenharmony_ci &pxor ($inout4,$inout3); 20641cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout3); 20651cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); 20661cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout4); 20671cb0ef41Sopenharmony_ci 20681cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 20691cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 20701cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 20711cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 20721cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 20731cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 20741cb0ef41Sopenharmony_ci &pxor ($inout5,$inout5); 20751cb0ef41Sopenharmony_ci 20761cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 20771cb0ef41Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ roundkey[0] 20781cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout1); 20791cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); 20801cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout2); 20811cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); 20821cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout3); 20831cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); 20841cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout4); 20851cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); 20861cb0ef41Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 20871cb0ef41Sopenharmony_ci 20881cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 20891cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 20901cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 20911cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 20921cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 20931cb0ef41Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 20941cb0ef41Sopenharmony_ci 20951cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 20961cb0ef41Sopenharmony_ci &aesenc ($inout0,$rndkey1); 20971cb0ef41Sopenharmony_ci &aesenc ($inout1,$rndkey1); 20981cb0ef41Sopenharmony_ci &aesenc ($inout2,$rndkey1); 20991cb0ef41Sopenharmony_ci &aesenc ($inout3,$rndkey1); 21001cb0ef41Sopenharmony_ci &aesenc ($inout4,$rndkey1); 21011cb0ef41Sopenharmony_ci &aesenc ($inout5,$rndkey1); 21021cb0ef41Sopenharmony_ci 21031cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 21041cb0ef41Sopenharmony_ci &call ("_aesni_encrypt6_enter"); 21051cb0ef41Sopenharmony_ci 21061cb0ef41Sopenharmony_ci &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i 21071cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 21081cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 21091cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 21101cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 21111cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); 21121cb0ef41Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 21131cb0ef41Sopenharmony_ci 21141cb0ef41Sopenharmony_ci &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output 21151cb0ef41Sopenharmony_ci &movdqu (&QWP(16*1,$out,$inp),$inout1); 21161cb0ef41Sopenharmony_ci &movdqu (&QWP(16*2,$out,$inp),$inout2); 21171cb0ef41Sopenharmony_ci &movdqu (&QWP(16*3,$out,$inp),$inout3); 21181cb0ef41Sopenharmony_ci &movdqu (&QWP(16*4,$out,$inp),$inout4); 21191cb0ef41Sopenharmony_ci 21201cb0ef41Sopenharmony_ci &jmp (&label("done")); 21211cb0ef41Sopenharmony_ci 21221cb0ef41Sopenharmony_ci&set_label("one",16); 21231cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_)); 21241cb0ef41Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 21251cb0ef41Sopenharmony_ci 21261cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 21271cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 21281cb0ef41Sopenharmony_ci 21291cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); # ^ last offset_i 21301cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 21311cb0ef41Sopenharmony_ci &pxor ($inout0,$inout5); # ^ offset_i 21321cb0ef41Sopenharmony_ci 21331cb0ef41Sopenharmony_ci &movdqa ($inout4,$rndkey1); 21341cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 21351cb0ef41Sopenharmony_ci if ($inline) 21361cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc"); } 21371cb0ef41Sopenharmony_ci else 21381cb0ef41Sopenharmony_ci { &call ("_aesni_encrypt1"); } 21391cb0ef41Sopenharmony_ci 21401cb0ef41Sopenharmony_ci &xorps ($inout0,$inout5); # ^ offset_i 21411cb0ef41Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 21421cb0ef41Sopenharmony_ci &movdqa ($rndkey1,$inout4); # pass the checksum 21431cb0ef41Sopenharmony_ci &movups (&QWP(0,$out,$inp),$inout0); 21441cb0ef41Sopenharmony_ci 21451cb0ef41Sopenharmony_ci &jmp (&label("done")); 21461cb0ef41Sopenharmony_ci 21471cb0ef41Sopenharmony_ci&set_label("two",16); 21481cb0ef41Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 21491cb0ef41Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 21501cb0ef41Sopenharmony_ci &bsf ($i1,$i1); 21511cb0ef41Sopenharmony_ci &shl ($i1,4); 21521cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(0,$l_)); 21531cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i1)); 21541cb0ef41Sopenharmony_ci 21551cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 21561cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 21571cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 21581cb0ef41Sopenharmony_ci 21591cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); # ^ last offset_i 21601cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); 21611cb0ef41Sopenharmony_ci 21621cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 21631cb0ef41Sopenharmony_ci &pxor ($inout0,$inout4); # ^ offset_i 21641cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout1); 21651cb0ef41Sopenharmony_ci &pxor ($inout1,$inout5); 21661cb0ef41Sopenharmony_ci 21671cb0ef41Sopenharmony_ci &movdqa ($inout3,$rndkey1) 21681cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 21691cb0ef41Sopenharmony_ci &call ("_aesni_encrypt2"); 21701cb0ef41Sopenharmony_ci 21711cb0ef41Sopenharmony_ci &xorps ($inout0,$inout4); # ^ offset_i 21721cb0ef41Sopenharmony_ci &xorps ($inout1,$inout5); 21731cb0ef41Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 21741cb0ef41Sopenharmony_ci &movdqa ($rndkey1,$inout3); # pass the checksum 21751cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 21761cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 21771cb0ef41Sopenharmony_ci 21781cb0ef41Sopenharmony_ci &jmp (&label("done")); 21791cb0ef41Sopenharmony_ci 21801cb0ef41Sopenharmony_ci&set_label("three",16); 21811cb0ef41Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 21821cb0ef41Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 21831cb0ef41Sopenharmony_ci &bsf ($i1,$i1); 21841cb0ef41Sopenharmony_ci &shl ($i1,4); 21851cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_)); 21861cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(0,$l_,$i1)); 21871cb0ef41Sopenharmony_ci &movdqa ($inout5,$inout3); 21881cb0ef41Sopenharmony_ci 21891cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 21901cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 21911cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 21921cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 21931cb0ef41Sopenharmony_ci 21941cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); # ^ last offset_i 21951cb0ef41Sopenharmony_ci &pxor ($inout4,$inout3); 21961cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); 21971cb0ef41Sopenharmony_ci 21981cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 21991cb0ef41Sopenharmony_ci &pxor ($inout0,$inout3); # ^ offset_i 22001cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout1); 22011cb0ef41Sopenharmony_ci &pxor ($inout1,$inout4); 22021cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout2); 22031cb0ef41Sopenharmony_ci &pxor ($inout2,$inout5); 22041cb0ef41Sopenharmony_ci 22051cb0ef41Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 22061cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 22071cb0ef41Sopenharmony_ci &call ("_aesni_encrypt3"); 22081cb0ef41Sopenharmony_ci 22091cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # ^ offset_i 22101cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 22111cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 22121cb0ef41Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 22131cb0ef41Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 22141cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 22151cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 22161cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out,$inp),$inout2); 22171cb0ef41Sopenharmony_ci 22181cb0ef41Sopenharmony_ci &jmp (&label("done")); 22191cb0ef41Sopenharmony_ci 22201cb0ef41Sopenharmony_ci&set_label("four",16); 22211cb0ef41Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 22221cb0ef41Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 22231cb0ef41Sopenharmony_ci &bsf ($i1,$i1); 22241cb0ef41Sopenharmony_ci &bsf ($i3,$i3); 22251cb0ef41Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 22261cb0ef41Sopenharmony_ci &shl ($i1,4); 22271cb0ef41Sopenharmony_ci &shl ($i3,4); 22281cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(0,$l_)); 22291cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i1)); 22301cb0ef41Sopenharmony_ci &movdqa ($inout4,$inout2); 22311cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i3)); 22321cb0ef41Sopenharmony_ci 22331cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); # ^ last offset_i 22341cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 22351cb0ef41Sopenharmony_ci &pxor ($inout3,$inout2); 22361cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 22371cb0ef41Sopenharmony_ci &pxor ($inout4,$inout3); 22381cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout2); 22391cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); 22401cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout3); 22411cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 22421cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 22431cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 22441cb0ef41Sopenharmony_ci 22451cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 22461cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 22471cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout1); 22481cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 22491cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout2); 22501cb0ef41Sopenharmony_ci &pxor ($inout2,$inout4); 22511cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout3); 22521cb0ef41Sopenharmony_ci &pxor ($inout3,$inout5); 22531cb0ef41Sopenharmony_ci 22541cb0ef41Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1) 22551cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 22561cb0ef41Sopenharmony_ci &call ("_aesni_encrypt4"); 22571cb0ef41Sopenharmony_ci 22581cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i 22591cb0ef41Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 22601cb0ef41Sopenharmony_ci &xorps ($inout2,$inout4); 22611cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 22621cb0ef41Sopenharmony_ci &xorps ($inout3,$inout5); 22631cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 22641cb0ef41Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 22651cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out,$inp),$inout2); 22661cb0ef41Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 22671cb0ef41Sopenharmony_ci &movups (&QWP(16*3,$out,$inp),$inout3); 22681cb0ef41Sopenharmony_ci 22691cb0ef41Sopenharmony_ci&set_label("done"); 22701cb0ef41Sopenharmony_ci &mov ($key,&DWP($esp_off,"esp")); 22711cb0ef41Sopenharmony_ci &pxor ($inout0,$inout0); # clear register bank 22721cb0ef41Sopenharmony_ci &pxor ($inout1,$inout1); 22731cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack 22741cb0ef41Sopenharmony_ci &pxor ($inout2,$inout2); 22751cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout0); 22761cb0ef41Sopenharmony_ci &pxor ($inout3,$inout3); 22771cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout0); 22781cb0ef41Sopenharmony_ci &pxor ($inout4,$inout4); 22791cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout0); 22801cb0ef41Sopenharmony_ci &pxor ($inout5,$inout5); 22811cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout0); 22821cb0ef41Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),$inout0); 22831cb0ef41Sopenharmony_ci &movdqa (&QWP(16*6,"esp"),$inout0); 22841cb0ef41Sopenharmony_ci 22851cb0ef41Sopenharmony_ci &lea ("esp",&DWP(0,$key)); 22861cb0ef41Sopenharmony_ci &mov ($rounds,&wparam(5)); # &offset_i 22871cb0ef41Sopenharmony_ci &mov ($rounds_,&wparam(7)); # &checksum 22881cb0ef41Sopenharmony_ci &movdqu (&QWP(0,$rounds),$rndkey0); 22891cb0ef41Sopenharmony_ci &pxor ($rndkey0,$rndkey0); 22901cb0ef41Sopenharmony_ci &movdqu (&QWP(0,$rounds_),$rndkey1); 22911cb0ef41Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 22921cb0ef41Sopenharmony_ci&function_end("aesni_ocb_encrypt"); 22931cb0ef41Sopenharmony_ci 22941cb0ef41Sopenharmony_ci&function_begin("aesni_ocb_decrypt"); 22951cb0ef41Sopenharmony_ci &mov ($rounds,&wparam(5)); # &offset_i 22961cb0ef41Sopenharmony_ci &mov ($rounds_,&wparam(7)); # &checksum 22971cb0ef41Sopenharmony_ci 22981cb0ef41Sopenharmony_ci &mov ($inp,&wparam(0)); 22991cb0ef41Sopenharmony_ci &mov ($out,&wparam(1)); 23001cb0ef41Sopenharmony_ci &mov ($len,&wparam(2)); 23011cb0ef41Sopenharmony_ci &mov ($key,&wparam(3)); 23021cb0ef41Sopenharmony_ci &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i 23031cb0ef41Sopenharmony_ci &mov ($block,&wparam(4)); # start_block_num 23041cb0ef41Sopenharmony_ci &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum 23051cb0ef41Sopenharmony_ci &mov ($l_,&wparam(6)); # L_ 23061cb0ef41Sopenharmony_ci 23071cb0ef41Sopenharmony_ci &mov ($rounds,"esp"); 23081cb0ef41Sopenharmony_ci &sub ("esp",$esp_off+4); # alloca 23091cb0ef41Sopenharmony_ci &and ("esp",-16); # align stack 23101cb0ef41Sopenharmony_ci 23111cb0ef41Sopenharmony_ci &sub ($out,$inp); 23121cb0ef41Sopenharmony_ci &shl ($len,4); 23131cb0ef41Sopenharmony_ci &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6 23141cb0ef41Sopenharmony_ci &mov (&DWP($out_off,"esp"),$out); 23151cb0ef41Sopenharmony_ci &mov (&DWP($end_off,"esp"),$len); 23161cb0ef41Sopenharmony_ci &mov (&DWP($esp_off,"esp"),$rounds); 23171cb0ef41Sopenharmony_ci 23181cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 23191cb0ef41Sopenharmony_ci 23201cb0ef41Sopenharmony_ci &test ($block,1); 23211cb0ef41Sopenharmony_ci &jnz (&label("odd")); 23221cb0ef41Sopenharmony_ci 23231cb0ef41Sopenharmony_ci &bsf ($i3,$block); 23241cb0ef41Sopenharmony_ci &add ($block,1); 23251cb0ef41Sopenharmony_ci &shl ($i3,4); 23261cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i3)); 23271cb0ef41Sopenharmony_ci &mov ($i3,$key); # put aside key 23281cb0ef41Sopenharmony_ci 23291cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 23301cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16,$inp)); 23311cb0ef41Sopenharmony_ci 23321cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); # ^ last offset_i 23331cb0ef41Sopenharmony_ci &pxor ($inout0,$inout5); # ^ offset_i 23341cb0ef41Sopenharmony_ci 23351cb0ef41Sopenharmony_ci &movdqa ($inout4,$rndkey1); 23361cb0ef41Sopenharmony_ci if ($inline) 23371cb0ef41Sopenharmony_ci { &aesni_inline_generate1("dec"); } 23381cb0ef41Sopenharmony_ci else 23391cb0ef41Sopenharmony_ci { &call ("_aesni_decrypt1"); } 23401cb0ef41Sopenharmony_ci 23411cb0ef41Sopenharmony_ci &xorps ($inout0,$inout5); # ^ offset_i 23421cb0ef41Sopenharmony_ci &movaps ($rndkey1,$inout4); # pass the checksum 23431cb0ef41Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 23441cb0ef41Sopenharmony_ci &xorps ($rndkey1,$inout0); # checksum 23451cb0ef41Sopenharmony_ci &movups (&QWP(-16,$out,$inp),$inout0); # store output 23461cb0ef41Sopenharmony_ci 23471cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$i3)); 23481cb0ef41Sopenharmony_ci &mov ($key,$i3); # restore key 23491cb0ef41Sopenharmony_ci &mov ($len,&DWP($end_off,"esp")); 23501cb0ef41Sopenharmony_ci 23511cb0ef41Sopenharmony_ci&set_label("odd"); 23521cb0ef41Sopenharmony_ci &shl ($rounds,4); 23531cb0ef41Sopenharmony_ci &mov ($out,16); 23541cb0ef41Sopenharmony_ci &sub ($out,$rounds); # twisted rounds 23551cb0ef41Sopenharmony_ci &mov (&DWP($key_off,"esp"),$key); 23561cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule 23571cb0ef41Sopenharmony_ci &mov (&DWP($rounds_off,"esp"),$out); 23581cb0ef41Sopenharmony_ci 23591cb0ef41Sopenharmony_ci &cmp ($inp,$len); 23601cb0ef41Sopenharmony_ci &ja (&label("short")); 23611cb0ef41Sopenharmony_ci &jmp (&label("grandloop")); 23621cb0ef41Sopenharmony_ci 23631cb0ef41Sopenharmony_ci&set_label("grandloop",32); 23641cb0ef41Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 23651cb0ef41Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 23661cb0ef41Sopenharmony_ci &lea ($i5,&DWP(5,$block)); 23671cb0ef41Sopenharmony_ci &add ($block,6); 23681cb0ef41Sopenharmony_ci &bsf ($i1,$i1); 23691cb0ef41Sopenharmony_ci &bsf ($i3,$i3); 23701cb0ef41Sopenharmony_ci &bsf ($i5,$i5); 23711cb0ef41Sopenharmony_ci &shl ($i1,4); 23721cb0ef41Sopenharmony_ci &shl ($i3,4); 23731cb0ef41Sopenharmony_ci &shl ($i5,4); 23741cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(0,$l_)); 23751cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(0,$l_,$i1)); 23761cb0ef41Sopenharmony_ci &mov ($rounds,&DWP($rounds_off,"esp")); 23771cb0ef41Sopenharmony_ci &movdqa ($inout2,$inout0); 23781cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i3)); 23791cb0ef41Sopenharmony_ci &movdqa ($inout4,$inout0); 23801cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i5)); 23811cb0ef41Sopenharmony_ci 23821cb0ef41Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ last offset_i 23831cb0ef41Sopenharmony_ci &pxor ($inout1,$inout0); 23841cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); 23851cb0ef41Sopenharmony_ci &pxor ($inout2,$inout1); 23861cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout1); 23871cb0ef41Sopenharmony_ci &pxor ($inout3,$inout2); 23881cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout2); 23891cb0ef41Sopenharmony_ci &pxor ($inout4,$inout3); 23901cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout3); 23911cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); 23921cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout4); 23931cb0ef41Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),$inout5); 23941cb0ef41Sopenharmony_ci 23951cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 23961cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 23971cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 23981cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 23991cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 24001cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 24011cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(16*5,$inp)); 24021cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16*6,$inp)); 24031cb0ef41Sopenharmony_ci 24041cb0ef41Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 24051cb0ef41Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ roundkey[0] 24061cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); 24071cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); 24081cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); 24091cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); 24101cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); 24111cb0ef41Sopenharmony_ci 24121cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 24131cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 24141cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 24151cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 24161cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 24171cb0ef41Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 24181cb0ef41Sopenharmony_ci &pxor ($inout5,&QWP(16*5,"esp")); 24191cb0ef41Sopenharmony_ci 24201cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 24211cb0ef41Sopenharmony_ci &aesdec ($inout0,$rndkey1); 24221cb0ef41Sopenharmony_ci &aesdec ($inout1,$rndkey1); 24231cb0ef41Sopenharmony_ci &aesdec ($inout2,$rndkey1); 24241cb0ef41Sopenharmony_ci &aesdec ($inout3,$rndkey1); 24251cb0ef41Sopenharmony_ci &aesdec ($inout4,$rndkey1); 24261cb0ef41Sopenharmony_ci &aesdec ($inout5,$rndkey1); 24271cb0ef41Sopenharmony_ci 24281cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 24291cb0ef41Sopenharmony_ci &mov ($len,&DWP($end_off,"esp")); 24301cb0ef41Sopenharmony_ci &call ("_aesni_decrypt6_enter"); 24311cb0ef41Sopenharmony_ci 24321cb0ef41Sopenharmony_ci &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i 24331cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 24341cb0ef41Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp")); 24351cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 24361cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 24371cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 24381cb0ef41Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 24391cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); 24401cb0ef41Sopenharmony_ci 24411cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 24421cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output 24431cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout1); 24441cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*5,$out,$inp),$inout1); 24451cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout2); 24461cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*4,$out,$inp),$inout2); 24471cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout3); 24481cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*3,$out,$inp),$inout3); 24491cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout4); 24501cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*2,$out,$inp),$inout4); 24511cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout5); 24521cb0ef41Sopenharmony_ci &movdqu (&QWP(-16*1,$out,$inp),$inout5); 24531cb0ef41Sopenharmony_ci &cmp ($inp,$len); # done yet? 24541cb0ef41Sopenharmony_ci &jbe (&label("grandloop")); 24551cb0ef41Sopenharmony_ci 24561cb0ef41Sopenharmony_ci&set_label("short"); 24571cb0ef41Sopenharmony_ci &add ($len,16*6); 24581cb0ef41Sopenharmony_ci &sub ($len,$inp); 24591cb0ef41Sopenharmony_ci &jz (&label("done")); 24601cb0ef41Sopenharmony_ci 24611cb0ef41Sopenharmony_ci &cmp ($len,16*2); 24621cb0ef41Sopenharmony_ci &jb (&label("one")); 24631cb0ef41Sopenharmony_ci &je (&label("two")); 24641cb0ef41Sopenharmony_ci 24651cb0ef41Sopenharmony_ci &cmp ($len,16*4); 24661cb0ef41Sopenharmony_ci &jb (&label("three")); 24671cb0ef41Sopenharmony_ci &je (&label("four")); 24681cb0ef41Sopenharmony_ci 24691cb0ef41Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 24701cb0ef41Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 24711cb0ef41Sopenharmony_ci &bsf ($i1,$i1); 24721cb0ef41Sopenharmony_ci &bsf ($i3,$i3); 24731cb0ef41Sopenharmony_ci &shl ($i1,4); 24741cb0ef41Sopenharmony_ci &shl ($i3,4); 24751cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(0,$l_)); 24761cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(0,$l_,$i1)); 24771cb0ef41Sopenharmony_ci &mov ($rounds,&DWP($rounds_off,"esp")); 24781cb0ef41Sopenharmony_ci &movdqa ($inout2,$inout0); 24791cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i3)); 24801cb0ef41Sopenharmony_ci &movdqa ($inout4,$inout0); 24811cb0ef41Sopenharmony_ci 24821cb0ef41Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ last offset_i 24831cb0ef41Sopenharmony_ci &pxor ($inout1,$inout0); 24841cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); 24851cb0ef41Sopenharmony_ci &pxor ($inout2,$inout1); 24861cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout1); 24871cb0ef41Sopenharmony_ci &pxor ($inout3,$inout2); 24881cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout2); 24891cb0ef41Sopenharmony_ci &pxor ($inout4,$inout3); 24901cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout3); 24911cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); 24921cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout4); 24931cb0ef41Sopenharmony_ci 24941cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 24951cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 24961cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 24971cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 24981cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 24991cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 25001cb0ef41Sopenharmony_ci &pxor ($inout5,$inout5); 25011cb0ef41Sopenharmony_ci 25021cb0ef41Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 25031cb0ef41Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ roundkey[0] 25041cb0ef41Sopenharmony_ci &pxor ($inout1,$rndkey0); 25051cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); 25061cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); 25071cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); 25081cb0ef41Sopenharmony_ci 25091cb0ef41Sopenharmony_ci &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 25101cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 25111cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 25121cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 25131cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 25141cb0ef41Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 25151cb0ef41Sopenharmony_ci 25161cb0ef41Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 25171cb0ef41Sopenharmony_ci &aesdec ($inout0,$rndkey1); 25181cb0ef41Sopenharmony_ci &aesdec ($inout1,$rndkey1); 25191cb0ef41Sopenharmony_ci &aesdec ($inout2,$rndkey1); 25201cb0ef41Sopenharmony_ci &aesdec ($inout3,$rndkey1); 25211cb0ef41Sopenharmony_ci &aesdec ($inout4,$rndkey1); 25221cb0ef41Sopenharmony_ci &aesdec ($inout5,$rndkey1); 25231cb0ef41Sopenharmony_ci 25241cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 25251cb0ef41Sopenharmony_ci &call ("_aesni_decrypt6_enter"); 25261cb0ef41Sopenharmony_ci 25271cb0ef41Sopenharmony_ci &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i 25281cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 25291cb0ef41Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp")); 25301cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 25311cb0ef41Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 25321cb0ef41Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 25331cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); 25341cb0ef41Sopenharmony_ci 25351cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 25361cb0ef41Sopenharmony_ci &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output 25371cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout1); 25381cb0ef41Sopenharmony_ci &movdqu (&QWP(16*1,$out,$inp),$inout1); 25391cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout2); 25401cb0ef41Sopenharmony_ci &movdqu (&QWP(16*2,$out,$inp),$inout2); 25411cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout3); 25421cb0ef41Sopenharmony_ci &movdqu (&QWP(16*3,$out,$inp),$inout3); 25431cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout4); 25441cb0ef41Sopenharmony_ci &movdqu (&QWP(16*4,$out,$inp),$inout4); 25451cb0ef41Sopenharmony_ci 25461cb0ef41Sopenharmony_ci &jmp (&label("done")); 25471cb0ef41Sopenharmony_ci 25481cb0ef41Sopenharmony_ci&set_label("one",16); 25491cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_)); 25501cb0ef41Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 25511cb0ef41Sopenharmony_ci 25521cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 25531cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 25541cb0ef41Sopenharmony_ci 25551cb0ef41Sopenharmony_ci &pxor ($inout5,$rndkey0); # ^ last offset_i 25561cb0ef41Sopenharmony_ci &pxor ($inout0,$inout5); # ^ offset_i 25571cb0ef41Sopenharmony_ci 25581cb0ef41Sopenharmony_ci &movdqa ($inout4,$rndkey1); 25591cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 25601cb0ef41Sopenharmony_ci if ($inline) 25611cb0ef41Sopenharmony_ci { &aesni_inline_generate1("dec"); } 25621cb0ef41Sopenharmony_ci else 25631cb0ef41Sopenharmony_ci { &call ("_aesni_decrypt1"); } 25641cb0ef41Sopenharmony_ci 25651cb0ef41Sopenharmony_ci &xorps ($inout0,$inout5); # ^ offset_i 25661cb0ef41Sopenharmony_ci &movaps ($rndkey1,$inout4); # pass the checksum 25671cb0ef41Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 25681cb0ef41Sopenharmony_ci &xorps ($rndkey1,$inout0); # checksum 25691cb0ef41Sopenharmony_ci &movups (&QWP(0,$out,$inp),$inout0); 25701cb0ef41Sopenharmony_ci 25711cb0ef41Sopenharmony_ci &jmp (&label("done")); 25721cb0ef41Sopenharmony_ci 25731cb0ef41Sopenharmony_ci&set_label("two",16); 25741cb0ef41Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 25751cb0ef41Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 25761cb0ef41Sopenharmony_ci &bsf ($i1,$i1); 25771cb0ef41Sopenharmony_ci &shl ($i1,4); 25781cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(0,$l_)); 25791cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i1)); 25801cb0ef41Sopenharmony_ci 25811cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 25821cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 25831cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 25841cb0ef41Sopenharmony_ci 25851cb0ef41Sopenharmony_ci &movdqa ($inout3,$rndkey1); 25861cb0ef41Sopenharmony_ci &pxor ($inout4,$rndkey0); # ^ last offset_i 25871cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); 25881cb0ef41Sopenharmony_ci 25891cb0ef41Sopenharmony_ci &pxor ($inout0,$inout4); # ^ offset_i 25901cb0ef41Sopenharmony_ci &pxor ($inout1,$inout5); 25911cb0ef41Sopenharmony_ci 25921cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 25931cb0ef41Sopenharmony_ci &call ("_aesni_decrypt2"); 25941cb0ef41Sopenharmony_ci 25951cb0ef41Sopenharmony_ci &xorps ($inout0,$inout4); # ^ offset_i 25961cb0ef41Sopenharmony_ci &xorps ($inout1,$inout5); 25971cb0ef41Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 25981cb0ef41Sopenharmony_ci &xorps ($inout3,$inout0); # checksum 25991cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 26001cb0ef41Sopenharmony_ci &xorps ($inout3,$inout1); 26011cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 26021cb0ef41Sopenharmony_ci &movaps ($rndkey1,$inout3); # pass the checksum 26031cb0ef41Sopenharmony_ci 26041cb0ef41Sopenharmony_ci &jmp (&label("done")); 26051cb0ef41Sopenharmony_ci 26061cb0ef41Sopenharmony_ci&set_label("three",16); 26071cb0ef41Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 26081cb0ef41Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 26091cb0ef41Sopenharmony_ci &bsf ($i1,$i1); 26101cb0ef41Sopenharmony_ci &shl ($i1,4); 26111cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_)); 26121cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(0,$l_,$i1)); 26131cb0ef41Sopenharmony_ci &movdqa ($inout5,$inout3); 26141cb0ef41Sopenharmony_ci 26151cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 26161cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 26171cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 26181cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 26191cb0ef41Sopenharmony_ci 26201cb0ef41Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 26211cb0ef41Sopenharmony_ci &pxor ($inout3,$rndkey0); # ^ last offset_i 26221cb0ef41Sopenharmony_ci &pxor ($inout4,$inout3); 26231cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); 26241cb0ef41Sopenharmony_ci 26251cb0ef41Sopenharmony_ci &pxor ($inout0,$inout3); # ^ offset_i 26261cb0ef41Sopenharmony_ci &pxor ($inout1,$inout4); 26271cb0ef41Sopenharmony_ci &pxor ($inout2,$inout5); 26281cb0ef41Sopenharmony_ci 26291cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 26301cb0ef41Sopenharmony_ci &call ("_aesni_decrypt3"); 26311cb0ef41Sopenharmony_ci 26321cb0ef41Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 26331cb0ef41Sopenharmony_ci &xorps ($inout0,$inout3); # ^ offset_i 26341cb0ef41Sopenharmony_ci &xorps ($inout1,$inout4); 26351cb0ef41Sopenharmony_ci &xorps ($inout2,$inout5); 26361cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 26371cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 26381cb0ef41Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 26391cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 26401cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout1); 26411cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out,$inp),$inout2); 26421cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout2); 26431cb0ef41Sopenharmony_ci 26441cb0ef41Sopenharmony_ci &jmp (&label("done")); 26451cb0ef41Sopenharmony_ci 26461cb0ef41Sopenharmony_ci&set_label("four",16); 26471cb0ef41Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 26481cb0ef41Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 26491cb0ef41Sopenharmony_ci &bsf ($i1,$i1); 26501cb0ef41Sopenharmony_ci &bsf ($i3,$i3); 26511cb0ef41Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 26521cb0ef41Sopenharmony_ci &shl ($i1,4); 26531cb0ef41Sopenharmony_ci &shl ($i3,4); 26541cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(0,$l_)); 26551cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i1)); 26561cb0ef41Sopenharmony_ci &movdqa ($inout4,$inout2); 26571cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i3)); 26581cb0ef41Sopenharmony_ci 26591cb0ef41Sopenharmony_ci &pxor ($inout2,$rndkey0); # ^ last offset_i 26601cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 26611cb0ef41Sopenharmony_ci &pxor ($inout3,$inout2); 26621cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 26631cb0ef41Sopenharmony_ci &pxor ($inout4,$inout3); 26641cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout2); 26651cb0ef41Sopenharmony_ci &pxor ($inout5,$inout4); 26661cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout3); 26671cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 26681cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 26691cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 26701cb0ef41Sopenharmony_ci 26711cb0ef41Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 26721cb0ef41Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 26731cb0ef41Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 26741cb0ef41Sopenharmony_ci &pxor ($inout2,$inout4); 26751cb0ef41Sopenharmony_ci &pxor ($inout3,$inout5); 26761cb0ef41Sopenharmony_ci 26771cb0ef41Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 26781cb0ef41Sopenharmony_ci &call ("_aesni_decrypt4"); 26791cb0ef41Sopenharmony_ci 26801cb0ef41Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 26811cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i 26821cb0ef41Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 26831cb0ef41Sopenharmony_ci &xorps ($inout2,$inout4); 26841cb0ef41Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 26851cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 26861cb0ef41Sopenharmony_ci &xorps ($inout3,$inout5); 26871cb0ef41Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 26881cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout1); 26891cb0ef41Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 26901cb0ef41Sopenharmony_ci &movups (&QWP(16*2,$out,$inp),$inout2); 26911cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout2); 26921cb0ef41Sopenharmony_ci &movups (&QWP(16*3,$out,$inp),$inout3); 26931cb0ef41Sopenharmony_ci &pxor ($rndkey1,$inout3); 26941cb0ef41Sopenharmony_ci 26951cb0ef41Sopenharmony_ci&set_label("done"); 26961cb0ef41Sopenharmony_ci &mov ($key,&DWP($esp_off,"esp")); 26971cb0ef41Sopenharmony_ci &pxor ($inout0,$inout0); # clear register bank 26981cb0ef41Sopenharmony_ci &pxor ($inout1,$inout1); 26991cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack 27001cb0ef41Sopenharmony_ci &pxor ($inout2,$inout2); 27011cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout0); 27021cb0ef41Sopenharmony_ci &pxor ($inout3,$inout3); 27031cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout0); 27041cb0ef41Sopenharmony_ci &pxor ($inout4,$inout4); 27051cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout0); 27061cb0ef41Sopenharmony_ci &pxor ($inout5,$inout5); 27071cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout0); 27081cb0ef41Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),$inout0); 27091cb0ef41Sopenharmony_ci &movdqa (&QWP(16*6,"esp"),$inout0); 27101cb0ef41Sopenharmony_ci 27111cb0ef41Sopenharmony_ci &lea ("esp",&DWP(0,$key)); 27121cb0ef41Sopenharmony_ci &mov ($rounds,&wparam(5)); # &offset_i 27131cb0ef41Sopenharmony_ci &mov ($rounds_,&wparam(7)); # &checksum 27141cb0ef41Sopenharmony_ci &movdqu (&QWP(0,$rounds),$rndkey0); 27151cb0ef41Sopenharmony_ci &pxor ($rndkey0,$rndkey0); 27161cb0ef41Sopenharmony_ci &movdqu (&QWP(0,$rounds_),$rndkey1); 27171cb0ef41Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 27181cb0ef41Sopenharmony_ci&function_end("aesni_ocb_decrypt"); 27191cb0ef41Sopenharmony_ci} 27201cb0ef41Sopenharmony_ci} 27211cb0ef41Sopenharmony_ci 27221cb0ef41Sopenharmony_ci###################################################################### 27231cb0ef41Sopenharmony_ci# void $PREFIX_cbc_encrypt (const void *inp, void *out, 27241cb0ef41Sopenharmony_ci# size_t length, const AES_KEY *key, 27251cb0ef41Sopenharmony_ci# unsigned char *ivp,const int enc); 27261cb0ef41Sopenharmony_ci&function_begin("${PREFIX}_cbc_encrypt"); 27271cb0ef41Sopenharmony_ci &mov ($inp,&wparam(0)); 27281cb0ef41Sopenharmony_ci &mov ($rounds_,"esp"); 27291cb0ef41Sopenharmony_ci &mov ($out,&wparam(1)); 27301cb0ef41Sopenharmony_ci &sub ($rounds_,24); 27311cb0ef41Sopenharmony_ci &mov ($len,&wparam(2)); 27321cb0ef41Sopenharmony_ci &and ($rounds_,-16); 27331cb0ef41Sopenharmony_ci &mov ($key,&wparam(3)); 27341cb0ef41Sopenharmony_ci &mov ($key_,&wparam(4)); 27351cb0ef41Sopenharmony_ci &test ($len,$len); 27361cb0ef41Sopenharmony_ci &jz (&label("cbc_abort")); 27371cb0ef41Sopenharmony_ci 27381cb0ef41Sopenharmony_ci &cmp (&wparam(5),0); 27391cb0ef41Sopenharmony_ci &xchg ($rounds_,"esp"); # alloca 27401cb0ef41Sopenharmony_ci &movups ($ivec,&QWP(0,$key_)); # load IV 27411cb0ef41Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 27421cb0ef41Sopenharmony_ci &mov ($key_,$key); # backup $key 27431cb0ef41Sopenharmony_ci &mov (&DWP(16,"esp"),$rounds_); # save original %esp 27441cb0ef41Sopenharmony_ci &mov ($rounds_,$rounds); # backup $rounds 27451cb0ef41Sopenharmony_ci &je (&label("cbc_decrypt")); 27461cb0ef41Sopenharmony_ci 27471cb0ef41Sopenharmony_ci &movaps ($inout0,$ivec); 27481cb0ef41Sopenharmony_ci &cmp ($len,16); 27491cb0ef41Sopenharmony_ci &jb (&label("cbc_enc_tail")); 27501cb0ef41Sopenharmony_ci &sub ($len,16); 27511cb0ef41Sopenharmony_ci &jmp (&label("cbc_enc_loop")); 27521cb0ef41Sopenharmony_ci 27531cb0ef41Sopenharmony_ci&set_label("cbc_enc_loop",16); 27541cb0ef41Sopenharmony_ci &movups ($ivec,&QWP(0,$inp)); # input actually 27551cb0ef41Sopenharmony_ci &lea ($inp,&DWP(16,$inp)); 27561cb0ef41Sopenharmony_ci if ($inline) 27571cb0ef41Sopenharmony_ci { &aesni_inline_generate1("enc",$inout0,$ivec); } 27581cb0ef41Sopenharmony_ci else 27591cb0ef41Sopenharmony_ci { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 27601cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 27611cb0ef41Sopenharmony_ci &mov ($key,$key_); # restore $key 27621cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); # store output 27631cb0ef41Sopenharmony_ci &lea ($out,&DWP(16,$out)); 27641cb0ef41Sopenharmony_ci &sub ($len,16); 27651cb0ef41Sopenharmony_ci &jnc (&label("cbc_enc_loop")); 27661cb0ef41Sopenharmony_ci &add ($len,16); 27671cb0ef41Sopenharmony_ci &jnz (&label("cbc_enc_tail")); 27681cb0ef41Sopenharmony_ci &movaps ($ivec,$inout0); 27691cb0ef41Sopenharmony_ci &pxor ($inout0,$inout0); 27701cb0ef41Sopenharmony_ci &jmp (&label("cbc_ret")); 27711cb0ef41Sopenharmony_ci 27721cb0ef41Sopenharmony_ci&set_label("cbc_enc_tail"); 27731cb0ef41Sopenharmony_ci &mov ("ecx",$len); # zaps $rounds 27741cb0ef41Sopenharmony_ci &data_word(0xA4F3F689); # rep movsb 27751cb0ef41Sopenharmony_ci &mov ("ecx",16); # zero tail 27761cb0ef41Sopenharmony_ci &sub ("ecx",$len); 27771cb0ef41Sopenharmony_ci &xor ("eax","eax"); # zaps $len 27781cb0ef41Sopenharmony_ci &data_word(0xAAF3F689); # rep stosb 27791cb0ef41Sopenharmony_ci &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 27801cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 27811cb0ef41Sopenharmony_ci &mov ($inp,$out); # $inp and $out are the same 27821cb0ef41Sopenharmony_ci &mov ($key,$key_); # restore $key 27831cb0ef41Sopenharmony_ci &jmp (&label("cbc_enc_loop")); 27841cb0ef41Sopenharmony_ci###################################################################### 27851cb0ef41Sopenharmony_ci&set_label("cbc_decrypt",16); 27861cb0ef41Sopenharmony_ci &cmp ($len,0x50); 27871cb0ef41Sopenharmony_ci &jbe (&label("cbc_dec_tail")); 27881cb0ef41Sopenharmony_ci &movaps (&QWP(0,"esp"),$ivec); # save IV 27891cb0ef41Sopenharmony_ci &sub ($len,0x50); 27901cb0ef41Sopenharmony_ci &jmp (&label("cbc_dec_loop6_enter")); 27911cb0ef41Sopenharmony_ci 27921cb0ef41Sopenharmony_ci&set_label("cbc_dec_loop6",16); 27931cb0ef41Sopenharmony_ci &movaps (&QWP(0,"esp"),$rndkey0); # save IV 27941cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout5); 27951cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x10,$out)); 27961cb0ef41Sopenharmony_ci&set_label("cbc_dec_loop6_enter"); 27971cb0ef41Sopenharmony_ci &movdqu ($inout0,&QWP(0,$inp)); 27981cb0ef41Sopenharmony_ci &movdqu ($inout1,&QWP(0x10,$inp)); 27991cb0ef41Sopenharmony_ci &movdqu ($inout2,&QWP(0x20,$inp)); 28001cb0ef41Sopenharmony_ci &movdqu ($inout3,&QWP(0x30,$inp)); 28011cb0ef41Sopenharmony_ci &movdqu ($inout4,&QWP(0x40,$inp)); 28021cb0ef41Sopenharmony_ci &movdqu ($inout5,&QWP(0x50,$inp)); 28031cb0ef41Sopenharmony_ci 28041cb0ef41Sopenharmony_ci &call ("_aesni_decrypt6"); 28051cb0ef41Sopenharmony_ci 28061cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0,$inp)); 28071cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0x10,$inp)); 28081cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(0,"esp")); # ^=IV 28091cb0ef41Sopenharmony_ci &xorps ($inout1,$rndkey1); 28101cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0x20,$inp)); 28111cb0ef41Sopenharmony_ci &xorps ($inout2,$rndkey0); 28121cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0x30,$inp)); 28131cb0ef41Sopenharmony_ci &xorps ($inout3,$rndkey1); 28141cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0x40,$inp)); 28151cb0ef41Sopenharmony_ci &xorps ($inout4,$rndkey0); 28161cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0x50,$inp)); # IV 28171cb0ef41Sopenharmony_ci &xorps ($inout5,$rndkey1); 28181cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 28191cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 28201cb0ef41Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 28211cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 28221cb0ef41Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 28231cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 28241cb0ef41Sopenharmony_ci &mov ($key,$key_); # restore $key 28251cb0ef41Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 28261cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x50,$out)); 28271cb0ef41Sopenharmony_ci &sub ($len,0x60); 28281cb0ef41Sopenharmony_ci &ja (&label("cbc_dec_loop6")); 28291cb0ef41Sopenharmony_ci 28301cb0ef41Sopenharmony_ci &movaps ($inout0,$inout5); 28311cb0ef41Sopenharmony_ci &movaps ($ivec,$rndkey0); 28321cb0ef41Sopenharmony_ci &add ($len,0x50); 28331cb0ef41Sopenharmony_ci &jle (&label("cbc_dec_clear_tail_collected")); 28341cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 28351cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x10,$out)); 28361cb0ef41Sopenharmony_ci&set_label("cbc_dec_tail"); 28371cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 28381cb0ef41Sopenharmony_ci &movaps ($in0,$inout0); 28391cb0ef41Sopenharmony_ci &cmp ($len,0x10); 28401cb0ef41Sopenharmony_ci &jbe (&label("cbc_dec_one")); 28411cb0ef41Sopenharmony_ci 28421cb0ef41Sopenharmony_ci &movups ($inout1,&QWP(0x10,$inp)); 28431cb0ef41Sopenharmony_ci &movaps ($in1,$inout1); 28441cb0ef41Sopenharmony_ci &cmp ($len,0x20); 28451cb0ef41Sopenharmony_ci &jbe (&label("cbc_dec_two")); 28461cb0ef41Sopenharmony_ci 28471cb0ef41Sopenharmony_ci &movups ($inout2,&QWP(0x20,$inp)); 28481cb0ef41Sopenharmony_ci &cmp ($len,0x30); 28491cb0ef41Sopenharmony_ci &jbe (&label("cbc_dec_three")); 28501cb0ef41Sopenharmony_ci 28511cb0ef41Sopenharmony_ci &movups ($inout3,&QWP(0x30,$inp)); 28521cb0ef41Sopenharmony_ci &cmp ($len,0x40); 28531cb0ef41Sopenharmony_ci &jbe (&label("cbc_dec_four")); 28541cb0ef41Sopenharmony_ci 28551cb0ef41Sopenharmony_ci &movups ($inout4,&QWP(0x40,$inp)); 28561cb0ef41Sopenharmony_ci &movaps (&QWP(0,"esp"),$ivec); # save IV 28571cb0ef41Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 28581cb0ef41Sopenharmony_ci &xorps ($inout5,$inout5); 28591cb0ef41Sopenharmony_ci &call ("_aesni_decrypt6"); 28601cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0,$inp)); 28611cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0x10,$inp)); 28621cb0ef41Sopenharmony_ci &xorps ($inout0,&QWP(0,"esp")); # ^= IV 28631cb0ef41Sopenharmony_ci &xorps ($inout1,$rndkey1); 28641cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0x20,$inp)); 28651cb0ef41Sopenharmony_ci &xorps ($inout2,$rndkey0); 28661cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0x30,$inp)); 28671cb0ef41Sopenharmony_ci &xorps ($inout3,$rndkey1); 28681cb0ef41Sopenharmony_ci &movups ($ivec,&QWP(0x40,$inp)); # IV 28691cb0ef41Sopenharmony_ci &xorps ($inout4,$rndkey0); 28701cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 28711cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 28721cb0ef41Sopenharmony_ci &pxor ($inout1,$inout1); 28731cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 28741cb0ef41Sopenharmony_ci &pxor ($inout2,$inout2); 28751cb0ef41Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 28761cb0ef41Sopenharmony_ci &pxor ($inout3,$inout3); 28771cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x40,$out)); 28781cb0ef41Sopenharmony_ci &movaps ($inout0,$inout4); 28791cb0ef41Sopenharmony_ci &pxor ($inout4,$inout4); 28801cb0ef41Sopenharmony_ci &sub ($len,0x50); 28811cb0ef41Sopenharmony_ci &jmp (&label("cbc_dec_tail_collected")); 28821cb0ef41Sopenharmony_ci 28831cb0ef41Sopenharmony_ci&set_label("cbc_dec_one",16); 28841cb0ef41Sopenharmony_ci if ($inline) 28851cb0ef41Sopenharmony_ci { &aesni_inline_generate1("dec"); } 28861cb0ef41Sopenharmony_ci else 28871cb0ef41Sopenharmony_ci { &call ("_aesni_decrypt1"); } 28881cb0ef41Sopenharmony_ci &xorps ($inout0,$ivec); 28891cb0ef41Sopenharmony_ci &movaps ($ivec,$in0); 28901cb0ef41Sopenharmony_ci &sub ($len,0x10); 28911cb0ef41Sopenharmony_ci &jmp (&label("cbc_dec_tail_collected")); 28921cb0ef41Sopenharmony_ci 28931cb0ef41Sopenharmony_ci&set_label("cbc_dec_two",16); 28941cb0ef41Sopenharmony_ci &call ("_aesni_decrypt2"); 28951cb0ef41Sopenharmony_ci &xorps ($inout0,$ivec); 28961cb0ef41Sopenharmony_ci &xorps ($inout1,$in0); 28971cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 28981cb0ef41Sopenharmony_ci &movaps ($inout0,$inout1); 28991cb0ef41Sopenharmony_ci &pxor ($inout1,$inout1); 29001cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x10,$out)); 29011cb0ef41Sopenharmony_ci &movaps ($ivec,$in1); 29021cb0ef41Sopenharmony_ci &sub ($len,0x20); 29031cb0ef41Sopenharmony_ci &jmp (&label("cbc_dec_tail_collected")); 29041cb0ef41Sopenharmony_ci 29051cb0ef41Sopenharmony_ci&set_label("cbc_dec_three",16); 29061cb0ef41Sopenharmony_ci &call ("_aesni_decrypt3"); 29071cb0ef41Sopenharmony_ci &xorps ($inout0,$ivec); 29081cb0ef41Sopenharmony_ci &xorps ($inout1,$in0); 29091cb0ef41Sopenharmony_ci &xorps ($inout2,$in1); 29101cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 29111cb0ef41Sopenharmony_ci &movaps ($inout0,$inout2); 29121cb0ef41Sopenharmony_ci &pxor ($inout2,$inout2); 29131cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 29141cb0ef41Sopenharmony_ci &pxor ($inout1,$inout1); 29151cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x20,$out)); 29161cb0ef41Sopenharmony_ci &movups ($ivec,&QWP(0x20,$inp)); 29171cb0ef41Sopenharmony_ci &sub ($len,0x30); 29181cb0ef41Sopenharmony_ci &jmp (&label("cbc_dec_tail_collected")); 29191cb0ef41Sopenharmony_ci 29201cb0ef41Sopenharmony_ci&set_label("cbc_dec_four",16); 29211cb0ef41Sopenharmony_ci &call ("_aesni_decrypt4"); 29221cb0ef41Sopenharmony_ci &movups ($rndkey1,&QWP(0x10,$inp)); 29231cb0ef41Sopenharmony_ci &movups ($rndkey0,&QWP(0x20,$inp)); 29241cb0ef41Sopenharmony_ci &xorps ($inout0,$ivec); 29251cb0ef41Sopenharmony_ci &movups ($ivec,&QWP(0x30,$inp)); 29261cb0ef41Sopenharmony_ci &xorps ($inout1,$in0); 29271cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 29281cb0ef41Sopenharmony_ci &xorps ($inout2,$rndkey1); 29291cb0ef41Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 29301cb0ef41Sopenharmony_ci &pxor ($inout1,$inout1); 29311cb0ef41Sopenharmony_ci &xorps ($inout3,$rndkey0); 29321cb0ef41Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 29331cb0ef41Sopenharmony_ci &pxor ($inout2,$inout2); 29341cb0ef41Sopenharmony_ci &lea ($out,&DWP(0x30,$out)); 29351cb0ef41Sopenharmony_ci &movaps ($inout0,$inout3); 29361cb0ef41Sopenharmony_ci &pxor ($inout3,$inout3); 29371cb0ef41Sopenharmony_ci &sub ($len,0x40); 29381cb0ef41Sopenharmony_ci &jmp (&label("cbc_dec_tail_collected")); 29391cb0ef41Sopenharmony_ci 29401cb0ef41Sopenharmony_ci&set_label("cbc_dec_clear_tail_collected",16); 29411cb0ef41Sopenharmony_ci &pxor ($inout1,$inout1); 29421cb0ef41Sopenharmony_ci &pxor ($inout2,$inout2); 29431cb0ef41Sopenharmony_ci &pxor ($inout3,$inout3); 29441cb0ef41Sopenharmony_ci &pxor ($inout4,$inout4); 29451cb0ef41Sopenharmony_ci&set_label("cbc_dec_tail_collected"); 29461cb0ef41Sopenharmony_ci &and ($len,15); 29471cb0ef41Sopenharmony_ci &jnz (&label("cbc_dec_tail_partial")); 29481cb0ef41Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 29491cb0ef41Sopenharmony_ci &pxor ($rndkey0,$rndkey0); 29501cb0ef41Sopenharmony_ci &jmp (&label("cbc_ret")); 29511cb0ef41Sopenharmony_ci 29521cb0ef41Sopenharmony_ci&set_label("cbc_dec_tail_partial",16); 29531cb0ef41Sopenharmony_ci &movaps (&QWP(0,"esp"),$inout0); 29541cb0ef41Sopenharmony_ci &pxor ($rndkey0,$rndkey0); 29551cb0ef41Sopenharmony_ci &mov ("ecx",16); 29561cb0ef41Sopenharmony_ci &mov ($inp,"esp"); 29571cb0ef41Sopenharmony_ci &sub ("ecx",$len); 29581cb0ef41Sopenharmony_ci &data_word(0xA4F3F689); # rep movsb 29591cb0ef41Sopenharmony_ci &movdqa (&QWP(0,"esp"),$inout0); 29601cb0ef41Sopenharmony_ci 29611cb0ef41Sopenharmony_ci&set_label("cbc_ret"); 29621cb0ef41Sopenharmony_ci &mov ("esp",&DWP(16,"esp")); # pull original %esp 29631cb0ef41Sopenharmony_ci &mov ($key_,&wparam(4)); 29641cb0ef41Sopenharmony_ci &pxor ($inout0,$inout0); 29651cb0ef41Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 29661cb0ef41Sopenharmony_ci &movups (&QWP(0,$key_),$ivec); # output IV 29671cb0ef41Sopenharmony_ci &pxor ($ivec,$ivec); 29681cb0ef41Sopenharmony_ci&set_label("cbc_abort"); 29691cb0ef41Sopenharmony_ci&function_end("${PREFIX}_cbc_encrypt"); 29701cb0ef41Sopenharmony_ci 29711cb0ef41Sopenharmony_ci###################################################################### 29721cb0ef41Sopenharmony_ci# Mechanical port from aesni-x86_64.pl. 29731cb0ef41Sopenharmony_ci# 29741cb0ef41Sopenharmony_ci# _aesni_set_encrypt_key is private interface, 29751cb0ef41Sopenharmony_ci# input: 29761cb0ef41Sopenharmony_ci# "eax" const unsigned char *userKey 29771cb0ef41Sopenharmony_ci# $rounds int bits 29781cb0ef41Sopenharmony_ci# $key AES_KEY *key 29791cb0ef41Sopenharmony_ci# output: 29801cb0ef41Sopenharmony_ci# "eax" return code 29811cb0ef41Sopenharmony_ci# $round rounds 29821cb0ef41Sopenharmony_ci 29831cb0ef41Sopenharmony_ci&function_begin_B("_aesni_set_encrypt_key"); 29841cb0ef41Sopenharmony_ci &push ("ebp"); 29851cb0ef41Sopenharmony_ci &push ("ebx"); 29861cb0ef41Sopenharmony_ci &test ("eax","eax"); 29871cb0ef41Sopenharmony_ci &jz (&label("bad_pointer")); 29881cb0ef41Sopenharmony_ci &test ($key,$key); 29891cb0ef41Sopenharmony_ci &jz (&label("bad_pointer")); 29901cb0ef41Sopenharmony_ci 29911cb0ef41Sopenharmony_ci &call (&label("pic")); 29921cb0ef41Sopenharmony_ci&set_label("pic"); 29931cb0ef41Sopenharmony_ci &blindpop("ebx"); 29941cb0ef41Sopenharmony_ci &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 29951cb0ef41Sopenharmony_ci 29961cb0ef41Sopenharmony_ci &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); 29971cb0ef41Sopenharmony_ci &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 29981cb0ef41Sopenharmony_ci &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 29991cb0ef41Sopenharmony_ci &mov ("ebp",&DWP(4,"ebp")); 30001cb0ef41Sopenharmony_ci &lea ($key,&DWP(16,$key)); 30011cb0ef41Sopenharmony_ci &and ("ebp",1<<28|1<<11); # AVX and XOP bits 30021cb0ef41Sopenharmony_ci &cmp ($rounds,256); 30031cb0ef41Sopenharmony_ci &je (&label("14rounds")); 30041cb0ef41Sopenharmony_ci &cmp ($rounds,192); 30051cb0ef41Sopenharmony_ci &je (&label("12rounds")); 30061cb0ef41Sopenharmony_ci &cmp ($rounds,128); 30071cb0ef41Sopenharmony_ci &jne (&label("bad_keybits")); 30081cb0ef41Sopenharmony_ci 30091cb0ef41Sopenharmony_ci&set_label("10rounds",16); 30101cb0ef41Sopenharmony_ci &cmp ("ebp",1<<28); 30111cb0ef41Sopenharmony_ci &je (&label("10rounds_alt")); 30121cb0ef41Sopenharmony_ci 30131cb0ef41Sopenharmony_ci &mov ($rounds,9); 30141cb0ef41Sopenharmony_ci &$movekey (&QWP(-16,$key),"xmm0"); # round 0 30151cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x01); # round 1 30161cb0ef41Sopenharmony_ci &call (&label("key_128_cold")); 30171cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x2); # round 2 30181cb0ef41Sopenharmony_ci &call (&label("key_128")); 30191cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x04); # round 3 30201cb0ef41Sopenharmony_ci &call (&label("key_128")); 30211cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x08); # round 4 30221cb0ef41Sopenharmony_ci &call (&label("key_128")); 30231cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x10); # round 5 30241cb0ef41Sopenharmony_ci &call (&label("key_128")); 30251cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x20); # round 6 30261cb0ef41Sopenharmony_ci &call (&label("key_128")); 30271cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x40); # round 7 30281cb0ef41Sopenharmony_ci &call (&label("key_128")); 30291cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x80); # round 8 30301cb0ef41Sopenharmony_ci &call (&label("key_128")); 30311cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 30321cb0ef41Sopenharmony_ci &call (&label("key_128")); 30331cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x36); # round 10 30341cb0ef41Sopenharmony_ci &call (&label("key_128")); 30351cb0ef41Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 30361cb0ef41Sopenharmony_ci &mov (&DWP(80,$key),$rounds); 30371cb0ef41Sopenharmony_ci 30381cb0ef41Sopenharmony_ci &jmp (&label("good_key")); 30391cb0ef41Sopenharmony_ci 30401cb0ef41Sopenharmony_ci&set_label("key_128",16); 30411cb0ef41Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 30421cb0ef41Sopenharmony_ci &lea ($key,&DWP(16,$key)); 30431cb0ef41Sopenharmony_ci&set_label("key_128_cold"); 30441cb0ef41Sopenharmony_ci &shufps ("xmm4","xmm0",0b00010000); 30451cb0ef41Sopenharmony_ci &xorps ("xmm0","xmm4"); 30461cb0ef41Sopenharmony_ci &shufps ("xmm4","xmm0",0b10001100); 30471cb0ef41Sopenharmony_ci &xorps ("xmm0","xmm4"); 30481cb0ef41Sopenharmony_ci &shufps ("xmm1","xmm1",0b11111111); # critical path 30491cb0ef41Sopenharmony_ci &xorps ("xmm0","xmm1"); 30501cb0ef41Sopenharmony_ci &ret(); 30511cb0ef41Sopenharmony_ci 30521cb0ef41Sopenharmony_ci&set_label("10rounds_alt",16); 30531cb0ef41Sopenharmony_ci &movdqa ("xmm5",&QWP(0x00,"ebx")); 30541cb0ef41Sopenharmony_ci &mov ($rounds,8); 30551cb0ef41Sopenharmony_ci &movdqa ("xmm4",&QWP(0x20,"ebx")); 30561cb0ef41Sopenharmony_ci &movdqa ("xmm2","xmm0"); 30571cb0ef41Sopenharmony_ci &movdqu (&QWP(-16,$key),"xmm0"); 30581cb0ef41Sopenharmony_ci 30591cb0ef41Sopenharmony_ci&set_label("loop_key128"); 30601cb0ef41Sopenharmony_ci &pshufb ("xmm0","xmm5"); 30611cb0ef41Sopenharmony_ci &aesenclast ("xmm0","xmm4"); 30621cb0ef41Sopenharmony_ci &pslld ("xmm4",1); 30631cb0ef41Sopenharmony_ci &lea ($key,&DWP(16,$key)); 30641cb0ef41Sopenharmony_ci 30651cb0ef41Sopenharmony_ci &movdqa ("xmm3","xmm2"); 30661cb0ef41Sopenharmony_ci &pslldq ("xmm2",4); 30671cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm2"); 30681cb0ef41Sopenharmony_ci &pslldq ("xmm2",4); 30691cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm2"); 30701cb0ef41Sopenharmony_ci &pslldq ("xmm2",4); 30711cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm3"); 30721cb0ef41Sopenharmony_ci 30731cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm2"); 30741cb0ef41Sopenharmony_ci &movdqu (&QWP(-16,$key),"xmm0"); 30751cb0ef41Sopenharmony_ci &movdqa ("xmm2","xmm0"); 30761cb0ef41Sopenharmony_ci 30771cb0ef41Sopenharmony_ci &dec ($rounds); 30781cb0ef41Sopenharmony_ci &jnz (&label("loop_key128")); 30791cb0ef41Sopenharmony_ci 30801cb0ef41Sopenharmony_ci &movdqa ("xmm4",&QWP(0x30,"ebx")); 30811cb0ef41Sopenharmony_ci 30821cb0ef41Sopenharmony_ci &pshufb ("xmm0","xmm5"); 30831cb0ef41Sopenharmony_ci &aesenclast ("xmm0","xmm4"); 30841cb0ef41Sopenharmony_ci &pslld ("xmm4",1); 30851cb0ef41Sopenharmony_ci 30861cb0ef41Sopenharmony_ci &movdqa ("xmm3","xmm2"); 30871cb0ef41Sopenharmony_ci &pslldq ("xmm2",4); 30881cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm2"); 30891cb0ef41Sopenharmony_ci &pslldq ("xmm2",4); 30901cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm2"); 30911cb0ef41Sopenharmony_ci &pslldq ("xmm2",4); 30921cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm3"); 30931cb0ef41Sopenharmony_ci 30941cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm2"); 30951cb0ef41Sopenharmony_ci &movdqu (&QWP(0,$key),"xmm0"); 30961cb0ef41Sopenharmony_ci 30971cb0ef41Sopenharmony_ci &movdqa ("xmm2","xmm0"); 30981cb0ef41Sopenharmony_ci &pshufb ("xmm0","xmm5"); 30991cb0ef41Sopenharmony_ci &aesenclast ("xmm0","xmm4"); 31001cb0ef41Sopenharmony_ci 31011cb0ef41Sopenharmony_ci &movdqa ("xmm3","xmm2"); 31021cb0ef41Sopenharmony_ci &pslldq ("xmm2",4); 31031cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm2"); 31041cb0ef41Sopenharmony_ci &pslldq ("xmm2",4); 31051cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm2"); 31061cb0ef41Sopenharmony_ci &pslldq ("xmm2",4); 31071cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm3"); 31081cb0ef41Sopenharmony_ci 31091cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm2"); 31101cb0ef41Sopenharmony_ci &movdqu (&QWP(16,$key),"xmm0"); 31111cb0ef41Sopenharmony_ci 31121cb0ef41Sopenharmony_ci &mov ($rounds,9); 31131cb0ef41Sopenharmony_ci &mov (&DWP(96,$key),$rounds); 31141cb0ef41Sopenharmony_ci 31151cb0ef41Sopenharmony_ci &jmp (&label("good_key")); 31161cb0ef41Sopenharmony_ci 31171cb0ef41Sopenharmony_ci&set_label("12rounds",16); 31181cb0ef41Sopenharmony_ci &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 31191cb0ef41Sopenharmony_ci &cmp ("ebp",1<<28); 31201cb0ef41Sopenharmony_ci &je (&label("12rounds_alt")); 31211cb0ef41Sopenharmony_ci 31221cb0ef41Sopenharmony_ci &mov ($rounds,11); 31231cb0ef41Sopenharmony_ci &$movekey (&QWP(-16,$key),"xmm0"); # round 0 31241cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 31251cb0ef41Sopenharmony_ci &call (&label("key_192a_cold")); 31261cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 31271cb0ef41Sopenharmony_ci &call (&label("key_192b")); 31281cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 31291cb0ef41Sopenharmony_ci &call (&label("key_192a")); 31301cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 31311cb0ef41Sopenharmony_ci &call (&label("key_192b")); 31321cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 31331cb0ef41Sopenharmony_ci &call (&label("key_192a")); 31341cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 31351cb0ef41Sopenharmony_ci &call (&label("key_192b")); 31361cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 31371cb0ef41Sopenharmony_ci &call (&label("key_192a")); 31381cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 31391cb0ef41Sopenharmony_ci &call (&label("key_192b")); 31401cb0ef41Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 31411cb0ef41Sopenharmony_ci &mov (&DWP(48,$key),$rounds); 31421cb0ef41Sopenharmony_ci 31431cb0ef41Sopenharmony_ci &jmp (&label("good_key")); 31441cb0ef41Sopenharmony_ci 31451cb0ef41Sopenharmony_ci&set_label("key_192a",16); 31461cb0ef41Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 31471cb0ef41Sopenharmony_ci &lea ($key,&DWP(16,$key)); 31481cb0ef41Sopenharmony_ci&set_label("key_192a_cold",16); 31491cb0ef41Sopenharmony_ci &movaps ("xmm5","xmm2"); 31501cb0ef41Sopenharmony_ci&set_label("key_192b_warm"); 31511cb0ef41Sopenharmony_ci &shufps ("xmm4","xmm0",0b00010000); 31521cb0ef41Sopenharmony_ci &movdqa ("xmm3","xmm2"); 31531cb0ef41Sopenharmony_ci &xorps ("xmm0","xmm4"); 31541cb0ef41Sopenharmony_ci &shufps ("xmm4","xmm0",0b10001100); 31551cb0ef41Sopenharmony_ci &pslldq ("xmm3",4); 31561cb0ef41Sopenharmony_ci &xorps ("xmm0","xmm4"); 31571cb0ef41Sopenharmony_ci &pshufd ("xmm1","xmm1",0b01010101); # critical path 31581cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm3"); 31591cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm1"); 31601cb0ef41Sopenharmony_ci &pshufd ("xmm3","xmm0",0b11111111); 31611cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm3"); 31621cb0ef41Sopenharmony_ci &ret(); 31631cb0ef41Sopenharmony_ci 31641cb0ef41Sopenharmony_ci&set_label("key_192b",16); 31651cb0ef41Sopenharmony_ci &movaps ("xmm3","xmm0"); 31661cb0ef41Sopenharmony_ci &shufps ("xmm5","xmm0",0b01000100); 31671cb0ef41Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm5"); 31681cb0ef41Sopenharmony_ci &shufps ("xmm3","xmm2",0b01001110); 31691cb0ef41Sopenharmony_ci &$movekey (&QWP(16,$key),"xmm3"); 31701cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key)); 31711cb0ef41Sopenharmony_ci &jmp (&label("key_192b_warm")); 31721cb0ef41Sopenharmony_ci 31731cb0ef41Sopenharmony_ci&set_label("12rounds_alt",16); 31741cb0ef41Sopenharmony_ci &movdqa ("xmm5",&QWP(0x10,"ebx")); 31751cb0ef41Sopenharmony_ci &movdqa ("xmm4",&QWP(0x20,"ebx")); 31761cb0ef41Sopenharmony_ci &mov ($rounds,8); 31771cb0ef41Sopenharmony_ci &movdqu (&QWP(-16,$key),"xmm0"); 31781cb0ef41Sopenharmony_ci 31791cb0ef41Sopenharmony_ci&set_label("loop_key192"); 31801cb0ef41Sopenharmony_ci &movq (&QWP(0,$key),"xmm2"); 31811cb0ef41Sopenharmony_ci &movdqa ("xmm1","xmm2"); 31821cb0ef41Sopenharmony_ci &pshufb ("xmm2","xmm5"); 31831cb0ef41Sopenharmony_ci &aesenclast ("xmm2","xmm4"); 31841cb0ef41Sopenharmony_ci &pslld ("xmm4",1); 31851cb0ef41Sopenharmony_ci &lea ($key,&DWP(24,$key)); 31861cb0ef41Sopenharmony_ci 31871cb0ef41Sopenharmony_ci &movdqa ("xmm3","xmm0"); 31881cb0ef41Sopenharmony_ci &pslldq ("xmm0",4); 31891cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm0"); 31901cb0ef41Sopenharmony_ci &pslldq ("xmm0",4); 31911cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm0"); 31921cb0ef41Sopenharmony_ci &pslldq ("xmm0",4); 31931cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm3"); 31941cb0ef41Sopenharmony_ci 31951cb0ef41Sopenharmony_ci &pshufd ("xmm3","xmm0",0xff); 31961cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm1"); 31971cb0ef41Sopenharmony_ci &pslldq ("xmm1",4); 31981cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm1"); 31991cb0ef41Sopenharmony_ci 32001cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm2"); 32011cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm3"); 32021cb0ef41Sopenharmony_ci &movdqu (&QWP(-16,$key),"xmm0"); 32031cb0ef41Sopenharmony_ci 32041cb0ef41Sopenharmony_ci &dec ($rounds); 32051cb0ef41Sopenharmony_ci &jnz (&label("loop_key192")); 32061cb0ef41Sopenharmony_ci 32071cb0ef41Sopenharmony_ci &mov ($rounds,11); 32081cb0ef41Sopenharmony_ci &mov (&DWP(32,$key),$rounds); 32091cb0ef41Sopenharmony_ci 32101cb0ef41Sopenharmony_ci &jmp (&label("good_key")); 32111cb0ef41Sopenharmony_ci 32121cb0ef41Sopenharmony_ci&set_label("14rounds",16); 32131cb0ef41Sopenharmony_ci &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 32141cb0ef41Sopenharmony_ci &lea ($key,&DWP(16,$key)); 32151cb0ef41Sopenharmony_ci &cmp ("ebp",1<<28); 32161cb0ef41Sopenharmony_ci &je (&label("14rounds_alt")); 32171cb0ef41Sopenharmony_ci 32181cb0ef41Sopenharmony_ci &mov ($rounds,13); 32191cb0ef41Sopenharmony_ci &$movekey (&QWP(-32,$key),"xmm0"); # round 0 32201cb0ef41Sopenharmony_ci &$movekey (&QWP(-16,$key),"xmm2"); # round 1 32211cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x01); # round 2 32221cb0ef41Sopenharmony_ci &call (&label("key_256a_cold")); 32231cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x01); # round 3 32241cb0ef41Sopenharmony_ci &call (&label("key_256b")); 32251cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x02); # round 4 32261cb0ef41Sopenharmony_ci &call (&label("key_256a")); 32271cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x02); # round 5 32281cb0ef41Sopenharmony_ci &call (&label("key_256b")); 32291cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x04); # round 6 32301cb0ef41Sopenharmony_ci &call (&label("key_256a")); 32311cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x04); # round 7 32321cb0ef41Sopenharmony_ci &call (&label("key_256b")); 32331cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x08); # round 8 32341cb0ef41Sopenharmony_ci &call (&label("key_256a")); 32351cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x08); # round 9 32361cb0ef41Sopenharmony_ci &call (&label("key_256b")); 32371cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x10); # round 10 32381cb0ef41Sopenharmony_ci &call (&label("key_256a")); 32391cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x10); # round 11 32401cb0ef41Sopenharmony_ci &call (&label("key_256b")); 32411cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x20); # round 12 32421cb0ef41Sopenharmony_ci &call (&label("key_256a")); 32431cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x20); # round 13 32441cb0ef41Sopenharmony_ci &call (&label("key_256b")); 32451cb0ef41Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x40); # round 14 32461cb0ef41Sopenharmony_ci &call (&label("key_256a")); 32471cb0ef41Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 32481cb0ef41Sopenharmony_ci &mov (&DWP(16,$key),$rounds); 32491cb0ef41Sopenharmony_ci &xor ("eax","eax"); 32501cb0ef41Sopenharmony_ci 32511cb0ef41Sopenharmony_ci &jmp (&label("good_key")); 32521cb0ef41Sopenharmony_ci 32531cb0ef41Sopenharmony_ci&set_label("key_256a",16); 32541cb0ef41Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm2"); 32551cb0ef41Sopenharmony_ci &lea ($key,&DWP(16,$key)); 32561cb0ef41Sopenharmony_ci&set_label("key_256a_cold"); 32571cb0ef41Sopenharmony_ci &shufps ("xmm4","xmm0",0b00010000); 32581cb0ef41Sopenharmony_ci &xorps ("xmm0","xmm4"); 32591cb0ef41Sopenharmony_ci &shufps ("xmm4","xmm0",0b10001100); 32601cb0ef41Sopenharmony_ci &xorps ("xmm0","xmm4"); 32611cb0ef41Sopenharmony_ci &shufps ("xmm1","xmm1",0b11111111); # critical path 32621cb0ef41Sopenharmony_ci &xorps ("xmm0","xmm1"); 32631cb0ef41Sopenharmony_ci &ret(); 32641cb0ef41Sopenharmony_ci 32651cb0ef41Sopenharmony_ci&set_label("key_256b",16); 32661cb0ef41Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 32671cb0ef41Sopenharmony_ci &lea ($key,&DWP(16,$key)); 32681cb0ef41Sopenharmony_ci 32691cb0ef41Sopenharmony_ci &shufps ("xmm4","xmm2",0b00010000); 32701cb0ef41Sopenharmony_ci &xorps ("xmm2","xmm4"); 32711cb0ef41Sopenharmony_ci &shufps ("xmm4","xmm2",0b10001100); 32721cb0ef41Sopenharmony_ci &xorps ("xmm2","xmm4"); 32731cb0ef41Sopenharmony_ci &shufps ("xmm1","xmm1",0b10101010); # critical path 32741cb0ef41Sopenharmony_ci &xorps ("xmm2","xmm1"); 32751cb0ef41Sopenharmony_ci &ret(); 32761cb0ef41Sopenharmony_ci 32771cb0ef41Sopenharmony_ci&set_label("14rounds_alt",16); 32781cb0ef41Sopenharmony_ci &movdqa ("xmm5",&QWP(0x00,"ebx")); 32791cb0ef41Sopenharmony_ci &movdqa ("xmm4",&QWP(0x20,"ebx")); 32801cb0ef41Sopenharmony_ci &mov ($rounds,7); 32811cb0ef41Sopenharmony_ci &movdqu (&QWP(-32,$key),"xmm0"); 32821cb0ef41Sopenharmony_ci &movdqa ("xmm1","xmm2"); 32831cb0ef41Sopenharmony_ci &movdqu (&QWP(-16,$key),"xmm2"); 32841cb0ef41Sopenharmony_ci 32851cb0ef41Sopenharmony_ci&set_label("loop_key256"); 32861cb0ef41Sopenharmony_ci &pshufb ("xmm2","xmm5"); 32871cb0ef41Sopenharmony_ci &aesenclast ("xmm2","xmm4"); 32881cb0ef41Sopenharmony_ci 32891cb0ef41Sopenharmony_ci &movdqa ("xmm3","xmm0"); 32901cb0ef41Sopenharmony_ci &pslldq ("xmm0",4); 32911cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm0"); 32921cb0ef41Sopenharmony_ci &pslldq ("xmm0",4); 32931cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm0"); 32941cb0ef41Sopenharmony_ci &pslldq ("xmm0",4); 32951cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm3"); 32961cb0ef41Sopenharmony_ci &pslld ("xmm4",1); 32971cb0ef41Sopenharmony_ci 32981cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm2"); 32991cb0ef41Sopenharmony_ci &movdqu (&QWP(0,$key),"xmm0"); 33001cb0ef41Sopenharmony_ci 33011cb0ef41Sopenharmony_ci &dec ($rounds); 33021cb0ef41Sopenharmony_ci &jz (&label("done_key256")); 33031cb0ef41Sopenharmony_ci 33041cb0ef41Sopenharmony_ci &pshufd ("xmm2","xmm0",0xff); 33051cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm3"); 33061cb0ef41Sopenharmony_ci &aesenclast ("xmm2","xmm3"); 33071cb0ef41Sopenharmony_ci 33081cb0ef41Sopenharmony_ci &movdqa ("xmm3","xmm1"); 33091cb0ef41Sopenharmony_ci &pslldq ("xmm1",4); 33101cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm1"); 33111cb0ef41Sopenharmony_ci &pslldq ("xmm1",4); 33121cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm1"); 33131cb0ef41Sopenharmony_ci &pslldq ("xmm1",4); 33141cb0ef41Sopenharmony_ci &pxor ("xmm1","xmm3"); 33151cb0ef41Sopenharmony_ci 33161cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm1"); 33171cb0ef41Sopenharmony_ci &movdqu (&QWP(16,$key),"xmm2"); 33181cb0ef41Sopenharmony_ci &lea ($key,&DWP(32,$key)); 33191cb0ef41Sopenharmony_ci &movdqa ("xmm1","xmm2"); 33201cb0ef41Sopenharmony_ci &jmp (&label("loop_key256")); 33211cb0ef41Sopenharmony_ci 33221cb0ef41Sopenharmony_ci&set_label("done_key256"); 33231cb0ef41Sopenharmony_ci &mov ($rounds,13); 33241cb0ef41Sopenharmony_ci &mov (&DWP(16,$key),$rounds); 33251cb0ef41Sopenharmony_ci 33261cb0ef41Sopenharmony_ci&set_label("good_key"); 33271cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm0"); 33281cb0ef41Sopenharmony_ci &pxor ("xmm1","xmm1"); 33291cb0ef41Sopenharmony_ci &pxor ("xmm2","xmm2"); 33301cb0ef41Sopenharmony_ci &pxor ("xmm3","xmm3"); 33311cb0ef41Sopenharmony_ci &pxor ("xmm4","xmm4"); 33321cb0ef41Sopenharmony_ci &pxor ("xmm5","xmm5"); 33331cb0ef41Sopenharmony_ci &xor ("eax","eax"); 33341cb0ef41Sopenharmony_ci &pop ("ebx"); 33351cb0ef41Sopenharmony_ci &pop ("ebp"); 33361cb0ef41Sopenharmony_ci &ret (); 33371cb0ef41Sopenharmony_ci 33381cb0ef41Sopenharmony_ci&set_label("bad_pointer",4); 33391cb0ef41Sopenharmony_ci &mov ("eax",-1); 33401cb0ef41Sopenharmony_ci &pop ("ebx"); 33411cb0ef41Sopenharmony_ci &pop ("ebp"); 33421cb0ef41Sopenharmony_ci &ret (); 33431cb0ef41Sopenharmony_ci&set_label("bad_keybits",4); 33441cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm0"); 33451cb0ef41Sopenharmony_ci &mov ("eax",-2); 33461cb0ef41Sopenharmony_ci &pop ("ebx"); 33471cb0ef41Sopenharmony_ci &pop ("ebp"); 33481cb0ef41Sopenharmony_ci &ret (); 33491cb0ef41Sopenharmony_ci&function_end_B("_aesni_set_encrypt_key"); 33501cb0ef41Sopenharmony_ci 33511cb0ef41Sopenharmony_ci# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 33521cb0ef41Sopenharmony_ci# AES_KEY *key) 33531cb0ef41Sopenharmony_ci&function_begin_B("${PREFIX}_set_encrypt_key"); 33541cb0ef41Sopenharmony_ci &mov ("eax",&wparam(0)); 33551cb0ef41Sopenharmony_ci &mov ($rounds,&wparam(1)); 33561cb0ef41Sopenharmony_ci &mov ($key,&wparam(2)); 33571cb0ef41Sopenharmony_ci &call ("_aesni_set_encrypt_key"); 33581cb0ef41Sopenharmony_ci &ret (); 33591cb0ef41Sopenharmony_ci&function_end_B("${PREFIX}_set_encrypt_key"); 33601cb0ef41Sopenharmony_ci 33611cb0ef41Sopenharmony_ci# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 33621cb0ef41Sopenharmony_ci# AES_KEY *key) 33631cb0ef41Sopenharmony_ci&function_begin_B("${PREFIX}_set_decrypt_key"); 33641cb0ef41Sopenharmony_ci &mov ("eax",&wparam(0)); 33651cb0ef41Sopenharmony_ci &mov ($rounds,&wparam(1)); 33661cb0ef41Sopenharmony_ci &mov ($key,&wparam(2)); 33671cb0ef41Sopenharmony_ci &call ("_aesni_set_encrypt_key"); 33681cb0ef41Sopenharmony_ci &mov ($key,&wparam(2)); 33691cb0ef41Sopenharmony_ci &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key 33701cb0ef41Sopenharmony_ci &test ("eax","eax"); 33711cb0ef41Sopenharmony_ci &jnz (&label("dec_key_ret")); 33721cb0ef41Sopenharmony_ci &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 33731cb0ef41Sopenharmony_ci 33741cb0ef41Sopenharmony_ci &$movekey ("xmm0",&QWP(0,$key)); # just swap 33751cb0ef41Sopenharmony_ci &$movekey ("xmm1",&QWP(0,"eax")); 33761cb0ef41Sopenharmony_ci &$movekey (&QWP(0,"eax"),"xmm0"); 33771cb0ef41Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm1"); 33781cb0ef41Sopenharmony_ci &lea ($key,&DWP(16,$key)); 33791cb0ef41Sopenharmony_ci &lea ("eax",&DWP(-16,"eax")); 33801cb0ef41Sopenharmony_ci 33811cb0ef41Sopenharmony_ci&set_label("dec_key_inverse"); 33821cb0ef41Sopenharmony_ci &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 33831cb0ef41Sopenharmony_ci &$movekey ("xmm1",&QWP(0,"eax")); 33841cb0ef41Sopenharmony_ci &aesimc ("xmm0","xmm0"); 33851cb0ef41Sopenharmony_ci &aesimc ("xmm1","xmm1"); 33861cb0ef41Sopenharmony_ci &lea ($key,&DWP(16,$key)); 33871cb0ef41Sopenharmony_ci &lea ("eax",&DWP(-16,"eax")); 33881cb0ef41Sopenharmony_ci &$movekey (&QWP(16,"eax"),"xmm0"); 33891cb0ef41Sopenharmony_ci &$movekey (&QWP(-16,$key),"xmm1"); 33901cb0ef41Sopenharmony_ci &cmp ("eax",$key); 33911cb0ef41Sopenharmony_ci &ja (&label("dec_key_inverse")); 33921cb0ef41Sopenharmony_ci 33931cb0ef41Sopenharmony_ci &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 33941cb0ef41Sopenharmony_ci &aesimc ("xmm0","xmm0"); 33951cb0ef41Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 33961cb0ef41Sopenharmony_ci 33971cb0ef41Sopenharmony_ci &pxor ("xmm0","xmm0"); 33981cb0ef41Sopenharmony_ci &pxor ("xmm1","xmm1"); 33991cb0ef41Sopenharmony_ci &xor ("eax","eax"); # return success 34001cb0ef41Sopenharmony_ci&set_label("dec_key_ret"); 34011cb0ef41Sopenharmony_ci &ret (); 34021cb0ef41Sopenharmony_ci&function_end_B("${PREFIX}_set_decrypt_key"); 34031cb0ef41Sopenharmony_ci 34041cb0ef41Sopenharmony_ci&set_label("key_const",64); 34051cb0ef41Sopenharmony_ci&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 34061cb0ef41Sopenharmony_ci&data_word(0x04070605,0x04070605,0x04070605,0x04070605); 34071cb0ef41Sopenharmony_ci&data_word(1,1,1,1); 34081cb0ef41Sopenharmony_ci&data_word(0x1b,0x1b,0x1b,0x1b); 34091cb0ef41Sopenharmony_ci&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 34101cb0ef41Sopenharmony_ci 34111cb0ef41Sopenharmony_ci&asm_finish(); 34121cb0ef41Sopenharmony_ci 34131cb0ef41Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 3414