1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2009-2022 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# This module implements support for Intel AES-NI extension. In 18e1051a39Sopenharmony_ci# OpenSSL context it's used with Intel engine, but can also be used as 19e1051a39Sopenharmony_ci# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 20e1051a39Sopenharmony_ci# details]. 21e1051a39Sopenharmony_ci# 22e1051a39Sopenharmony_ci# Performance. 23e1051a39Sopenharmony_ci# 24e1051a39Sopenharmony_ci# To start with see corresponding paragraph in aesni-x86_64.pl... 25e1051a39Sopenharmony_ci# Instead of filling table similar to one found there I've chosen to 26e1051a39Sopenharmony_ci# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 27e1051a39Sopenharmony_ci# The simplified table below represents 32-bit performance relative 28e1051a39Sopenharmony_ci# to 64-bit one in every given point. Ratios vary for different 29e1051a39Sopenharmony_ci# encryption modes, therefore interval values. 30e1051a39Sopenharmony_ci# 31e1051a39Sopenharmony_ci# 16-byte 64-byte 256-byte 1-KB 8-KB 32e1051a39Sopenharmony_ci# 53-67% 67-84% 91-94% 95-98% 97-99.5% 33e1051a39Sopenharmony_ci# 34e1051a39Sopenharmony_ci# Lower ratios for smaller block sizes are perfectly understandable, 35e1051a39Sopenharmony_ci# because function call overhead is higher in 32-bit mode. Largest 36e1051a39Sopenharmony_ci# 8-KB block performance is virtually same: 32-bit code is less than 37e1051a39Sopenharmony_ci# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 38e1051a39Sopenharmony_ci 39e1051a39Sopenharmony_ci# January 2011 40e1051a39Sopenharmony_ci# 41e1051a39Sopenharmony_ci# See aesni-x86_64.pl for details. Unlike x86_64 version this module 42e1051a39Sopenharmony_ci# interleaves at most 6 aes[enc|dec] instructions, because there are 43e1051a39Sopenharmony_ci# not enough registers for 8x interleave [which should be optimal for 44e1051a39Sopenharmony_ci# Sandy Bridge]. Actually, performance results for 6x interleave 45e1051a39Sopenharmony_ci# factor presented in aesni-x86_64.pl (except for CTR) are for this 46e1051a39Sopenharmony_ci# module. 47e1051a39Sopenharmony_ci 48e1051a39Sopenharmony_ci# April 2011 49e1051a39Sopenharmony_ci# 50e1051a39Sopenharmony_ci# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 51e1051a39Sopenharmony_ci# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 52e1051a39Sopenharmony_ci 53e1051a39Sopenharmony_ci# November 2015 54e1051a39Sopenharmony_ci# 55e1051a39Sopenharmony_ci# Add aesni_ocb_[en|de]crypt. 56e1051a39Sopenharmony_ci 57e1051a39Sopenharmony_ci###################################################################### 58e1051a39Sopenharmony_ci# Current large-block performance in cycles per byte processed with 59e1051a39Sopenharmony_ci# 128-bit key (less is better). 60e1051a39Sopenharmony_ci# 61e1051a39Sopenharmony_ci# CBC en-/decrypt CTR XTS ECB OCB 62e1051a39Sopenharmony_ci# Westmere 3.77/1.37 1.37 1.52 1.27 63e1051a39Sopenharmony_ci# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10 64e1051a39Sopenharmony_ci# Haswell 4.44/0.80 0.97 1.03 0.72 0.76 65e1051a39Sopenharmony_ci# Skylake 2.68/0.65 0.65 0.66 0.64 0.66 66e1051a39Sopenharmony_ci# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03 67e1051a39Sopenharmony_ci# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70 68e1051a39Sopenharmony_ci# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23 69e1051a39Sopenharmony_ci 70e1051a39Sopenharmony_ci$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 71e1051a39Sopenharmony_ci # generates drop-in replacement for 72e1051a39Sopenharmony_ci # crypto/aes/asm/aes-586.pl:-) 73e1051a39Sopenharmony_ci$inline=1; # inline _aesni_[en|de]crypt 74e1051a39Sopenharmony_ci 75e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 76e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 77e1051a39Sopenharmony_cirequire "x86asm.pl"; 78e1051a39Sopenharmony_ci 79e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output"; 80e1051a39Sopenharmony_ci 81e1051a39Sopenharmony_ci&asm_init($ARGV[0]); 82e1051a39Sopenharmony_ci 83e1051a39Sopenharmony_ci&external_label("OPENSSL_ia32cap_P"); 84e1051a39Sopenharmony_ci&static_label("key_const"); 85e1051a39Sopenharmony_ci 86e1051a39Sopenharmony_ciif ($PREFIX eq "aesni") { $movekey=\&movups; } 87e1051a39Sopenharmony_cielse { $movekey=\&movups; } 88e1051a39Sopenharmony_ci 89e1051a39Sopenharmony_ci$len="eax"; 90e1051a39Sopenharmony_ci$rounds="ecx"; 91e1051a39Sopenharmony_ci$key="edx"; 92e1051a39Sopenharmony_ci$inp="esi"; 93e1051a39Sopenharmony_ci$out="edi"; 94e1051a39Sopenharmony_ci$rounds_="ebx"; # backup copy for $rounds 95e1051a39Sopenharmony_ci$key_="ebp"; # backup copy for $key 96e1051a39Sopenharmony_ci 97e1051a39Sopenharmony_ci$rndkey0="xmm0"; 98e1051a39Sopenharmony_ci$rndkey1="xmm1"; 99e1051a39Sopenharmony_ci$inout0="xmm2"; 100e1051a39Sopenharmony_ci$inout1="xmm3"; 101e1051a39Sopenharmony_ci$inout2="xmm4"; 102e1051a39Sopenharmony_ci$inout3="xmm5"; $in1="xmm5"; 103e1051a39Sopenharmony_ci$inout4="xmm6"; $in0="xmm6"; 104e1051a39Sopenharmony_ci$inout5="xmm7"; $ivec="xmm7"; 105e1051a39Sopenharmony_ci 106e1051a39Sopenharmony_ci# AESNI extension 107e1051a39Sopenharmony_cisub aeskeygenassist 108e1051a39Sopenharmony_ci{ my($dst,$src,$imm)=@_; 109e1051a39Sopenharmony_ci if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 110e1051a39Sopenharmony_ci { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 111e1051a39Sopenharmony_ci} 112e1051a39Sopenharmony_cisub aescommon 113e1051a39Sopenharmony_ci{ my($opcodelet,$dst,$src)=@_; 114e1051a39Sopenharmony_ci if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 115e1051a39Sopenharmony_ci { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 116e1051a39Sopenharmony_ci} 117e1051a39Sopenharmony_cisub aesimc { aescommon(0xdb,@_); } 118e1051a39Sopenharmony_cisub aesenc { aescommon(0xdc,@_); } 119e1051a39Sopenharmony_cisub aesenclast { aescommon(0xdd,@_); } 120e1051a39Sopenharmony_cisub aesdec { aescommon(0xde,@_); } 121e1051a39Sopenharmony_cisub aesdeclast { aescommon(0xdf,@_); } 122e1051a39Sopenharmony_ci 123e1051a39Sopenharmony_ci# Inline version of internal aesni_[en|de]crypt1 124e1051a39Sopenharmony_ci{ my $sn; 125e1051a39Sopenharmony_cisub aesni_inline_generate1 126e1051a39Sopenharmony_ci{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 127e1051a39Sopenharmony_ci $sn++; 128e1051a39Sopenharmony_ci 129e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key)); 130e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key)); 131e1051a39Sopenharmony_ci &xorps ($ivec,$rndkey0) if (defined($ivec)); 132e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key)); 133e1051a39Sopenharmony_ci &xorps ($inout,$ivec) if (defined($ivec)); 134e1051a39Sopenharmony_ci &xorps ($inout,$rndkey0) if (!defined($ivec)); 135e1051a39Sopenharmony_ci &set_label("${p}1_loop_$sn"); 136e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 137e1051a39Sopenharmony_ci &dec ($rounds); 138e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key)); 139e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 140e1051a39Sopenharmony_ci &jnz (&label("${p}1_loop_$sn")); 141e1051a39Sopenharmony_ci eval"&aes${p}last ($inout,$rndkey1)"; 142e1051a39Sopenharmony_ci}} 143e1051a39Sopenharmony_ci 144e1051a39Sopenharmony_cisub aesni_generate1 # fully unrolled loop 145e1051a39Sopenharmony_ci{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 146e1051a39Sopenharmony_ci 147e1051a39Sopenharmony_ci &function_begin_B("_aesni_${p}rypt1"); 148e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0,$key)); 149e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0x10,$key)); 150e1051a39Sopenharmony_ci &xorps ($inout,$rndkey0); 151e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0x20,$key)); 152e1051a39Sopenharmony_ci &lea ($key,&DWP(0x30,$key)); 153e1051a39Sopenharmony_ci &cmp ($rounds,11); 154e1051a39Sopenharmony_ci &jb (&label("${p}128")); 155e1051a39Sopenharmony_ci &lea ($key,&DWP(0x20,$key)); 156e1051a39Sopenharmony_ci &je (&label("${p}192")); 157e1051a39Sopenharmony_ci &lea ($key,&DWP(0x20,$key)); 158e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 159e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(-0x40,$key)); 160e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 161e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-0x30,$key)); 162e1051a39Sopenharmony_ci &set_label("${p}192"); 163e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 164e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(-0x20,$key)); 165e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 166e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-0x10,$key)); 167e1051a39Sopenharmony_ci &set_label("${p}128"); 168e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 169e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key)); 170e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 171e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0x10,$key)); 172e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 173e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0x20,$key)); 174e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 175e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0x30,$key)); 176e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 177e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0x40,$key)); 178e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 179e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0x50,$key)); 180e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 181e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0x60,$key)); 182e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey0)"; 183e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0x70,$key)); 184e1051a39Sopenharmony_ci eval"&aes${p} ($inout,$rndkey1)"; 185e1051a39Sopenharmony_ci eval"&aes${p}last ($inout,$rndkey0)"; 186e1051a39Sopenharmony_ci &ret(); 187e1051a39Sopenharmony_ci &function_end_B("_aesni_${p}rypt1"); 188e1051a39Sopenharmony_ci} 189e1051a39Sopenharmony_ci 190e1051a39Sopenharmony_ci# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 191e1051a39Sopenharmony_ci&aesni_generate1("enc") if (!$inline); 192e1051a39Sopenharmony_ci&function_begin_B("${PREFIX}_encrypt"); 193e1051a39Sopenharmony_ci &mov ("eax",&wparam(0)); 194e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); 195e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,"eax")); 196e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 197e1051a39Sopenharmony_ci &mov ("eax",&wparam(1)); 198e1051a39Sopenharmony_ci if ($inline) 199e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc"); } 200e1051a39Sopenharmony_ci else 201e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1"); } 202e1051a39Sopenharmony_ci &pxor ($rndkey0,$rndkey0); # clear register bank 203e1051a39Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 204e1051a39Sopenharmony_ci &movups (&QWP(0,"eax"),$inout0); 205e1051a39Sopenharmony_ci &pxor ($inout0,$inout0); 206e1051a39Sopenharmony_ci &ret (); 207e1051a39Sopenharmony_ci&function_end_B("${PREFIX}_encrypt"); 208e1051a39Sopenharmony_ci 209e1051a39Sopenharmony_ci# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 210e1051a39Sopenharmony_ci&aesni_generate1("dec") if(!$inline); 211e1051a39Sopenharmony_ci&function_begin_B("${PREFIX}_decrypt"); 212e1051a39Sopenharmony_ci &mov ("eax",&wparam(0)); 213e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); 214e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,"eax")); 215e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 216e1051a39Sopenharmony_ci &mov ("eax",&wparam(1)); 217e1051a39Sopenharmony_ci if ($inline) 218e1051a39Sopenharmony_ci { &aesni_inline_generate1("dec"); } 219e1051a39Sopenharmony_ci else 220e1051a39Sopenharmony_ci { &call ("_aesni_decrypt1"); } 221e1051a39Sopenharmony_ci &pxor ($rndkey0,$rndkey0); # clear register bank 222e1051a39Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 223e1051a39Sopenharmony_ci &movups (&QWP(0,"eax"),$inout0); 224e1051a39Sopenharmony_ci &pxor ($inout0,$inout0); 225e1051a39Sopenharmony_ci &ret (); 226e1051a39Sopenharmony_ci&function_end_B("${PREFIX}_decrypt"); 227e1051a39Sopenharmony_ci 228e1051a39Sopenharmony_ci# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 229e1051a39Sopenharmony_ci# factor. Why 3x subroutine were originally used in loops? Even though 230e1051a39Sopenharmony_ci# aes[enc|dec] latency was originally 6, it could be scheduled only 231e1051a39Sopenharmony_ci# every *2nd* cycle. Thus 3x interleave was the one providing optimal 232e1051a39Sopenharmony_ci# utilization, i.e. when subroutine's throughput is virtually same as 233e1051a39Sopenharmony_ci# of non-interleaved subroutine [for number of input blocks up to 3]. 234e1051a39Sopenharmony_ci# This is why it originally made no sense to implement 2x subroutine. 235e1051a39Sopenharmony_ci# But times change and it became appropriate to spend extra 192 bytes 236e1051a39Sopenharmony_ci# on 2x subroutine on Atom Silvermont account. For processors that 237e1051a39Sopenharmony_ci# can schedule aes[enc|dec] every cycle optimal interleave factor 238e1051a39Sopenharmony_ci# equals to corresponding instructions latency. 8x is optimal for 239e1051a39Sopenharmony_ci# * Bridge, but it's unfeasible to accommodate such implementation 240e1051a39Sopenharmony_ci# in XMM registers addressable in 32-bit mode and therefore maximum 241e1051a39Sopenharmony_ci# of 6x is used instead... 242e1051a39Sopenharmony_ci 243e1051a39Sopenharmony_cisub aesni_generate2 244e1051a39Sopenharmony_ci{ my $p=shift; 245e1051a39Sopenharmony_ci 246e1051a39Sopenharmony_ci &function_begin_B("_aesni_${p}rypt2"); 247e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key)); 248e1051a39Sopenharmony_ci &shl ($rounds,4); 249e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key)); 250e1051a39Sopenharmony_ci &xorps ($inout0,$rndkey0); 251e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); 252e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key)); 253e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 254e1051a39Sopenharmony_ci &neg ($rounds); 255e1051a39Sopenharmony_ci &add ($rounds,16); 256e1051a39Sopenharmony_ci 257e1051a39Sopenharmony_ci &set_label("${p}2_loop"); 258e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 259e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 260e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 261e1051a39Sopenharmony_ci &add ($rounds,32); 262e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey0)"; 263e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey0)"; 264e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 265e1051a39Sopenharmony_ci &jnz (&label("${p}2_loop")); 266e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 267e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 268e1051a39Sopenharmony_ci eval"&aes${p}last ($inout0,$rndkey0)"; 269e1051a39Sopenharmony_ci eval"&aes${p}last ($inout1,$rndkey0)"; 270e1051a39Sopenharmony_ci &ret(); 271e1051a39Sopenharmony_ci &function_end_B("_aesni_${p}rypt2"); 272e1051a39Sopenharmony_ci} 273e1051a39Sopenharmony_ci 274e1051a39Sopenharmony_cisub aesni_generate3 275e1051a39Sopenharmony_ci{ my $p=shift; 276e1051a39Sopenharmony_ci 277e1051a39Sopenharmony_ci &function_begin_B("_aesni_${p}rypt3"); 278e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key)); 279e1051a39Sopenharmony_ci &shl ($rounds,4); 280e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key)); 281e1051a39Sopenharmony_ci &xorps ($inout0,$rndkey0); 282e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); 283e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); 284e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key)); 285e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 286e1051a39Sopenharmony_ci &neg ($rounds); 287e1051a39Sopenharmony_ci &add ($rounds,16); 288e1051a39Sopenharmony_ci 289e1051a39Sopenharmony_ci &set_label("${p}3_loop"); 290e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 291e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 292e1051a39Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 293e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 294e1051a39Sopenharmony_ci &add ($rounds,32); 295e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey0)"; 296e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey0)"; 297e1051a39Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey0)"; 298e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 299e1051a39Sopenharmony_ci &jnz (&label("${p}3_loop")); 300e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 301e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 302e1051a39Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 303e1051a39Sopenharmony_ci eval"&aes${p}last ($inout0,$rndkey0)"; 304e1051a39Sopenharmony_ci eval"&aes${p}last ($inout1,$rndkey0)"; 305e1051a39Sopenharmony_ci eval"&aes${p}last ($inout2,$rndkey0)"; 306e1051a39Sopenharmony_ci &ret(); 307e1051a39Sopenharmony_ci &function_end_B("_aesni_${p}rypt3"); 308e1051a39Sopenharmony_ci} 309e1051a39Sopenharmony_ci 310e1051a39Sopenharmony_ci# 4x interleave is implemented to improve small block performance, 311e1051a39Sopenharmony_ci# most notably [and naturally] 4 block by ~30%. One can argue that one 312e1051a39Sopenharmony_ci# should have implemented 5x as well, but improvement would be <20%, 313e1051a39Sopenharmony_ci# so it's not worth it... 314e1051a39Sopenharmony_cisub aesni_generate4 315e1051a39Sopenharmony_ci{ my $p=shift; 316e1051a39Sopenharmony_ci 317e1051a39Sopenharmony_ci &function_begin_B("_aesni_${p}rypt4"); 318e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key)); 319e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key)); 320e1051a39Sopenharmony_ci &shl ($rounds,4); 321e1051a39Sopenharmony_ci &xorps ($inout0,$rndkey0); 322e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); 323e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); 324e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); 325e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key)); 326e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 327e1051a39Sopenharmony_ci &neg ($rounds); 328e1051a39Sopenharmony_ci &data_byte (0x0f,0x1f,0x40,0x00); 329e1051a39Sopenharmony_ci &add ($rounds,16); 330e1051a39Sopenharmony_ci 331e1051a39Sopenharmony_ci &set_label("${p}4_loop"); 332e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 333e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 334e1051a39Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 335e1051a39Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey1)"; 336e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 337e1051a39Sopenharmony_ci &add ($rounds,32); 338e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey0)"; 339e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey0)"; 340e1051a39Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey0)"; 341e1051a39Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey0)"; 342e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 343e1051a39Sopenharmony_ci &jnz (&label("${p}4_loop")); 344e1051a39Sopenharmony_ci 345e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 346e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 347e1051a39Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 348e1051a39Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey1)"; 349e1051a39Sopenharmony_ci eval"&aes${p}last ($inout0,$rndkey0)"; 350e1051a39Sopenharmony_ci eval"&aes${p}last ($inout1,$rndkey0)"; 351e1051a39Sopenharmony_ci eval"&aes${p}last ($inout2,$rndkey0)"; 352e1051a39Sopenharmony_ci eval"&aes${p}last ($inout3,$rndkey0)"; 353e1051a39Sopenharmony_ci &ret(); 354e1051a39Sopenharmony_ci &function_end_B("_aesni_${p}rypt4"); 355e1051a39Sopenharmony_ci} 356e1051a39Sopenharmony_ci 357e1051a39Sopenharmony_cisub aesni_generate6 358e1051a39Sopenharmony_ci{ my $p=shift; 359e1051a39Sopenharmony_ci 360e1051a39Sopenharmony_ci &function_begin_B("_aesni_${p}rypt6"); 361e1051a39Sopenharmony_ci &static_label("_aesni_${p}rypt6_enter"); 362e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key)); 363e1051a39Sopenharmony_ci &shl ($rounds,4); 364e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key)); 365e1051a39Sopenharmony_ci &xorps ($inout0,$rndkey0); 366e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); # pxor does better here 367e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); 368e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 369e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); 370e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); 371e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 372e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 373e1051a39Sopenharmony_ci &neg ($rounds); 374e1051a39Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 375e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); 376e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 377e1051a39Sopenharmony_ci &add ($rounds,16); 378e1051a39Sopenharmony_ci &jmp (&label("_aesni_${p}rypt6_inner")); 379e1051a39Sopenharmony_ci 380e1051a39Sopenharmony_ci &set_label("${p}6_loop",16); 381e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 382e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 383e1051a39Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 384e1051a39Sopenharmony_ci &set_label("_aesni_${p}rypt6_inner"); 385e1051a39Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey1)"; 386e1051a39Sopenharmony_ci eval"&aes${p} ($inout4,$rndkey1)"; 387e1051a39Sopenharmony_ci eval"&aes${p} ($inout5,$rndkey1)"; 388e1051a39Sopenharmony_ci &set_label("_aesni_${p}rypt6_enter"); 389e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 390e1051a39Sopenharmony_ci &add ($rounds,32); 391e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey0)"; 392e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey0)"; 393e1051a39Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey0)"; 394e1051a39Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey0)"; 395e1051a39Sopenharmony_ci eval"&aes${p} ($inout4,$rndkey0)"; 396e1051a39Sopenharmony_ci eval"&aes${p} ($inout5,$rndkey0)"; 397e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 398e1051a39Sopenharmony_ci &jnz (&label("${p}6_loop")); 399e1051a39Sopenharmony_ci 400e1051a39Sopenharmony_ci eval"&aes${p} ($inout0,$rndkey1)"; 401e1051a39Sopenharmony_ci eval"&aes${p} ($inout1,$rndkey1)"; 402e1051a39Sopenharmony_ci eval"&aes${p} ($inout2,$rndkey1)"; 403e1051a39Sopenharmony_ci eval"&aes${p} ($inout3,$rndkey1)"; 404e1051a39Sopenharmony_ci eval"&aes${p} ($inout4,$rndkey1)"; 405e1051a39Sopenharmony_ci eval"&aes${p} ($inout5,$rndkey1)"; 406e1051a39Sopenharmony_ci eval"&aes${p}last ($inout0,$rndkey0)"; 407e1051a39Sopenharmony_ci eval"&aes${p}last ($inout1,$rndkey0)"; 408e1051a39Sopenharmony_ci eval"&aes${p}last ($inout2,$rndkey0)"; 409e1051a39Sopenharmony_ci eval"&aes${p}last ($inout3,$rndkey0)"; 410e1051a39Sopenharmony_ci eval"&aes${p}last ($inout4,$rndkey0)"; 411e1051a39Sopenharmony_ci eval"&aes${p}last ($inout5,$rndkey0)"; 412e1051a39Sopenharmony_ci &ret(); 413e1051a39Sopenharmony_ci &function_end_B("_aesni_${p}rypt6"); 414e1051a39Sopenharmony_ci} 415e1051a39Sopenharmony_ci&aesni_generate2("enc") if ($PREFIX eq "aesni"); 416e1051a39Sopenharmony_ci&aesni_generate2("dec"); 417e1051a39Sopenharmony_ci&aesni_generate3("enc") if ($PREFIX eq "aesni"); 418e1051a39Sopenharmony_ci&aesni_generate3("dec"); 419e1051a39Sopenharmony_ci&aesni_generate4("enc") if ($PREFIX eq "aesni"); 420e1051a39Sopenharmony_ci&aesni_generate4("dec"); 421e1051a39Sopenharmony_ci&aesni_generate6("enc") if ($PREFIX eq "aesni"); 422e1051a39Sopenharmony_ci&aesni_generate6("dec"); 423e1051a39Sopenharmony_ci 424e1051a39Sopenharmony_ciif ($PREFIX eq "aesni") { 425e1051a39Sopenharmony_ci###################################################################### 426e1051a39Sopenharmony_ci# void aesni_ecb_encrypt (const void *in, void *out, 427e1051a39Sopenharmony_ci# size_t length, const AES_KEY *key, 428e1051a39Sopenharmony_ci# int enc); 429e1051a39Sopenharmony_ci&function_begin("aesni_ecb_encrypt"); 430e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); 431e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); 432e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 433e1051a39Sopenharmony_ci &mov ($key,&wparam(3)); 434e1051a39Sopenharmony_ci &mov ($rounds_,&wparam(4)); 435e1051a39Sopenharmony_ci &and ($len,-16); 436e1051a39Sopenharmony_ci &jz (&label("ecb_ret")); 437e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 438e1051a39Sopenharmony_ci &test ($rounds_,$rounds_); 439e1051a39Sopenharmony_ci &jz (&label("ecb_decrypt")); 440e1051a39Sopenharmony_ci 441e1051a39Sopenharmony_ci &mov ($key_,$key); # backup $key 442e1051a39Sopenharmony_ci &mov ($rounds_,$rounds); # backup $rounds 443e1051a39Sopenharmony_ci &cmp ($len,0x60); 444e1051a39Sopenharmony_ci &jb (&label("ecb_enc_tail")); 445e1051a39Sopenharmony_ci 446e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(0,$inp)); 447e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(0x10,$inp)); 448e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(0x20,$inp)); 449e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0x30,$inp)); 450e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(0x40,$inp)); 451e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0x50,$inp)); 452e1051a39Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 453e1051a39Sopenharmony_ci &sub ($len,0x60); 454e1051a39Sopenharmony_ci &jmp (&label("ecb_enc_loop6_enter")); 455e1051a39Sopenharmony_ci 456e1051a39Sopenharmony_ci&set_label("ecb_enc_loop6",16); 457e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 458e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(0,$inp)); 459e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 460e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(0x10,$inp)); 461e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 462e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(0x20,$inp)); 463e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 464e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0x30,$inp)); 465e1051a39Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 466e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(0x40,$inp)); 467e1051a39Sopenharmony_ci &movups (&QWP(0x50,$out),$inout5); 468e1051a39Sopenharmony_ci &lea ($out,&DWP(0x60,$out)); 469e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0x50,$inp)); 470e1051a39Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 471e1051a39Sopenharmony_ci&set_label("ecb_enc_loop6_enter"); 472e1051a39Sopenharmony_ci 473e1051a39Sopenharmony_ci &call ("_aesni_encrypt6"); 474e1051a39Sopenharmony_ci 475e1051a39Sopenharmony_ci &mov ($key,$key_); # restore $key 476e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 477e1051a39Sopenharmony_ci &sub ($len,0x60); 478e1051a39Sopenharmony_ci &jnc (&label("ecb_enc_loop6")); 479e1051a39Sopenharmony_ci 480e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 481e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 482e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 483e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 484e1051a39Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 485e1051a39Sopenharmony_ci &movups (&QWP(0x50,$out),$inout5); 486e1051a39Sopenharmony_ci &lea ($out,&DWP(0x60,$out)); 487e1051a39Sopenharmony_ci &add ($len,0x60); 488e1051a39Sopenharmony_ci &jz (&label("ecb_ret")); 489e1051a39Sopenharmony_ci 490e1051a39Sopenharmony_ci&set_label("ecb_enc_tail"); 491e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 492e1051a39Sopenharmony_ci &cmp ($len,0x20); 493e1051a39Sopenharmony_ci &jb (&label("ecb_enc_one")); 494e1051a39Sopenharmony_ci &movups ($inout1,&QWP(0x10,$inp)); 495e1051a39Sopenharmony_ci &je (&label("ecb_enc_two")); 496e1051a39Sopenharmony_ci &movups ($inout2,&QWP(0x20,$inp)); 497e1051a39Sopenharmony_ci &cmp ($len,0x40); 498e1051a39Sopenharmony_ci &jb (&label("ecb_enc_three")); 499e1051a39Sopenharmony_ci &movups ($inout3,&QWP(0x30,$inp)); 500e1051a39Sopenharmony_ci &je (&label("ecb_enc_four")); 501e1051a39Sopenharmony_ci &movups ($inout4,&QWP(0x40,$inp)); 502e1051a39Sopenharmony_ci &xorps ($inout5,$inout5); 503e1051a39Sopenharmony_ci &call ("_aesni_encrypt6"); 504e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 505e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 506e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 507e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 508e1051a39Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 509e1051a39Sopenharmony_ci jmp (&label("ecb_ret")); 510e1051a39Sopenharmony_ci 511e1051a39Sopenharmony_ci&set_label("ecb_enc_one",16); 512e1051a39Sopenharmony_ci if ($inline) 513e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc"); } 514e1051a39Sopenharmony_ci else 515e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1"); } 516e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 517e1051a39Sopenharmony_ci &jmp (&label("ecb_ret")); 518e1051a39Sopenharmony_ci 519e1051a39Sopenharmony_ci&set_label("ecb_enc_two",16); 520e1051a39Sopenharmony_ci &call ("_aesni_encrypt2"); 521e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 522e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 523e1051a39Sopenharmony_ci &jmp (&label("ecb_ret")); 524e1051a39Sopenharmony_ci 525e1051a39Sopenharmony_ci&set_label("ecb_enc_three",16); 526e1051a39Sopenharmony_ci &call ("_aesni_encrypt3"); 527e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 528e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 529e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 530e1051a39Sopenharmony_ci &jmp (&label("ecb_ret")); 531e1051a39Sopenharmony_ci 532e1051a39Sopenharmony_ci&set_label("ecb_enc_four",16); 533e1051a39Sopenharmony_ci &call ("_aesni_encrypt4"); 534e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 535e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 536e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 537e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 538e1051a39Sopenharmony_ci &jmp (&label("ecb_ret")); 539e1051a39Sopenharmony_ci###################################################################### 540e1051a39Sopenharmony_ci&set_label("ecb_decrypt",16); 541e1051a39Sopenharmony_ci &mov ($key_,$key); # backup $key 542e1051a39Sopenharmony_ci &mov ($rounds_,$rounds); # backup $rounds 543e1051a39Sopenharmony_ci &cmp ($len,0x60); 544e1051a39Sopenharmony_ci &jb (&label("ecb_dec_tail")); 545e1051a39Sopenharmony_ci 546e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(0,$inp)); 547e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(0x10,$inp)); 548e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(0x20,$inp)); 549e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0x30,$inp)); 550e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(0x40,$inp)); 551e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0x50,$inp)); 552e1051a39Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 553e1051a39Sopenharmony_ci &sub ($len,0x60); 554e1051a39Sopenharmony_ci &jmp (&label("ecb_dec_loop6_enter")); 555e1051a39Sopenharmony_ci 556e1051a39Sopenharmony_ci&set_label("ecb_dec_loop6",16); 557e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 558e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(0,$inp)); 559e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 560e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(0x10,$inp)); 561e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 562e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(0x20,$inp)); 563e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 564e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0x30,$inp)); 565e1051a39Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 566e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(0x40,$inp)); 567e1051a39Sopenharmony_ci &movups (&QWP(0x50,$out),$inout5); 568e1051a39Sopenharmony_ci &lea ($out,&DWP(0x60,$out)); 569e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0x50,$inp)); 570e1051a39Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 571e1051a39Sopenharmony_ci&set_label("ecb_dec_loop6_enter"); 572e1051a39Sopenharmony_ci 573e1051a39Sopenharmony_ci &call ("_aesni_decrypt6"); 574e1051a39Sopenharmony_ci 575e1051a39Sopenharmony_ci &mov ($key,$key_); # restore $key 576e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 577e1051a39Sopenharmony_ci &sub ($len,0x60); 578e1051a39Sopenharmony_ci &jnc (&label("ecb_dec_loop6")); 579e1051a39Sopenharmony_ci 580e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 581e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 582e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 583e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 584e1051a39Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 585e1051a39Sopenharmony_ci &movups (&QWP(0x50,$out),$inout5); 586e1051a39Sopenharmony_ci &lea ($out,&DWP(0x60,$out)); 587e1051a39Sopenharmony_ci &add ($len,0x60); 588e1051a39Sopenharmony_ci &jz (&label("ecb_ret")); 589e1051a39Sopenharmony_ci 590e1051a39Sopenharmony_ci&set_label("ecb_dec_tail"); 591e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 592e1051a39Sopenharmony_ci &cmp ($len,0x20); 593e1051a39Sopenharmony_ci &jb (&label("ecb_dec_one")); 594e1051a39Sopenharmony_ci &movups ($inout1,&QWP(0x10,$inp)); 595e1051a39Sopenharmony_ci &je (&label("ecb_dec_two")); 596e1051a39Sopenharmony_ci &movups ($inout2,&QWP(0x20,$inp)); 597e1051a39Sopenharmony_ci &cmp ($len,0x40); 598e1051a39Sopenharmony_ci &jb (&label("ecb_dec_three")); 599e1051a39Sopenharmony_ci &movups ($inout3,&QWP(0x30,$inp)); 600e1051a39Sopenharmony_ci &je (&label("ecb_dec_four")); 601e1051a39Sopenharmony_ci &movups ($inout4,&QWP(0x40,$inp)); 602e1051a39Sopenharmony_ci &xorps ($inout5,$inout5); 603e1051a39Sopenharmony_ci &call ("_aesni_decrypt6"); 604e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 605e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 606e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 607e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 608e1051a39Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 609e1051a39Sopenharmony_ci &jmp (&label("ecb_ret")); 610e1051a39Sopenharmony_ci 611e1051a39Sopenharmony_ci&set_label("ecb_dec_one",16); 612e1051a39Sopenharmony_ci if ($inline) 613e1051a39Sopenharmony_ci { &aesni_inline_generate1("dec"); } 614e1051a39Sopenharmony_ci else 615e1051a39Sopenharmony_ci { &call ("_aesni_decrypt1"); } 616e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 617e1051a39Sopenharmony_ci &jmp (&label("ecb_ret")); 618e1051a39Sopenharmony_ci 619e1051a39Sopenharmony_ci&set_label("ecb_dec_two",16); 620e1051a39Sopenharmony_ci &call ("_aesni_decrypt2"); 621e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 622e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 623e1051a39Sopenharmony_ci &jmp (&label("ecb_ret")); 624e1051a39Sopenharmony_ci 625e1051a39Sopenharmony_ci&set_label("ecb_dec_three",16); 626e1051a39Sopenharmony_ci &call ("_aesni_decrypt3"); 627e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 628e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 629e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 630e1051a39Sopenharmony_ci &jmp (&label("ecb_ret")); 631e1051a39Sopenharmony_ci 632e1051a39Sopenharmony_ci&set_label("ecb_dec_four",16); 633e1051a39Sopenharmony_ci &call ("_aesni_decrypt4"); 634e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 635e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 636e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 637e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 638e1051a39Sopenharmony_ci 639e1051a39Sopenharmony_ci&set_label("ecb_ret"); 640e1051a39Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 641e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 642e1051a39Sopenharmony_ci &pxor ("xmm2","xmm2"); 643e1051a39Sopenharmony_ci &pxor ("xmm3","xmm3"); 644e1051a39Sopenharmony_ci &pxor ("xmm4","xmm4"); 645e1051a39Sopenharmony_ci &pxor ("xmm5","xmm5"); 646e1051a39Sopenharmony_ci &pxor ("xmm6","xmm6"); 647e1051a39Sopenharmony_ci &pxor ("xmm7","xmm7"); 648e1051a39Sopenharmony_ci&function_end("aesni_ecb_encrypt"); 649e1051a39Sopenharmony_ci 650e1051a39Sopenharmony_ci###################################################################### 651e1051a39Sopenharmony_ci# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 652e1051a39Sopenharmony_ci# size_t blocks, const AES_KEY *key, 653e1051a39Sopenharmony_ci# const char *ivec,char *cmac); 654e1051a39Sopenharmony_ci# 655e1051a39Sopenharmony_ci# Handles only complete blocks, operates on 64-bit counter and 656e1051a39Sopenharmony_ci# does not update *ivec! Nor does it finalize CMAC value 657e1051a39Sopenharmony_ci# (see engine/eng_aesni.c for details) 658e1051a39Sopenharmony_ci# 659e1051a39Sopenharmony_ci{ my $cmac=$inout1; 660e1051a39Sopenharmony_ci&function_begin("aesni_ccm64_encrypt_blocks"); 661e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); 662e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); 663e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 664e1051a39Sopenharmony_ci &mov ($key,&wparam(3)); 665e1051a39Sopenharmony_ci &mov ($rounds_,&wparam(4)); 666e1051a39Sopenharmony_ci &mov ($rounds,&wparam(5)); 667e1051a39Sopenharmony_ci &mov ($key_,"esp"); 668e1051a39Sopenharmony_ci &sub ("esp",60); 669e1051a39Sopenharmony_ci &and ("esp",-16); # align stack 670e1051a39Sopenharmony_ci &mov (&DWP(48,"esp"),$key_); 671e1051a39Sopenharmony_ci 672e1051a39Sopenharmony_ci &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 673e1051a39Sopenharmony_ci &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 674e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 675e1051a39Sopenharmony_ci 676e1051a39Sopenharmony_ci # compose byte-swap control mask for pshufb on stack 677e1051a39Sopenharmony_ci &mov (&DWP(0,"esp"),0x0c0d0e0f); 678e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),0x08090a0b); 679e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),0x04050607); 680e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),0x00010203); 681e1051a39Sopenharmony_ci 682e1051a39Sopenharmony_ci # compose counter increment vector on stack 683e1051a39Sopenharmony_ci &mov ($rounds_,1); 684e1051a39Sopenharmony_ci &xor ($key_,$key_); 685e1051a39Sopenharmony_ci &mov (&DWP(16,"esp"),$rounds_); 686e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),$key_); 687e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),$key_); 688e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),$key_); 689e1051a39Sopenharmony_ci 690e1051a39Sopenharmony_ci &shl ($rounds,4); 691e1051a39Sopenharmony_ci &mov ($rounds_,16); 692e1051a39Sopenharmony_ci &lea ($key_,&DWP(0,$key)); 693e1051a39Sopenharmony_ci &movdqa ($inout3,&QWP(0,"esp")); 694e1051a39Sopenharmony_ci &movdqa ($inout0,$ivec); 695e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 696e1051a39Sopenharmony_ci &sub ($rounds_,$rounds); 697e1051a39Sopenharmony_ci &pshufb ($ivec,$inout3); 698e1051a39Sopenharmony_ci 699e1051a39Sopenharmony_ci&set_label("ccm64_enc_outer"); 700e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key_)); 701e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); 702e1051a39Sopenharmony_ci &movups ($in0,&QWP(0,$inp)); 703e1051a39Sopenharmony_ci 704e1051a39Sopenharmony_ci &xorps ($inout0,$rndkey0); 705e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key_)); 706e1051a39Sopenharmony_ci &xorps ($rndkey0,$in0); 707e1051a39Sopenharmony_ci &xorps ($cmac,$rndkey0); # cmac^=inp 708e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key_)); 709e1051a39Sopenharmony_ci 710e1051a39Sopenharmony_ci&set_label("ccm64_enc2_loop"); 711e1051a39Sopenharmony_ci &aesenc ($inout0,$rndkey1); 712e1051a39Sopenharmony_ci &aesenc ($cmac,$rndkey1); 713e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 714e1051a39Sopenharmony_ci &add ($rounds,32); 715e1051a39Sopenharmony_ci &aesenc ($inout0,$rndkey0); 716e1051a39Sopenharmony_ci &aesenc ($cmac,$rndkey0); 717e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 718e1051a39Sopenharmony_ci &jnz (&label("ccm64_enc2_loop")); 719e1051a39Sopenharmony_ci &aesenc ($inout0,$rndkey1); 720e1051a39Sopenharmony_ci &aesenc ($cmac,$rndkey1); 721e1051a39Sopenharmony_ci &paddq ($ivec,&QWP(16,"esp")); 722e1051a39Sopenharmony_ci &dec ($len); 723e1051a39Sopenharmony_ci &aesenclast ($inout0,$rndkey0); 724e1051a39Sopenharmony_ci &aesenclast ($cmac,$rndkey0); 725e1051a39Sopenharmony_ci 726e1051a39Sopenharmony_ci &lea ($inp,&DWP(16,$inp)); 727e1051a39Sopenharmony_ci &xorps ($in0,$inout0); # inp^=E(ivec) 728e1051a39Sopenharmony_ci &movdqa ($inout0,$ivec); 729e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$in0); # save output 730e1051a39Sopenharmony_ci &pshufb ($inout0,$inout3); 731e1051a39Sopenharmony_ci &lea ($out,&DWP(16,$out)); 732e1051a39Sopenharmony_ci &jnz (&label("ccm64_enc_outer")); 733e1051a39Sopenharmony_ci 734e1051a39Sopenharmony_ci &mov ("esp",&DWP(48,"esp")); 735e1051a39Sopenharmony_ci &mov ($out,&wparam(5)); 736e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$cmac); 737e1051a39Sopenharmony_ci 738e1051a39Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 739e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 740e1051a39Sopenharmony_ci &pxor ("xmm2","xmm2"); 741e1051a39Sopenharmony_ci &pxor ("xmm3","xmm3"); 742e1051a39Sopenharmony_ci &pxor ("xmm4","xmm4"); 743e1051a39Sopenharmony_ci &pxor ("xmm5","xmm5"); 744e1051a39Sopenharmony_ci &pxor ("xmm6","xmm6"); 745e1051a39Sopenharmony_ci &pxor ("xmm7","xmm7"); 746e1051a39Sopenharmony_ci&function_end("aesni_ccm64_encrypt_blocks"); 747e1051a39Sopenharmony_ci 748e1051a39Sopenharmony_ci&function_begin("aesni_ccm64_decrypt_blocks"); 749e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); 750e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); 751e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 752e1051a39Sopenharmony_ci &mov ($key,&wparam(3)); 753e1051a39Sopenharmony_ci &mov ($rounds_,&wparam(4)); 754e1051a39Sopenharmony_ci &mov ($rounds,&wparam(5)); 755e1051a39Sopenharmony_ci &mov ($key_,"esp"); 756e1051a39Sopenharmony_ci &sub ("esp",60); 757e1051a39Sopenharmony_ci &and ("esp",-16); # align stack 758e1051a39Sopenharmony_ci &mov (&DWP(48,"esp"),$key_); 759e1051a39Sopenharmony_ci 760e1051a39Sopenharmony_ci &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 761e1051a39Sopenharmony_ci &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 762e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 763e1051a39Sopenharmony_ci 764e1051a39Sopenharmony_ci # compose byte-swap control mask for pshufb on stack 765e1051a39Sopenharmony_ci &mov (&DWP(0,"esp"),0x0c0d0e0f); 766e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),0x08090a0b); 767e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),0x04050607); 768e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),0x00010203); 769e1051a39Sopenharmony_ci 770e1051a39Sopenharmony_ci # compose counter increment vector on stack 771e1051a39Sopenharmony_ci &mov ($rounds_,1); 772e1051a39Sopenharmony_ci &xor ($key_,$key_); 773e1051a39Sopenharmony_ci &mov (&DWP(16,"esp"),$rounds_); 774e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),$key_); 775e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),$key_); 776e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),$key_); 777e1051a39Sopenharmony_ci 778e1051a39Sopenharmony_ci &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 779e1051a39Sopenharmony_ci &movdqa ($inout0,$ivec); 780e1051a39Sopenharmony_ci 781e1051a39Sopenharmony_ci &mov ($key_,$key); 782e1051a39Sopenharmony_ci &mov ($rounds_,$rounds); 783e1051a39Sopenharmony_ci 784e1051a39Sopenharmony_ci &pshufb ($ivec,$inout3); 785e1051a39Sopenharmony_ci if ($inline) 786e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc"); } 787e1051a39Sopenharmony_ci else 788e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1"); } 789e1051a39Sopenharmony_ci &shl ($rounds_,4); 790e1051a39Sopenharmony_ci &mov ($rounds,16); 791e1051a39Sopenharmony_ci &movups ($in0,&QWP(0,$inp)); # load inp 792e1051a39Sopenharmony_ci &paddq ($ivec,&QWP(16,"esp")); 793e1051a39Sopenharmony_ci &lea ($inp,&QWP(16,$inp)); 794e1051a39Sopenharmony_ci &sub ($rounds,$rounds_); 795e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key_,$rounds_)); 796e1051a39Sopenharmony_ci &mov ($rounds_,$rounds); 797e1051a39Sopenharmony_ci &jmp (&label("ccm64_dec_outer")); 798e1051a39Sopenharmony_ci 799e1051a39Sopenharmony_ci&set_label("ccm64_dec_outer",16); 800e1051a39Sopenharmony_ci &xorps ($in0,$inout0); # inp ^= E(ivec) 801e1051a39Sopenharmony_ci &movdqa ($inout0,$ivec); 802e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$in0); # save output 803e1051a39Sopenharmony_ci &lea ($out,&DWP(16,$out)); 804e1051a39Sopenharmony_ci &pshufb ($inout0,$inout3); 805e1051a39Sopenharmony_ci 806e1051a39Sopenharmony_ci &sub ($len,1); 807e1051a39Sopenharmony_ci &jz (&label("ccm64_dec_break")); 808e1051a39Sopenharmony_ci 809e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key_)); 810e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); 811e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key_)); 812e1051a39Sopenharmony_ci &xorps ($in0,$rndkey0); 813e1051a39Sopenharmony_ci &xorps ($inout0,$rndkey0); 814e1051a39Sopenharmony_ci &xorps ($cmac,$in0); # cmac^=out 815e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key_)); 816e1051a39Sopenharmony_ci 817e1051a39Sopenharmony_ci&set_label("ccm64_dec2_loop"); 818e1051a39Sopenharmony_ci &aesenc ($inout0,$rndkey1); 819e1051a39Sopenharmony_ci &aesenc ($cmac,$rndkey1); 820e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 821e1051a39Sopenharmony_ci &add ($rounds,32); 822e1051a39Sopenharmony_ci &aesenc ($inout0,$rndkey0); 823e1051a39Sopenharmony_ci &aesenc ($cmac,$rndkey0); 824e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 825e1051a39Sopenharmony_ci &jnz (&label("ccm64_dec2_loop")); 826e1051a39Sopenharmony_ci &movups ($in0,&QWP(0,$inp)); # load inp 827e1051a39Sopenharmony_ci &paddq ($ivec,&QWP(16,"esp")); 828e1051a39Sopenharmony_ci &aesenc ($inout0,$rndkey1); 829e1051a39Sopenharmony_ci &aesenc ($cmac,$rndkey1); 830e1051a39Sopenharmony_ci &aesenclast ($inout0,$rndkey0); 831e1051a39Sopenharmony_ci &aesenclast ($cmac,$rndkey0); 832e1051a39Sopenharmony_ci &lea ($inp,&QWP(16,$inp)); 833e1051a39Sopenharmony_ci &jmp (&label("ccm64_dec_outer")); 834e1051a39Sopenharmony_ci 835e1051a39Sopenharmony_ci&set_label("ccm64_dec_break",16); 836e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key_)); 837e1051a39Sopenharmony_ci &mov ($key,$key_); 838e1051a39Sopenharmony_ci if ($inline) 839e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc",$cmac,$in0); } 840e1051a39Sopenharmony_ci else 841e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1",$cmac); } 842e1051a39Sopenharmony_ci 843e1051a39Sopenharmony_ci &mov ("esp",&DWP(48,"esp")); 844e1051a39Sopenharmony_ci &mov ($out,&wparam(5)); 845e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$cmac); 846e1051a39Sopenharmony_ci 847e1051a39Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 848e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 849e1051a39Sopenharmony_ci &pxor ("xmm2","xmm2"); 850e1051a39Sopenharmony_ci &pxor ("xmm3","xmm3"); 851e1051a39Sopenharmony_ci &pxor ("xmm4","xmm4"); 852e1051a39Sopenharmony_ci &pxor ("xmm5","xmm5"); 853e1051a39Sopenharmony_ci &pxor ("xmm6","xmm6"); 854e1051a39Sopenharmony_ci &pxor ("xmm7","xmm7"); 855e1051a39Sopenharmony_ci&function_end("aesni_ccm64_decrypt_blocks"); 856e1051a39Sopenharmony_ci} 857e1051a39Sopenharmony_ci 858e1051a39Sopenharmony_ci###################################################################### 859e1051a39Sopenharmony_ci# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 860e1051a39Sopenharmony_ci# size_t blocks, const AES_KEY *key, 861e1051a39Sopenharmony_ci# const char *ivec); 862e1051a39Sopenharmony_ci# 863e1051a39Sopenharmony_ci# Handles only complete blocks, operates on 32-bit counter and 864e1051a39Sopenharmony_ci# does not update *ivec! (see crypto/modes/ctr128.c for details) 865e1051a39Sopenharmony_ci# 866e1051a39Sopenharmony_ci# stack layout: 867e1051a39Sopenharmony_ci# 0 pshufb mask 868e1051a39Sopenharmony_ci# 16 vector addend: 0,6,6,6 869e1051a39Sopenharmony_ci# 32 counter-less ivec 870e1051a39Sopenharmony_ci# 48 1st triplet of counter vector 871e1051a39Sopenharmony_ci# 64 2nd triplet of counter vector 872e1051a39Sopenharmony_ci# 80 saved %esp 873e1051a39Sopenharmony_ci 874e1051a39Sopenharmony_ci&function_begin("aesni_ctr32_encrypt_blocks"); 875e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); 876e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); 877e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 878e1051a39Sopenharmony_ci &mov ($key,&wparam(3)); 879e1051a39Sopenharmony_ci &mov ($rounds_,&wparam(4)); 880e1051a39Sopenharmony_ci &mov ($key_,"esp"); 881e1051a39Sopenharmony_ci &sub ("esp",88); 882e1051a39Sopenharmony_ci &and ("esp",-16); # align stack 883e1051a39Sopenharmony_ci &mov (&DWP(80,"esp"),$key_); 884e1051a39Sopenharmony_ci 885e1051a39Sopenharmony_ci &cmp ($len,1); 886e1051a39Sopenharmony_ci &je (&label("ctr32_one_shortcut")); 887e1051a39Sopenharmony_ci 888e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 889e1051a39Sopenharmony_ci 890e1051a39Sopenharmony_ci # compose byte-swap control mask for pshufb on stack 891e1051a39Sopenharmony_ci &mov (&DWP(0,"esp"),0x0c0d0e0f); 892e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),0x08090a0b); 893e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),0x04050607); 894e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),0x00010203); 895e1051a39Sopenharmony_ci 896e1051a39Sopenharmony_ci # compose counter increment vector on stack 897e1051a39Sopenharmony_ci &mov ($rounds,6); 898e1051a39Sopenharmony_ci &xor ($key_,$key_); 899e1051a39Sopenharmony_ci &mov (&DWP(16,"esp"),$rounds); 900e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),$rounds); 901e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),$rounds); 902e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),$key_); 903e1051a39Sopenharmony_ci 904e1051a39Sopenharmony_ci &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 905e1051a39Sopenharmony_ci &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 906e1051a39Sopenharmony_ci 907e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); # key->rounds 908e1051a39Sopenharmony_ci 909e1051a39Sopenharmony_ci # compose 2 vectors of 3x32-bit counters 910e1051a39Sopenharmony_ci &bswap ($rounds_); 911e1051a39Sopenharmony_ci &pxor ($rndkey0,$rndkey0); 912e1051a39Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 913e1051a39Sopenharmony_ci &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 914e1051a39Sopenharmony_ci &pinsrd ($rndkey0,$rounds_,0); 915e1051a39Sopenharmony_ci &lea ($key_,&DWP(3,$rounds_)); 916e1051a39Sopenharmony_ci &pinsrd ($rndkey1,$key_,0); 917e1051a39Sopenharmony_ci &inc ($rounds_); 918e1051a39Sopenharmony_ci &pinsrd ($rndkey0,$rounds_,1); 919e1051a39Sopenharmony_ci &inc ($key_); 920e1051a39Sopenharmony_ci &pinsrd ($rndkey1,$key_,1); 921e1051a39Sopenharmony_ci &inc ($rounds_); 922e1051a39Sopenharmony_ci &pinsrd ($rndkey0,$rounds_,2); 923e1051a39Sopenharmony_ci &inc ($key_); 924e1051a39Sopenharmony_ci &pinsrd ($rndkey1,$key_,2); 925e1051a39Sopenharmony_ci &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 926e1051a39Sopenharmony_ci &pshufb ($rndkey0,$inout0); # byte swap 927e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(0,$key)); # key[0] 928e1051a39Sopenharmony_ci &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 929e1051a39Sopenharmony_ci &pshufb ($rndkey1,$inout0); # byte swap 930e1051a39Sopenharmony_ci 931e1051a39Sopenharmony_ci &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 932e1051a39Sopenharmony_ci &pshufd ($inout1,$rndkey0,2<<6); 933e1051a39Sopenharmony_ci &cmp ($len,6); 934e1051a39Sopenharmony_ci &jb (&label("ctr32_tail")); 935e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); # counter-less ivec^key[0] 936e1051a39Sopenharmony_ci &shl ($rounds,4); 937e1051a39Sopenharmony_ci &mov ($rounds_,16); 938e1051a39Sopenharmony_ci &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 939e1051a39Sopenharmony_ci &mov ($key_,$key); # backup $key 940e1051a39Sopenharmony_ci &sub ($rounds_,$rounds); # backup twisted $rounds 941e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 942e1051a39Sopenharmony_ci &sub ($len,6); 943e1051a39Sopenharmony_ci &jmp (&label("ctr32_loop6")); 944e1051a39Sopenharmony_ci 945e1051a39Sopenharmony_ci&set_label("ctr32_loop6",16); 946e1051a39Sopenharmony_ci # inlining _aesni_encrypt6's prologue gives ~6% improvement... 947e1051a39Sopenharmony_ci &pshufd ($inout2,$rndkey0,1<<6); 948e1051a39Sopenharmony_ci &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 949e1051a39Sopenharmony_ci &pshufd ($inout3,$rndkey1,3<<6); 950e1051a39Sopenharmony_ci &pxor ($inout0,$rndkey0); # merge counter-less ivec 951e1051a39Sopenharmony_ci &pshufd ($inout4,$rndkey1,2<<6); 952e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); 953e1051a39Sopenharmony_ci &pshufd ($inout5,$rndkey1,1<<6); 954e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key_)); 955e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); 956e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); 957e1051a39Sopenharmony_ci &aesenc ($inout0,$rndkey1); 958e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); 959e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); 960e1051a39Sopenharmony_ci &aesenc ($inout1,$rndkey1); 961e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key_)); 962e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); 963e1051a39Sopenharmony_ci &aesenc ($inout2,$rndkey1); 964e1051a39Sopenharmony_ci &aesenc ($inout3,$rndkey1); 965e1051a39Sopenharmony_ci &aesenc ($inout4,$rndkey1); 966e1051a39Sopenharmony_ci &aesenc ($inout5,$rndkey1); 967e1051a39Sopenharmony_ci 968e1051a39Sopenharmony_ci &call (&label("_aesni_encrypt6_enter")); 969e1051a39Sopenharmony_ci 970e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0,$inp)); 971e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0x10,$inp)); 972e1051a39Sopenharmony_ci &xorps ($inout0,$rndkey1); 973e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0x20,$inp)); 974e1051a39Sopenharmony_ci &xorps ($inout1,$rndkey0); 975e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 976e1051a39Sopenharmony_ci &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 977e1051a39Sopenharmony_ci &xorps ($inout2,$rndkey1); 978e1051a39Sopenharmony_ci &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 979e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 980e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 981e1051a39Sopenharmony_ci 982e1051a39Sopenharmony_ci &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 983e1051a39Sopenharmony_ci &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 984e1051a39Sopenharmony_ci &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 985e1051a39Sopenharmony_ci 986e1051a39Sopenharmony_ci &movups ($inout1,&QWP(0x30,$inp)); 987e1051a39Sopenharmony_ci &movups ($inout2,&QWP(0x40,$inp)); 988e1051a39Sopenharmony_ci &xorps ($inout3,$inout1); 989e1051a39Sopenharmony_ci &movups ($inout1,&QWP(0x50,$inp)); 990e1051a39Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 991e1051a39Sopenharmony_ci &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 992e1051a39Sopenharmony_ci &pshufb ($rndkey0,$inout0); # byte swap 993e1051a39Sopenharmony_ci &xorps ($inout4,$inout2); 994e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 995e1051a39Sopenharmony_ci &xorps ($inout5,$inout1); 996e1051a39Sopenharmony_ci &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 997e1051a39Sopenharmony_ci &pshufb ($rndkey1,$inout0); # byte swap 998e1051a39Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 999e1051a39Sopenharmony_ci &pshufd ($inout0,$rndkey0,3<<6); 1000e1051a39Sopenharmony_ci &movups (&QWP(0x50,$out),$inout5); 1001e1051a39Sopenharmony_ci &lea ($out,&DWP(0x60,$out)); 1002e1051a39Sopenharmony_ci 1003e1051a39Sopenharmony_ci &pshufd ($inout1,$rndkey0,2<<6); 1004e1051a39Sopenharmony_ci &sub ($len,6); 1005e1051a39Sopenharmony_ci &jnc (&label("ctr32_loop6")); 1006e1051a39Sopenharmony_ci 1007e1051a39Sopenharmony_ci &add ($len,6); 1008e1051a39Sopenharmony_ci &jz (&label("ctr32_ret")); 1009e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$key_)); 1010e1051a39Sopenharmony_ci &mov ($key,$key_); 1011e1051a39Sopenharmony_ci &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 1012e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1013e1051a39Sopenharmony_ci 1014e1051a39Sopenharmony_ci&set_label("ctr32_tail"); 1015e1051a39Sopenharmony_ci &por ($inout0,$inout5); 1016e1051a39Sopenharmony_ci &cmp ($len,2); 1017e1051a39Sopenharmony_ci &jb (&label("ctr32_one")); 1018e1051a39Sopenharmony_ci 1019e1051a39Sopenharmony_ci &pshufd ($inout2,$rndkey0,1<<6); 1020e1051a39Sopenharmony_ci &por ($inout1,$inout5); 1021e1051a39Sopenharmony_ci &je (&label("ctr32_two")); 1022e1051a39Sopenharmony_ci 1023e1051a39Sopenharmony_ci &pshufd ($inout3,$rndkey1,3<<6); 1024e1051a39Sopenharmony_ci &por ($inout2,$inout5); 1025e1051a39Sopenharmony_ci &cmp ($len,4); 1026e1051a39Sopenharmony_ci &jb (&label("ctr32_three")); 1027e1051a39Sopenharmony_ci 1028e1051a39Sopenharmony_ci &pshufd ($inout4,$rndkey1,2<<6); 1029e1051a39Sopenharmony_ci &por ($inout3,$inout5); 1030e1051a39Sopenharmony_ci &je (&label("ctr32_four")); 1031e1051a39Sopenharmony_ci 1032e1051a39Sopenharmony_ci &por ($inout4,$inout5); 1033e1051a39Sopenharmony_ci &call ("_aesni_encrypt6"); 1034e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0,$inp)); 1035e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0x10,$inp)); 1036e1051a39Sopenharmony_ci &xorps ($inout0,$rndkey1); 1037e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0x20,$inp)); 1038e1051a39Sopenharmony_ci &xorps ($inout1,$rndkey0); 1039e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0x30,$inp)); 1040e1051a39Sopenharmony_ci &xorps ($inout2,$rndkey1); 1041e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0x40,$inp)); 1042e1051a39Sopenharmony_ci &xorps ($inout3,$rndkey0); 1043e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 1044e1051a39Sopenharmony_ci &xorps ($inout4,$rndkey1); 1045e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 1046e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 1047e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 1048e1051a39Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 1049e1051a39Sopenharmony_ci &jmp (&label("ctr32_ret")); 1050e1051a39Sopenharmony_ci 1051e1051a39Sopenharmony_ci&set_label("ctr32_one_shortcut",16); 1052e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$rounds_)); # load ivec 1053e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 1054e1051a39Sopenharmony_ci 1055e1051a39Sopenharmony_ci&set_label("ctr32_one"); 1056e1051a39Sopenharmony_ci if ($inline) 1057e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc"); } 1058e1051a39Sopenharmony_ci else 1059e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1"); } 1060e1051a39Sopenharmony_ci &movups ($in0,&QWP(0,$inp)); 1061e1051a39Sopenharmony_ci &xorps ($in0,$inout0); 1062e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$in0); 1063e1051a39Sopenharmony_ci &jmp (&label("ctr32_ret")); 1064e1051a39Sopenharmony_ci 1065e1051a39Sopenharmony_ci&set_label("ctr32_two",16); 1066e1051a39Sopenharmony_ci &call ("_aesni_encrypt2"); 1067e1051a39Sopenharmony_ci &movups ($inout3,&QWP(0,$inp)); 1068e1051a39Sopenharmony_ci &movups ($inout4,&QWP(0x10,$inp)); 1069e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); 1070e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 1071e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 1072e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 1073e1051a39Sopenharmony_ci &jmp (&label("ctr32_ret")); 1074e1051a39Sopenharmony_ci 1075e1051a39Sopenharmony_ci&set_label("ctr32_three",16); 1076e1051a39Sopenharmony_ci &call ("_aesni_encrypt3"); 1077e1051a39Sopenharmony_ci &movups ($inout3,&QWP(0,$inp)); 1078e1051a39Sopenharmony_ci &movups ($inout4,&QWP(0x10,$inp)); 1079e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); 1080e1051a39Sopenharmony_ci &movups ($inout5,&QWP(0x20,$inp)); 1081e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 1082e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 1083e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 1084e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 1085e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 1086e1051a39Sopenharmony_ci &jmp (&label("ctr32_ret")); 1087e1051a39Sopenharmony_ci 1088e1051a39Sopenharmony_ci&set_label("ctr32_four",16); 1089e1051a39Sopenharmony_ci &call ("_aesni_encrypt4"); 1090e1051a39Sopenharmony_ci &movups ($inout4,&QWP(0,$inp)); 1091e1051a39Sopenharmony_ci &movups ($inout5,&QWP(0x10,$inp)); 1092e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0x20,$inp)); 1093e1051a39Sopenharmony_ci &xorps ($inout0,$inout4); 1094e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0x30,$inp)); 1095e1051a39Sopenharmony_ci &xorps ($inout1,$inout5); 1096e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 1097e1051a39Sopenharmony_ci &xorps ($inout2,$rndkey1); 1098e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 1099e1051a39Sopenharmony_ci &xorps ($inout3,$rndkey0); 1100e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 1101e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 1102e1051a39Sopenharmony_ci 1103e1051a39Sopenharmony_ci&set_label("ctr32_ret"); 1104e1051a39Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 1105e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 1106e1051a39Sopenharmony_ci &pxor ("xmm2","xmm2"); 1107e1051a39Sopenharmony_ci &pxor ("xmm3","xmm3"); 1108e1051a39Sopenharmony_ci &pxor ("xmm4","xmm4"); 1109e1051a39Sopenharmony_ci &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 1110e1051a39Sopenharmony_ci &pxor ("xmm5","xmm5"); 1111e1051a39Sopenharmony_ci &movdqa (&QWP(48,"esp"),"xmm0"); 1112e1051a39Sopenharmony_ci &pxor ("xmm6","xmm6"); 1113e1051a39Sopenharmony_ci &movdqa (&QWP(64,"esp"),"xmm0"); 1114e1051a39Sopenharmony_ci &pxor ("xmm7","xmm7"); 1115e1051a39Sopenharmony_ci &mov ("esp",&DWP(80,"esp")); 1116e1051a39Sopenharmony_ci&function_end("aesni_ctr32_encrypt_blocks"); 1117e1051a39Sopenharmony_ci 1118e1051a39Sopenharmony_ci###################################################################### 1119e1051a39Sopenharmony_ci# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1120e1051a39Sopenharmony_ci# const AES_KEY *key1, const AES_KEY *key2 1121e1051a39Sopenharmony_ci# const unsigned char iv[16]); 1122e1051a39Sopenharmony_ci# 1123e1051a39Sopenharmony_ci{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1124e1051a39Sopenharmony_ci 1125e1051a39Sopenharmony_ci&function_begin("aesni_xts_encrypt"); 1126e1051a39Sopenharmony_ci &mov ($key,&wparam(4)); # key2 1127e1051a39Sopenharmony_ci &mov ($inp,&wparam(5)); # clear-text tweak 1128e1051a39Sopenharmony_ci 1129e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); # key2->rounds 1130e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 1131e1051a39Sopenharmony_ci if ($inline) 1132e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc"); } 1133e1051a39Sopenharmony_ci else 1134e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1"); } 1135e1051a39Sopenharmony_ci 1136e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); 1137e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); 1138e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 1139e1051a39Sopenharmony_ci &mov ($key,&wparam(3)); # key1 1140e1051a39Sopenharmony_ci 1141e1051a39Sopenharmony_ci &mov ($key_,"esp"); 1142e1051a39Sopenharmony_ci &sub ("esp",16*7+8); 1143e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); # key1->rounds 1144e1051a39Sopenharmony_ci &and ("esp",-16); # align stack 1145e1051a39Sopenharmony_ci 1146e1051a39Sopenharmony_ci &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1147e1051a39Sopenharmony_ci &mov (&DWP(16*6+4,"esp"),0); 1148e1051a39Sopenharmony_ci &mov (&DWP(16*6+8,"esp"),1); 1149e1051a39Sopenharmony_ci &mov (&DWP(16*6+12,"esp"),0); 1150e1051a39Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1151e1051a39Sopenharmony_ci &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1152e1051a39Sopenharmony_ci 1153e1051a39Sopenharmony_ci &movdqa ($tweak,$inout0); 1154e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1155e1051a39Sopenharmony_ci &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1156e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1157e1051a39Sopenharmony_ci 1158e1051a39Sopenharmony_ci &and ($len,-16); 1159e1051a39Sopenharmony_ci &mov ($key_,$key); # backup $key 1160e1051a39Sopenharmony_ci &mov ($rounds_,$rounds); # backup $rounds 1161e1051a39Sopenharmony_ci &sub ($len,16*6); 1162e1051a39Sopenharmony_ci &jc (&label("xts_enc_short")); 1163e1051a39Sopenharmony_ci 1164e1051a39Sopenharmony_ci &shl ($rounds,4); 1165e1051a39Sopenharmony_ci &mov ($rounds_,16); 1166e1051a39Sopenharmony_ci &sub ($rounds_,$rounds); 1167e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 1168e1051a39Sopenharmony_ci &jmp (&label("xts_enc_loop6")); 1169e1051a39Sopenharmony_ci 1170e1051a39Sopenharmony_ci&set_label("xts_enc_loop6",16); 1171e1051a39Sopenharmony_ci for ($i=0;$i<4;$i++) { 1172e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1173e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1174e1051a39Sopenharmony_ci &movdqa (&QWP(16*$i,"esp"),$tweak); 1175e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1176e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1177e1051a39Sopenharmony_ci &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1178e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1179e1051a39Sopenharmony_ci } 1180e1051a39Sopenharmony_ci &pshufd ($inout5,$twtmp,0x13); 1181e1051a39Sopenharmony_ci &movdqa (&QWP(16*$i++,"esp"),$tweak); 1182e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1183e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key_)); 1184e1051a39Sopenharmony_ci &pand ($inout5,$twmask); # isolate carry and residue 1185e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); # load input 1186e1051a39Sopenharmony_ci &pxor ($inout5,$tweak); 1187e1051a39Sopenharmony_ci 1188e1051a39Sopenharmony_ci # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1189e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 1190e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 1191e1051a39Sopenharmony_ci &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1192e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 1193e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); 1194e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 1195e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); 1196e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 1197e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); 1198e1051a39Sopenharmony_ci &movdqu ($rndkey1,&QWP(16*5,$inp)); 1199e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); 1200e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*6,$inp)); 1201e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1202e1051a39Sopenharmony_ci &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1203e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey1); 1204e1051a39Sopenharmony_ci 1205e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key_)); 1206e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 1207e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 1208e1051a39Sopenharmony_ci &aesenc ($inout0,$rndkey1); 1209e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 1210e1051a39Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 1211e1051a39Sopenharmony_ci &aesenc ($inout1,$rndkey1); 1212e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); 1213e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key_)); 1214e1051a39Sopenharmony_ci &aesenc ($inout2,$rndkey1); 1215e1051a39Sopenharmony_ci &aesenc ($inout3,$rndkey1); 1216e1051a39Sopenharmony_ci &aesenc ($inout4,$rndkey1); 1217e1051a39Sopenharmony_ci &aesenc ($inout5,$rndkey1); 1218e1051a39Sopenharmony_ci &call (&label("_aesni_encrypt6_enter")); 1219e1051a39Sopenharmony_ci 1220e1051a39Sopenharmony_ci &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1221e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1222e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1223e1051a39Sopenharmony_ci &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1224e1051a39Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 1225e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1226e1051a39Sopenharmony_ci &xorps ($inout2,&QWP(16*2,"esp")); 1227e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 1228e1051a39Sopenharmony_ci &xorps ($inout3,&QWP(16*3,"esp")); 1229e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 1230e1051a39Sopenharmony_ci &xorps ($inout4,&QWP(16*4,"esp")); 1231e1051a39Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 1232e1051a39Sopenharmony_ci &xorps ($inout5,$tweak); 1233e1051a39Sopenharmony_ci &movups (&QWP(16*4,$out),$inout4); 1234e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1235e1051a39Sopenharmony_ci &movups (&QWP(16*5,$out),$inout5); 1236e1051a39Sopenharmony_ci &lea ($out,&DWP(16*6,$out)); 1237e1051a39Sopenharmony_ci &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1238e1051a39Sopenharmony_ci 1239e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1240e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1241e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1242e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1243e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1244e1051a39Sopenharmony_ci 1245e1051a39Sopenharmony_ci &sub ($len,16*6); 1246e1051a39Sopenharmony_ci &jnc (&label("xts_enc_loop6")); 1247e1051a39Sopenharmony_ci 1248e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1249e1051a39Sopenharmony_ci &mov ($key,$key_); # restore $key 1250e1051a39Sopenharmony_ci &mov ($rounds_,$rounds); 1251e1051a39Sopenharmony_ci 1252e1051a39Sopenharmony_ci&set_label("xts_enc_short"); 1253e1051a39Sopenharmony_ci &add ($len,16*6); 1254e1051a39Sopenharmony_ci &jz (&label("xts_enc_done6x")); 1255e1051a39Sopenharmony_ci 1256e1051a39Sopenharmony_ci &movdqa ($inout3,$tweak); # put aside previous tweak 1257e1051a39Sopenharmony_ci &cmp ($len,0x20); 1258e1051a39Sopenharmony_ci &jb (&label("xts_enc_one")); 1259e1051a39Sopenharmony_ci 1260e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1261e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1262e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1263e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1264e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1265e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1266e1051a39Sopenharmony_ci &je (&label("xts_enc_two")); 1267e1051a39Sopenharmony_ci 1268e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1269e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1270e1051a39Sopenharmony_ci &movdqa ($inout4,$tweak); # put aside previous tweak 1271e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1272e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1273e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1274e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1275e1051a39Sopenharmony_ci &cmp ($len,0x40); 1276e1051a39Sopenharmony_ci &jb (&label("xts_enc_three")); 1277e1051a39Sopenharmony_ci 1278e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1279e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1280e1051a39Sopenharmony_ci &movdqa ($inout5,$tweak); # put aside previous tweak 1281e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1282e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1283e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1284e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1285e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout3); 1286e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout4); 1287e1051a39Sopenharmony_ci &je (&label("xts_enc_four")); 1288e1051a39Sopenharmony_ci 1289e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout5); 1290e1051a39Sopenharmony_ci &pshufd ($inout5,$twtmp,0x13); 1291e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$tweak); 1292e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($inout0,1); 1293e1051a39Sopenharmony_ci &pand ($inout5,$twmask); # isolate carry and residue 1294e1051a39Sopenharmony_ci &pxor ($inout5,$tweak); 1295e1051a39Sopenharmony_ci 1296e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1297e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 1298e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 1299e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1300e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 1301e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 1302e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 1303e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 1304e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*5,$inp)); 1305e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 1306e1051a39Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1307e1051a39Sopenharmony_ci &pxor ($inout4,$inout5); 1308e1051a39Sopenharmony_ci 1309e1051a39Sopenharmony_ci &call ("_aesni_encrypt6"); 1310e1051a39Sopenharmony_ci 1311e1051a39Sopenharmony_ci &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1312e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1313e1051a39Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 1314e1051a39Sopenharmony_ci &xorps ($inout2,&QWP(16*2,"esp")); 1315e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1316e1051a39Sopenharmony_ci &xorps ($inout3,&QWP(16*3,"esp")); 1317e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 1318e1051a39Sopenharmony_ci &xorps ($inout4,$tweak); 1319e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 1320e1051a39Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 1321e1051a39Sopenharmony_ci &movups (&QWP(16*4,$out),$inout4); 1322e1051a39Sopenharmony_ci &lea ($out,&DWP(16*5,$out)); 1323e1051a39Sopenharmony_ci &jmp (&label("xts_enc_done")); 1324e1051a39Sopenharmony_ci 1325e1051a39Sopenharmony_ci&set_label("xts_enc_one",16); 1326e1051a39Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 1327e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*1,$inp)); 1328e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 1329e1051a39Sopenharmony_ci if ($inline) 1330e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc"); } 1331e1051a39Sopenharmony_ci else 1332e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1"); } 1333e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 1334e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1335e1051a39Sopenharmony_ci &lea ($out,&DWP(16*1,$out)); 1336e1051a39Sopenharmony_ci 1337e1051a39Sopenharmony_ci &movdqa ($tweak,$inout3); # last tweak 1338e1051a39Sopenharmony_ci &jmp (&label("xts_enc_done")); 1339e1051a39Sopenharmony_ci 1340e1051a39Sopenharmony_ci&set_label("xts_enc_two",16); 1341e1051a39Sopenharmony_ci &movaps ($inout4,$tweak); # put aside last tweak 1342e1051a39Sopenharmony_ci 1343e1051a39Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 1344e1051a39Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 1345e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*2,$inp)); 1346e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 1347e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 1348e1051a39Sopenharmony_ci 1349e1051a39Sopenharmony_ci &call ("_aesni_encrypt2"); 1350e1051a39Sopenharmony_ci 1351e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 1352e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 1353e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1354e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 1355e1051a39Sopenharmony_ci &lea ($out,&DWP(16*2,$out)); 1356e1051a39Sopenharmony_ci 1357e1051a39Sopenharmony_ci &movdqa ($tweak,$inout4); # last tweak 1358e1051a39Sopenharmony_ci &jmp (&label("xts_enc_done")); 1359e1051a39Sopenharmony_ci 1360e1051a39Sopenharmony_ci&set_label("xts_enc_three",16); 1361e1051a39Sopenharmony_ci &movaps ($inout5,$tweak); # put aside last tweak 1362e1051a39Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 1363e1051a39Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 1364e1051a39Sopenharmony_ci &movups ($inout2,&QWP(16*2,$inp)); 1365e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*3,$inp)); 1366e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 1367e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 1368e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 1369e1051a39Sopenharmony_ci 1370e1051a39Sopenharmony_ci &call ("_aesni_encrypt3"); 1371e1051a39Sopenharmony_ci 1372e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 1373e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 1374e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 1375e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1376e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 1377e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 1378e1051a39Sopenharmony_ci &lea ($out,&DWP(16*3,$out)); 1379e1051a39Sopenharmony_ci 1380e1051a39Sopenharmony_ci &movdqa ($tweak,$inout5); # last tweak 1381e1051a39Sopenharmony_ci &jmp (&label("xts_enc_done")); 1382e1051a39Sopenharmony_ci 1383e1051a39Sopenharmony_ci&set_label("xts_enc_four",16); 1384e1051a39Sopenharmony_ci &movaps ($inout4,$tweak); # put aside last tweak 1385e1051a39Sopenharmony_ci 1386e1051a39Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 1387e1051a39Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 1388e1051a39Sopenharmony_ci &movups ($inout2,&QWP(16*2,$inp)); 1389e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1390e1051a39Sopenharmony_ci &movups ($inout3,&QWP(16*3,$inp)); 1391e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*4,$inp)); 1392e1051a39Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 1393e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 1394e1051a39Sopenharmony_ci &xorps ($inout3,$inout4); 1395e1051a39Sopenharmony_ci 1396e1051a39Sopenharmony_ci &call ("_aesni_encrypt4"); 1397e1051a39Sopenharmony_ci 1398e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1399e1051a39Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 1400e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 1401e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1402e1051a39Sopenharmony_ci &xorps ($inout3,$inout4); 1403e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 1404e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 1405e1051a39Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 1406e1051a39Sopenharmony_ci &lea ($out,&DWP(16*4,$out)); 1407e1051a39Sopenharmony_ci 1408e1051a39Sopenharmony_ci &movdqa ($tweak,$inout4); # last tweak 1409e1051a39Sopenharmony_ci &jmp (&label("xts_enc_done")); 1410e1051a39Sopenharmony_ci 1411e1051a39Sopenharmony_ci&set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1412e1051a39Sopenharmony_ci &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1413e1051a39Sopenharmony_ci &and ($len,15); 1414e1051a39Sopenharmony_ci &jz (&label("xts_enc_ret")); 1415e1051a39Sopenharmony_ci &movdqa ($inout3,$tweak); 1416e1051a39Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1417e1051a39Sopenharmony_ci &jmp (&label("xts_enc_steal")); 1418e1051a39Sopenharmony_ci 1419e1051a39Sopenharmony_ci&set_label("xts_enc_done",16); 1420e1051a39Sopenharmony_ci &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1421e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1422e1051a39Sopenharmony_ci &and ($len,15); 1423e1051a39Sopenharmony_ci &jz (&label("xts_enc_ret")); 1424e1051a39Sopenharmony_ci 1425e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1426e1051a39Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1427e1051a39Sopenharmony_ci &pshufd ($inout3,$twtmp,0x13); 1428e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1429e1051a39Sopenharmony_ci &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1430e1051a39Sopenharmony_ci &pxor ($inout3,$tweak); 1431e1051a39Sopenharmony_ci 1432e1051a39Sopenharmony_ci&set_label("xts_enc_steal"); 1433e1051a39Sopenharmony_ci &movz ($rounds,&BP(0,$inp)); 1434e1051a39Sopenharmony_ci &movz ($key,&BP(-16,$out)); 1435e1051a39Sopenharmony_ci &lea ($inp,&DWP(1,$inp)); 1436e1051a39Sopenharmony_ci &mov (&BP(-16,$out),&LB($rounds)); 1437e1051a39Sopenharmony_ci &mov (&BP(0,$out),&LB($key)); 1438e1051a39Sopenharmony_ci &lea ($out,&DWP(1,$out)); 1439e1051a39Sopenharmony_ci &sub ($len,1); 1440e1051a39Sopenharmony_ci &jnz (&label("xts_enc_steal")); 1441e1051a39Sopenharmony_ci 1442e1051a39Sopenharmony_ci &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1443e1051a39Sopenharmony_ci &mov ($key,$key_); # restore $key 1444e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 1445e1051a39Sopenharmony_ci 1446e1051a39Sopenharmony_ci &movups ($inout0,&QWP(-16,$out)); # load input 1447e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 1448e1051a39Sopenharmony_ci if ($inline) 1449e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc"); } 1450e1051a39Sopenharmony_ci else 1451e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1"); } 1452e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 1453e1051a39Sopenharmony_ci &movups (&QWP(-16,$out),$inout0); # write output 1454e1051a39Sopenharmony_ci 1455e1051a39Sopenharmony_ci&set_label("xts_enc_ret"); 1456e1051a39Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 1457e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 1458e1051a39Sopenharmony_ci &pxor ("xmm2","xmm2"); 1459e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1460e1051a39Sopenharmony_ci &pxor ("xmm3","xmm3"); 1461e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),"xmm0"); 1462e1051a39Sopenharmony_ci &pxor ("xmm4","xmm4"); 1463e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),"xmm0"); 1464e1051a39Sopenharmony_ci &pxor ("xmm5","xmm5"); 1465e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),"xmm0"); 1466e1051a39Sopenharmony_ci &pxor ("xmm6","xmm6"); 1467e1051a39Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),"xmm0"); 1468e1051a39Sopenharmony_ci &pxor ("xmm7","xmm7"); 1469e1051a39Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),"xmm0"); 1470e1051a39Sopenharmony_ci &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1471e1051a39Sopenharmony_ci&function_end("aesni_xts_encrypt"); 1472e1051a39Sopenharmony_ci 1473e1051a39Sopenharmony_ci&function_begin("aesni_xts_decrypt"); 1474e1051a39Sopenharmony_ci &mov ($key,&wparam(4)); # key2 1475e1051a39Sopenharmony_ci &mov ($inp,&wparam(5)); # clear-text tweak 1476e1051a39Sopenharmony_ci 1477e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); # key2->rounds 1478e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 1479e1051a39Sopenharmony_ci if ($inline) 1480e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc"); } 1481e1051a39Sopenharmony_ci else 1482e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1"); } 1483e1051a39Sopenharmony_ci 1484e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); 1485e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); 1486e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 1487e1051a39Sopenharmony_ci &mov ($key,&wparam(3)); # key1 1488e1051a39Sopenharmony_ci 1489e1051a39Sopenharmony_ci &mov ($key_,"esp"); 1490e1051a39Sopenharmony_ci &sub ("esp",16*7+8); 1491e1051a39Sopenharmony_ci &and ("esp",-16); # align stack 1492e1051a39Sopenharmony_ci 1493e1051a39Sopenharmony_ci &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1494e1051a39Sopenharmony_ci &test ($len,15); 1495e1051a39Sopenharmony_ci &setnz (&LB($rounds_)); 1496e1051a39Sopenharmony_ci &shl ($rounds_,4); 1497e1051a39Sopenharmony_ci &sub ($len,$rounds_); 1498e1051a39Sopenharmony_ci 1499e1051a39Sopenharmony_ci &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1500e1051a39Sopenharmony_ci &mov (&DWP(16*6+4,"esp"),0); 1501e1051a39Sopenharmony_ci &mov (&DWP(16*6+8,"esp"),1); 1502e1051a39Sopenharmony_ci &mov (&DWP(16*6+12,"esp"),0); 1503e1051a39Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1504e1051a39Sopenharmony_ci &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1505e1051a39Sopenharmony_ci 1506e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); # key1->rounds 1507e1051a39Sopenharmony_ci &mov ($key_,$key); # backup $key 1508e1051a39Sopenharmony_ci &mov ($rounds_,$rounds); # backup $rounds 1509e1051a39Sopenharmony_ci 1510e1051a39Sopenharmony_ci &movdqa ($tweak,$inout0); 1511e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1512e1051a39Sopenharmony_ci &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1513e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1514e1051a39Sopenharmony_ci 1515e1051a39Sopenharmony_ci &and ($len,-16); 1516e1051a39Sopenharmony_ci &sub ($len,16*6); 1517e1051a39Sopenharmony_ci &jc (&label("xts_dec_short")); 1518e1051a39Sopenharmony_ci 1519e1051a39Sopenharmony_ci &shl ($rounds,4); 1520e1051a39Sopenharmony_ci &mov ($rounds_,16); 1521e1051a39Sopenharmony_ci &sub ($rounds_,$rounds); 1522e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); 1523e1051a39Sopenharmony_ci &jmp (&label("xts_dec_loop6")); 1524e1051a39Sopenharmony_ci 1525e1051a39Sopenharmony_ci&set_label("xts_dec_loop6",16); 1526e1051a39Sopenharmony_ci for ($i=0;$i<4;$i++) { 1527e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1528e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1529e1051a39Sopenharmony_ci &movdqa (&QWP(16*$i,"esp"),$tweak); 1530e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1531e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1532e1051a39Sopenharmony_ci &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1533e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1534e1051a39Sopenharmony_ci } 1535e1051a39Sopenharmony_ci &pshufd ($inout5,$twtmp,0x13); 1536e1051a39Sopenharmony_ci &movdqa (&QWP(16*$i++,"esp"),$tweak); 1537e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1538e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(0,$key_)); 1539e1051a39Sopenharmony_ci &pand ($inout5,$twmask); # isolate carry and residue 1540e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); # load input 1541e1051a39Sopenharmony_ci &pxor ($inout5,$tweak); 1542e1051a39Sopenharmony_ci 1543e1051a39Sopenharmony_ci # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1544e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); 1545e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 1546e1051a39Sopenharmony_ci &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1547e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 1548e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); 1549e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 1550e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); 1551e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 1552e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); 1553e1051a39Sopenharmony_ci &movdqu ($rndkey1,&QWP(16*5,$inp)); 1554e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); 1555e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*6,$inp)); 1556e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1557e1051a39Sopenharmony_ci &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1558e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey1); 1559e1051a39Sopenharmony_ci 1560e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(16,$key_)); 1561e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 1562e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 1563e1051a39Sopenharmony_ci &aesdec ($inout0,$rndkey1); 1564e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 1565e1051a39Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 1566e1051a39Sopenharmony_ci &aesdec ($inout1,$rndkey1); 1567e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); 1568e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(32,$key_)); 1569e1051a39Sopenharmony_ci &aesdec ($inout2,$rndkey1); 1570e1051a39Sopenharmony_ci &aesdec ($inout3,$rndkey1); 1571e1051a39Sopenharmony_ci &aesdec ($inout4,$rndkey1); 1572e1051a39Sopenharmony_ci &aesdec ($inout5,$rndkey1); 1573e1051a39Sopenharmony_ci &call (&label("_aesni_decrypt6_enter")); 1574e1051a39Sopenharmony_ci 1575e1051a39Sopenharmony_ci &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1576e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1577e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1578e1051a39Sopenharmony_ci &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1579e1051a39Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 1580e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1581e1051a39Sopenharmony_ci &xorps ($inout2,&QWP(16*2,"esp")); 1582e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 1583e1051a39Sopenharmony_ci &xorps ($inout3,&QWP(16*3,"esp")); 1584e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 1585e1051a39Sopenharmony_ci &xorps ($inout4,&QWP(16*4,"esp")); 1586e1051a39Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 1587e1051a39Sopenharmony_ci &xorps ($inout5,$tweak); 1588e1051a39Sopenharmony_ci &movups (&QWP(16*4,$out),$inout4); 1589e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1590e1051a39Sopenharmony_ci &movups (&QWP(16*5,$out),$inout5); 1591e1051a39Sopenharmony_ci &lea ($out,&DWP(16*6,$out)); 1592e1051a39Sopenharmony_ci &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1593e1051a39Sopenharmony_ci 1594e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1595e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1596e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1597e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1598e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1599e1051a39Sopenharmony_ci 1600e1051a39Sopenharmony_ci &sub ($len,16*6); 1601e1051a39Sopenharmony_ci &jnc (&label("xts_dec_loop6")); 1602e1051a39Sopenharmony_ci 1603e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1604e1051a39Sopenharmony_ci &mov ($key,$key_); # restore $key 1605e1051a39Sopenharmony_ci &mov ($rounds_,$rounds); 1606e1051a39Sopenharmony_ci 1607e1051a39Sopenharmony_ci&set_label("xts_dec_short"); 1608e1051a39Sopenharmony_ci &add ($len,16*6); 1609e1051a39Sopenharmony_ci &jz (&label("xts_dec_done6x")); 1610e1051a39Sopenharmony_ci 1611e1051a39Sopenharmony_ci &movdqa ($inout3,$tweak); # put aside previous tweak 1612e1051a39Sopenharmony_ci &cmp ($len,0x20); 1613e1051a39Sopenharmony_ci &jb (&label("xts_dec_one")); 1614e1051a39Sopenharmony_ci 1615e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1616e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1617e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1618e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1619e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1620e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1621e1051a39Sopenharmony_ci &je (&label("xts_dec_two")); 1622e1051a39Sopenharmony_ci 1623e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1624e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1625e1051a39Sopenharmony_ci &movdqa ($inout4,$tweak); # put aside previous tweak 1626e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1627e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1628e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1629e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1630e1051a39Sopenharmony_ci &cmp ($len,0x40); 1631e1051a39Sopenharmony_ci &jb (&label("xts_dec_three")); 1632e1051a39Sopenharmony_ci 1633e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1634e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1635e1051a39Sopenharmony_ci &movdqa ($inout5,$tweak); # put aside previous tweak 1636e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1637e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1638e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1639e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1640e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout3); 1641e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout4); 1642e1051a39Sopenharmony_ci &je (&label("xts_dec_four")); 1643e1051a39Sopenharmony_ci 1644e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout5); 1645e1051a39Sopenharmony_ci &pshufd ($inout5,$twtmp,0x13); 1646e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$tweak); 1647e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($inout0,1); 1648e1051a39Sopenharmony_ci &pand ($inout5,$twmask); # isolate carry and residue 1649e1051a39Sopenharmony_ci &pxor ($inout5,$tweak); 1650e1051a39Sopenharmony_ci 1651e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1652e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 1653e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 1654e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1655e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 1656e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 1657e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 1658e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 1659e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*5,$inp)); 1660e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 1661e1051a39Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1662e1051a39Sopenharmony_ci &pxor ($inout4,$inout5); 1663e1051a39Sopenharmony_ci 1664e1051a39Sopenharmony_ci &call ("_aesni_decrypt6"); 1665e1051a39Sopenharmony_ci 1666e1051a39Sopenharmony_ci &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1667e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1668e1051a39Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 1669e1051a39Sopenharmony_ci &xorps ($inout2,&QWP(16*2,"esp")); 1670e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1671e1051a39Sopenharmony_ci &xorps ($inout3,&QWP(16*3,"esp")); 1672e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 1673e1051a39Sopenharmony_ci &xorps ($inout4,$tweak); 1674e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 1675e1051a39Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 1676e1051a39Sopenharmony_ci &movups (&QWP(16*4,$out),$inout4); 1677e1051a39Sopenharmony_ci &lea ($out,&DWP(16*5,$out)); 1678e1051a39Sopenharmony_ci &jmp (&label("xts_dec_done")); 1679e1051a39Sopenharmony_ci 1680e1051a39Sopenharmony_ci&set_label("xts_dec_one",16); 1681e1051a39Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 1682e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*1,$inp)); 1683e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 1684e1051a39Sopenharmony_ci if ($inline) 1685e1051a39Sopenharmony_ci { &aesni_inline_generate1("dec"); } 1686e1051a39Sopenharmony_ci else 1687e1051a39Sopenharmony_ci { &call ("_aesni_decrypt1"); } 1688e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 1689e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1690e1051a39Sopenharmony_ci &lea ($out,&DWP(16*1,$out)); 1691e1051a39Sopenharmony_ci 1692e1051a39Sopenharmony_ci &movdqa ($tweak,$inout3); # last tweak 1693e1051a39Sopenharmony_ci &jmp (&label("xts_dec_done")); 1694e1051a39Sopenharmony_ci 1695e1051a39Sopenharmony_ci&set_label("xts_dec_two",16); 1696e1051a39Sopenharmony_ci &movaps ($inout4,$tweak); # put aside last tweak 1697e1051a39Sopenharmony_ci 1698e1051a39Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 1699e1051a39Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 1700e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*2,$inp)); 1701e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 1702e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 1703e1051a39Sopenharmony_ci 1704e1051a39Sopenharmony_ci &call ("_aesni_decrypt2"); 1705e1051a39Sopenharmony_ci 1706e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 1707e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 1708e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1709e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 1710e1051a39Sopenharmony_ci &lea ($out,&DWP(16*2,$out)); 1711e1051a39Sopenharmony_ci 1712e1051a39Sopenharmony_ci &movdqa ($tweak,$inout4); # last tweak 1713e1051a39Sopenharmony_ci &jmp (&label("xts_dec_done")); 1714e1051a39Sopenharmony_ci 1715e1051a39Sopenharmony_ci&set_label("xts_dec_three",16); 1716e1051a39Sopenharmony_ci &movaps ($inout5,$tweak); # put aside last tweak 1717e1051a39Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 1718e1051a39Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 1719e1051a39Sopenharmony_ci &movups ($inout2,&QWP(16*2,$inp)); 1720e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*3,$inp)); 1721e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 1722e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 1723e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 1724e1051a39Sopenharmony_ci 1725e1051a39Sopenharmony_ci &call ("_aesni_decrypt3"); 1726e1051a39Sopenharmony_ci 1727e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 1728e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 1729e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 1730e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1731e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 1732e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 1733e1051a39Sopenharmony_ci &lea ($out,&DWP(16*3,$out)); 1734e1051a39Sopenharmony_ci 1735e1051a39Sopenharmony_ci &movdqa ($tweak,$inout5); # last tweak 1736e1051a39Sopenharmony_ci &jmp (&label("xts_dec_done")); 1737e1051a39Sopenharmony_ci 1738e1051a39Sopenharmony_ci&set_label("xts_dec_four",16); 1739e1051a39Sopenharmony_ci &movaps ($inout4,$tweak); # put aside last tweak 1740e1051a39Sopenharmony_ci 1741e1051a39Sopenharmony_ci &movups ($inout0,&QWP(16*0,$inp)); # load input 1742e1051a39Sopenharmony_ci &movups ($inout1,&QWP(16*1,$inp)); 1743e1051a39Sopenharmony_ci &movups ($inout2,&QWP(16*2,$inp)); 1744e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1745e1051a39Sopenharmony_ci &movups ($inout3,&QWP(16*3,$inp)); 1746e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*4,$inp)); 1747e1051a39Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 1748e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 1749e1051a39Sopenharmony_ci &xorps ($inout3,$inout4); 1750e1051a39Sopenharmony_ci 1751e1051a39Sopenharmony_ci &call ("_aesni_decrypt4"); 1752e1051a39Sopenharmony_ci 1753e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1754e1051a39Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 1755e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 1756e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out),$inout0); # write output 1757e1051a39Sopenharmony_ci &xorps ($inout3,$inout4); 1758e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out),$inout1); 1759e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out),$inout2); 1760e1051a39Sopenharmony_ci &movups (&QWP(16*3,$out),$inout3); 1761e1051a39Sopenharmony_ci &lea ($out,&DWP(16*4,$out)); 1762e1051a39Sopenharmony_ci 1763e1051a39Sopenharmony_ci &movdqa ($tweak,$inout4); # last tweak 1764e1051a39Sopenharmony_ci &jmp (&label("xts_dec_done")); 1765e1051a39Sopenharmony_ci 1766e1051a39Sopenharmony_ci&set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1767e1051a39Sopenharmony_ci &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1768e1051a39Sopenharmony_ci &and ($len,15); 1769e1051a39Sopenharmony_ci &jz (&label("xts_dec_ret")); 1770e1051a39Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1771e1051a39Sopenharmony_ci &jmp (&label("xts_dec_only_one_more")); 1772e1051a39Sopenharmony_ci 1773e1051a39Sopenharmony_ci&set_label("xts_dec_done",16); 1774e1051a39Sopenharmony_ci &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1775e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1776e1051a39Sopenharmony_ci &and ($len,15); 1777e1051a39Sopenharmony_ci &jz (&label("xts_dec_ret")); 1778e1051a39Sopenharmony_ci 1779e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1780e1051a39Sopenharmony_ci &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1781e1051a39Sopenharmony_ci &pshufd ($twres,$twtmp,0x13); 1782e1051a39Sopenharmony_ci &pxor ($twtmp,$twtmp); 1783e1051a39Sopenharmony_ci &movdqa ($twmask,&QWP(16*6,"esp")); 1784e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1785e1051a39Sopenharmony_ci &pand ($twres,$twmask); # isolate carry and residue 1786e1051a39Sopenharmony_ci &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1787e1051a39Sopenharmony_ci &pxor ($tweak,$twres); 1788e1051a39Sopenharmony_ci 1789e1051a39Sopenharmony_ci&set_label("xts_dec_only_one_more"); 1790e1051a39Sopenharmony_ci &pshufd ($inout3,$twtmp,0x13); 1791e1051a39Sopenharmony_ci &movdqa ($inout4,$tweak); # put aside previous tweak 1792e1051a39Sopenharmony_ci &paddq ($tweak,$tweak); # &psllq($tweak,1); 1793e1051a39Sopenharmony_ci &pand ($inout3,$twmask); # isolate carry and residue 1794e1051a39Sopenharmony_ci &pxor ($inout3,$tweak); 1795e1051a39Sopenharmony_ci 1796e1051a39Sopenharmony_ci &mov ($key,$key_); # restore $key 1797e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 1798e1051a39Sopenharmony_ci 1799e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); # load input 1800e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # input^=tweak 1801e1051a39Sopenharmony_ci if ($inline) 1802e1051a39Sopenharmony_ci { &aesni_inline_generate1("dec"); } 1803e1051a39Sopenharmony_ci else 1804e1051a39Sopenharmony_ci { &call ("_aesni_decrypt1"); } 1805e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # output^=tweak 1806e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); # write output 1807e1051a39Sopenharmony_ci 1808e1051a39Sopenharmony_ci&set_label("xts_dec_steal"); 1809e1051a39Sopenharmony_ci &movz ($rounds,&BP(16,$inp)); 1810e1051a39Sopenharmony_ci &movz ($key,&BP(0,$out)); 1811e1051a39Sopenharmony_ci &lea ($inp,&DWP(1,$inp)); 1812e1051a39Sopenharmony_ci &mov (&BP(0,$out),&LB($rounds)); 1813e1051a39Sopenharmony_ci &mov (&BP(16,$out),&LB($key)); 1814e1051a39Sopenharmony_ci &lea ($out,&DWP(1,$out)); 1815e1051a39Sopenharmony_ci &sub ($len,1); 1816e1051a39Sopenharmony_ci &jnz (&label("xts_dec_steal")); 1817e1051a39Sopenharmony_ci 1818e1051a39Sopenharmony_ci &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1819e1051a39Sopenharmony_ci &mov ($key,$key_); # restore $key 1820e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 1821e1051a39Sopenharmony_ci 1822e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$out)); # load input 1823e1051a39Sopenharmony_ci &xorps ($inout0,$inout4); # input^=tweak 1824e1051a39Sopenharmony_ci if ($inline) 1825e1051a39Sopenharmony_ci { &aesni_inline_generate1("dec"); } 1826e1051a39Sopenharmony_ci else 1827e1051a39Sopenharmony_ci { &call ("_aesni_decrypt1"); } 1828e1051a39Sopenharmony_ci &xorps ($inout0,$inout4); # output^=tweak 1829e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); # write output 1830e1051a39Sopenharmony_ci 1831e1051a39Sopenharmony_ci&set_label("xts_dec_ret"); 1832e1051a39Sopenharmony_ci &pxor ("xmm0","xmm0"); # clear register bank 1833e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 1834e1051a39Sopenharmony_ci &pxor ("xmm2","xmm2"); 1835e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1836e1051a39Sopenharmony_ci &pxor ("xmm3","xmm3"); 1837e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),"xmm0"); 1838e1051a39Sopenharmony_ci &pxor ("xmm4","xmm4"); 1839e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),"xmm0"); 1840e1051a39Sopenharmony_ci &pxor ("xmm5","xmm5"); 1841e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),"xmm0"); 1842e1051a39Sopenharmony_ci &pxor ("xmm6","xmm6"); 1843e1051a39Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),"xmm0"); 1844e1051a39Sopenharmony_ci &pxor ("xmm7","xmm7"); 1845e1051a39Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),"xmm0"); 1846e1051a39Sopenharmony_ci &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1847e1051a39Sopenharmony_ci&function_end("aesni_xts_decrypt"); 1848e1051a39Sopenharmony_ci} 1849e1051a39Sopenharmony_ci 1850e1051a39Sopenharmony_ci###################################################################### 1851e1051a39Sopenharmony_ci# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 1852e1051a39Sopenharmony_ci# const AES_KEY *key, unsigned int start_block_num, 1853e1051a39Sopenharmony_ci# unsigned char offset_i[16], const unsigned char L_[][16], 1854e1051a39Sopenharmony_ci# unsigned char checksum[16]); 1855e1051a39Sopenharmony_ci# 1856e1051a39Sopenharmony_ci{ 1857e1051a39Sopenharmony_ci# offsets within stack frame 1858e1051a39Sopenharmony_cimy $checksum = 16*6; 1859e1051a39Sopenharmony_cimy ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4)); 1860e1051a39Sopenharmony_ci 1861e1051a39Sopenharmony_ci# reassigned registers 1862e1051a39Sopenharmony_cimy ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out); 1863e1051a39Sopenharmony_ci# $l_, $blocks, $inp, $key are permanently allocated in registers; 1864e1051a39Sopenharmony_ci# remaining non-volatile ones are offloaded to stack, which even 1865e1051a39Sopenharmony_ci# stay invariant after written to stack. 1866e1051a39Sopenharmony_ci 1867e1051a39Sopenharmony_ci&function_begin("aesni_ocb_encrypt"); 1868e1051a39Sopenharmony_ci &mov ($rounds,&wparam(5)); # &offset_i 1869e1051a39Sopenharmony_ci &mov ($rounds_,&wparam(7)); # &checksum 1870e1051a39Sopenharmony_ci 1871e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); 1872e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); 1873e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 1874e1051a39Sopenharmony_ci &mov ($key,&wparam(3)); 1875e1051a39Sopenharmony_ci &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i 1876e1051a39Sopenharmony_ci &mov ($block,&wparam(4)); # start_block_num 1877e1051a39Sopenharmony_ci &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum 1878e1051a39Sopenharmony_ci &mov ($l_,&wparam(6)); # L_ 1879e1051a39Sopenharmony_ci 1880e1051a39Sopenharmony_ci &mov ($rounds,"esp"); 1881e1051a39Sopenharmony_ci &sub ("esp",$esp_off+4); # alloca 1882e1051a39Sopenharmony_ci &and ("esp",-16); # align stack 1883e1051a39Sopenharmony_ci 1884e1051a39Sopenharmony_ci &sub ($out,$inp); 1885e1051a39Sopenharmony_ci &shl ($len,4); 1886e1051a39Sopenharmony_ci &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6 1887e1051a39Sopenharmony_ci &mov (&DWP($out_off,"esp"),$out); 1888e1051a39Sopenharmony_ci &mov (&DWP($end_off,"esp"),$len); 1889e1051a39Sopenharmony_ci &mov (&DWP($esp_off,"esp"),$rounds); 1890e1051a39Sopenharmony_ci 1891e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 1892e1051a39Sopenharmony_ci 1893e1051a39Sopenharmony_ci &test ($block,1); 1894e1051a39Sopenharmony_ci &jnz (&label("odd")); 1895e1051a39Sopenharmony_ci 1896e1051a39Sopenharmony_ci &bsf ($i3,$block); 1897e1051a39Sopenharmony_ci &add ($block,1); 1898e1051a39Sopenharmony_ci &shl ($i3,4); 1899e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i3)); 1900e1051a39Sopenharmony_ci &mov ($i3,$key); # put aside key 1901e1051a39Sopenharmony_ci 1902e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1903e1051a39Sopenharmony_ci &lea ($inp,&DWP(16,$inp)); 1904e1051a39Sopenharmony_ci 1905e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); # ^ last offset_i 1906e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 1907e1051a39Sopenharmony_ci &pxor ($inout0,$inout5); # ^ offset_i 1908e1051a39Sopenharmony_ci 1909e1051a39Sopenharmony_ci &movdqa ($inout4,$rndkey1); 1910e1051a39Sopenharmony_ci if ($inline) 1911e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc"); } 1912e1051a39Sopenharmony_ci else 1913e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1"); } 1914e1051a39Sopenharmony_ci 1915e1051a39Sopenharmony_ci &xorps ($inout0,$inout5); # ^ offset_i 1916e1051a39Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 1917e1051a39Sopenharmony_ci &movdqa ($rndkey1,$inout4); # pass the checksum 1918e1051a39Sopenharmony_ci 1919e1051a39Sopenharmony_ci &movups (&QWP(-16,$out,$inp),$inout0); # store output 1920e1051a39Sopenharmony_ci 1921e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$i3)); 1922e1051a39Sopenharmony_ci &mov ($key,$i3); # restore key 1923e1051a39Sopenharmony_ci &mov ($len,&DWP($end_off,"esp")); 1924e1051a39Sopenharmony_ci 1925e1051a39Sopenharmony_ci&set_label("odd"); 1926e1051a39Sopenharmony_ci &shl ($rounds,4); 1927e1051a39Sopenharmony_ci &mov ($out,16); 1928e1051a39Sopenharmony_ci &sub ($out,$rounds); # twisted rounds 1929e1051a39Sopenharmony_ci &mov (&DWP($key_off,"esp"),$key); 1930e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule 1931e1051a39Sopenharmony_ci &mov (&DWP($rounds_off,"esp"),$out); 1932e1051a39Sopenharmony_ci 1933e1051a39Sopenharmony_ci &cmp ($inp,$len); 1934e1051a39Sopenharmony_ci &ja (&label("short")); 1935e1051a39Sopenharmony_ci &jmp (&label("grandloop")); 1936e1051a39Sopenharmony_ci 1937e1051a39Sopenharmony_ci&set_label("grandloop",32); 1938e1051a39Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 1939e1051a39Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 1940e1051a39Sopenharmony_ci &lea ($i5,&DWP(5,$block)); 1941e1051a39Sopenharmony_ci &add ($block,6); 1942e1051a39Sopenharmony_ci &bsf ($i1,$i1); 1943e1051a39Sopenharmony_ci &bsf ($i3,$i3); 1944e1051a39Sopenharmony_ci &bsf ($i5,$i5); 1945e1051a39Sopenharmony_ci &shl ($i1,4); 1946e1051a39Sopenharmony_ci &shl ($i3,4); 1947e1051a39Sopenharmony_ci &shl ($i5,4); 1948e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(0,$l_)); 1949e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(0,$l_,$i1)); 1950e1051a39Sopenharmony_ci &mov ($rounds,&DWP($rounds_off,"esp")); 1951e1051a39Sopenharmony_ci &movdqa ($inout2,$inout0); 1952e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i3)); 1953e1051a39Sopenharmony_ci &movdqa ($inout4,$inout0); 1954e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i5)); 1955e1051a39Sopenharmony_ci 1956e1051a39Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ last offset_i 1957e1051a39Sopenharmony_ci &pxor ($inout1,$inout0); 1958e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); 1959e1051a39Sopenharmony_ci &pxor ($inout2,$inout1); 1960e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout1); 1961e1051a39Sopenharmony_ci &pxor ($inout3,$inout2); 1962e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout2); 1963e1051a39Sopenharmony_ci &pxor ($inout4,$inout3); 1964e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout3); 1965e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); 1966e1051a39Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout4); 1967e1051a39Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),$inout5); 1968e1051a39Sopenharmony_ci 1969e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 1970e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1971e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 1972e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 1973e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 1974e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 1975e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(16*5,$inp)); 1976e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*6,$inp)); 1977e1051a39Sopenharmony_ci 1978e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 1979e1051a39Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ roundkey[0] 1980e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout1); 1981e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); 1982e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout2); 1983e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); 1984e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout3); 1985e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); 1986e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout4); 1987e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); 1988e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout5); 1989e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); 1990e1051a39Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 1991e1051a39Sopenharmony_ci 1992e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 1993e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 1994e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 1995e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 1996e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 1997e1051a39Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 1998e1051a39Sopenharmony_ci &pxor ($inout5,&QWP(16*5,"esp")); 1999e1051a39Sopenharmony_ci 2000e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2001e1051a39Sopenharmony_ci &aesenc ($inout0,$rndkey1); 2002e1051a39Sopenharmony_ci &aesenc ($inout1,$rndkey1); 2003e1051a39Sopenharmony_ci &aesenc ($inout2,$rndkey1); 2004e1051a39Sopenharmony_ci &aesenc ($inout3,$rndkey1); 2005e1051a39Sopenharmony_ci &aesenc ($inout4,$rndkey1); 2006e1051a39Sopenharmony_ci &aesenc ($inout5,$rndkey1); 2007e1051a39Sopenharmony_ci 2008e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2009e1051a39Sopenharmony_ci &mov ($len,&DWP($end_off,"esp")); 2010e1051a39Sopenharmony_ci &call ("_aesni_encrypt6_enter"); 2011e1051a39Sopenharmony_ci 2012e1051a39Sopenharmony_ci &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i 2013e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2014e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 2015e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 2016e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 2017e1051a39Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 2018e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); 2019e1051a39Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2020e1051a39Sopenharmony_ci 2021e1051a39Sopenharmony_ci &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output 2022e1051a39Sopenharmony_ci &movdqu (&QWP(-16*5,$out,$inp),$inout1); 2023e1051a39Sopenharmony_ci &movdqu (&QWP(-16*4,$out,$inp),$inout2); 2024e1051a39Sopenharmony_ci &movdqu (&QWP(-16*3,$out,$inp),$inout3); 2025e1051a39Sopenharmony_ci &movdqu (&QWP(-16*2,$out,$inp),$inout4); 2026e1051a39Sopenharmony_ci &movdqu (&QWP(-16*1,$out,$inp),$inout5); 2027e1051a39Sopenharmony_ci &cmp ($inp,$len); # done yet? 2028e1051a39Sopenharmony_ci &jbe (&label("grandloop")); 2029e1051a39Sopenharmony_ci 2030e1051a39Sopenharmony_ci&set_label("short"); 2031e1051a39Sopenharmony_ci &add ($len,16*6); 2032e1051a39Sopenharmony_ci &sub ($len,$inp); 2033e1051a39Sopenharmony_ci &jz (&label("done")); 2034e1051a39Sopenharmony_ci 2035e1051a39Sopenharmony_ci &cmp ($len,16*2); 2036e1051a39Sopenharmony_ci &jb (&label("one")); 2037e1051a39Sopenharmony_ci &je (&label("two")); 2038e1051a39Sopenharmony_ci 2039e1051a39Sopenharmony_ci &cmp ($len,16*4); 2040e1051a39Sopenharmony_ci &jb (&label("three")); 2041e1051a39Sopenharmony_ci &je (&label("four")); 2042e1051a39Sopenharmony_ci 2043e1051a39Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 2044e1051a39Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 2045e1051a39Sopenharmony_ci &bsf ($i1,$i1); 2046e1051a39Sopenharmony_ci &bsf ($i3,$i3); 2047e1051a39Sopenharmony_ci &shl ($i1,4); 2048e1051a39Sopenharmony_ci &shl ($i3,4); 2049e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(0,$l_)); 2050e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(0,$l_,$i1)); 2051e1051a39Sopenharmony_ci &mov ($rounds,&DWP($rounds_off,"esp")); 2052e1051a39Sopenharmony_ci &movdqa ($inout2,$inout0); 2053e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i3)); 2054e1051a39Sopenharmony_ci &movdqa ($inout4,$inout0); 2055e1051a39Sopenharmony_ci 2056e1051a39Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ last offset_i 2057e1051a39Sopenharmony_ci &pxor ($inout1,$inout0); 2058e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); 2059e1051a39Sopenharmony_ci &pxor ($inout2,$inout1); 2060e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout1); 2061e1051a39Sopenharmony_ci &pxor ($inout3,$inout2); 2062e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout2); 2063e1051a39Sopenharmony_ci &pxor ($inout4,$inout3); 2064e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout3); 2065e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); 2066e1051a39Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout4); 2067e1051a39Sopenharmony_ci 2068e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 2069e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2070e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 2071e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 2072e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 2073e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 2074e1051a39Sopenharmony_ci &pxor ($inout5,$inout5); 2075e1051a39Sopenharmony_ci 2076e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 2077e1051a39Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ roundkey[0] 2078e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout1); 2079e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); 2080e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout2); 2081e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); 2082e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout3); 2083e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); 2084e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout4); 2085e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); 2086e1051a39Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 2087e1051a39Sopenharmony_ci 2088e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 2089e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2090e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 2091e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 2092e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 2093e1051a39Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 2094e1051a39Sopenharmony_ci 2095e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2096e1051a39Sopenharmony_ci &aesenc ($inout0,$rndkey1); 2097e1051a39Sopenharmony_ci &aesenc ($inout1,$rndkey1); 2098e1051a39Sopenharmony_ci &aesenc ($inout2,$rndkey1); 2099e1051a39Sopenharmony_ci &aesenc ($inout3,$rndkey1); 2100e1051a39Sopenharmony_ci &aesenc ($inout4,$rndkey1); 2101e1051a39Sopenharmony_ci &aesenc ($inout5,$rndkey1); 2102e1051a39Sopenharmony_ci 2103e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2104e1051a39Sopenharmony_ci &call ("_aesni_encrypt6_enter"); 2105e1051a39Sopenharmony_ci 2106e1051a39Sopenharmony_ci &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i 2107e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2108e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 2109e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 2110e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 2111e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); 2112e1051a39Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2113e1051a39Sopenharmony_ci 2114e1051a39Sopenharmony_ci &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output 2115e1051a39Sopenharmony_ci &movdqu (&QWP(16*1,$out,$inp),$inout1); 2116e1051a39Sopenharmony_ci &movdqu (&QWP(16*2,$out,$inp),$inout2); 2117e1051a39Sopenharmony_ci &movdqu (&QWP(16*3,$out,$inp),$inout3); 2118e1051a39Sopenharmony_ci &movdqu (&QWP(16*4,$out,$inp),$inout4); 2119e1051a39Sopenharmony_ci 2120e1051a39Sopenharmony_ci &jmp (&label("done")); 2121e1051a39Sopenharmony_ci 2122e1051a39Sopenharmony_ci&set_label("one",16); 2123e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_)); 2124e1051a39Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 2125e1051a39Sopenharmony_ci 2126e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2127e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2128e1051a39Sopenharmony_ci 2129e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); # ^ last offset_i 2130e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 2131e1051a39Sopenharmony_ci &pxor ($inout0,$inout5); # ^ offset_i 2132e1051a39Sopenharmony_ci 2133e1051a39Sopenharmony_ci &movdqa ($inout4,$rndkey1); 2134e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2135e1051a39Sopenharmony_ci if ($inline) 2136e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc"); } 2137e1051a39Sopenharmony_ci else 2138e1051a39Sopenharmony_ci { &call ("_aesni_encrypt1"); } 2139e1051a39Sopenharmony_ci 2140e1051a39Sopenharmony_ci &xorps ($inout0,$inout5); # ^ offset_i 2141e1051a39Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 2142e1051a39Sopenharmony_ci &movdqa ($rndkey1,$inout4); # pass the checksum 2143e1051a39Sopenharmony_ci &movups (&QWP(0,$out,$inp),$inout0); 2144e1051a39Sopenharmony_ci 2145e1051a39Sopenharmony_ci &jmp (&label("done")); 2146e1051a39Sopenharmony_ci 2147e1051a39Sopenharmony_ci&set_label("two",16); 2148e1051a39Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 2149e1051a39Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 2150e1051a39Sopenharmony_ci &bsf ($i1,$i1); 2151e1051a39Sopenharmony_ci &shl ($i1,4); 2152e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(0,$l_)); 2153e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i1)); 2154e1051a39Sopenharmony_ci 2155e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2156e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 2157e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2158e1051a39Sopenharmony_ci 2159e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); # ^ last offset_i 2160e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); 2161e1051a39Sopenharmony_ci 2162e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 2163e1051a39Sopenharmony_ci &pxor ($inout0,$inout4); # ^ offset_i 2164e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout1); 2165e1051a39Sopenharmony_ci &pxor ($inout1,$inout5); 2166e1051a39Sopenharmony_ci 2167e1051a39Sopenharmony_ci &movdqa ($inout3,$rndkey1) 2168e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2169e1051a39Sopenharmony_ci &call ("_aesni_encrypt2"); 2170e1051a39Sopenharmony_ci 2171e1051a39Sopenharmony_ci &xorps ($inout0,$inout4); # ^ offset_i 2172e1051a39Sopenharmony_ci &xorps ($inout1,$inout5); 2173e1051a39Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 2174e1051a39Sopenharmony_ci &movdqa ($rndkey1,$inout3); # pass the checksum 2175e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2176e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 2177e1051a39Sopenharmony_ci 2178e1051a39Sopenharmony_ci &jmp (&label("done")); 2179e1051a39Sopenharmony_ci 2180e1051a39Sopenharmony_ci&set_label("three",16); 2181e1051a39Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 2182e1051a39Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 2183e1051a39Sopenharmony_ci &bsf ($i1,$i1); 2184e1051a39Sopenharmony_ci &shl ($i1,4); 2185e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_)); 2186e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(0,$l_,$i1)); 2187e1051a39Sopenharmony_ci &movdqa ($inout5,$inout3); 2188e1051a39Sopenharmony_ci 2189e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2190e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 2191e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 2192e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2193e1051a39Sopenharmony_ci 2194e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); # ^ last offset_i 2195e1051a39Sopenharmony_ci &pxor ($inout4,$inout3); 2196e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); 2197e1051a39Sopenharmony_ci 2198e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 2199e1051a39Sopenharmony_ci &pxor ($inout0,$inout3); # ^ offset_i 2200e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout1); 2201e1051a39Sopenharmony_ci &pxor ($inout1,$inout4); 2202e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout2); 2203e1051a39Sopenharmony_ci &pxor ($inout2,$inout5); 2204e1051a39Sopenharmony_ci 2205e1051a39Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 2206e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2207e1051a39Sopenharmony_ci &call ("_aesni_encrypt3"); 2208e1051a39Sopenharmony_ci 2209e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # ^ offset_i 2210e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 2211e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 2212e1051a39Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 2213e1051a39Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2214e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2215e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 2216e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out,$inp),$inout2); 2217e1051a39Sopenharmony_ci 2218e1051a39Sopenharmony_ci &jmp (&label("done")); 2219e1051a39Sopenharmony_ci 2220e1051a39Sopenharmony_ci&set_label("four",16); 2221e1051a39Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 2222e1051a39Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 2223e1051a39Sopenharmony_ci &bsf ($i1,$i1); 2224e1051a39Sopenharmony_ci &bsf ($i3,$i3); 2225e1051a39Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 2226e1051a39Sopenharmony_ci &shl ($i1,4); 2227e1051a39Sopenharmony_ci &shl ($i3,4); 2228e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(0,$l_)); 2229e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i1)); 2230e1051a39Sopenharmony_ci &movdqa ($inout4,$inout2); 2231e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i3)); 2232e1051a39Sopenharmony_ci 2233e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); # ^ last offset_i 2234e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2235e1051a39Sopenharmony_ci &pxor ($inout3,$inout2); 2236e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 2237e1051a39Sopenharmony_ci &pxor ($inout4,$inout3); 2238e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout2); 2239e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); 2240e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout3); 2241e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 2242e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 2243e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2244e1051a39Sopenharmony_ci 2245e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 2246e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2247e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout1); 2248e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 2249e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout2); 2250e1051a39Sopenharmony_ci &pxor ($inout2,$inout4); 2251e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout3); 2252e1051a39Sopenharmony_ci &pxor ($inout3,$inout5); 2253e1051a39Sopenharmony_ci 2254e1051a39Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1) 2255e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2256e1051a39Sopenharmony_ci &call ("_aesni_encrypt4"); 2257e1051a39Sopenharmony_ci 2258e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2259e1051a39Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 2260e1051a39Sopenharmony_ci &xorps ($inout2,$inout4); 2261e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2262e1051a39Sopenharmony_ci &xorps ($inout3,$inout5); 2263e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 2264e1051a39Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 2265e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out,$inp),$inout2); 2266e1051a39Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2267e1051a39Sopenharmony_ci &movups (&QWP(16*3,$out,$inp),$inout3); 2268e1051a39Sopenharmony_ci 2269e1051a39Sopenharmony_ci&set_label("done"); 2270e1051a39Sopenharmony_ci &mov ($key,&DWP($esp_off,"esp")); 2271e1051a39Sopenharmony_ci &pxor ($inout0,$inout0); # clear register bank 2272e1051a39Sopenharmony_ci &pxor ($inout1,$inout1); 2273e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack 2274e1051a39Sopenharmony_ci &pxor ($inout2,$inout2); 2275e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout0); 2276e1051a39Sopenharmony_ci &pxor ($inout3,$inout3); 2277e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout0); 2278e1051a39Sopenharmony_ci &pxor ($inout4,$inout4); 2279e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout0); 2280e1051a39Sopenharmony_ci &pxor ($inout5,$inout5); 2281e1051a39Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout0); 2282e1051a39Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),$inout0); 2283e1051a39Sopenharmony_ci &movdqa (&QWP(16*6,"esp"),$inout0); 2284e1051a39Sopenharmony_ci 2285e1051a39Sopenharmony_ci &lea ("esp",&DWP(0,$key)); 2286e1051a39Sopenharmony_ci &mov ($rounds,&wparam(5)); # &offset_i 2287e1051a39Sopenharmony_ci &mov ($rounds_,&wparam(7)); # &checksum 2288e1051a39Sopenharmony_ci &movdqu (&QWP(0,$rounds),$rndkey0); 2289e1051a39Sopenharmony_ci &pxor ($rndkey0,$rndkey0); 2290e1051a39Sopenharmony_ci &movdqu (&QWP(0,$rounds_),$rndkey1); 2291e1051a39Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 2292e1051a39Sopenharmony_ci&function_end("aesni_ocb_encrypt"); 2293e1051a39Sopenharmony_ci 2294e1051a39Sopenharmony_ci&function_begin("aesni_ocb_decrypt"); 2295e1051a39Sopenharmony_ci &mov ($rounds,&wparam(5)); # &offset_i 2296e1051a39Sopenharmony_ci &mov ($rounds_,&wparam(7)); # &checksum 2297e1051a39Sopenharmony_ci 2298e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); 2299e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); 2300e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 2301e1051a39Sopenharmony_ci &mov ($key,&wparam(3)); 2302e1051a39Sopenharmony_ci &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i 2303e1051a39Sopenharmony_ci &mov ($block,&wparam(4)); # start_block_num 2304e1051a39Sopenharmony_ci &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum 2305e1051a39Sopenharmony_ci &mov ($l_,&wparam(6)); # L_ 2306e1051a39Sopenharmony_ci 2307e1051a39Sopenharmony_ci &mov ($rounds,"esp"); 2308e1051a39Sopenharmony_ci &sub ("esp",$esp_off+4); # alloca 2309e1051a39Sopenharmony_ci &and ("esp",-16); # align stack 2310e1051a39Sopenharmony_ci 2311e1051a39Sopenharmony_ci &sub ($out,$inp); 2312e1051a39Sopenharmony_ci &shl ($len,4); 2313e1051a39Sopenharmony_ci &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6 2314e1051a39Sopenharmony_ci &mov (&DWP($out_off,"esp"),$out); 2315e1051a39Sopenharmony_ci &mov (&DWP($end_off,"esp"),$len); 2316e1051a39Sopenharmony_ci &mov (&DWP($esp_off,"esp"),$rounds); 2317e1051a39Sopenharmony_ci 2318e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2319e1051a39Sopenharmony_ci 2320e1051a39Sopenharmony_ci &test ($block,1); 2321e1051a39Sopenharmony_ci &jnz (&label("odd")); 2322e1051a39Sopenharmony_ci 2323e1051a39Sopenharmony_ci &bsf ($i3,$block); 2324e1051a39Sopenharmony_ci &add ($block,1); 2325e1051a39Sopenharmony_ci &shl ($i3,4); 2326e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i3)); 2327e1051a39Sopenharmony_ci &mov ($i3,$key); # put aside key 2328e1051a39Sopenharmony_ci 2329e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2330e1051a39Sopenharmony_ci &lea ($inp,&DWP(16,$inp)); 2331e1051a39Sopenharmony_ci 2332e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); # ^ last offset_i 2333e1051a39Sopenharmony_ci &pxor ($inout0,$inout5); # ^ offset_i 2334e1051a39Sopenharmony_ci 2335e1051a39Sopenharmony_ci &movdqa ($inout4,$rndkey1); 2336e1051a39Sopenharmony_ci if ($inline) 2337e1051a39Sopenharmony_ci { &aesni_inline_generate1("dec"); } 2338e1051a39Sopenharmony_ci else 2339e1051a39Sopenharmony_ci { &call ("_aesni_decrypt1"); } 2340e1051a39Sopenharmony_ci 2341e1051a39Sopenharmony_ci &xorps ($inout0,$inout5); # ^ offset_i 2342e1051a39Sopenharmony_ci &movaps ($rndkey1,$inout4); # pass the checksum 2343e1051a39Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 2344e1051a39Sopenharmony_ci &xorps ($rndkey1,$inout0); # checksum 2345e1051a39Sopenharmony_ci &movups (&QWP(-16,$out,$inp),$inout0); # store output 2346e1051a39Sopenharmony_ci 2347e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$i3)); 2348e1051a39Sopenharmony_ci &mov ($key,$i3); # restore key 2349e1051a39Sopenharmony_ci &mov ($len,&DWP($end_off,"esp")); 2350e1051a39Sopenharmony_ci 2351e1051a39Sopenharmony_ci&set_label("odd"); 2352e1051a39Sopenharmony_ci &shl ($rounds,4); 2353e1051a39Sopenharmony_ci &mov ($out,16); 2354e1051a39Sopenharmony_ci &sub ($out,$rounds); # twisted rounds 2355e1051a39Sopenharmony_ci &mov (&DWP($key_off,"esp"),$key); 2356e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule 2357e1051a39Sopenharmony_ci &mov (&DWP($rounds_off,"esp"),$out); 2358e1051a39Sopenharmony_ci 2359e1051a39Sopenharmony_ci &cmp ($inp,$len); 2360e1051a39Sopenharmony_ci &ja (&label("short")); 2361e1051a39Sopenharmony_ci &jmp (&label("grandloop")); 2362e1051a39Sopenharmony_ci 2363e1051a39Sopenharmony_ci&set_label("grandloop",32); 2364e1051a39Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 2365e1051a39Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 2366e1051a39Sopenharmony_ci &lea ($i5,&DWP(5,$block)); 2367e1051a39Sopenharmony_ci &add ($block,6); 2368e1051a39Sopenharmony_ci &bsf ($i1,$i1); 2369e1051a39Sopenharmony_ci &bsf ($i3,$i3); 2370e1051a39Sopenharmony_ci &bsf ($i5,$i5); 2371e1051a39Sopenharmony_ci &shl ($i1,4); 2372e1051a39Sopenharmony_ci &shl ($i3,4); 2373e1051a39Sopenharmony_ci &shl ($i5,4); 2374e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(0,$l_)); 2375e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(0,$l_,$i1)); 2376e1051a39Sopenharmony_ci &mov ($rounds,&DWP($rounds_off,"esp")); 2377e1051a39Sopenharmony_ci &movdqa ($inout2,$inout0); 2378e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i3)); 2379e1051a39Sopenharmony_ci &movdqa ($inout4,$inout0); 2380e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i5)); 2381e1051a39Sopenharmony_ci 2382e1051a39Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ last offset_i 2383e1051a39Sopenharmony_ci &pxor ($inout1,$inout0); 2384e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); 2385e1051a39Sopenharmony_ci &pxor ($inout2,$inout1); 2386e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout1); 2387e1051a39Sopenharmony_ci &pxor ($inout3,$inout2); 2388e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout2); 2389e1051a39Sopenharmony_ci &pxor ($inout4,$inout3); 2390e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout3); 2391e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); 2392e1051a39Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout4); 2393e1051a39Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),$inout5); 2394e1051a39Sopenharmony_ci 2395e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 2396e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2397e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 2398e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 2399e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 2400e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 2401e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(16*5,$inp)); 2402e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*6,$inp)); 2403e1051a39Sopenharmony_ci 2404e1051a39Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 2405e1051a39Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ roundkey[0] 2406e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); 2407e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); 2408e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); 2409e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); 2410e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); 2411e1051a39Sopenharmony_ci 2412e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 2413e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2414e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 2415e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 2416e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 2417e1051a39Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 2418e1051a39Sopenharmony_ci &pxor ($inout5,&QWP(16*5,"esp")); 2419e1051a39Sopenharmony_ci 2420e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2421e1051a39Sopenharmony_ci &aesdec ($inout0,$rndkey1); 2422e1051a39Sopenharmony_ci &aesdec ($inout1,$rndkey1); 2423e1051a39Sopenharmony_ci &aesdec ($inout2,$rndkey1); 2424e1051a39Sopenharmony_ci &aesdec ($inout3,$rndkey1); 2425e1051a39Sopenharmony_ci &aesdec ($inout4,$rndkey1); 2426e1051a39Sopenharmony_ci &aesdec ($inout5,$rndkey1); 2427e1051a39Sopenharmony_ci 2428e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2429e1051a39Sopenharmony_ci &mov ($len,&DWP($end_off,"esp")); 2430e1051a39Sopenharmony_ci &call ("_aesni_decrypt6_enter"); 2431e1051a39Sopenharmony_ci 2432e1051a39Sopenharmony_ci &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i 2433e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2434e1051a39Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp")); 2435e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 2436e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 2437e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 2438e1051a39Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 2439e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); 2440e1051a39Sopenharmony_ci 2441e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 2442e1051a39Sopenharmony_ci &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output 2443e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout1); 2444e1051a39Sopenharmony_ci &movdqu (&QWP(-16*5,$out,$inp),$inout1); 2445e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout2); 2446e1051a39Sopenharmony_ci &movdqu (&QWP(-16*4,$out,$inp),$inout2); 2447e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout3); 2448e1051a39Sopenharmony_ci &movdqu (&QWP(-16*3,$out,$inp),$inout3); 2449e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout4); 2450e1051a39Sopenharmony_ci &movdqu (&QWP(-16*2,$out,$inp),$inout4); 2451e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout5); 2452e1051a39Sopenharmony_ci &movdqu (&QWP(-16*1,$out,$inp),$inout5); 2453e1051a39Sopenharmony_ci &cmp ($inp,$len); # done yet? 2454e1051a39Sopenharmony_ci &jbe (&label("grandloop")); 2455e1051a39Sopenharmony_ci 2456e1051a39Sopenharmony_ci&set_label("short"); 2457e1051a39Sopenharmony_ci &add ($len,16*6); 2458e1051a39Sopenharmony_ci &sub ($len,$inp); 2459e1051a39Sopenharmony_ci &jz (&label("done")); 2460e1051a39Sopenharmony_ci 2461e1051a39Sopenharmony_ci &cmp ($len,16*2); 2462e1051a39Sopenharmony_ci &jb (&label("one")); 2463e1051a39Sopenharmony_ci &je (&label("two")); 2464e1051a39Sopenharmony_ci 2465e1051a39Sopenharmony_ci &cmp ($len,16*4); 2466e1051a39Sopenharmony_ci &jb (&label("three")); 2467e1051a39Sopenharmony_ci &je (&label("four")); 2468e1051a39Sopenharmony_ci 2469e1051a39Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 2470e1051a39Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 2471e1051a39Sopenharmony_ci &bsf ($i1,$i1); 2472e1051a39Sopenharmony_ci &bsf ($i3,$i3); 2473e1051a39Sopenharmony_ci &shl ($i1,4); 2474e1051a39Sopenharmony_ci &shl ($i3,4); 2475e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(0,$l_)); 2476e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(0,$l_,$i1)); 2477e1051a39Sopenharmony_ci &mov ($rounds,&DWP($rounds_off,"esp")); 2478e1051a39Sopenharmony_ci &movdqa ($inout2,$inout0); 2479e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i3)); 2480e1051a39Sopenharmony_ci &movdqa ($inout4,$inout0); 2481e1051a39Sopenharmony_ci 2482e1051a39Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ last offset_i 2483e1051a39Sopenharmony_ci &pxor ($inout1,$inout0); 2484e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); 2485e1051a39Sopenharmony_ci &pxor ($inout2,$inout1); 2486e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout1); 2487e1051a39Sopenharmony_ci &pxor ($inout3,$inout2); 2488e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout2); 2489e1051a39Sopenharmony_ci &pxor ($inout4,$inout3); 2490e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout3); 2491e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); 2492e1051a39Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout4); 2493e1051a39Sopenharmony_ci 2494e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 2495e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2496e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 2497e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 2498e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 2499e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(16*4,$inp)); 2500e1051a39Sopenharmony_ci &pxor ($inout5,$inout5); 2501e1051a39Sopenharmony_ci 2502e1051a39Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 2503e1051a39Sopenharmony_ci &pxor ($inout0,$rndkey0); # ^ roundkey[0] 2504e1051a39Sopenharmony_ci &pxor ($inout1,$rndkey0); 2505e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); 2506e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); 2507e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); 2508e1051a39Sopenharmony_ci 2509e1051a39Sopenharmony_ci &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 2510e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2511e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 2512e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 2513e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 2514e1051a39Sopenharmony_ci &pxor ($inout4,&QWP(16*4,"esp")); 2515e1051a39Sopenharmony_ci 2516e1051a39Sopenharmony_ci &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2517e1051a39Sopenharmony_ci &aesdec ($inout0,$rndkey1); 2518e1051a39Sopenharmony_ci &aesdec ($inout1,$rndkey1); 2519e1051a39Sopenharmony_ci &aesdec ($inout2,$rndkey1); 2520e1051a39Sopenharmony_ci &aesdec ($inout3,$rndkey1); 2521e1051a39Sopenharmony_ci &aesdec ($inout4,$rndkey1); 2522e1051a39Sopenharmony_ci &aesdec ($inout5,$rndkey1); 2523e1051a39Sopenharmony_ci 2524e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2525e1051a39Sopenharmony_ci &call ("_aesni_decrypt6_enter"); 2526e1051a39Sopenharmony_ci 2527e1051a39Sopenharmony_ci &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i 2528e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2529e1051a39Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp")); 2530e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 2531e1051a39Sopenharmony_ci &pxor ($inout2,&QWP(16*2,"esp")); 2532e1051a39Sopenharmony_ci &pxor ($inout3,&QWP(16*3,"esp")); 2533e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); 2534e1051a39Sopenharmony_ci 2535e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 2536e1051a39Sopenharmony_ci &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output 2537e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout1); 2538e1051a39Sopenharmony_ci &movdqu (&QWP(16*1,$out,$inp),$inout1); 2539e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout2); 2540e1051a39Sopenharmony_ci &movdqu (&QWP(16*2,$out,$inp),$inout2); 2541e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout3); 2542e1051a39Sopenharmony_ci &movdqu (&QWP(16*3,$out,$inp),$inout3); 2543e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout4); 2544e1051a39Sopenharmony_ci &movdqu (&QWP(16*4,$out,$inp),$inout4); 2545e1051a39Sopenharmony_ci 2546e1051a39Sopenharmony_ci &jmp (&label("done")); 2547e1051a39Sopenharmony_ci 2548e1051a39Sopenharmony_ci&set_label("one",16); 2549e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_)); 2550e1051a39Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 2551e1051a39Sopenharmony_ci 2552e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2553e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2554e1051a39Sopenharmony_ci 2555e1051a39Sopenharmony_ci &pxor ($inout5,$rndkey0); # ^ last offset_i 2556e1051a39Sopenharmony_ci &pxor ($inout0,$inout5); # ^ offset_i 2557e1051a39Sopenharmony_ci 2558e1051a39Sopenharmony_ci &movdqa ($inout4,$rndkey1); 2559e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2560e1051a39Sopenharmony_ci if ($inline) 2561e1051a39Sopenharmony_ci { &aesni_inline_generate1("dec"); } 2562e1051a39Sopenharmony_ci else 2563e1051a39Sopenharmony_ci { &call ("_aesni_decrypt1"); } 2564e1051a39Sopenharmony_ci 2565e1051a39Sopenharmony_ci &xorps ($inout0,$inout5); # ^ offset_i 2566e1051a39Sopenharmony_ci &movaps ($rndkey1,$inout4); # pass the checksum 2567e1051a39Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 2568e1051a39Sopenharmony_ci &xorps ($rndkey1,$inout0); # checksum 2569e1051a39Sopenharmony_ci &movups (&QWP(0,$out,$inp),$inout0); 2570e1051a39Sopenharmony_ci 2571e1051a39Sopenharmony_ci &jmp (&label("done")); 2572e1051a39Sopenharmony_ci 2573e1051a39Sopenharmony_ci&set_label("two",16); 2574e1051a39Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 2575e1051a39Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 2576e1051a39Sopenharmony_ci &bsf ($i1,$i1); 2577e1051a39Sopenharmony_ci &shl ($i1,4); 2578e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(0,$l_)); 2579e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i1)); 2580e1051a39Sopenharmony_ci 2581e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2582e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 2583e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2584e1051a39Sopenharmony_ci 2585e1051a39Sopenharmony_ci &movdqa ($inout3,$rndkey1); 2586e1051a39Sopenharmony_ci &pxor ($inout4,$rndkey0); # ^ last offset_i 2587e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); 2588e1051a39Sopenharmony_ci 2589e1051a39Sopenharmony_ci &pxor ($inout0,$inout4); # ^ offset_i 2590e1051a39Sopenharmony_ci &pxor ($inout1,$inout5); 2591e1051a39Sopenharmony_ci 2592e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2593e1051a39Sopenharmony_ci &call ("_aesni_decrypt2"); 2594e1051a39Sopenharmony_ci 2595e1051a39Sopenharmony_ci &xorps ($inout0,$inout4); # ^ offset_i 2596e1051a39Sopenharmony_ci &xorps ($inout1,$inout5); 2597e1051a39Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 2598e1051a39Sopenharmony_ci &xorps ($inout3,$inout0); # checksum 2599e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2600e1051a39Sopenharmony_ci &xorps ($inout3,$inout1); 2601e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 2602e1051a39Sopenharmony_ci &movaps ($rndkey1,$inout3); # pass the checksum 2603e1051a39Sopenharmony_ci 2604e1051a39Sopenharmony_ci &jmp (&label("done")); 2605e1051a39Sopenharmony_ci 2606e1051a39Sopenharmony_ci&set_label("three",16); 2607e1051a39Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 2608e1051a39Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 2609e1051a39Sopenharmony_ci &bsf ($i1,$i1); 2610e1051a39Sopenharmony_ci &shl ($i1,4); 2611e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_)); 2612e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(0,$l_,$i1)); 2613e1051a39Sopenharmony_ci &movdqa ($inout5,$inout3); 2614e1051a39Sopenharmony_ci 2615e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2616e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 2617e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 2618e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2619e1051a39Sopenharmony_ci 2620e1051a39Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 2621e1051a39Sopenharmony_ci &pxor ($inout3,$rndkey0); # ^ last offset_i 2622e1051a39Sopenharmony_ci &pxor ($inout4,$inout3); 2623e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); 2624e1051a39Sopenharmony_ci 2625e1051a39Sopenharmony_ci &pxor ($inout0,$inout3); # ^ offset_i 2626e1051a39Sopenharmony_ci &pxor ($inout1,$inout4); 2627e1051a39Sopenharmony_ci &pxor ($inout2,$inout5); 2628e1051a39Sopenharmony_ci 2629e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2630e1051a39Sopenharmony_ci &call ("_aesni_decrypt3"); 2631e1051a39Sopenharmony_ci 2632e1051a39Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2633e1051a39Sopenharmony_ci &xorps ($inout0,$inout3); # ^ offset_i 2634e1051a39Sopenharmony_ci &xorps ($inout1,$inout4); 2635e1051a39Sopenharmony_ci &xorps ($inout2,$inout5); 2636e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2637e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 2638e1051a39Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 2639e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 2640e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout1); 2641e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out,$inp),$inout2); 2642e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout2); 2643e1051a39Sopenharmony_ci 2644e1051a39Sopenharmony_ci &jmp (&label("done")); 2645e1051a39Sopenharmony_ci 2646e1051a39Sopenharmony_ci&set_label("four",16); 2647e1051a39Sopenharmony_ci &lea ($i1,&DWP(1,$block)); 2648e1051a39Sopenharmony_ci &lea ($i3,&DWP(3,$block)); 2649e1051a39Sopenharmony_ci &bsf ($i1,$i1); 2650e1051a39Sopenharmony_ci &bsf ($i3,$i3); 2651e1051a39Sopenharmony_ci &mov ($key,&DWP($key_off,"esp")); # restore key 2652e1051a39Sopenharmony_ci &shl ($i1,4); 2653e1051a39Sopenharmony_ci &shl ($i3,4); 2654e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(0,$l_)); 2655e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0,$l_,$i1)); 2656e1051a39Sopenharmony_ci &movdqa ($inout4,$inout2); 2657e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0,$l_,$i3)); 2658e1051a39Sopenharmony_ci 2659e1051a39Sopenharmony_ci &pxor ($inout2,$rndkey0); # ^ last offset_i 2660e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2661e1051a39Sopenharmony_ci &pxor ($inout3,$inout2); 2662e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(16*1,$inp)); 2663e1051a39Sopenharmony_ci &pxor ($inout4,$inout3); 2664e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout2); 2665e1051a39Sopenharmony_ci &pxor ($inout5,$inout4); 2666e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout3); 2667e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(16*2,$inp)); 2668e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(16*3,$inp)); 2669e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2670e1051a39Sopenharmony_ci 2671e1051a39Sopenharmony_ci &movdqa (&QWP($checksum,"esp"),$rndkey1); 2672e1051a39Sopenharmony_ci &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2673e1051a39Sopenharmony_ci &pxor ($inout1,&QWP(16*1,"esp")); 2674e1051a39Sopenharmony_ci &pxor ($inout2,$inout4); 2675e1051a39Sopenharmony_ci &pxor ($inout3,$inout5); 2676e1051a39Sopenharmony_ci 2677e1051a39Sopenharmony_ci &mov ($out,&DWP($out_off,"esp")); 2678e1051a39Sopenharmony_ci &call ("_aesni_decrypt4"); 2679e1051a39Sopenharmony_ci 2680e1051a39Sopenharmony_ci &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2681e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2682e1051a39Sopenharmony_ci &xorps ($inout1,&QWP(16*1,"esp")); 2683e1051a39Sopenharmony_ci &xorps ($inout2,$inout4); 2684e1051a39Sopenharmony_ci &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2685e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout0); # checksum 2686e1051a39Sopenharmony_ci &xorps ($inout3,$inout5); 2687e1051a39Sopenharmony_ci &movups (&QWP(16*1,$out,$inp),$inout1); 2688e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout1); 2689e1051a39Sopenharmony_ci &movdqa ($rndkey0,$inout5); # pass last offset_i 2690e1051a39Sopenharmony_ci &movups (&QWP(16*2,$out,$inp),$inout2); 2691e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout2); 2692e1051a39Sopenharmony_ci &movups (&QWP(16*3,$out,$inp),$inout3); 2693e1051a39Sopenharmony_ci &pxor ($rndkey1,$inout3); 2694e1051a39Sopenharmony_ci 2695e1051a39Sopenharmony_ci&set_label("done"); 2696e1051a39Sopenharmony_ci &mov ($key,&DWP($esp_off,"esp")); 2697e1051a39Sopenharmony_ci &pxor ($inout0,$inout0); # clear register bank 2698e1051a39Sopenharmony_ci &pxor ($inout1,$inout1); 2699e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack 2700e1051a39Sopenharmony_ci &pxor ($inout2,$inout2); 2701e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$inout0); 2702e1051a39Sopenharmony_ci &pxor ($inout3,$inout3); 2703e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$inout0); 2704e1051a39Sopenharmony_ci &pxor ($inout4,$inout4); 2705e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$inout0); 2706e1051a39Sopenharmony_ci &pxor ($inout5,$inout5); 2707e1051a39Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$inout0); 2708e1051a39Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),$inout0); 2709e1051a39Sopenharmony_ci &movdqa (&QWP(16*6,"esp"),$inout0); 2710e1051a39Sopenharmony_ci 2711e1051a39Sopenharmony_ci &lea ("esp",&DWP(0,$key)); 2712e1051a39Sopenharmony_ci &mov ($rounds,&wparam(5)); # &offset_i 2713e1051a39Sopenharmony_ci &mov ($rounds_,&wparam(7)); # &checksum 2714e1051a39Sopenharmony_ci &movdqu (&QWP(0,$rounds),$rndkey0); 2715e1051a39Sopenharmony_ci &pxor ($rndkey0,$rndkey0); 2716e1051a39Sopenharmony_ci &movdqu (&QWP(0,$rounds_),$rndkey1); 2717e1051a39Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 2718e1051a39Sopenharmony_ci&function_end("aesni_ocb_decrypt"); 2719e1051a39Sopenharmony_ci} 2720e1051a39Sopenharmony_ci} 2721e1051a39Sopenharmony_ci 2722e1051a39Sopenharmony_ci###################################################################### 2723e1051a39Sopenharmony_ci# void $PREFIX_cbc_encrypt (const void *inp, void *out, 2724e1051a39Sopenharmony_ci# size_t length, const AES_KEY *key, 2725e1051a39Sopenharmony_ci# unsigned char *ivp,const int enc); 2726e1051a39Sopenharmony_ci&function_begin("${PREFIX}_cbc_encrypt"); 2727e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); 2728e1051a39Sopenharmony_ci &mov ($rounds_,"esp"); 2729e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); 2730e1051a39Sopenharmony_ci &sub ($rounds_,24); 2731e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 2732e1051a39Sopenharmony_ci &and ($rounds_,-16); 2733e1051a39Sopenharmony_ci &mov ($key,&wparam(3)); 2734e1051a39Sopenharmony_ci &mov ($key_,&wparam(4)); 2735e1051a39Sopenharmony_ci &test ($len,$len); 2736e1051a39Sopenharmony_ci &jz (&label("cbc_abort")); 2737e1051a39Sopenharmony_ci 2738e1051a39Sopenharmony_ci &cmp (&wparam(5),0); 2739e1051a39Sopenharmony_ci &xchg ($rounds_,"esp"); # alloca 2740e1051a39Sopenharmony_ci &movups ($ivec,&QWP(0,$key_)); # load IV 2741e1051a39Sopenharmony_ci &mov ($rounds,&DWP(240,$key)); 2742e1051a39Sopenharmony_ci &mov ($key_,$key); # backup $key 2743e1051a39Sopenharmony_ci &mov (&DWP(16,"esp"),$rounds_); # save original %esp 2744e1051a39Sopenharmony_ci &mov ($rounds_,$rounds); # backup $rounds 2745e1051a39Sopenharmony_ci &je (&label("cbc_decrypt")); 2746e1051a39Sopenharmony_ci 2747e1051a39Sopenharmony_ci &movaps ($inout0,$ivec); 2748e1051a39Sopenharmony_ci &cmp ($len,16); 2749e1051a39Sopenharmony_ci &jb (&label("cbc_enc_tail")); 2750e1051a39Sopenharmony_ci &sub ($len,16); 2751e1051a39Sopenharmony_ci &jmp (&label("cbc_enc_loop")); 2752e1051a39Sopenharmony_ci 2753e1051a39Sopenharmony_ci&set_label("cbc_enc_loop",16); 2754e1051a39Sopenharmony_ci &movups ($ivec,&QWP(0,$inp)); # input actually 2755e1051a39Sopenharmony_ci &lea ($inp,&DWP(16,$inp)); 2756e1051a39Sopenharmony_ci if ($inline) 2757e1051a39Sopenharmony_ci { &aesni_inline_generate1("enc",$inout0,$ivec); } 2758e1051a39Sopenharmony_ci else 2759e1051a39Sopenharmony_ci { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 2760e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 2761e1051a39Sopenharmony_ci &mov ($key,$key_); # restore $key 2762e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); # store output 2763e1051a39Sopenharmony_ci &lea ($out,&DWP(16,$out)); 2764e1051a39Sopenharmony_ci &sub ($len,16); 2765e1051a39Sopenharmony_ci &jnc (&label("cbc_enc_loop")); 2766e1051a39Sopenharmony_ci &add ($len,16); 2767e1051a39Sopenharmony_ci &jnz (&label("cbc_enc_tail")); 2768e1051a39Sopenharmony_ci &movaps ($ivec,$inout0); 2769e1051a39Sopenharmony_ci &pxor ($inout0,$inout0); 2770e1051a39Sopenharmony_ci &jmp (&label("cbc_ret")); 2771e1051a39Sopenharmony_ci 2772e1051a39Sopenharmony_ci&set_label("cbc_enc_tail"); 2773e1051a39Sopenharmony_ci &mov ("ecx",$len); # zaps $rounds 2774e1051a39Sopenharmony_ci &data_word(0xA4F3F689); # rep movsb 2775e1051a39Sopenharmony_ci &mov ("ecx",16); # zero tail 2776e1051a39Sopenharmony_ci &sub ("ecx",$len); 2777e1051a39Sopenharmony_ci &xor ("eax","eax"); # zaps $len 2778e1051a39Sopenharmony_ci &data_word(0xAAF3F689); # rep stosb 2779e1051a39Sopenharmony_ci &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 2780e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 2781e1051a39Sopenharmony_ci &mov ($inp,$out); # $inp and $out are the same 2782e1051a39Sopenharmony_ci &mov ($key,$key_); # restore $key 2783e1051a39Sopenharmony_ci &jmp (&label("cbc_enc_loop")); 2784e1051a39Sopenharmony_ci###################################################################### 2785e1051a39Sopenharmony_ci&set_label("cbc_decrypt",16); 2786e1051a39Sopenharmony_ci &cmp ($len,0x50); 2787e1051a39Sopenharmony_ci &jbe (&label("cbc_dec_tail")); 2788e1051a39Sopenharmony_ci &movaps (&QWP(0,"esp"),$ivec); # save IV 2789e1051a39Sopenharmony_ci &sub ($len,0x50); 2790e1051a39Sopenharmony_ci &jmp (&label("cbc_dec_loop6_enter")); 2791e1051a39Sopenharmony_ci 2792e1051a39Sopenharmony_ci&set_label("cbc_dec_loop6",16); 2793e1051a39Sopenharmony_ci &movaps (&QWP(0,"esp"),$rndkey0); # save IV 2794e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout5); 2795e1051a39Sopenharmony_ci &lea ($out,&DWP(0x10,$out)); 2796e1051a39Sopenharmony_ci&set_label("cbc_dec_loop6_enter"); 2797e1051a39Sopenharmony_ci &movdqu ($inout0,&QWP(0,$inp)); 2798e1051a39Sopenharmony_ci &movdqu ($inout1,&QWP(0x10,$inp)); 2799e1051a39Sopenharmony_ci &movdqu ($inout2,&QWP(0x20,$inp)); 2800e1051a39Sopenharmony_ci &movdqu ($inout3,&QWP(0x30,$inp)); 2801e1051a39Sopenharmony_ci &movdqu ($inout4,&QWP(0x40,$inp)); 2802e1051a39Sopenharmony_ci &movdqu ($inout5,&QWP(0x50,$inp)); 2803e1051a39Sopenharmony_ci 2804e1051a39Sopenharmony_ci &call ("_aesni_decrypt6"); 2805e1051a39Sopenharmony_ci 2806e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0,$inp)); 2807e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0x10,$inp)); 2808e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(0,"esp")); # ^=IV 2809e1051a39Sopenharmony_ci &xorps ($inout1,$rndkey1); 2810e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0x20,$inp)); 2811e1051a39Sopenharmony_ci &xorps ($inout2,$rndkey0); 2812e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0x30,$inp)); 2813e1051a39Sopenharmony_ci &xorps ($inout3,$rndkey1); 2814e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0x40,$inp)); 2815e1051a39Sopenharmony_ci &xorps ($inout4,$rndkey0); 2816e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0x50,$inp)); # IV 2817e1051a39Sopenharmony_ci &xorps ($inout5,$rndkey1); 2818e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 2819e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 2820e1051a39Sopenharmony_ci &lea ($inp,&DWP(0x60,$inp)); 2821e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 2822e1051a39Sopenharmony_ci &mov ($rounds,$rounds_); # restore $rounds 2823e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 2824e1051a39Sopenharmony_ci &mov ($key,$key_); # restore $key 2825e1051a39Sopenharmony_ci &movups (&QWP(0x40,$out),$inout4); 2826e1051a39Sopenharmony_ci &lea ($out,&DWP(0x50,$out)); 2827e1051a39Sopenharmony_ci &sub ($len,0x60); 2828e1051a39Sopenharmony_ci &ja (&label("cbc_dec_loop6")); 2829e1051a39Sopenharmony_ci 2830e1051a39Sopenharmony_ci &movaps ($inout0,$inout5); 2831e1051a39Sopenharmony_ci &movaps ($ivec,$rndkey0); 2832e1051a39Sopenharmony_ci &add ($len,0x50); 2833e1051a39Sopenharmony_ci &jle (&label("cbc_dec_clear_tail_collected")); 2834e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 2835e1051a39Sopenharmony_ci &lea ($out,&DWP(0x10,$out)); 2836e1051a39Sopenharmony_ci&set_label("cbc_dec_tail"); 2837e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 2838e1051a39Sopenharmony_ci &movaps ($in0,$inout0); 2839e1051a39Sopenharmony_ci &cmp ($len,0x10); 2840e1051a39Sopenharmony_ci &jbe (&label("cbc_dec_one")); 2841e1051a39Sopenharmony_ci 2842e1051a39Sopenharmony_ci &movups ($inout1,&QWP(0x10,$inp)); 2843e1051a39Sopenharmony_ci &movaps ($in1,$inout1); 2844e1051a39Sopenharmony_ci &cmp ($len,0x20); 2845e1051a39Sopenharmony_ci &jbe (&label("cbc_dec_two")); 2846e1051a39Sopenharmony_ci 2847e1051a39Sopenharmony_ci &movups ($inout2,&QWP(0x20,$inp)); 2848e1051a39Sopenharmony_ci &cmp ($len,0x30); 2849e1051a39Sopenharmony_ci &jbe (&label("cbc_dec_three")); 2850e1051a39Sopenharmony_ci 2851e1051a39Sopenharmony_ci &movups ($inout3,&QWP(0x30,$inp)); 2852e1051a39Sopenharmony_ci &cmp ($len,0x40); 2853e1051a39Sopenharmony_ci &jbe (&label("cbc_dec_four")); 2854e1051a39Sopenharmony_ci 2855e1051a39Sopenharmony_ci &movups ($inout4,&QWP(0x40,$inp)); 2856e1051a39Sopenharmony_ci &movaps (&QWP(0,"esp"),$ivec); # save IV 2857e1051a39Sopenharmony_ci &movups ($inout0,&QWP(0,$inp)); 2858e1051a39Sopenharmony_ci &xorps ($inout5,$inout5); 2859e1051a39Sopenharmony_ci &call ("_aesni_decrypt6"); 2860e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0,$inp)); 2861e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0x10,$inp)); 2862e1051a39Sopenharmony_ci &xorps ($inout0,&QWP(0,"esp")); # ^= IV 2863e1051a39Sopenharmony_ci &xorps ($inout1,$rndkey1); 2864e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0x20,$inp)); 2865e1051a39Sopenharmony_ci &xorps ($inout2,$rndkey0); 2866e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0x30,$inp)); 2867e1051a39Sopenharmony_ci &xorps ($inout3,$rndkey1); 2868e1051a39Sopenharmony_ci &movups ($ivec,&QWP(0x40,$inp)); # IV 2869e1051a39Sopenharmony_ci &xorps ($inout4,$rndkey0); 2870e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 2871e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 2872e1051a39Sopenharmony_ci &pxor ($inout1,$inout1); 2873e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 2874e1051a39Sopenharmony_ci &pxor ($inout2,$inout2); 2875e1051a39Sopenharmony_ci &movups (&QWP(0x30,$out),$inout3); 2876e1051a39Sopenharmony_ci &pxor ($inout3,$inout3); 2877e1051a39Sopenharmony_ci &lea ($out,&DWP(0x40,$out)); 2878e1051a39Sopenharmony_ci &movaps ($inout0,$inout4); 2879e1051a39Sopenharmony_ci &pxor ($inout4,$inout4); 2880e1051a39Sopenharmony_ci &sub ($len,0x50); 2881e1051a39Sopenharmony_ci &jmp (&label("cbc_dec_tail_collected")); 2882e1051a39Sopenharmony_ci 2883e1051a39Sopenharmony_ci&set_label("cbc_dec_one",16); 2884e1051a39Sopenharmony_ci if ($inline) 2885e1051a39Sopenharmony_ci { &aesni_inline_generate1("dec"); } 2886e1051a39Sopenharmony_ci else 2887e1051a39Sopenharmony_ci { &call ("_aesni_decrypt1"); } 2888e1051a39Sopenharmony_ci &xorps ($inout0,$ivec); 2889e1051a39Sopenharmony_ci &movaps ($ivec,$in0); 2890e1051a39Sopenharmony_ci &sub ($len,0x10); 2891e1051a39Sopenharmony_ci &jmp (&label("cbc_dec_tail_collected")); 2892e1051a39Sopenharmony_ci 2893e1051a39Sopenharmony_ci&set_label("cbc_dec_two",16); 2894e1051a39Sopenharmony_ci &call ("_aesni_decrypt2"); 2895e1051a39Sopenharmony_ci &xorps ($inout0,$ivec); 2896e1051a39Sopenharmony_ci &xorps ($inout1,$in0); 2897e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 2898e1051a39Sopenharmony_ci &movaps ($inout0,$inout1); 2899e1051a39Sopenharmony_ci &pxor ($inout1,$inout1); 2900e1051a39Sopenharmony_ci &lea ($out,&DWP(0x10,$out)); 2901e1051a39Sopenharmony_ci &movaps ($ivec,$in1); 2902e1051a39Sopenharmony_ci &sub ($len,0x20); 2903e1051a39Sopenharmony_ci &jmp (&label("cbc_dec_tail_collected")); 2904e1051a39Sopenharmony_ci 2905e1051a39Sopenharmony_ci&set_label("cbc_dec_three",16); 2906e1051a39Sopenharmony_ci &call ("_aesni_decrypt3"); 2907e1051a39Sopenharmony_ci &xorps ($inout0,$ivec); 2908e1051a39Sopenharmony_ci &xorps ($inout1,$in0); 2909e1051a39Sopenharmony_ci &xorps ($inout2,$in1); 2910e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 2911e1051a39Sopenharmony_ci &movaps ($inout0,$inout2); 2912e1051a39Sopenharmony_ci &pxor ($inout2,$inout2); 2913e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 2914e1051a39Sopenharmony_ci &pxor ($inout1,$inout1); 2915e1051a39Sopenharmony_ci &lea ($out,&DWP(0x20,$out)); 2916e1051a39Sopenharmony_ci &movups ($ivec,&QWP(0x20,$inp)); 2917e1051a39Sopenharmony_ci &sub ($len,0x30); 2918e1051a39Sopenharmony_ci &jmp (&label("cbc_dec_tail_collected")); 2919e1051a39Sopenharmony_ci 2920e1051a39Sopenharmony_ci&set_label("cbc_dec_four",16); 2921e1051a39Sopenharmony_ci &call ("_aesni_decrypt4"); 2922e1051a39Sopenharmony_ci &movups ($rndkey1,&QWP(0x10,$inp)); 2923e1051a39Sopenharmony_ci &movups ($rndkey0,&QWP(0x20,$inp)); 2924e1051a39Sopenharmony_ci &xorps ($inout0,$ivec); 2925e1051a39Sopenharmony_ci &movups ($ivec,&QWP(0x30,$inp)); 2926e1051a39Sopenharmony_ci &xorps ($inout1,$in0); 2927e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 2928e1051a39Sopenharmony_ci &xorps ($inout2,$rndkey1); 2929e1051a39Sopenharmony_ci &movups (&QWP(0x10,$out),$inout1); 2930e1051a39Sopenharmony_ci &pxor ($inout1,$inout1); 2931e1051a39Sopenharmony_ci &xorps ($inout3,$rndkey0); 2932e1051a39Sopenharmony_ci &movups (&QWP(0x20,$out),$inout2); 2933e1051a39Sopenharmony_ci &pxor ($inout2,$inout2); 2934e1051a39Sopenharmony_ci &lea ($out,&DWP(0x30,$out)); 2935e1051a39Sopenharmony_ci &movaps ($inout0,$inout3); 2936e1051a39Sopenharmony_ci &pxor ($inout3,$inout3); 2937e1051a39Sopenharmony_ci &sub ($len,0x40); 2938e1051a39Sopenharmony_ci &jmp (&label("cbc_dec_tail_collected")); 2939e1051a39Sopenharmony_ci 2940e1051a39Sopenharmony_ci&set_label("cbc_dec_clear_tail_collected",16); 2941e1051a39Sopenharmony_ci &pxor ($inout1,$inout1); 2942e1051a39Sopenharmony_ci &pxor ($inout2,$inout2); 2943e1051a39Sopenharmony_ci &pxor ($inout3,$inout3); 2944e1051a39Sopenharmony_ci &pxor ($inout4,$inout4); 2945e1051a39Sopenharmony_ci&set_label("cbc_dec_tail_collected"); 2946e1051a39Sopenharmony_ci &and ($len,15); 2947e1051a39Sopenharmony_ci &jnz (&label("cbc_dec_tail_partial")); 2948e1051a39Sopenharmony_ci &movups (&QWP(0,$out),$inout0); 2949e1051a39Sopenharmony_ci &pxor ($rndkey0,$rndkey0); 2950e1051a39Sopenharmony_ci &jmp (&label("cbc_ret")); 2951e1051a39Sopenharmony_ci 2952e1051a39Sopenharmony_ci&set_label("cbc_dec_tail_partial",16); 2953e1051a39Sopenharmony_ci &movaps (&QWP(0,"esp"),$inout0); 2954e1051a39Sopenharmony_ci &pxor ($rndkey0,$rndkey0); 2955e1051a39Sopenharmony_ci &mov ("ecx",16); 2956e1051a39Sopenharmony_ci &mov ($inp,"esp"); 2957e1051a39Sopenharmony_ci &sub ("ecx",$len); 2958e1051a39Sopenharmony_ci &data_word(0xA4F3F689); # rep movsb 2959e1051a39Sopenharmony_ci &movdqa (&QWP(0,"esp"),$inout0); 2960e1051a39Sopenharmony_ci 2961e1051a39Sopenharmony_ci&set_label("cbc_ret"); 2962e1051a39Sopenharmony_ci &mov ("esp",&DWP(16,"esp")); # pull original %esp 2963e1051a39Sopenharmony_ci &mov ($key_,&wparam(4)); 2964e1051a39Sopenharmony_ci &pxor ($inout0,$inout0); 2965e1051a39Sopenharmony_ci &pxor ($rndkey1,$rndkey1); 2966e1051a39Sopenharmony_ci &movups (&QWP(0,$key_),$ivec); # output IV 2967e1051a39Sopenharmony_ci &pxor ($ivec,$ivec); 2968e1051a39Sopenharmony_ci&set_label("cbc_abort"); 2969e1051a39Sopenharmony_ci&function_end("${PREFIX}_cbc_encrypt"); 2970e1051a39Sopenharmony_ci 2971e1051a39Sopenharmony_ci###################################################################### 2972e1051a39Sopenharmony_ci# Mechanical port from aesni-x86_64.pl. 2973e1051a39Sopenharmony_ci# 2974e1051a39Sopenharmony_ci# _aesni_set_encrypt_key is private interface, 2975e1051a39Sopenharmony_ci# input: 2976e1051a39Sopenharmony_ci# "eax" const unsigned char *userKey 2977e1051a39Sopenharmony_ci# $rounds int bits 2978e1051a39Sopenharmony_ci# $key AES_KEY *key 2979e1051a39Sopenharmony_ci# output: 2980e1051a39Sopenharmony_ci# "eax" return code 2981e1051a39Sopenharmony_ci# $round rounds 2982e1051a39Sopenharmony_ci 2983e1051a39Sopenharmony_ci&function_begin_B("_aesni_set_encrypt_key"); 2984e1051a39Sopenharmony_ci &push ("ebp"); 2985e1051a39Sopenharmony_ci &push ("ebx"); 2986e1051a39Sopenharmony_ci &test ("eax","eax"); 2987e1051a39Sopenharmony_ci &jz (&label("bad_pointer")); 2988e1051a39Sopenharmony_ci &test ($key,$key); 2989e1051a39Sopenharmony_ci &jz (&label("bad_pointer")); 2990e1051a39Sopenharmony_ci 2991e1051a39Sopenharmony_ci &call (&label("pic")); 2992e1051a39Sopenharmony_ci&set_label("pic"); 2993e1051a39Sopenharmony_ci &blindpop("ebx"); 2994e1051a39Sopenharmony_ci &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 2995e1051a39Sopenharmony_ci 2996e1051a39Sopenharmony_ci &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); 2997e1051a39Sopenharmony_ci &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 2998e1051a39Sopenharmony_ci &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 2999e1051a39Sopenharmony_ci &mov ("ebp",&DWP(4,"ebp")); 3000e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 3001e1051a39Sopenharmony_ci &and ("ebp",1<<28|1<<11); # AVX and XOP bits 3002e1051a39Sopenharmony_ci &cmp ($rounds,256); 3003e1051a39Sopenharmony_ci &je (&label("14rounds")); 3004e1051a39Sopenharmony_ci &cmp ($rounds,192); 3005e1051a39Sopenharmony_ci &je (&label("12rounds")); 3006e1051a39Sopenharmony_ci &cmp ($rounds,128); 3007e1051a39Sopenharmony_ci &jne (&label("bad_keybits")); 3008e1051a39Sopenharmony_ci 3009e1051a39Sopenharmony_ci&set_label("10rounds",16); 3010e1051a39Sopenharmony_ci &cmp ("ebp",1<<28); 3011e1051a39Sopenharmony_ci &je (&label("10rounds_alt")); 3012e1051a39Sopenharmony_ci 3013e1051a39Sopenharmony_ci &mov ($rounds,9); 3014e1051a39Sopenharmony_ci &$movekey (&QWP(-16,$key),"xmm0"); # round 0 3015e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x01); # round 1 3016e1051a39Sopenharmony_ci &call (&label("key_128_cold")); 3017e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x2); # round 2 3018e1051a39Sopenharmony_ci &call (&label("key_128")); 3019e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x04); # round 3 3020e1051a39Sopenharmony_ci &call (&label("key_128")); 3021e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x08); # round 4 3022e1051a39Sopenharmony_ci &call (&label("key_128")); 3023e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x10); # round 5 3024e1051a39Sopenharmony_ci &call (&label("key_128")); 3025e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x20); # round 6 3026e1051a39Sopenharmony_ci &call (&label("key_128")); 3027e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x40); # round 7 3028e1051a39Sopenharmony_ci &call (&label("key_128")); 3029e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x80); # round 8 3030e1051a39Sopenharmony_ci &call (&label("key_128")); 3031e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 3032e1051a39Sopenharmony_ci &call (&label("key_128")); 3033e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x36); # round 10 3034e1051a39Sopenharmony_ci &call (&label("key_128")); 3035e1051a39Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 3036e1051a39Sopenharmony_ci &mov (&DWP(80,$key),$rounds); 3037e1051a39Sopenharmony_ci 3038e1051a39Sopenharmony_ci &jmp (&label("good_key")); 3039e1051a39Sopenharmony_ci 3040e1051a39Sopenharmony_ci&set_label("key_128",16); 3041e1051a39Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 3042e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 3043e1051a39Sopenharmony_ci&set_label("key_128_cold"); 3044e1051a39Sopenharmony_ci &shufps ("xmm4","xmm0",0b00010000); 3045e1051a39Sopenharmony_ci &xorps ("xmm0","xmm4"); 3046e1051a39Sopenharmony_ci &shufps ("xmm4","xmm0",0b10001100); 3047e1051a39Sopenharmony_ci &xorps ("xmm0","xmm4"); 3048e1051a39Sopenharmony_ci &shufps ("xmm1","xmm1",0b11111111); # critical path 3049e1051a39Sopenharmony_ci &xorps ("xmm0","xmm1"); 3050e1051a39Sopenharmony_ci &ret(); 3051e1051a39Sopenharmony_ci 3052e1051a39Sopenharmony_ci&set_label("10rounds_alt",16); 3053e1051a39Sopenharmony_ci &movdqa ("xmm5",&QWP(0x00,"ebx")); 3054e1051a39Sopenharmony_ci &mov ($rounds,8); 3055e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(0x20,"ebx")); 3056e1051a39Sopenharmony_ci &movdqa ("xmm2","xmm0"); 3057e1051a39Sopenharmony_ci &movdqu (&QWP(-16,$key),"xmm0"); 3058e1051a39Sopenharmony_ci 3059e1051a39Sopenharmony_ci&set_label("loop_key128"); 3060e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm5"); 3061e1051a39Sopenharmony_ci &aesenclast ("xmm0","xmm4"); 3062e1051a39Sopenharmony_ci &pslld ("xmm4",1); 3063e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 3064e1051a39Sopenharmony_ci 3065e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm2"); 3066e1051a39Sopenharmony_ci &pslldq ("xmm2",4); 3067e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); 3068e1051a39Sopenharmony_ci &pslldq ("xmm2",4); 3069e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); 3070e1051a39Sopenharmony_ci &pslldq ("xmm2",4); 3071e1051a39Sopenharmony_ci &pxor ("xmm2","xmm3"); 3072e1051a39Sopenharmony_ci 3073e1051a39Sopenharmony_ci &pxor ("xmm0","xmm2"); 3074e1051a39Sopenharmony_ci &movdqu (&QWP(-16,$key),"xmm0"); 3075e1051a39Sopenharmony_ci &movdqa ("xmm2","xmm0"); 3076e1051a39Sopenharmony_ci 3077e1051a39Sopenharmony_ci &dec ($rounds); 3078e1051a39Sopenharmony_ci &jnz (&label("loop_key128")); 3079e1051a39Sopenharmony_ci 3080e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(0x30,"ebx")); 3081e1051a39Sopenharmony_ci 3082e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm5"); 3083e1051a39Sopenharmony_ci &aesenclast ("xmm0","xmm4"); 3084e1051a39Sopenharmony_ci &pslld ("xmm4",1); 3085e1051a39Sopenharmony_ci 3086e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm2"); 3087e1051a39Sopenharmony_ci &pslldq ("xmm2",4); 3088e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); 3089e1051a39Sopenharmony_ci &pslldq ("xmm2",4); 3090e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); 3091e1051a39Sopenharmony_ci &pslldq ("xmm2",4); 3092e1051a39Sopenharmony_ci &pxor ("xmm2","xmm3"); 3093e1051a39Sopenharmony_ci 3094e1051a39Sopenharmony_ci &pxor ("xmm0","xmm2"); 3095e1051a39Sopenharmony_ci &movdqu (&QWP(0,$key),"xmm0"); 3096e1051a39Sopenharmony_ci 3097e1051a39Sopenharmony_ci &movdqa ("xmm2","xmm0"); 3098e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm5"); 3099e1051a39Sopenharmony_ci &aesenclast ("xmm0","xmm4"); 3100e1051a39Sopenharmony_ci 3101e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm2"); 3102e1051a39Sopenharmony_ci &pslldq ("xmm2",4); 3103e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); 3104e1051a39Sopenharmony_ci &pslldq ("xmm2",4); 3105e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); 3106e1051a39Sopenharmony_ci &pslldq ("xmm2",4); 3107e1051a39Sopenharmony_ci &pxor ("xmm2","xmm3"); 3108e1051a39Sopenharmony_ci 3109e1051a39Sopenharmony_ci &pxor ("xmm0","xmm2"); 3110e1051a39Sopenharmony_ci &movdqu (&QWP(16,$key),"xmm0"); 3111e1051a39Sopenharmony_ci 3112e1051a39Sopenharmony_ci &mov ($rounds,9); 3113e1051a39Sopenharmony_ci &mov (&DWP(96,$key),$rounds); 3114e1051a39Sopenharmony_ci 3115e1051a39Sopenharmony_ci &jmp (&label("good_key")); 3116e1051a39Sopenharmony_ci 3117e1051a39Sopenharmony_ci&set_label("12rounds",16); 3118e1051a39Sopenharmony_ci &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 3119e1051a39Sopenharmony_ci &cmp ("ebp",1<<28); 3120e1051a39Sopenharmony_ci &je (&label("12rounds_alt")); 3121e1051a39Sopenharmony_ci 3122e1051a39Sopenharmony_ci &mov ($rounds,11); 3123e1051a39Sopenharmony_ci &$movekey (&QWP(-16,$key),"xmm0"); # round 0 3124e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 3125e1051a39Sopenharmony_ci &call (&label("key_192a_cold")); 3126e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 3127e1051a39Sopenharmony_ci &call (&label("key_192b")); 3128e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 3129e1051a39Sopenharmony_ci &call (&label("key_192a")); 3130e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 3131e1051a39Sopenharmony_ci &call (&label("key_192b")); 3132e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 3133e1051a39Sopenharmony_ci &call (&label("key_192a")); 3134e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 3135e1051a39Sopenharmony_ci &call (&label("key_192b")); 3136e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 3137e1051a39Sopenharmony_ci &call (&label("key_192a")); 3138e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 3139e1051a39Sopenharmony_ci &call (&label("key_192b")); 3140e1051a39Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 3141e1051a39Sopenharmony_ci &mov (&DWP(48,$key),$rounds); 3142e1051a39Sopenharmony_ci 3143e1051a39Sopenharmony_ci &jmp (&label("good_key")); 3144e1051a39Sopenharmony_ci 3145e1051a39Sopenharmony_ci&set_label("key_192a",16); 3146e1051a39Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 3147e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 3148e1051a39Sopenharmony_ci&set_label("key_192a_cold",16); 3149e1051a39Sopenharmony_ci &movaps ("xmm5","xmm2"); 3150e1051a39Sopenharmony_ci&set_label("key_192b_warm"); 3151e1051a39Sopenharmony_ci &shufps ("xmm4","xmm0",0b00010000); 3152e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm2"); 3153e1051a39Sopenharmony_ci &xorps ("xmm0","xmm4"); 3154e1051a39Sopenharmony_ci &shufps ("xmm4","xmm0",0b10001100); 3155e1051a39Sopenharmony_ci &pslldq ("xmm3",4); 3156e1051a39Sopenharmony_ci &xorps ("xmm0","xmm4"); 3157e1051a39Sopenharmony_ci &pshufd ("xmm1","xmm1",0b01010101); # critical path 3158e1051a39Sopenharmony_ci &pxor ("xmm2","xmm3"); 3159e1051a39Sopenharmony_ci &pxor ("xmm0","xmm1"); 3160e1051a39Sopenharmony_ci &pshufd ("xmm3","xmm0",0b11111111); 3161e1051a39Sopenharmony_ci &pxor ("xmm2","xmm3"); 3162e1051a39Sopenharmony_ci &ret(); 3163e1051a39Sopenharmony_ci 3164e1051a39Sopenharmony_ci&set_label("key_192b",16); 3165e1051a39Sopenharmony_ci &movaps ("xmm3","xmm0"); 3166e1051a39Sopenharmony_ci &shufps ("xmm5","xmm0",0b01000100); 3167e1051a39Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm5"); 3168e1051a39Sopenharmony_ci &shufps ("xmm3","xmm2",0b01001110); 3169e1051a39Sopenharmony_ci &$movekey (&QWP(16,$key),"xmm3"); 3170e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key)); 3171e1051a39Sopenharmony_ci &jmp (&label("key_192b_warm")); 3172e1051a39Sopenharmony_ci 3173e1051a39Sopenharmony_ci&set_label("12rounds_alt",16); 3174e1051a39Sopenharmony_ci &movdqa ("xmm5",&QWP(0x10,"ebx")); 3175e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(0x20,"ebx")); 3176e1051a39Sopenharmony_ci &mov ($rounds,8); 3177e1051a39Sopenharmony_ci &movdqu (&QWP(-16,$key),"xmm0"); 3178e1051a39Sopenharmony_ci 3179e1051a39Sopenharmony_ci&set_label("loop_key192"); 3180e1051a39Sopenharmony_ci &movq (&QWP(0,$key),"xmm2"); 3181e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm2"); 3182e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm5"); 3183e1051a39Sopenharmony_ci &aesenclast ("xmm2","xmm4"); 3184e1051a39Sopenharmony_ci &pslld ("xmm4",1); 3185e1051a39Sopenharmony_ci &lea ($key,&DWP(24,$key)); 3186e1051a39Sopenharmony_ci 3187e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm0"); 3188e1051a39Sopenharmony_ci &pslldq ("xmm0",4); 3189e1051a39Sopenharmony_ci &pxor ("xmm3","xmm0"); 3190e1051a39Sopenharmony_ci &pslldq ("xmm0",4); 3191e1051a39Sopenharmony_ci &pxor ("xmm3","xmm0"); 3192e1051a39Sopenharmony_ci &pslldq ("xmm0",4); 3193e1051a39Sopenharmony_ci &pxor ("xmm0","xmm3"); 3194e1051a39Sopenharmony_ci 3195e1051a39Sopenharmony_ci &pshufd ("xmm3","xmm0",0xff); 3196e1051a39Sopenharmony_ci &pxor ("xmm3","xmm1"); 3197e1051a39Sopenharmony_ci &pslldq ("xmm1",4); 3198e1051a39Sopenharmony_ci &pxor ("xmm3","xmm1"); 3199e1051a39Sopenharmony_ci 3200e1051a39Sopenharmony_ci &pxor ("xmm0","xmm2"); 3201e1051a39Sopenharmony_ci &pxor ("xmm2","xmm3"); 3202e1051a39Sopenharmony_ci &movdqu (&QWP(-16,$key),"xmm0"); 3203e1051a39Sopenharmony_ci 3204e1051a39Sopenharmony_ci &dec ($rounds); 3205e1051a39Sopenharmony_ci &jnz (&label("loop_key192")); 3206e1051a39Sopenharmony_ci 3207e1051a39Sopenharmony_ci &mov ($rounds,11); 3208e1051a39Sopenharmony_ci &mov (&DWP(32,$key),$rounds); 3209e1051a39Sopenharmony_ci 3210e1051a39Sopenharmony_ci &jmp (&label("good_key")); 3211e1051a39Sopenharmony_ci 3212e1051a39Sopenharmony_ci&set_label("14rounds",16); 3213e1051a39Sopenharmony_ci &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 3214e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 3215e1051a39Sopenharmony_ci &cmp ("ebp",1<<28); 3216e1051a39Sopenharmony_ci &je (&label("14rounds_alt")); 3217e1051a39Sopenharmony_ci 3218e1051a39Sopenharmony_ci &mov ($rounds,13); 3219e1051a39Sopenharmony_ci &$movekey (&QWP(-32,$key),"xmm0"); # round 0 3220e1051a39Sopenharmony_ci &$movekey (&QWP(-16,$key),"xmm2"); # round 1 3221e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x01); # round 2 3222e1051a39Sopenharmony_ci &call (&label("key_256a_cold")); 3223e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x01); # round 3 3224e1051a39Sopenharmony_ci &call (&label("key_256b")); 3225e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x02); # round 4 3226e1051a39Sopenharmony_ci &call (&label("key_256a")); 3227e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x02); # round 5 3228e1051a39Sopenharmony_ci &call (&label("key_256b")); 3229e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x04); # round 6 3230e1051a39Sopenharmony_ci &call (&label("key_256a")); 3231e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x04); # round 7 3232e1051a39Sopenharmony_ci &call (&label("key_256b")); 3233e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x08); # round 8 3234e1051a39Sopenharmony_ci &call (&label("key_256a")); 3235e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x08); # round 9 3236e1051a39Sopenharmony_ci &call (&label("key_256b")); 3237e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x10); # round 10 3238e1051a39Sopenharmony_ci &call (&label("key_256a")); 3239e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x10); # round 11 3240e1051a39Sopenharmony_ci &call (&label("key_256b")); 3241e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x20); # round 12 3242e1051a39Sopenharmony_ci &call (&label("key_256a")); 3243e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm0",0x20); # round 13 3244e1051a39Sopenharmony_ci &call (&label("key_256b")); 3245e1051a39Sopenharmony_ci &aeskeygenassist("xmm1","xmm2",0x40); # round 14 3246e1051a39Sopenharmony_ci &call (&label("key_256a")); 3247e1051a39Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 3248e1051a39Sopenharmony_ci &mov (&DWP(16,$key),$rounds); 3249e1051a39Sopenharmony_ci &xor ("eax","eax"); 3250e1051a39Sopenharmony_ci 3251e1051a39Sopenharmony_ci &jmp (&label("good_key")); 3252e1051a39Sopenharmony_ci 3253e1051a39Sopenharmony_ci&set_label("key_256a",16); 3254e1051a39Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm2"); 3255e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 3256e1051a39Sopenharmony_ci&set_label("key_256a_cold"); 3257e1051a39Sopenharmony_ci &shufps ("xmm4","xmm0",0b00010000); 3258e1051a39Sopenharmony_ci &xorps ("xmm0","xmm4"); 3259e1051a39Sopenharmony_ci &shufps ("xmm4","xmm0",0b10001100); 3260e1051a39Sopenharmony_ci &xorps ("xmm0","xmm4"); 3261e1051a39Sopenharmony_ci &shufps ("xmm1","xmm1",0b11111111); # critical path 3262e1051a39Sopenharmony_ci &xorps ("xmm0","xmm1"); 3263e1051a39Sopenharmony_ci &ret(); 3264e1051a39Sopenharmony_ci 3265e1051a39Sopenharmony_ci&set_label("key_256b",16); 3266e1051a39Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 3267e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 3268e1051a39Sopenharmony_ci 3269e1051a39Sopenharmony_ci &shufps ("xmm4","xmm2",0b00010000); 3270e1051a39Sopenharmony_ci &xorps ("xmm2","xmm4"); 3271e1051a39Sopenharmony_ci &shufps ("xmm4","xmm2",0b10001100); 3272e1051a39Sopenharmony_ci &xorps ("xmm2","xmm4"); 3273e1051a39Sopenharmony_ci &shufps ("xmm1","xmm1",0b10101010); # critical path 3274e1051a39Sopenharmony_ci &xorps ("xmm2","xmm1"); 3275e1051a39Sopenharmony_ci &ret(); 3276e1051a39Sopenharmony_ci 3277e1051a39Sopenharmony_ci&set_label("14rounds_alt",16); 3278e1051a39Sopenharmony_ci &movdqa ("xmm5",&QWP(0x00,"ebx")); 3279e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(0x20,"ebx")); 3280e1051a39Sopenharmony_ci &mov ($rounds,7); 3281e1051a39Sopenharmony_ci &movdqu (&QWP(-32,$key),"xmm0"); 3282e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm2"); 3283e1051a39Sopenharmony_ci &movdqu (&QWP(-16,$key),"xmm2"); 3284e1051a39Sopenharmony_ci 3285e1051a39Sopenharmony_ci&set_label("loop_key256"); 3286e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm5"); 3287e1051a39Sopenharmony_ci &aesenclast ("xmm2","xmm4"); 3288e1051a39Sopenharmony_ci 3289e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm0"); 3290e1051a39Sopenharmony_ci &pslldq ("xmm0",4); 3291e1051a39Sopenharmony_ci &pxor ("xmm3","xmm0"); 3292e1051a39Sopenharmony_ci &pslldq ("xmm0",4); 3293e1051a39Sopenharmony_ci &pxor ("xmm3","xmm0"); 3294e1051a39Sopenharmony_ci &pslldq ("xmm0",4); 3295e1051a39Sopenharmony_ci &pxor ("xmm0","xmm3"); 3296e1051a39Sopenharmony_ci &pslld ("xmm4",1); 3297e1051a39Sopenharmony_ci 3298e1051a39Sopenharmony_ci &pxor ("xmm0","xmm2"); 3299e1051a39Sopenharmony_ci &movdqu (&QWP(0,$key),"xmm0"); 3300e1051a39Sopenharmony_ci 3301e1051a39Sopenharmony_ci &dec ($rounds); 3302e1051a39Sopenharmony_ci &jz (&label("done_key256")); 3303e1051a39Sopenharmony_ci 3304e1051a39Sopenharmony_ci &pshufd ("xmm2","xmm0",0xff); 3305e1051a39Sopenharmony_ci &pxor ("xmm3","xmm3"); 3306e1051a39Sopenharmony_ci &aesenclast ("xmm2","xmm3"); 3307e1051a39Sopenharmony_ci 3308e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm1"); 3309e1051a39Sopenharmony_ci &pslldq ("xmm1",4); 3310e1051a39Sopenharmony_ci &pxor ("xmm3","xmm1"); 3311e1051a39Sopenharmony_ci &pslldq ("xmm1",4); 3312e1051a39Sopenharmony_ci &pxor ("xmm3","xmm1"); 3313e1051a39Sopenharmony_ci &pslldq ("xmm1",4); 3314e1051a39Sopenharmony_ci &pxor ("xmm1","xmm3"); 3315e1051a39Sopenharmony_ci 3316e1051a39Sopenharmony_ci &pxor ("xmm2","xmm1"); 3317e1051a39Sopenharmony_ci &movdqu (&QWP(16,$key),"xmm2"); 3318e1051a39Sopenharmony_ci &lea ($key,&DWP(32,$key)); 3319e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm2"); 3320e1051a39Sopenharmony_ci &jmp (&label("loop_key256")); 3321e1051a39Sopenharmony_ci 3322e1051a39Sopenharmony_ci&set_label("done_key256"); 3323e1051a39Sopenharmony_ci &mov ($rounds,13); 3324e1051a39Sopenharmony_ci &mov (&DWP(16,$key),$rounds); 3325e1051a39Sopenharmony_ci 3326e1051a39Sopenharmony_ci&set_label("good_key"); 3327e1051a39Sopenharmony_ci &pxor ("xmm0","xmm0"); 3328e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 3329e1051a39Sopenharmony_ci &pxor ("xmm2","xmm2"); 3330e1051a39Sopenharmony_ci &pxor ("xmm3","xmm3"); 3331e1051a39Sopenharmony_ci &pxor ("xmm4","xmm4"); 3332e1051a39Sopenharmony_ci &pxor ("xmm5","xmm5"); 3333e1051a39Sopenharmony_ci &xor ("eax","eax"); 3334e1051a39Sopenharmony_ci &pop ("ebx"); 3335e1051a39Sopenharmony_ci &pop ("ebp"); 3336e1051a39Sopenharmony_ci &ret (); 3337e1051a39Sopenharmony_ci 3338e1051a39Sopenharmony_ci&set_label("bad_pointer",4); 3339e1051a39Sopenharmony_ci &mov ("eax",-1); 3340e1051a39Sopenharmony_ci &pop ("ebx"); 3341e1051a39Sopenharmony_ci &pop ("ebp"); 3342e1051a39Sopenharmony_ci &ret (); 3343e1051a39Sopenharmony_ci&set_label("bad_keybits",4); 3344e1051a39Sopenharmony_ci &pxor ("xmm0","xmm0"); 3345e1051a39Sopenharmony_ci &mov ("eax",-2); 3346e1051a39Sopenharmony_ci &pop ("ebx"); 3347e1051a39Sopenharmony_ci &pop ("ebp"); 3348e1051a39Sopenharmony_ci &ret (); 3349e1051a39Sopenharmony_ci&function_end_B("_aesni_set_encrypt_key"); 3350e1051a39Sopenharmony_ci 3351e1051a39Sopenharmony_ci# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 3352e1051a39Sopenharmony_ci# AES_KEY *key) 3353e1051a39Sopenharmony_ci&function_begin_B("${PREFIX}_set_encrypt_key"); 3354e1051a39Sopenharmony_ci &mov ("eax",&wparam(0)); 3355e1051a39Sopenharmony_ci &mov ($rounds,&wparam(1)); 3356e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); 3357e1051a39Sopenharmony_ci &call ("_aesni_set_encrypt_key"); 3358e1051a39Sopenharmony_ci &ret (); 3359e1051a39Sopenharmony_ci&function_end_B("${PREFIX}_set_encrypt_key"); 3360e1051a39Sopenharmony_ci 3361e1051a39Sopenharmony_ci# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 3362e1051a39Sopenharmony_ci# AES_KEY *key) 3363e1051a39Sopenharmony_ci&function_begin_B("${PREFIX}_set_decrypt_key"); 3364e1051a39Sopenharmony_ci &mov ("eax",&wparam(0)); 3365e1051a39Sopenharmony_ci &mov ($rounds,&wparam(1)); 3366e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); 3367e1051a39Sopenharmony_ci &call ("_aesni_set_encrypt_key"); 3368e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); 3369e1051a39Sopenharmony_ci &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key 3370e1051a39Sopenharmony_ci &test ("eax","eax"); 3371e1051a39Sopenharmony_ci &jnz (&label("dec_key_ret")); 3372e1051a39Sopenharmony_ci &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 3373e1051a39Sopenharmony_ci 3374e1051a39Sopenharmony_ci &$movekey ("xmm0",&QWP(0,$key)); # just swap 3375e1051a39Sopenharmony_ci &$movekey ("xmm1",&QWP(0,"eax")); 3376e1051a39Sopenharmony_ci &$movekey (&QWP(0,"eax"),"xmm0"); 3377e1051a39Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm1"); 3378e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 3379e1051a39Sopenharmony_ci &lea ("eax",&DWP(-16,"eax")); 3380e1051a39Sopenharmony_ci 3381e1051a39Sopenharmony_ci&set_label("dec_key_inverse"); 3382e1051a39Sopenharmony_ci &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 3383e1051a39Sopenharmony_ci &$movekey ("xmm1",&QWP(0,"eax")); 3384e1051a39Sopenharmony_ci &aesimc ("xmm0","xmm0"); 3385e1051a39Sopenharmony_ci &aesimc ("xmm1","xmm1"); 3386e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 3387e1051a39Sopenharmony_ci &lea ("eax",&DWP(-16,"eax")); 3388e1051a39Sopenharmony_ci &$movekey (&QWP(16,"eax"),"xmm0"); 3389e1051a39Sopenharmony_ci &$movekey (&QWP(-16,$key),"xmm1"); 3390e1051a39Sopenharmony_ci &cmp ("eax",$key); 3391e1051a39Sopenharmony_ci &ja (&label("dec_key_inverse")); 3392e1051a39Sopenharmony_ci 3393e1051a39Sopenharmony_ci &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 3394e1051a39Sopenharmony_ci &aesimc ("xmm0","xmm0"); 3395e1051a39Sopenharmony_ci &$movekey (&QWP(0,$key),"xmm0"); 3396e1051a39Sopenharmony_ci 3397e1051a39Sopenharmony_ci &pxor ("xmm0","xmm0"); 3398e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 3399e1051a39Sopenharmony_ci &xor ("eax","eax"); # return success 3400e1051a39Sopenharmony_ci&set_label("dec_key_ret"); 3401e1051a39Sopenharmony_ci &ret (); 3402e1051a39Sopenharmony_ci&function_end_B("${PREFIX}_set_decrypt_key"); 3403e1051a39Sopenharmony_ci 3404e1051a39Sopenharmony_ci&set_label("key_const",64); 3405e1051a39Sopenharmony_ci&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 3406e1051a39Sopenharmony_ci&data_word(0x04070605,0x04070605,0x04070605,0x04070605); 3407e1051a39Sopenharmony_ci&data_word(1,1,1,1); 3408e1051a39Sopenharmony_ci&data_word(0x1b,0x1b,0x1b,0x1b); 3409e1051a39Sopenharmony_ci&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 3410e1051a39Sopenharmony_ci 3411e1051a39Sopenharmony_ci&asm_finish(); 3412e1051a39Sopenharmony_ci 3413e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 3414