1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> 12e1051a39Sopenharmony_ci# 13e1051a39Sopenharmony_ci# This module may be used under the terms of either the GNU General 14e1051a39Sopenharmony_ci# Public License version 2 or later, the GNU Lesser General Public 15e1051a39Sopenharmony_ci# License version 2.1 or later, the Mozilla Public License version 16e1051a39Sopenharmony_ci# 1.1 or the BSD License. The exact terms of either license are 17e1051a39Sopenharmony_ci# distributed along with this module. For further details see 18e1051a39Sopenharmony_ci# http://www.openssl.org/~appro/camellia/. 19e1051a39Sopenharmony_ci# ==================================================================== 20e1051a39Sopenharmony_ci 21e1051a39Sopenharmony_ci# Performance in cycles per processed byte (less is better) in 22e1051a39Sopenharmony_ci# 'openssl speed ...' benchmark: 23e1051a39Sopenharmony_ci# 24e1051a39Sopenharmony_ci# AMD K8 Core2 PIII P4 25e1051a39Sopenharmony_ci# -evp camellia-128-ecb 21.5 22.8 27.0 28.9 26e1051a39Sopenharmony_ci# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64% 27e1051a39Sopenharmony_ci# + over icc 8.0 +48/19% +21/15% +21/17% +55/37% 28e1051a39Sopenharmony_ci# 29e1051a39Sopenharmony_ci# camellia-128-cbc 17.3 21.1 23.9 25.9 30e1051a39Sopenharmony_ci# 31e1051a39Sopenharmony_ci# 128-bit key setup 196 280 256 240 cycles/key 32e1051a39Sopenharmony_ci# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40% 33e1051a39Sopenharmony_ci# + over icc 8.0 +18/3% +10/0% +10/3% +21/10% 34e1051a39Sopenharmony_ci# 35e1051a39Sopenharmony_ci# Pairs of numbers in "+" rows represent performance improvement over 36e1051a39Sopenharmony_ci# compiler generated position-independent code, PIC, and non-PIC 37e1051a39Sopenharmony_ci# respectively. PIC results are of greater relevance, as this module 38e1051a39Sopenharmony_ci# is position-independent, i.e. suitable for a shared library or PIE. 39e1051a39Sopenharmony_ci# Position independence "costs" one register, which is why compilers 40e1051a39Sopenharmony_ci# are so close with non-PIC results, they have an extra register to 41e1051a39Sopenharmony_ci# spare. CBC results are better than ECB ones thanks to "zero-copy" 42e1051a39Sopenharmony_ci# private _x86_* interface, and are ~30-40% better than with compiler 43e1051a39Sopenharmony_ci# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on 44e1051a39Sopenharmony_ci# same CPU (where applicable). 45e1051a39Sopenharmony_ci 46e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 48e1051a39Sopenharmony_cirequire "x86asm.pl"; 49e1051a39Sopenharmony_ci 50e1051a39Sopenharmony_ci$OPENSSL=1; 51e1051a39Sopenharmony_ci 52e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output"; 53e1051a39Sopenharmony_ci 54e1051a39Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 55e1051a39Sopenharmony_ci 56e1051a39Sopenharmony_ci@T=("eax","ebx","ecx","edx"); 57e1051a39Sopenharmony_ci$idx="esi"; 58e1051a39Sopenharmony_ci$key="edi"; 59e1051a39Sopenharmony_ci$Tbl="ebp"; 60e1051a39Sopenharmony_ci 61e1051a39Sopenharmony_ci# stack frame layout in _x86_Camellia_* routines, frame is allocated 62e1051a39Sopenharmony_ci# by caller 63e1051a39Sopenharmony_ci$__ra=&DWP(0,"esp"); # return address 64e1051a39Sopenharmony_ci$__s0=&DWP(4,"esp"); # s0 backing store 65e1051a39Sopenharmony_ci$__s1=&DWP(8,"esp"); # s1 backing store 66e1051a39Sopenharmony_ci$__s2=&DWP(12,"esp"); # s2 backing store 67e1051a39Sopenharmony_ci$__s3=&DWP(16,"esp"); # s3 backing store 68e1051a39Sopenharmony_ci$__end=&DWP(20,"esp"); # pointer to end/start of key schedule 69e1051a39Sopenharmony_ci 70e1051a39Sopenharmony_ci# stack frame layout in Camellia_[en|crypt] routines, which differs from 71e1051a39Sopenharmony_ci# above by 4 and overlaps by pointer to end/start of key schedule 72e1051a39Sopenharmony_ci$_end=&DWP(16,"esp"); 73e1051a39Sopenharmony_ci$_esp=&DWP(20,"esp"); 74e1051a39Sopenharmony_ci 75e1051a39Sopenharmony_ci# const unsigned int Camellia_SBOX[4][256]; 76e1051a39Sopenharmony_ci# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], 77e1051a39Sopenharmony_ci# and [2][] - with [3][]. This is done to optimize code size. 78e1051a39Sopenharmony_ci$SBOX1_1110=0; # Camellia_SBOX[0] 79e1051a39Sopenharmony_ci$SBOX4_4404=4; # Camellia_SBOX[1] 80e1051a39Sopenharmony_ci$SBOX2_0222=2048; # Camellia_SBOX[2] 81e1051a39Sopenharmony_ci$SBOX3_3033=2052; # Camellia_SBOX[3] 82e1051a39Sopenharmony_ci&static_label("Camellia_SIGMA"); 83e1051a39Sopenharmony_ci&static_label("Camellia_SBOX"); 84e1051a39Sopenharmony_ci 85e1051a39Sopenharmony_cisub Camellia_Feistel { 86e1051a39Sopenharmony_cimy $i=@_[0]; 87e1051a39Sopenharmony_cimy $seed=defined(@_[1])?@_[1]:0; 88e1051a39Sopenharmony_cimy $scale=$seed<0?-8:8; 89e1051a39Sopenharmony_cimy $frame=defined(@_[2])?@_[2]:0; 90e1051a39Sopenharmony_cimy $j=($i&1)*2; 91e1051a39Sopenharmony_cimy $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4]; 92e1051a39Sopenharmony_ci 93e1051a39Sopenharmony_ci &xor ($t0,$idx); # t0^=key[0] 94e1051a39Sopenharmony_ci &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1] 95e1051a39Sopenharmony_ci &movz ($idx,&HB($t0)); # (t0>>8)&0xff 96e1051a39Sopenharmony_ci &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0] 97e1051a39Sopenharmony_ci &movz ($idx,&LB($t0)); # (t0>>0)&0xff 98e1051a39Sopenharmony_ci &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0] 99e1051a39Sopenharmony_ci &shr ($t0,16); 100e1051a39Sopenharmony_ci &movz ($idx,&LB($t1)); # (t1>>0)&0xff 101e1051a39Sopenharmony_ci &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1] 102e1051a39Sopenharmony_ci &movz ($idx,&HB($t0)); # (t0>>24)&0xff 103e1051a39Sopenharmony_ci &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0] 104e1051a39Sopenharmony_ci &movz ($idx,&HB($t1)); # (t1>>8)&0xff 105e1051a39Sopenharmony_ci &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1] 106e1051a39Sopenharmony_ci &shr ($t1,16); 107e1051a39Sopenharmony_ci &movz ($t0,&LB($t0)); # (t0>>16)&0xff 108e1051a39Sopenharmony_ci &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0] 109e1051a39Sopenharmony_ci &movz ($idx,&HB($t1)); # (t1>>24)&0xff 110e1051a39Sopenharmony_ci &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3" 111e1051a39Sopenharmony_ci &xor ($t2,$t3); # t2^=t3 112e1051a39Sopenharmony_ci &rotr ($t3,8); # t3=RightRotate(t3,8) 113e1051a39Sopenharmony_ci &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1] 114e1051a39Sopenharmony_ci &movz ($idx,&LB($t1)); # (t1>>16)&0xff 115e1051a39Sopenharmony_ci &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2" 116e1051a39Sopenharmony_ci &xor ($t3,$t0); # t3^=s3 117e1051a39Sopenharmony_ci &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1] 118e1051a39Sopenharmony_ci &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1] 119e1051a39Sopenharmony_ci &xor ($t3,$t2); # t3^=t2 120e1051a39Sopenharmony_ci &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3 121e1051a39Sopenharmony_ci &xor ($t2,$t1); # t2^=s2 122e1051a39Sopenharmony_ci &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2 123e1051a39Sopenharmony_ci} 124e1051a39Sopenharmony_ci 125e1051a39Sopenharmony_ci# void Camellia_EncryptBlock_Rounds( 126e1051a39Sopenharmony_ci# int grandRounds, 127e1051a39Sopenharmony_ci# const Byte plaintext[], 128e1051a39Sopenharmony_ci# const KEY_TABLE_TYPE keyTable, 129e1051a39Sopenharmony_ci# Byte ciphertext[]) 130e1051a39Sopenharmony_ci&function_begin("Camellia_EncryptBlock_Rounds"); 131e1051a39Sopenharmony_ci &mov ("eax",&wparam(0)); # load grandRounds 132e1051a39Sopenharmony_ci &mov ($idx,&wparam(1)); # load plaintext pointer 133e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); # load key schedule pointer 134e1051a39Sopenharmony_ci 135e1051a39Sopenharmony_ci &mov ("ebx","esp"); 136e1051a39Sopenharmony_ci &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 137e1051a39Sopenharmony_ci &and ("esp",-64); 138e1051a39Sopenharmony_ci 139e1051a39Sopenharmony_ci # place stack frame just "above mod 1024" the key schedule 140e1051a39Sopenharmony_ci # this ensures that cache associativity of 2 suffices 141e1051a39Sopenharmony_ci &lea ("ecx",&DWP(-64-63,$key)); 142e1051a39Sopenharmony_ci &sub ("ecx","esp"); 143e1051a39Sopenharmony_ci &neg ("ecx"); 144e1051a39Sopenharmony_ci &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 145e1051a39Sopenharmony_ci &sub ("esp","ecx"); 146e1051a39Sopenharmony_ci &add ("esp",4); # 4 is reserved for callee's return address 147e1051a39Sopenharmony_ci 148e1051a39Sopenharmony_ci &shl ("eax",6); 149e1051a39Sopenharmony_ci &lea ("eax",&DWP(0,$key,"eax")); 150e1051a39Sopenharmony_ci &mov ($_esp,"ebx"); # save %esp 151e1051a39Sopenharmony_ci &mov ($_end,"eax"); # save keyEnd 152e1051a39Sopenharmony_ci 153e1051a39Sopenharmony_ci &call (&label("pic_point")); 154e1051a39Sopenharmony_ci &set_label("pic_point"); 155e1051a39Sopenharmony_ci &blindpop($Tbl); 156e1051a39Sopenharmony_ci &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 157e1051a39Sopenharmony_ci 158e1051a39Sopenharmony_ci &mov (@T[0],&DWP(0,$idx)); # load plaintext 159e1051a39Sopenharmony_ci &mov (@T[1],&DWP(4,$idx)); 160e1051a39Sopenharmony_ci &mov (@T[2],&DWP(8,$idx)); 161e1051a39Sopenharmony_ci &bswap (@T[0]); 162e1051a39Sopenharmony_ci &mov (@T[3],&DWP(12,$idx)); 163e1051a39Sopenharmony_ci &bswap (@T[1]); 164e1051a39Sopenharmony_ci &bswap (@T[2]); 165e1051a39Sopenharmony_ci &bswap (@T[3]); 166e1051a39Sopenharmony_ci 167e1051a39Sopenharmony_ci &call ("_x86_Camellia_encrypt"); 168e1051a39Sopenharmony_ci 169e1051a39Sopenharmony_ci &mov ("esp",$_esp); 170e1051a39Sopenharmony_ci &bswap (@T[0]); 171e1051a39Sopenharmony_ci &mov ($idx,&wparam(3)); # load ciphertext pointer 172e1051a39Sopenharmony_ci &bswap (@T[1]); 173e1051a39Sopenharmony_ci &bswap (@T[2]); 174e1051a39Sopenharmony_ci &bswap (@T[3]); 175e1051a39Sopenharmony_ci &mov (&DWP(0,$idx),@T[0]); # write ciphertext 176e1051a39Sopenharmony_ci &mov (&DWP(4,$idx),@T[1]); 177e1051a39Sopenharmony_ci &mov (&DWP(8,$idx),@T[2]); 178e1051a39Sopenharmony_ci &mov (&DWP(12,$idx),@T[3]); 179e1051a39Sopenharmony_ci&function_end("Camellia_EncryptBlock_Rounds"); 180e1051a39Sopenharmony_ci# V1.x API 181e1051a39Sopenharmony_ci&function_begin_B("Camellia_EncryptBlock"); 182e1051a39Sopenharmony_ci &mov ("eax",128); 183e1051a39Sopenharmony_ci &sub ("eax",&wparam(0)); # load keyBitLength 184e1051a39Sopenharmony_ci &mov ("eax",3); 185e1051a39Sopenharmony_ci &adc ("eax",0); # keyBitLength==128?3:4 186e1051a39Sopenharmony_ci &mov (&wparam(0),"eax"); 187e1051a39Sopenharmony_ci &jmp (&label("Camellia_EncryptBlock_Rounds")); 188e1051a39Sopenharmony_ci&function_end_B("Camellia_EncryptBlock"); 189e1051a39Sopenharmony_ci 190e1051a39Sopenharmony_ciif ($OPENSSL) { 191e1051a39Sopenharmony_ci# void Camellia_encrypt( 192e1051a39Sopenharmony_ci# const unsigned char *in, 193e1051a39Sopenharmony_ci# unsigned char *out, 194e1051a39Sopenharmony_ci# const CAMELLIA_KEY *key) 195e1051a39Sopenharmony_ci&function_begin("Camellia_encrypt"); 196e1051a39Sopenharmony_ci &mov ($idx,&wparam(0)); # load plaintext pointer 197e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); # load key schedule pointer 198e1051a39Sopenharmony_ci 199e1051a39Sopenharmony_ci &mov ("ebx","esp"); 200e1051a39Sopenharmony_ci &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 201e1051a39Sopenharmony_ci &and ("esp",-64); 202e1051a39Sopenharmony_ci &mov ("eax",&DWP(272,$key)); # load grandRounds counter 203e1051a39Sopenharmony_ci 204e1051a39Sopenharmony_ci # place stack frame just "above mod 1024" the key schedule 205e1051a39Sopenharmony_ci # this ensures that cache associativity of 2 suffices 206e1051a39Sopenharmony_ci &lea ("ecx",&DWP(-64-63,$key)); 207e1051a39Sopenharmony_ci &sub ("ecx","esp"); 208e1051a39Sopenharmony_ci &neg ("ecx"); 209e1051a39Sopenharmony_ci &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 210e1051a39Sopenharmony_ci &sub ("esp","ecx"); 211e1051a39Sopenharmony_ci &add ("esp",4); # 4 is reserved for callee's return address 212e1051a39Sopenharmony_ci 213e1051a39Sopenharmony_ci &shl ("eax",6); 214e1051a39Sopenharmony_ci &lea ("eax",&DWP(0,$key,"eax")); 215e1051a39Sopenharmony_ci &mov ($_esp,"ebx"); # save %esp 216e1051a39Sopenharmony_ci &mov ($_end,"eax"); # save keyEnd 217e1051a39Sopenharmony_ci 218e1051a39Sopenharmony_ci &call (&label("pic_point")); 219e1051a39Sopenharmony_ci &set_label("pic_point"); 220e1051a39Sopenharmony_ci &blindpop($Tbl); 221e1051a39Sopenharmony_ci &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 222e1051a39Sopenharmony_ci 223e1051a39Sopenharmony_ci &mov (@T[0],&DWP(0,$idx)); # load plaintext 224e1051a39Sopenharmony_ci &mov (@T[1],&DWP(4,$idx)); 225e1051a39Sopenharmony_ci &mov (@T[2],&DWP(8,$idx)); 226e1051a39Sopenharmony_ci &bswap (@T[0]); 227e1051a39Sopenharmony_ci &mov (@T[3],&DWP(12,$idx)); 228e1051a39Sopenharmony_ci &bswap (@T[1]); 229e1051a39Sopenharmony_ci &bswap (@T[2]); 230e1051a39Sopenharmony_ci &bswap (@T[3]); 231e1051a39Sopenharmony_ci 232e1051a39Sopenharmony_ci &call ("_x86_Camellia_encrypt"); 233e1051a39Sopenharmony_ci 234e1051a39Sopenharmony_ci &mov ("esp",$_esp); 235e1051a39Sopenharmony_ci &bswap (@T[0]); 236e1051a39Sopenharmony_ci &mov ($idx,&wparam(1)); # load ciphertext pointer 237e1051a39Sopenharmony_ci &bswap (@T[1]); 238e1051a39Sopenharmony_ci &bswap (@T[2]); 239e1051a39Sopenharmony_ci &bswap (@T[3]); 240e1051a39Sopenharmony_ci &mov (&DWP(0,$idx),@T[0]); # write ciphertext 241e1051a39Sopenharmony_ci &mov (&DWP(4,$idx),@T[1]); 242e1051a39Sopenharmony_ci &mov (&DWP(8,$idx),@T[2]); 243e1051a39Sopenharmony_ci &mov (&DWP(12,$idx),@T[3]); 244e1051a39Sopenharmony_ci&function_end("Camellia_encrypt"); 245e1051a39Sopenharmony_ci} 246e1051a39Sopenharmony_ci 247e1051a39Sopenharmony_ci&function_begin_B("_x86_Camellia_encrypt"); 248e1051a39Sopenharmony_ci &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] 249e1051a39Sopenharmony_ci &xor (@T[1],&DWP(4,$key)); 250e1051a39Sopenharmony_ci &xor (@T[2],&DWP(8,$key)); 251e1051a39Sopenharmony_ci &xor (@T[3],&DWP(12,$key)); 252e1051a39Sopenharmony_ci &mov ($idx,&DWP(16,$key)); # prefetch key[4] 253e1051a39Sopenharmony_ci 254e1051a39Sopenharmony_ci &mov ($__s0,@T[0]); # save s[0-3] 255e1051a39Sopenharmony_ci &mov ($__s1,@T[1]); 256e1051a39Sopenharmony_ci &mov ($__s2,@T[2]); 257e1051a39Sopenharmony_ci &mov ($__s3,@T[3]); 258e1051a39Sopenharmony_ci 259e1051a39Sopenharmony_ci&set_label("loop",16); 260e1051a39Sopenharmony_ci for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); } 261e1051a39Sopenharmony_ci 262e1051a39Sopenharmony_ci &add ($key,16*4); 263e1051a39Sopenharmony_ci &cmp ($key,$__end); 264e1051a39Sopenharmony_ci &je (&label("done")); 265e1051a39Sopenharmony_ci 266e1051a39Sopenharmony_ci # @T[0-1] are preloaded, $idx is preloaded with key[0] 267e1051a39Sopenharmony_ci &and ($idx,@T[0]); 268e1051a39Sopenharmony_ci &mov (@T[3],$__s3); 269e1051a39Sopenharmony_ci &rotl ($idx,1); 270e1051a39Sopenharmony_ci &mov (@T[2],@T[3]); 271e1051a39Sopenharmony_ci &xor (@T[1],$idx); 272e1051a39Sopenharmony_ci &or (@T[2],&DWP(12,$key)); 273e1051a39Sopenharmony_ci &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); 274e1051a39Sopenharmony_ci &xor (@T[2],$__s2); 275e1051a39Sopenharmony_ci 276e1051a39Sopenharmony_ci &mov ($idx,&DWP(4,$key)); 277e1051a39Sopenharmony_ci &mov ($__s2,@T[2]); # s2^=s3|key[3]; 278e1051a39Sopenharmony_ci &or ($idx,@T[1]); 279e1051a39Sopenharmony_ci &and (@T[2],&DWP(8,$key)); 280e1051a39Sopenharmony_ci &xor (@T[0],$idx); 281e1051a39Sopenharmony_ci &rotl (@T[2],1); 282e1051a39Sopenharmony_ci &mov ($__s0,@T[0]); # s0^=s1|key[1]; 283e1051a39Sopenharmony_ci &xor (@T[3],@T[2]); 284e1051a39Sopenharmony_ci &mov ($idx,&DWP(16,$key)); # prefetch key[4] 285e1051a39Sopenharmony_ci &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); 286e1051a39Sopenharmony_ci &jmp (&label("loop")); 287e1051a39Sopenharmony_ci 288e1051a39Sopenharmony_ci&set_label("done",8); 289e1051a39Sopenharmony_ci &mov (@T[2],@T[0]); # SwapHalf 290e1051a39Sopenharmony_ci &mov (@T[3],@T[1]); 291e1051a39Sopenharmony_ci &mov (@T[0],$__s2); 292e1051a39Sopenharmony_ci &mov (@T[1],$__s3); 293e1051a39Sopenharmony_ci &xor (@T[0],$idx); # $idx is preloaded with key[0] 294e1051a39Sopenharmony_ci &xor (@T[1],&DWP(4,$key)); 295e1051a39Sopenharmony_ci &xor (@T[2],&DWP(8,$key)); 296e1051a39Sopenharmony_ci &xor (@T[3],&DWP(12,$key)); 297e1051a39Sopenharmony_ci &ret (); 298e1051a39Sopenharmony_ci&function_end_B("_x86_Camellia_encrypt"); 299e1051a39Sopenharmony_ci 300e1051a39Sopenharmony_ci# void Camellia_DecryptBlock_Rounds( 301e1051a39Sopenharmony_ci# int grandRounds, 302e1051a39Sopenharmony_ci# const Byte ciphertext[], 303e1051a39Sopenharmony_ci# const KEY_TABLE_TYPE keyTable, 304e1051a39Sopenharmony_ci# Byte plaintext[]) 305e1051a39Sopenharmony_ci&function_begin("Camellia_DecryptBlock_Rounds"); 306e1051a39Sopenharmony_ci &mov ("eax",&wparam(0)); # load grandRounds 307e1051a39Sopenharmony_ci &mov ($idx,&wparam(1)); # load ciphertext pointer 308e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); # load key schedule pointer 309e1051a39Sopenharmony_ci 310e1051a39Sopenharmony_ci &mov ("ebx","esp"); 311e1051a39Sopenharmony_ci &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 312e1051a39Sopenharmony_ci &and ("esp",-64); 313e1051a39Sopenharmony_ci 314e1051a39Sopenharmony_ci # place stack frame just "above mod 1024" the key schedule 315e1051a39Sopenharmony_ci # this ensures that cache associativity of 2 suffices 316e1051a39Sopenharmony_ci &lea ("ecx",&DWP(-64-63,$key)); 317e1051a39Sopenharmony_ci &sub ("ecx","esp"); 318e1051a39Sopenharmony_ci &neg ("ecx"); 319e1051a39Sopenharmony_ci &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 320e1051a39Sopenharmony_ci &sub ("esp","ecx"); 321e1051a39Sopenharmony_ci &add ("esp",4); # 4 is reserved for callee's return address 322e1051a39Sopenharmony_ci 323e1051a39Sopenharmony_ci &shl ("eax",6); 324e1051a39Sopenharmony_ci &mov (&DWP(4*4,"esp"),$key); # save keyStart 325e1051a39Sopenharmony_ci &lea ($key,&DWP(0,$key,"eax")); 326e1051a39Sopenharmony_ci &mov (&DWP(5*4,"esp"),"ebx");# save %esp 327e1051a39Sopenharmony_ci 328e1051a39Sopenharmony_ci &call (&label("pic_point")); 329e1051a39Sopenharmony_ci &set_label("pic_point"); 330e1051a39Sopenharmony_ci &blindpop($Tbl); 331e1051a39Sopenharmony_ci &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 332e1051a39Sopenharmony_ci 333e1051a39Sopenharmony_ci &mov (@T[0],&DWP(0,$idx)); # load ciphertext 334e1051a39Sopenharmony_ci &mov (@T[1],&DWP(4,$idx)); 335e1051a39Sopenharmony_ci &mov (@T[2],&DWP(8,$idx)); 336e1051a39Sopenharmony_ci &bswap (@T[0]); 337e1051a39Sopenharmony_ci &mov (@T[3],&DWP(12,$idx)); 338e1051a39Sopenharmony_ci &bswap (@T[1]); 339e1051a39Sopenharmony_ci &bswap (@T[2]); 340e1051a39Sopenharmony_ci &bswap (@T[3]); 341e1051a39Sopenharmony_ci 342e1051a39Sopenharmony_ci &call ("_x86_Camellia_decrypt"); 343e1051a39Sopenharmony_ci 344e1051a39Sopenharmony_ci &mov ("esp",&DWP(5*4,"esp")); 345e1051a39Sopenharmony_ci &bswap (@T[0]); 346e1051a39Sopenharmony_ci &mov ($idx,&wparam(3)); # load plaintext pointer 347e1051a39Sopenharmony_ci &bswap (@T[1]); 348e1051a39Sopenharmony_ci &bswap (@T[2]); 349e1051a39Sopenharmony_ci &bswap (@T[3]); 350e1051a39Sopenharmony_ci &mov (&DWP(0,$idx),@T[0]); # write plaintext 351e1051a39Sopenharmony_ci &mov (&DWP(4,$idx),@T[1]); 352e1051a39Sopenharmony_ci &mov (&DWP(8,$idx),@T[2]); 353e1051a39Sopenharmony_ci &mov (&DWP(12,$idx),@T[3]); 354e1051a39Sopenharmony_ci&function_end("Camellia_DecryptBlock_Rounds"); 355e1051a39Sopenharmony_ci# V1.x API 356e1051a39Sopenharmony_ci&function_begin_B("Camellia_DecryptBlock"); 357e1051a39Sopenharmony_ci &mov ("eax",128); 358e1051a39Sopenharmony_ci &sub ("eax",&wparam(0)); # load keyBitLength 359e1051a39Sopenharmony_ci &mov ("eax",3); 360e1051a39Sopenharmony_ci &adc ("eax",0); # keyBitLength==128?3:4 361e1051a39Sopenharmony_ci &mov (&wparam(0),"eax"); 362e1051a39Sopenharmony_ci &jmp (&label("Camellia_DecryptBlock_Rounds")); 363e1051a39Sopenharmony_ci&function_end_B("Camellia_DecryptBlock"); 364e1051a39Sopenharmony_ci 365e1051a39Sopenharmony_ciif ($OPENSSL) { 366e1051a39Sopenharmony_ci# void Camellia_decrypt( 367e1051a39Sopenharmony_ci# const unsigned char *in, 368e1051a39Sopenharmony_ci# unsigned char *out, 369e1051a39Sopenharmony_ci# const CAMELLIA_KEY *key) 370e1051a39Sopenharmony_ci&function_begin("Camellia_decrypt"); 371e1051a39Sopenharmony_ci &mov ($idx,&wparam(0)); # load ciphertext pointer 372e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); # load key schedule pointer 373e1051a39Sopenharmony_ci 374e1051a39Sopenharmony_ci &mov ("ebx","esp"); 375e1051a39Sopenharmony_ci &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 376e1051a39Sopenharmony_ci &and ("esp",-64); 377e1051a39Sopenharmony_ci &mov ("eax",&DWP(272,$key)); # load grandRounds counter 378e1051a39Sopenharmony_ci 379e1051a39Sopenharmony_ci # place stack frame just "above mod 1024" the key schedule 380e1051a39Sopenharmony_ci # this ensures that cache associativity of 2 suffices 381e1051a39Sopenharmony_ci &lea ("ecx",&DWP(-64-63,$key)); 382e1051a39Sopenharmony_ci &sub ("ecx","esp"); 383e1051a39Sopenharmony_ci &neg ("ecx"); 384e1051a39Sopenharmony_ci &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 385e1051a39Sopenharmony_ci &sub ("esp","ecx"); 386e1051a39Sopenharmony_ci &add ("esp",4); # 4 is reserved for callee's return address 387e1051a39Sopenharmony_ci 388e1051a39Sopenharmony_ci &shl ("eax",6); 389e1051a39Sopenharmony_ci &mov (&DWP(4*4,"esp"),$key); # save keyStart 390e1051a39Sopenharmony_ci &lea ($key,&DWP(0,$key,"eax")); 391e1051a39Sopenharmony_ci &mov (&DWP(5*4,"esp"),"ebx");# save %esp 392e1051a39Sopenharmony_ci 393e1051a39Sopenharmony_ci &call (&label("pic_point")); 394e1051a39Sopenharmony_ci &set_label("pic_point"); 395e1051a39Sopenharmony_ci &blindpop($Tbl); 396e1051a39Sopenharmony_ci &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 397e1051a39Sopenharmony_ci 398e1051a39Sopenharmony_ci &mov (@T[0],&DWP(0,$idx)); # load ciphertext 399e1051a39Sopenharmony_ci &mov (@T[1],&DWP(4,$idx)); 400e1051a39Sopenharmony_ci &mov (@T[2],&DWP(8,$idx)); 401e1051a39Sopenharmony_ci &bswap (@T[0]); 402e1051a39Sopenharmony_ci &mov (@T[3],&DWP(12,$idx)); 403e1051a39Sopenharmony_ci &bswap (@T[1]); 404e1051a39Sopenharmony_ci &bswap (@T[2]); 405e1051a39Sopenharmony_ci &bswap (@T[3]); 406e1051a39Sopenharmony_ci 407e1051a39Sopenharmony_ci &call ("_x86_Camellia_decrypt"); 408e1051a39Sopenharmony_ci 409e1051a39Sopenharmony_ci &mov ("esp",&DWP(5*4,"esp")); 410e1051a39Sopenharmony_ci &bswap (@T[0]); 411e1051a39Sopenharmony_ci &mov ($idx,&wparam(1)); # load plaintext pointer 412e1051a39Sopenharmony_ci &bswap (@T[1]); 413e1051a39Sopenharmony_ci &bswap (@T[2]); 414e1051a39Sopenharmony_ci &bswap (@T[3]); 415e1051a39Sopenharmony_ci &mov (&DWP(0,$idx),@T[0]); # write plaintext 416e1051a39Sopenharmony_ci &mov (&DWP(4,$idx),@T[1]); 417e1051a39Sopenharmony_ci &mov (&DWP(8,$idx),@T[2]); 418e1051a39Sopenharmony_ci &mov (&DWP(12,$idx),@T[3]); 419e1051a39Sopenharmony_ci&function_end("Camellia_decrypt"); 420e1051a39Sopenharmony_ci} 421e1051a39Sopenharmony_ci 422e1051a39Sopenharmony_ci&function_begin_B("_x86_Camellia_decrypt"); 423e1051a39Sopenharmony_ci &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] 424e1051a39Sopenharmony_ci &xor (@T[1],&DWP(4,$key)); 425e1051a39Sopenharmony_ci &xor (@T[2],&DWP(8,$key)); 426e1051a39Sopenharmony_ci &xor (@T[3],&DWP(12,$key)); 427e1051a39Sopenharmony_ci &mov ($idx,&DWP(-8,$key)); # prefetch key[-2] 428e1051a39Sopenharmony_ci 429e1051a39Sopenharmony_ci &mov ($__s0,@T[0]); # save s[0-3] 430e1051a39Sopenharmony_ci &mov ($__s1,@T[1]); 431e1051a39Sopenharmony_ci &mov ($__s2,@T[2]); 432e1051a39Sopenharmony_ci &mov ($__s3,@T[3]); 433e1051a39Sopenharmony_ci 434e1051a39Sopenharmony_ci&set_label("loop",16); 435e1051a39Sopenharmony_ci for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); } 436e1051a39Sopenharmony_ci 437e1051a39Sopenharmony_ci &sub ($key,16*4); 438e1051a39Sopenharmony_ci &cmp ($key,$__end); 439e1051a39Sopenharmony_ci &je (&label("done")); 440e1051a39Sopenharmony_ci 441e1051a39Sopenharmony_ci # @T[0-1] are preloaded, $idx is preloaded with key[2] 442e1051a39Sopenharmony_ci &and ($idx,@T[0]); 443e1051a39Sopenharmony_ci &mov (@T[3],$__s3); 444e1051a39Sopenharmony_ci &rotl ($idx,1); 445e1051a39Sopenharmony_ci &mov (@T[2],@T[3]); 446e1051a39Sopenharmony_ci &xor (@T[1],$idx); 447e1051a39Sopenharmony_ci &or (@T[2],&DWP(4,$key)); 448e1051a39Sopenharmony_ci &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); 449e1051a39Sopenharmony_ci &xor (@T[2],$__s2); 450e1051a39Sopenharmony_ci 451e1051a39Sopenharmony_ci &mov ($idx,&DWP(12,$key)); 452e1051a39Sopenharmony_ci &mov ($__s2,@T[2]); # s2^=s3|key[3]; 453e1051a39Sopenharmony_ci &or ($idx,@T[1]); 454e1051a39Sopenharmony_ci &and (@T[2],&DWP(0,$key)); 455e1051a39Sopenharmony_ci &xor (@T[0],$idx); 456e1051a39Sopenharmony_ci &rotl (@T[2],1); 457e1051a39Sopenharmony_ci &mov ($__s0,@T[0]); # s0^=s1|key[1]; 458e1051a39Sopenharmony_ci &xor (@T[3],@T[2]); 459e1051a39Sopenharmony_ci &mov ($idx,&DWP(-8,$key)); # prefetch key[4] 460e1051a39Sopenharmony_ci &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); 461e1051a39Sopenharmony_ci &jmp (&label("loop")); 462e1051a39Sopenharmony_ci 463e1051a39Sopenharmony_ci&set_label("done",8); 464e1051a39Sopenharmony_ci &mov (@T[2],@T[0]); # SwapHalf 465e1051a39Sopenharmony_ci &mov (@T[3],@T[1]); 466e1051a39Sopenharmony_ci &mov (@T[0],$__s2); 467e1051a39Sopenharmony_ci &mov (@T[1],$__s3); 468e1051a39Sopenharmony_ci &xor (@T[2],$idx); # $idx is preloaded with key[2] 469e1051a39Sopenharmony_ci &xor (@T[3],&DWP(12,$key)); 470e1051a39Sopenharmony_ci &xor (@T[0],&DWP(0,$key)); 471e1051a39Sopenharmony_ci &xor (@T[1],&DWP(4,$key)); 472e1051a39Sopenharmony_ci &ret (); 473e1051a39Sopenharmony_ci&function_end_B("_x86_Camellia_decrypt"); 474e1051a39Sopenharmony_ci 475e1051a39Sopenharmony_ci# shld is very slow on Intel P4 family. Even on AMD it limits 476e1051a39Sopenharmony_ci# instruction decode rate [because it's VectorPath] and consequently 477e1051a39Sopenharmony_ci# performance. PIII, PM and Core[2] seem to be the only ones which 478e1051a39Sopenharmony_ci# execute this code ~7% faster... 479e1051a39Sopenharmony_cisub __rotl128 { 480e1051a39Sopenharmony_ci my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; 481e1051a39Sopenharmony_ci 482e1051a39Sopenharmony_ci $rnd *= 2; 483e1051a39Sopenharmony_ci if ($rot) { 484e1051a39Sopenharmony_ci &mov ($idx,$i0); 485e1051a39Sopenharmony_ci &shld ($i0,$i1,$rot); 486e1051a39Sopenharmony_ci &shld ($i1,$i2,$rot); 487e1051a39Sopenharmony_ci &shld ($i2,$i3,$rot); 488e1051a39Sopenharmony_ci &shld ($i3,$idx,$rot); 489e1051a39Sopenharmony_ci } 490e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 491e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 492e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 493e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 494e1051a39Sopenharmony_ci} 495e1051a39Sopenharmony_ci 496e1051a39Sopenharmony_ci# ... Implementing 128-bit rotate without shld gives >3x performance 497e1051a39Sopenharmony_ci# improvement on P4, only ~7% degradation on other Intel CPUs and 498e1051a39Sopenharmony_ci# not worse performance on AMD. This is therefore preferred. 499e1051a39Sopenharmony_cisub _rotl128 { 500e1051a39Sopenharmony_ci my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; 501e1051a39Sopenharmony_ci 502e1051a39Sopenharmony_ci $rnd *= 2; 503e1051a39Sopenharmony_ci if ($rot) { 504e1051a39Sopenharmony_ci &mov ($Tbl,$i0); 505e1051a39Sopenharmony_ci &shl ($i0,$rot); 506e1051a39Sopenharmony_ci &mov ($idx,$i1); 507e1051a39Sopenharmony_ci &shr ($idx,32-$rot); 508e1051a39Sopenharmony_ci &shl ($i1,$rot); 509e1051a39Sopenharmony_ci &or ($i0,$idx); 510e1051a39Sopenharmony_ci &mov ($idx,$i2); 511e1051a39Sopenharmony_ci &shl ($i2,$rot); 512e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 513e1051a39Sopenharmony_ci &shr ($idx,32-$rot); 514e1051a39Sopenharmony_ci &or ($i1,$idx); 515e1051a39Sopenharmony_ci &shr ($Tbl,32-$rot); 516e1051a39Sopenharmony_ci &mov ($idx,$i3); 517e1051a39Sopenharmony_ci &shr ($idx,32-$rot); 518e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 519e1051a39Sopenharmony_ci &shl ($i3,$rot); 520e1051a39Sopenharmony_ci &or ($i2,$idx); 521e1051a39Sopenharmony_ci &or ($i3,$Tbl); 522e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 523e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 524e1051a39Sopenharmony_ci } else { 525e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 526e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 527e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 528e1051a39Sopenharmony_ci &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 529e1051a39Sopenharmony_ci } 530e1051a39Sopenharmony_ci} 531e1051a39Sopenharmony_ci 532e1051a39Sopenharmony_cisub _saveround { 533e1051a39Sopenharmony_cimy ($rnd,$key,@T)=@_; 534e1051a39Sopenharmony_cimy $bias=int(@T[0])?shift(@T):0; 535e1051a39Sopenharmony_ci 536e1051a39Sopenharmony_ci &mov (&DWP($bias+$rnd*8+0,$key),@T[0]); 537e1051a39Sopenharmony_ci &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1); 538e1051a39Sopenharmony_ci &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2); 539e1051a39Sopenharmony_ci &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3); 540e1051a39Sopenharmony_ci} 541e1051a39Sopenharmony_ci 542e1051a39Sopenharmony_cisub _loadround { 543e1051a39Sopenharmony_cimy ($rnd,$key,@T)=@_; 544e1051a39Sopenharmony_cimy $bias=int(@T[0])?shift(@T):0; 545e1051a39Sopenharmony_ci 546e1051a39Sopenharmony_ci &mov (@T[0],&DWP($bias+$rnd*8+0,$key)); 547e1051a39Sopenharmony_ci &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1); 548e1051a39Sopenharmony_ci &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2); 549e1051a39Sopenharmony_ci &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3); 550e1051a39Sopenharmony_ci} 551e1051a39Sopenharmony_ci 552e1051a39Sopenharmony_ci# void Camellia_Ekeygen( 553e1051a39Sopenharmony_ci# const int keyBitLength, 554e1051a39Sopenharmony_ci# const Byte *rawKey, 555e1051a39Sopenharmony_ci# KEY_TABLE_TYPE keyTable) 556e1051a39Sopenharmony_ci&function_begin("Camellia_Ekeygen"); 557e1051a39Sopenharmony_ci{ my $step=0; 558e1051a39Sopenharmony_ci 559e1051a39Sopenharmony_ci &stack_push(4); # place for s[0-3] 560e1051a39Sopenharmony_ci 561e1051a39Sopenharmony_ci &mov ($Tbl,&wparam(0)); # load arguments 562e1051a39Sopenharmony_ci &mov ($idx,&wparam(1)); 563e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); 564e1051a39Sopenharmony_ci 565e1051a39Sopenharmony_ci &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits 566e1051a39Sopenharmony_ci &mov (@T[1],&DWP(4,$idx)); 567e1051a39Sopenharmony_ci &mov (@T[2],&DWP(8,$idx)); 568e1051a39Sopenharmony_ci &mov (@T[3],&DWP(12,$idx)); 569e1051a39Sopenharmony_ci 570e1051a39Sopenharmony_ci &bswap (@T[0]); 571e1051a39Sopenharmony_ci &bswap (@T[1]); 572e1051a39Sopenharmony_ci &bswap (@T[2]); 573e1051a39Sopenharmony_ci &bswap (@T[3]); 574e1051a39Sopenharmony_ci 575e1051a39Sopenharmony_ci &_saveround (0,$key,@T); # KL<<<0 576e1051a39Sopenharmony_ci 577e1051a39Sopenharmony_ci &cmp ($Tbl,128); 578e1051a39Sopenharmony_ci &je (&label("1st128")); 579e1051a39Sopenharmony_ci 580e1051a39Sopenharmony_ci &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits 581e1051a39Sopenharmony_ci &mov (@T[1],&DWP(20,$idx)); 582e1051a39Sopenharmony_ci &cmp ($Tbl,192); 583e1051a39Sopenharmony_ci &je (&label("1st192")); 584e1051a39Sopenharmony_ci &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits 585e1051a39Sopenharmony_ci &mov (@T[3],&DWP(28,$idx)); 586e1051a39Sopenharmony_ci &jmp (&label("1st256")); 587e1051a39Sopenharmony_ci&set_label("1st192",4); 588e1051a39Sopenharmony_ci &mov (@T[2],@T[0]); 589e1051a39Sopenharmony_ci &mov (@T[3],@T[1]); 590e1051a39Sopenharmony_ci ¬ (@T[2]); 591e1051a39Sopenharmony_ci ¬ (@T[3]); 592e1051a39Sopenharmony_ci&set_label("1st256",4); 593e1051a39Sopenharmony_ci &bswap (@T[0]); 594e1051a39Sopenharmony_ci &bswap (@T[1]); 595e1051a39Sopenharmony_ci &bswap (@T[2]); 596e1051a39Sopenharmony_ci &bswap (@T[3]); 597e1051a39Sopenharmony_ci 598e1051a39Sopenharmony_ci &_saveround (4,$key,@T); # temporary storage for KR! 599e1051a39Sopenharmony_ci 600e1051a39Sopenharmony_ci &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL 601e1051a39Sopenharmony_ci &xor (@T[1],&DWP(0*8+4,$key)); 602e1051a39Sopenharmony_ci &xor (@T[2],&DWP(1*8+0,$key)); 603e1051a39Sopenharmony_ci &xor (@T[3],&DWP(1*8+4,$key)); 604e1051a39Sopenharmony_ci 605e1051a39Sopenharmony_ci&set_label("1st128",4); 606e1051a39Sopenharmony_ci &call (&label("pic_point")); 607e1051a39Sopenharmony_ci &set_label("pic_point"); 608e1051a39Sopenharmony_ci &blindpop($Tbl); 609e1051a39Sopenharmony_ci &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 610e1051a39Sopenharmony_ci &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl)); 611e1051a39Sopenharmony_ci 612e1051a39Sopenharmony_ci &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0] 613e1051a39Sopenharmony_ci &mov (&swtmp(0),@T[0]); # save s[0-3] 614e1051a39Sopenharmony_ci &mov (&swtmp(1),@T[1]); 615e1051a39Sopenharmony_ci &mov (&swtmp(2),@T[2]); 616e1051a39Sopenharmony_ci &mov (&swtmp(3),@T[3]); 617e1051a39Sopenharmony_ci &Camellia_Feistel($step++); 618e1051a39Sopenharmony_ci &Camellia_Feistel($step++); 619e1051a39Sopenharmony_ci &mov (@T[2],&swtmp(2)); 620e1051a39Sopenharmony_ci &mov (@T[3],&swtmp(3)); 621e1051a39Sopenharmony_ci 622e1051a39Sopenharmony_ci &mov ($idx,&wparam(2)); 623e1051a39Sopenharmony_ci &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL 624e1051a39Sopenharmony_ci &xor (@T[1],&DWP(0*8+4,$idx)); 625e1051a39Sopenharmony_ci &xor (@T[2],&DWP(1*8+0,$idx)); 626e1051a39Sopenharmony_ci &xor (@T[3],&DWP(1*8+4,$idx)); 627e1051a39Sopenharmony_ci 628e1051a39Sopenharmony_ci &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4] 629e1051a39Sopenharmony_ci &mov (&swtmp(0),@T[0]); # save s[0-3] 630e1051a39Sopenharmony_ci &mov (&swtmp(1),@T[1]); 631e1051a39Sopenharmony_ci &mov (&swtmp(2),@T[2]); 632e1051a39Sopenharmony_ci &mov (&swtmp(3),@T[3]); 633e1051a39Sopenharmony_ci &Camellia_Feistel($step++); 634e1051a39Sopenharmony_ci &Camellia_Feistel($step++); 635e1051a39Sopenharmony_ci &mov (@T[2],&swtmp(2)); 636e1051a39Sopenharmony_ci &mov (@T[3],&swtmp(3)); 637e1051a39Sopenharmony_ci 638e1051a39Sopenharmony_ci &mov ($idx,&wparam(0)); 639e1051a39Sopenharmony_ci &cmp ($idx,128); 640e1051a39Sopenharmony_ci &jne (&label("2nd256")); 641e1051a39Sopenharmony_ci 642e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); 643e1051a39Sopenharmony_ci &lea ($key,&DWP(128,$key)); # size optimization 644e1051a39Sopenharmony_ci 645e1051a39Sopenharmony_ci ####### process KA 646e1051a39Sopenharmony_ci &_saveround (2,$key,-128,@T); # KA<<<0 647e1051a39Sopenharmony_ci &_rotl128 (@T,15,6,@T); # KA<<<15 648e1051a39Sopenharmony_ci &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30) 649e1051a39Sopenharmony_ci &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45) 650e1051a39Sopenharmony_ci &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60) 651e1051a39Sopenharmony_ci push (@T,shift(@T)); # rotl128(@T,32); 652e1051a39Sopenharmony_ci &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94) 653e1051a39Sopenharmony_ci &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111) 654e1051a39Sopenharmony_ci 655e1051a39Sopenharmony_ci ####### process KL 656e1051a39Sopenharmony_ci &_loadround (0,$key,-128,@T); # load KL 657e1051a39Sopenharmony_ci &_rotl128 (@T,15,4,@T); # KL<<<15 658e1051a39Sopenharmony_ci &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45) 659e1051a39Sopenharmony_ci &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60) 660e1051a39Sopenharmony_ci &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77) 661e1051a39Sopenharmony_ci &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94) 662e1051a39Sopenharmony_ci &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111) 663e1051a39Sopenharmony_ci 664e1051a39Sopenharmony_ci while (@T[0] ne "eax") # restore order 665e1051a39Sopenharmony_ci { unshift (@T,pop(@T)); } 666e1051a39Sopenharmony_ci 667e1051a39Sopenharmony_ci &mov ("eax",3); # 3 grandRounds 668e1051a39Sopenharmony_ci &jmp (&label("done")); 669e1051a39Sopenharmony_ci 670e1051a39Sopenharmony_ci&set_label("2nd256",16); 671e1051a39Sopenharmony_ci &mov ($idx,&wparam(2)); 672e1051a39Sopenharmony_ci &_saveround (6,$idx,@T); # temporary storage for KA! 673e1051a39Sopenharmony_ci 674e1051a39Sopenharmony_ci &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR 675e1051a39Sopenharmony_ci &xor (@T[1],&DWP(4*8+4,$idx)); 676e1051a39Sopenharmony_ci &xor (@T[2],&DWP(5*8+0,$idx)); 677e1051a39Sopenharmony_ci &xor (@T[3],&DWP(5*8+4,$idx)); 678e1051a39Sopenharmony_ci 679e1051a39Sopenharmony_ci &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8] 680e1051a39Sopenharmony_ci &mov (&swtmp(0),@T[0]); # save s[0-3] 681e1051a39Sopenharmony_ci &mov (&swtmp(1),@T[1]); 682e1051a39Sopenharmony_ci &mov (&swtmp(2),@T[2]); 683e1051a39Sopenharmony_ci &mov (&swtmp(3),@T[3]); 684e1051a39Sopenharmony_ci &Camellia_Feistel($step++); 685e1051a39Sopenharmony_ci &Camellia_Feistel($step++); 686e1051a39Sopenharmony_ci &mov (@T[2],&swtmp(2)); 687e1051a39Sopenharmony_ci &mov (@T[3],&swtmp(3)); 688e1051a39Sopenharmony_ci 689e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); 690e1051a39Sopenharmony_ci &lea ($key,&DWP(128,$key)); # size optimization 691e1051a39Sopenharmony_ci 692e1051a39Sopenharmony_ci ####### process KB 693e1051a39Sopenharmony_ci &_saveround (2,$key,-128,@T); # KB<<<0 694e1051a39Sopenharmony_ci &_rotl128 (@T,30,10,@T); # KB<<<30 695e1051a39Sopenharmony_ci &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60) 696e1051a39Sopenharmony_ci push (@T,shift(@T)); # rotl128(@T,32); 697e1051a39Sopenharmony_ci &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111) 698e1051a39Sopenharmony_ci 699e1051a39Sopenharmony_ci ####### process KR 700e1051a39Sopenharmony_ci &_loadround (4,$key,-128,@T); # load KR 701e1051a39Sopenharmony_ci &_rotl128 (@T,15,4,@T); # KR<<<15 702e1051a39Sopenharmony_ci &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30) 703e1051a39Sopenharmony_ci &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60) 704e1051a39Sopenharmony_ci push (@T,shift(@T)); # rotl128(@T,32); 705e1051a39Sopenharmony_ci &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94) 706e1051a39Sopenharmony_ci 707e1051a39Sopenharmony_ci ####### process KA 708e1051a39Sopenharmony_ci &_loadround (6,$key,-128,@T); # load KA 709e1051a39Sopenharmony_ci &_rotl128 (@T,15,6,@T); # KA<<<15 710e1051a39Sopenharmony_ci &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45) 711e1051a39Sopenharmony_ci push (@T,shift(@T)); # rotl128(@T,32); 712e1051a39Sopenharmony_ci &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77) 713e1051a39Sopenharmony_ci &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94) 714e1051a39Sopenharmony_ci 715e1051a39Sopenharmony_ci ####### process KL 716e1051a39Sopenharmony_ci &_loadround (0,$key,-128,@T); # load KL 717e1051a39Sopenharmony_ci push (@T,shift(@T)); # rotl128(@T,32); 718e1051a39Sopenharmony_ci &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45) 719e1051a39Sopenharmony_ci &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60) 720e1051a39Sopenharmony_ci &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77) 721e1051a39Sopenharmony_ci push (@T,shift(@T)); # rotl128(@T,32); 722e1051a39Sopenharmony_ci &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111) 723e1051a39Sopenharmony_ci 724e1051a39Sopenharmony_ci while (@T[0] ne "eax") # restore order 725e1051a39Sopenharmony_ci { unshift (@T,pop(@T)); } 726e1051a39Sopenharmony_ci 727e1051a39Sopenharmony_ci &mov ("eax",4); # 4 grandRounds 728e1051a39Sopenharmony_ci&set_label("done"); 729e1051a39Sopenharmony_ci &lea ("edx",&DWP(272-128,$key)); # end of key schedule 730e1051a39Sopenharmony_ci &stack_pop(4); 731e1051a39Sopenharmony_ci} 732e1051a39Sopenharmony_ci&function_end("Camellia_Ekeygen"); 733e1051a39Sopenharmony_ci 734e1051a39Sopenharmony_ciif ($OPENSSL) { 735e1051a39Sopenharmony_ci# int Camellia_set_key ( 736e1051a39Sopenharmony_ci# const unsigned char *userKey, 737e1051a39Sopenharmony_ci# int bits, 738e1051a39Sopenharmony_ci# CAMELLIA_KEY *key) 739e1051a39Sopenharmony_ci&function_begin_B("Camellia_set_key"); 740e1051a39Sopenharmony_ci &push ("ebx"); 741e1051a39Sopenharmony_ci &mov ("ecx",&wparam(0)); # pull arguments 742e1051a39Sopenharmony_ci &mov ("ebx",&wparam(1)); 743e1051a39Sopenharmony_ci &mov ("edx",&wparam(2)); 744e1051a39Sopenharmony_ci 745e1051a39Sopenharmony_ci &mov ("eax",-1); 746e1051a39Sopenharmony_ci &test ("ecx","ecx"); 747e1051a39Sopenharmony_ci &jz (&label("done")); # userKey==NULL? 748e1051a39Sopenharmony_ci &test ("edx","edx"); 749e1051a39Sopenharmony_ci &jz (&label("done")); # key==NULL? 750e1051a39Sopenharmony_ci 751e1051a39Sopenharmony_ci &mov ("eax",-2); 752e1051a39Sopenharmony_ci &cmp ("ebx",256); 753e1051a39Sopenharmony_ci &je (&label("arg_ok")); # bits==256? 754e1051a39Sopenharmony_ci &cmp ("ebx",192); 755e1051a39Sopenharmony_ci &je (&label("arg_ok")); # bits==192? 756e1051a39Sopenharmony_ci &cmp ("ebx",128); 757e1051a39Sopenharmony_ci &jne (&label("done")); # bits!=128? 758e1051a39Sopenharmony_ci&set_label("arg_ok",4); 759e1051a39Sopenharmony_ci 760e1051a39Sopenharmony_ci &push ("edx"); # push arguments 761e1051a39Sopenharmony_ci &push ("ecx"); 762e1051a39Sopenharmony_ci &push ("ebx"); 763e1051a39Sopenharmony_ci &call ("Camellia_Ekeygen"); 764e1051a39Sopenharmony_ci &stack_pop(3); 765e1051a39Sopenharmony_ci 766e1051a39Sopenharmony_ci # eax holds grandRounds and edx points at where to put it 767e1051a39Sopenharmony_ci &mov (&DWP(0,"edx"),"eax"); 768e1051a39Sopenharmony_ci &xor ("eax","eax"); 769e1051a39Sopenharmony_ci&set_label("done",4); 770e1051a39Sopenharmony_ci &pop ("ebx"); 771e1051a39Sopenharmony_ci &ret (); 772e1051a39Sopenharmony_ci&function_end_B("Camellia_set_key"); 773e1051a39Sopenharmony_ci} 774e1051a39Sopenharmony_ci 775e1051a39Sopenharmony_ci@SBOX=( 776e1051a39Sopenharmony_ci112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, 777e1051a39Sopenharmony_ci 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, 778e1051a39Sopenharmony_ci134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, 779e1051a39Sopenharmony_ci166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, 780e1051a39Sopenharmony_ci139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, 781e1051a39Sopenharmony_ci223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, 782e1051a39Sopenharmony_ci 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, 783e1051a39Sopenharmony_ci254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, 784e1051a39Sopenharmony_ci170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, 785e1051a39Sopenharmony_ci 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, 786e1051a39Sopenharmony_ci135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, 787e1051a39Sopenharmony_ci 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, 788e1051a39Sopenharmony_ci233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, 789e1051a39Sopenharmony_ci120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, 790e1051a39Sopenharmony_ci114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, 791e1051a39Sopenharmony_ci 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); 792e1051a39Sopenharmony_ci 793e1051a39Sopenharmony_cisub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; } 794e1051a39Sopenharmony_cisub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; } 795e1051a39Sopenharmony_cisub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; } 796e1051a39Sopenharmony_cisub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; } 797e1051a39Sopenharmony_ci 798e1051a39Sopenharmony_ci&set_label("Camellia_SIGMA",64); 799e1051a39Sopenharmony_ci&data_word( 800e1051a39Sopenharmony_ci 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2, 801e1051a39Sopenharmony_ci 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c, 802e1051a39Sopenharmony_ci 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd, 803e1051a39Sopenharmony_ci 0, 0, 0, 0); 804e1051a39Sopenharmony_ci&set_label("Camellia_SBOX",64); 805e1051a39Sopenharmony_ci# tables are interleaved, remember? 806e1051a39Sopenharmony_cifor ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } 807e1051a39Sopenharmony_cifor ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } 808e1051a39Sopenharmony_ci 809e1051a39Sopenharmony_ci# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, 810e1051a39Sopenharmony_ci# size_t length, const CAMELLIA_KEY *key, 811e1051a39Sopenharmony_ci# unsigned char *ivp,const int enc); 812e1051a39Sopenharmony_ci{ 813e1051a39Sopenharmony_ci# stack frame layout 814e1051a39Sopenharmony_ci# -4(%esp) # return address 0(%esp) 815e1051a39Sopenharmony_ci# 0(%esp) # s0 4(%esp) 816e1051a39Sopenharmony_ci# 4(%esp) # s1 8(%esp) 817e1051a39Sopenharmony_ci# 8(%esp) # s2 12(%esp) 818e1051a39Sopenharmony_ci# 12(%esp) # s3 16(%esp) 819e1051a39Sopenharmony_ci# 16(%esp) # end of key schedule 20(%esp) 820e1051a39Sopenharmony_ci# 20(%esp) # %esp backup 821e1051a39Sopenharmony_cimy $_inp=&DWP(24,"esp"); #copy of wparam(0) 822e1051a39Sopenharmony_cimy $_out=&DWP(28,"esp"); #copy of wparam(1) 823e1051a39Sopenharmony_cimy $_len=&DWP(32,"esp"); #copy of wparam(2) 824e1051a39Sopenharmony_cimy $_key=&DWP(36,"esp"); #copy of wparam(3) 825e1051a39Sopenharmony_cimy $_ivp=&DWP(40,"esp"); #copy of wparam(4) 826e1051a39Sopenharmony_cimy $ivec=&DWP(44,"esp"); #ivec[16] 827e1051a39Sopenharmony_cimy $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec] 828e1051a39Sopenharmony_cimy ($s0,$s1,$s2,$s3) = @T; 829e1051a39Sopenharmony_ci 830e1051a39Sopenharmony_ci&function_begin("Camellia_cbc_encrypt"); 831e1051a39Sopenharmony_ci &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len 832e1051a39Sopenharmony_ci &cmp ($s2,0); 833e1051a39Sopenharmony_ci &je (&label("enc_out")); 834e1051a39Sopenharmony_ci 835e1051a39Sopenharmony_ci &pushf (); 836e1051a39Sopenharmony_ci &cld (); 837e1051a39Sopenharmony_ci 838e1051a39Sopenharmony_ci &mov ($s0,&wparam(0)); # load inp 839e1051a39Sopenharmony_ci &mov ($s1,&wparam(1)); # load out 840e1051a39Sopenharmony_ci #&mov ($s2,&wparam(2)); # load len 841e1051a39Sopenharmony_ci &mov ($s3,&wparam(3)); # load key 842e1051a39Sopenharmony_ci &mov ($Tbl,&wparam(4)); # load ivp 843e1051a39Sopenharmony_ci 844e1051a39Sopenharmony_ci # allocate aligned stack frame... 845e1051a39Sopenharmony_ci &lea ($idx,&DWP(-64,"esp")); 846e1051a39Sopenharmony_ci &and ($idx,-64); 847e1051a39Sopenharmony_ci 848e1051a39Sopenharmony_ci # place stack frame just "above mod 1024" the key schedule 849e1051a39Sopenharmony_ci # this ensures that cache associativity of 2 suffices 850e1051a39Sopenharmony_ci &lea ($key,&DWP(-64-63,$s3)); 851e1051a39Sopenharmony_ci &sub ($key,$idx); 852e1051a39Sopenharmony_ci &neg ($key); 853e1051a39Sopenharmony_ci &and ($key,0x3C0); # modulo 1024, but aligned to cache-line 854e1051a39Sopenharmony_ci &sub ($idx,$key); 855e1051a39Sopenharmony_ci 856e1051a39Sopenharmony_ci &mov ($key,&wparam(5)); # load enc 857e1051a39Sopenharmony_ci 858e1051a39Sopenharmony_ci &exch ("esp",$idx); 859e1051a39Sopenharmony_ci &add ("esp",4); # reserve for return address! 860e1051a39Sopenharmony_ci &mov ($_esp,$idx); # save %esp 861e1051a39Sopenharmony_ci 862e1051a39Sopenharmony_ci &mov ($_inp,$s0); # save copy of inp 863e1051a39Sopenharmony_ci &mov ($_out,$s1); # save copy of out 864e1051a39Sopenharmony_ci &mov ($_len,$s2); # save copy of len 865e1051a39Sopenharmony_ci &mov ($_key,$s3); # save copy of key 866e1051a39Sopenharmony_ci &mov ($_ivp,$Tbl); # save copy of ivp 867e1051a39Sopenharmony_ci 868e1051a39Sopenharmony_ci &call (&label("pic_point")); # make it PIC! 869e1051a39Sopenharmony_ci &set_label("pic_point"); 870e1051a39Sopenharmony_ci &blindpop($Tbl); 871e1051a39Sopenharmony_ci &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 872e1051a39Sopenharmony_ci 873e1051a39Sopenharmony_ci &mov ($idx,32); 874e1051a39Sopenharmony_ci &set_label("prefetch_sbox",4); 875e1051a39Sopenharmony_ci &mov ($s0,&DWP(0,$Tbl)); 876e1051a39Sopenharmony_ci &mov ($s1,&DWP(32,$Tbl)); 877e1051a39Sopenharmony_ci &mov ($s2,&DWP(64,$Tbl)); 878e1051a39Sopenharmony_ci &mov ($s3,&DWP(96,$Tbl)); 879e1051a39Sopenharmony_ci &lea ($Tbl,&DWP(128,$Tbl)); 880e1051a39Sopenharmony_ci &dec ($idx); 881e1051a39Sopenharmony_ci &jnz (&label("prefetch_sbox")); 882e1051a39Sopenharmony_ci &mov ($s0,$_key); 883e1051a39Sopenharmony_ci &sub ($Tbl,4096); 884e1051a39Sopenharmony_ci &mov ($idx,$_inp); 885e1051a39Sopenharmony_ci &mov ($s3,&DWP(272,$s0)); # load grandRounds 886e1051a39Sopenharmony_ci 887e1051a39Sopenharmony_ci &cmp ($key,0); 888e1051a39Sopenharmony_ci &je (&label("DECRYPT")); 889e1051a39Sopenharmony_ci 890e1051a39Sopenharmony_ci &mov ($s2,$_len); 891e1051a39Sopenharmony_ci &mov ($key,$_ivp); 892e1051a39Sopenharmony_ci &shl ($s3,6); 893e1051a39Sopenharmony_ci &lea ($s3,&DWP(0,$s0,$s3)); 894e1051a39Sopenharmony_ci &mov ($_end,$s3); 895e1051a39Sopenharmony_ci 896e1051a39Sopenharmony_ci &test ($s2,0xFFFFFFF0); 897e1051a39Sopenharmony_ci &jz (&label("enc_tail")); # short input... 898e1051a39Sopenharmony_ci 899e1051a39Sopenharmony_ci &mov ($s0,&DWP(0,$key)); # load iv 900e1051a39Sopenharmony_ci &mov ($s1,&DWP(4,$key)); 901e1051a39Sopenharmony_ci 902e1051a39Sopenharmony_ci &set_label("enc_loop",4); 903e1051a39Sopenharmony_ci &mov ($s2,&DWP(8,$key)); 904e1051a39Sopenharmony_ci &mov ($s3,&DWP(12,$key)); 905e1051a39Sopenharmony_ci 906e1051a39Sopenharmony_ci &xor ($s0,&DWP(0,$idx)); # xor input data 907e1051a39Sopenharmony_ci &xor ($s1,&DWP(4,$idx)); 908e1051a39Sopenharmony_ci &xor ($s2,&DWP(8,$idx)); 909e1051a39Sopenharmony_ci &bswap ($s0); 910e1051a39Sopenharmony_ci &xor ($s3,&DWP(12,$idx)); 911e1051a39Sopenharmony_ci &bswap ($s1); 912e1051a39Sopenharmony_ci &mov ($key,$_key); # load key 913e1051a39Sopenharmony_ci &bswap ($s2); 914e1051a39Sopenharmony_ci &bswap ($s3); 915e1051a39Sopenharmony_ci 916e1051a39Sopenharmony_ci &call ("_x86_Camellia_encrypt"); 917e1051a39Sopenharmony_ci 918e1051a39Sopenharmony_ci &mov ($idx,$_inp); # load inp 919e1051a39Sopenharmony_ci &mov ($key,$_out); # load out 920e1051a39Sopenharmony_ci 921e1051a39Sopenharmony_ci &bswap ($s0); 922e1051a39Sopenharmony_ci &bswap ($s1); 923e1051a39Sopenharmony_ci &bswap ($s2); 924e1051a39Sopenharmony_ci &mov (&DWP(0,$key),$s0); # save output data 925e1051a39Sopenharmony_ci &bswap ($s3); 926e1051a39Sopenharmony_ci &mov (&DWP(4,$key),$s1); 927e1051a39Sopenharmony_ci &mov (&DWP(8,$key),$s2); 928e1051a39Sopenharmony_ci &mov (&DWP(12,$key),$s3); 929e1051a39Sopenharmony_ci 930e1051a39Sopenharmony_ci &mov ($s2,$_len); # load len 931e1051a39Sopenharmony_ci 932e1051a39Sopenharmony_ci &lea ($idx,&DWP(16,$idx)); 933e1051a39Sopenharmony_ci &mov ($_inp,$idx); # save inp 934e1051a39Sopenharmony_ci 935e1051a39Sopenharmony_ci &lea ($s3,&DWP(16,$key)); 936e1051a39Sopenharmony_ci &mov ($_out,$s3); # save out 937e1051a39Sopenharmony_ci 938e1051a39Sopenharmony_ci &sub ($s2,16); 939e1051a39Sopenharmony_ci &test ($s2,0xFFFFFFF0); 940e1051a39Sopenharmony_ci &mov ($_len,$s2); # save len 941e1051a39Sopenharmony_ci &jnz (&label("enc_loop")); 942e1051a39Sopenharmony_ci &test ($s2,15); 943e1051a39Sopenharmony_ci &jnz (&label("enc_tail")); 944e1051a39Sopenharmony_ci &mov ($idx,$_ivp); # load ivp 945e1051a39Sopenharmony_ci &mov ($s2,&DWP(8,$key)); # restore last dwords 946e1051a39Sopenharmony_ci &mov ($s3,&DWP(12,$key)); 947e1051a39Sopenharmony_ci &mov (&DWP(0,$idx),$s0); # save ivec 948e1051a39Sopenharmony_ci &mov (&DWP(4,$idx),$s1); 949e1051a39Sopenharmony_ci &mov (&DWP(8,$idx),$s2); 950e1051a39Sopenharmony_ci &mov (&DWP(12,$idx),$s3); 951e1051a39Sopenharmony_ci 952e1051a39Sopenharmony_ci &mov ("esp",$_esp); 953e1051a39Sopenharmony_ci &popf (); 954e1051a39Sopenharmony_ci &set_label("enc_out"); 955e1051a39Sopenharmony_ci &function_end_A(); 956e1051a39Sopenharmony_ci &pushf (); # kludge, never executed 957e1051a39Sopenharmony_ci 958e1051a39Sopenharmony_ci &set_label("enc_tail",4); 959e1051a39Sopenharmony_ci &mov ($s0,$key eq "edi" ? $key : ""); 960e1051a39Sopenharmony_ci &mov ($key,$_out); # load out 961e1051a39Sopenharmony_ci &push ($s0); # push ivp 962e1051a39Sopenharmony_ci &mov ($s1,16); 963e1051a39Sopenharmony_ci &sub ($s1,$s2); 964e1051a39Sopenharmony_ci &cmp ($key,$idx); # compare with inp 965e1051a39Sopenharmony_ci &je (&label("enc_in_place")); 966e1051a39Sopenharmony_ci &align (4); 967e1051a39Sopenharmony_ci &data_word(0xA4F3F689); # rep movsb # copy input 968e1051a39Sopenharmony_ci &jmp (&label("enc_skip_in_place")); 969e1051a39Sopenharmony_ci &set_label("enc_in_place"); 970e1051a39Sopenharmony_ci &lea ($key,&DWP(0,$key,$s2)); 971e1051a39Sopenharmony_ci &set_label("enc_skip_in_place"); 972e1051a39Sopenharmony_ci &mov ($s2,$s1); 973e1051a39Sopenharmony_ci &xor ($s0,$s0); 974e1051a39Sopenharmony_ci &align (4); 975e1051a39Sopenharmony_ci &data_word(0xAAF3F689); # rep stosb # zero tail 976e1051a39Sopenharmony_ci &pop ($key); # pop ivp 977e1051a39Sopenharmony_ci 978e1051a39Sopenharmony_ci &mov ($idx,$_out); # output as input 979e1051a39Sopenharmony_ci &mov ($s0,&DWP(0,$key)); 980e1051a39Sopenharmony_ci &mov ($s1,&DWP(4,$key)); 981e1051a39Sopenharmony_ci &mov ($_len,16); # len=16 982e1051a39Sopenharmony_ci &jmp (&label("enc_loop")); # one more spin... 983e1051a39Sopenharmony_ci 984e1051a39Sopenharmony_ci#----------------------------- DECRYPT -----------------------------# 985e1051a39Sopenharmony_ci&set_label("DECRYPT",16); 986e1051a39Sopenharmony_ci &shl ($s3,6); 987e1051a39Sopenharmony_ci &lea ($s3,&DWP(0,$s0,$s3)); 988e1051a39Sopenharmony_ci &mov ($_end,$s0); 989e1051a39Sopenharmony_ci &mov ($_key,$s3); 990e1051a39Sopenharmony_ci 991e1051a39Sopenharmony_ci &cmp ($idx,$_out); 992e1051a39Sopenharmony_ci &je (&label("dec_in_place")); # in-place processing... 993e1051a39Sopenharmony_ci 994e1051a39Sopenharmony_ci &mov ($key,$_ivp); # load ivp 995e1051a39Sopenharmony_ci &mov ($_tmp,$key); 996e1051a39Sopenharmony_ci 997e1051a39Sopenharmony_ci &set_label("dec_loop",4); 998e1051a39Sopenharmony_ci &mov ($s0,&DWP(0,$idx)); # read input 999e1051a39Sopenharmony_ci &mov ($s1,&DWP(4,$idx)); 1000e1051a39Sopenharmony_ci &mov ($s2,&DWP(8,$idx)); 1001e1051a39Sopenharmony_ci &bswap ($s0); 1002e1051a39Sopenharmony_ci &mov ($s3,&DWP(12,$idx)); 1003e1051a39Sopenharmony_ci &bswap ($s1); 1004e1051a39Sopenharmony_ci &mov ($key,$_key); # load key 1005e1051a39Sopenharmony_ci &bswap ($s2); 1006e1051a39Sopenharmony_ci &bswap ($s3); 1007e1051a39Sopenharmony_ci 1008e1051a39Sopenharmony_ci &call ("_x86_Camellia_decrypt"); 1009e1051a39Sopenharmony_ci 1010e1051a39Sopenharmony_ci &mov ($key,$_tmp); # load ivp 1011e1051a39Sopenharmony_ci &mov ($idx,$_len); # load len 1012e1051a39Sopenharmony_ci 1013e1051a39Sopenharmony_ci &bswap ($s0); 1014e1051a39Sopenharmony_ci &bswap ($s1); 1015e1051a39Sopenharmony_ci &bswap ($s2); 1016e1051a39Sopenharmony_ci &xor ($s0,&DWP(0,$key)); # xor iv 1017e1051a39Sopenharmony_ci &bswap ($s3); 1018e1051a39Sopenharmony_ci &xor ($s1,&DWP(4,$key)); 1019e1051a39Sopenharmony_ci &xor ($s2,&DWP(8,$key)); 1020e1051a39Sopenharmony_ci &xor ($s3,&DWP(12,$key)); 1021e1051a39Sopenharmony_ci 1022e1051a39Sopenharmony_ci &sub ($idx,16); 1023e1051a39Sopenharmony_ci &jc (&label("dec_partial")); 1024e1051a39Sopenharmony_ci &mov ($_len,$idx); # save len 1025e1051a39Sopenharmony_ci &mov ($idx,$_inp); # load inp 1026e1051a39Sopenharmony_ci &mov ($key,$_out); # load out 1027e1051a39Sopenharmony_ci 1028e1051a39Sopenharmony_ci &mov (&DWP(0,$key),$s0); # write output 1029e1051a39Sopenharmony_ci &mov (&DWP(4,$key),$s1); 1030e1051a39Sopenharmony_ci &mov (&DWP(8,$key),$s2); 1031e1051a39Sopenharmony_ci &mov (&DWP(12,$key),$s3); 1032e1051a39Sopenharmony_ci 1033e1051a39Sopenharmony_ci &mov ($_tmp,$idx); # save ivp 1034e1051a39Sopenharmony_ci &lea ($idx,&DWP(16,$idx)); 1035e1051a39Sopenharmony_ci &mov ($_inp,$idx); # save inp 1036e1051a39Sopenharmony_ci 1037e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key)); 1038e1051a39Sopenharmony_ci &mov ($_out,$key); # save out 1039e1051a39Sopenharmony_ci 1040e1051a39Sopenharmony_ci &jnz (&label("dec_loop")); 1041e1051a39Sopenharmony_ci &mov ($key,$_tmp); # load temp ivp 1042e1051a39Sopenharmony_ci &set_label("dec_end"); 1043e1051a39Sopenharmony_ci &mov ($idx,$_ivp); # load user ivp 1044e1051a39Sopenharmony_ci &mov ($s0,&DWP(0,$key)); # load iv 1045e1051a39Sopenharmony_ci &mov ($s1,&DWP(4,$key)); 1046e1051a39Sopenharmony_ci &mov ($s2,&DWP(8,$key)); 1047e1051a39Sopenharmony_ci &mov ($s3,&DWP(12,$key)); 1048e1051a39Sopenharmony_ci &mov (&DWP(0,$idx),$s0); # copy back to user 1049e1051a39Sopenharmony_ci &mov (&DWP(4,$idx),$s1); 1050e1051a39Sopenharmony_ci &mov (&DWP(8,$idx),$s2); 1051e1051a39Sopenharmony_ci &mov (&DWP(12,$idx),$s3); 1052e1051a39Sopenharmony_ci &jmp (&label("dec_out")); 1053e1051a39Sopenharmony_ci 1054e1051a39Sopenharmony_ci &set_label("dec_partial",4); 1055e1051a39Sopenharmony_ci &lea ($key,$ivec); 1056e1051a39Sopenharmony_ci &mov (&DWP(0,$key),$s0); # dump output to stack 1057e1051a39Sopenharmony_ci &mov (&DWP(4,$key),$s1); 1058e1051a39Sopenharmony_ci &mov (&DWP(8,$key),$s2); 1059e1051a39Sopenharmony_ci &mov (&DWP(12,$key),$s3); 1060e1051a39Sopenharmony_ci &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx)); 1061e1051a39Sopenharmony_ci &mov ($idx eq "esi" ? $idx : "",$key); 1062e1051a39Sopenharmony_ci &mov ($key eq "edi" ? $key : "",$_out); # load out 1063e1051a39Sopenharmony_ci &data_word(0xA4F3F689); # rep movsb # copy output 1064e1051a39Sopenharmony_ci &mov ($key,$_inp); # use inp as temp ivp 1065e1051a39Sopenharmony_ci &jmp (&label("dec_end")); 1066e1051a39Sopenharmony_ci 1067e1051a39Sopenharmony_ci &set_label("dec_in_place",4); 1068e1051a39Sopenharmony_ci &set_label("dec_in_place_loop"); 1069e1051a39Sopenharmony_ci &lea ($key,$ivec); 1070e1051a39Sopenharmony_ci &mov ($s0,&DWP(0,$idx)); # read input 1071e1051a39Sopenharmony_ci &mov ($s1,&DWP(4,$idx)); 1072e1051a39Sopenharmony_ci &mov ($s2,&DWP(8,$idx)); 1073e1051a39Sopenharmony_ci &mov ($s3,&DWP(12,$idx)); 1074e1051a39Sopenharmony_ci 1075e1051a39Sopenharmony_ci &mov (&DWP(0,$key),$s0); # copy to temp 1076e1051a39Sopenharmony_ci &mov (&DWP(4,$key),$s1); 1077e1051a39Sopenharmony_ci &mov (&DWP(8,$key),$s2); 1078e1051a39Sopenharmony_ci &bswap ($s0); 1079e1051a39Sopenharmony_ci &mov (&DWP(12,$key),$s3); 1080e1051a39Sopenharmony_ci &bswap ($s1); 1081e1051a39Sopenharmony_ci &mov ($key,$_key); # load key 1082e1051a39Sopenharmony_ci &bswap ($s2); 1083e1051a39Sopenharmony_ci &bswap ($s3); 1084e1051a39Sopenharmony_ci 1085e1051a39Sopenharmony_ci &call ("_x86_Camellia_decrypt"); 1086e1051a39Sopenharmony_ci 1087e1051a39Sopenharmony_ci &mov ($key,$_ivp); # load ivp 1088e1051a39Sopenharmony_ci &mov ($idx,$_out); # load out 1089e1051a39Sopenharmony_ci 1090e1051a39Sopenharmony_ci &bswap ($s0); 1091e1051a39Sopenharmony_ci &bswap ($s1); 1092e1051a39Sopenharmony_ci &bswap ($s2); 1093e1051a39Sopenharmony_ci &xor ($s0,&DWP(0,$key)); # xor iv 1094e1051a39Sopenharmony_ci &bswap ($s3); 1095e1051a39Sopenharmony_ci &xor ($s1,&DWP(4,$key)); 1096e1051a39Sopenharmony_ci &xor ($s2,&DWP(8,$key)); 1097e1051a39Sopenharmony_ci &xor ($s3,&DWP(12,$key)); 1098e1051a39Sopenharmony_ci 1099e1051a39Sopenharmony_ci &mov (&DWP(0,$idx),$s0); # write output 1100e1051a39Sopenharmony_ci &mov (&DWP(4,$idx),$s1); 1101e1051a39Sopenharmony_ci &mov (&DWP(8,$idx),$s2); 1102e1051a39Sopenharmony_ci &mov (&DWP(12,$idx),$s3); 1103e1051a39Sopenharmony_ci 1104e1051a39Sopenharmony_ci &lea ($idx,&DWP(16,$idx)); 1105e1051a39Sopenharmony_ci &mov ($_out,$idx); # save out 1106e1051a39Sopenharmony_ci 1107e1051a39Sopenharmony_ci &lea ($idx,$ivec); 1108e1051a39Sopenharmony_ci &mov ($s0,&DWP(0,$idx)); # read temp 1109e1051a39Sopenharmony_ci &mov ($s1,&DWP(4,$idx)); 1110e1051a39Sopenharmony_ci &mov ($s2,&DWP(8,$idx)); 1111e1051a39Sopenharmony_ci &mov ($s3,&DWP(12,$idx)); 1112e1051a39Sopenharmony_ci 1113e1051a39Sopenharmony_ci &mov (&DWP(0,$key),$s0); # copy iv 1114e1051a39Sopenharmony_ci &mov (&DWP(4,$key),$s1); 1115e1051a39Sopenharmony_ci &mov (&DWP(8,$key),$s2); 1116e1051a39Sopenharmony_ci &mov (&DWP(12,$key),$s3); 1117e1051a39Sopenharmony_ci 1118e1051a39Sopenharmony_ci &mov ($idx,$_inp); # load inp 1119e1051a39Sopenharmony_ci 1120e1051a39Sopenharmony_ci &lea ($idx,&DWP(16,$idx)); 1121e1051a39Sopenharmony_ci &mov ($_inp,$idx); # save inp 1122e1051a39Sopenharmony_ci 1123e1051a39Sopenharmony_ci &mov ($s2,$_len); # load len 1124e1051a39Sopenharmony_ci &sub ($s2,16); 1125e1051a39Sopenharmony_ci &jc (&label("dec_in_place_partial")); 1126e1051a39Sopenharmony_ci &mov ($_len,$s2); # save len 1127e1051a39Sopenharmony_ci &jnz (&label("dec_in_place_loop")); 1128e1051a39Sopenharmony_ci &jmp (&label("dec_out")); 1129e1051a39Sopenharmony_ci 1130e1051a39Sopenharmony_ci &set_label("dec_in_place_partial",4); 1131e1051a39Sopenharmony_ci # one can argue if this is actually required... 1132e1051a39Sopenharmony_ci &mov ($key eq "edi" ? $key : "",$_out); 1133e1051a39Sopenharmony_ci &lea ($idx eq "esi" ? $idx : "",$ivec); 1134e1051a39Sopenharmony_ci &lea ($key,&DWP(0,$key,$s2)); 1135e1051a39Sopenharmony_ci &lea ($idx,&DWP(16,$idx,$s2)); 1136e1051a39Sopenharmony_ci &neg ($s2 eq "ecx" ? $s2 : ""); 1137e1051a39Sopenharmony_ci &data_word(0xA4F3F689); # rep movsb # restore tail 1138e1051a39Sopenharmony_ci 1139e1051a39Sopenharmony_ci &set_label("dec_out",4); 1140e1051a39Sopenharmony_ci &mov ("esp",$_esp); 1141e1051a39Sopenharmony_ci &popf (); 1142e1051a39Sopenharmony_ci&function_end("Camellia_cbc_encrypt"); 1143e1051a39Sopenharmony_ci} 1144e1051a39Sopenharmony_ci 1145e1051a39Sopenharmony_ci&asciz("Camellia for x86 by <appro\@openssl.org>"); 1146e1051a39Sopenharmony_ci 1147e1051a39Sopenharmony_ci&asm_finish(); 1148e1051a39Sopenharmony_ci 1149e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 1150