1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci# 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# January 2015 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# ChaCha20 for x86. 20e1051a39Sopenharmony_ci# 21e1051a39Sopenharmony_ci# Performance in cycles per byte out of large buffer. 22e1051a39Sopenharmony_ci# 23e1051a39Sopenharmony_ci# 1xIALU/gcc 4xSSSE3 24e1051a39Sopenharmony_ci# Pentium 17.5/+80% 25e1051a39Sopenharmony_ci# PIII 14.2/+60% 26e1051a39Sopenharmony_ci# P4 18.6/+84% 27e1051a39Sopenharmony_ci# Core2 9.56/+89% 4.83 28e1051a39Sopenharmony_ci# Westmere 9.50/+45% 3.35 29e1051a39Sopenharmony_ci# Sandy Bridge 10.5/+47% 3.20 30e1051a39Sopenharmony_ci# Haswell 8.15/+50% 2.83 31e1051a39Sopenharmony_ci# Skylake 7.53/+22% 2.75 32e1051a39Sopenharmony_ci# Silvermont 17.4/+36% 8.35 33e1051a39Sopenharmony_ci# Goldmont 13.4/+40% 4.36 34e1051a39Sopenharmony_ci# Sledgehammer 10.2/+54% 35e1051a39Sopenharmony_ci# Bulldozer 13.4/+50% 4.38(*) 36e1051a39Sopenharmony_ci# 37e1051a39Sopenharmony_ci# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; 38e1051a39Sopenharmony_ci 39e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 41e1051a39Sopenharmony_cirequire "x86asm.pl"; 42e1051a39Sopenharmony_ci 43e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output"; 44e1051a39Sopenharmony_ci 45e1051a39Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 46e1051a39Sopenharmony_ci 47e1051a39Sopenharmony_ci$xmm=$ymm=0; 48e1051a39Sopenharmony_cifor (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } 49e1051a39Sopenharmony_ci 50e1051a39Sopenharmony_ci$ymm=1 if ($xmm && 51e1051a39Sopenharmony_ci `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 52e1051a39Sopenharmony_ci =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 53e1051a39Sopenharmony_ci ($gasver=$1)>=2.19); # first version supporting AVX 54e1051a39Sopenharmony_ci 55e1051a39Sopenharmony_ci$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 56e1051a39Sopenharmony_ci `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 57e1051a39Sopenharmony_ci $1>=2.03); # first version supporting AVX 58e1051a39Sopenharmony_ci 59e1051a39Sopenharmony_ci$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && 60e1051a39Sopenharmony_ci `ml 2>&1` =~ /Version ([0-9]+)\./ && 61e1051a39Sopenharmony_ci $1>=10); # first version supporting AVX 62e1051a39Sopenharmony_ci 63e1051a39Sopenharmony_ci$ymm=1 if ($xmm && !$ymm && 64e1051a39Sopenharmony_ci `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ && 65e1051a39Sopenharmony_ci $2>=3.0); # first version supporting AVX 66e1051a39Sopenharmony_ci 67e1051a39Sopenharmony_ci$a="eax"; 68e1051a39Sopenharmony_ci($b,$b_)=("ebx","ebp"); 69e1051a39Sopenharmony_ci($c,$c_)=("ecx","esi"); 70e1051a39Sopenharmony_ci($d,$d_)=("edx","edi"); 71e1051a39Sopenharmony_ci 72e1051a39Sopenharmony_cisub QUARTERROUND { 73e1051a39Sopenharmony_cimy ($ai,$bi,$ci,$di,$i)=@_; 74e1051a39Sopenharmony_cimy ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 75e1051a39Sopenharmony_cimy ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 76e1051a39Sopenharmony_ci 77e1051a39Sopenharmony_ci # a b c d 78e1051a39Sopenharmony_ci # 79e1051a39Sopenharmony_ci # 0 4 8 12 < even round 80e1051a39Sopenharmony_ci # 1 5 9 13 81e1051a39Sopenharmony_ci # 2 6 10 14 82e1051a39Sopenharmony_ci # 3 7 11 15 83e1051a39Sopenharmony_ci # 0 5 10 15 < odd round 84e1051a39Sopenharmony_ci # 1 6 11 12 85e1051a39Sopenharmony_ci # 2 7 8 13 86e1051a39Sopenharmony_ci # 3 4 9 14 87e1051a39Sopenharmony_ci 88e1051a39Sopenharmony_ci if ($i==0) { 89e1051a39Sopenharmony_ci my $j=4; 90e1051a39Sopenharmony_ci ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 91e1051a39Sopenharmony_ci } elsif ($i==3) { 92e1051a39Sopenharmony_ci my $j=0; 93e1051a39Sopenharmony_ci ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 94e1051a39Sopenharmony_ci } elsif ($i==4) { 95e1051a39Sopenharmony_ci my $j=4; 96e1051a39Sopenharmony_ci ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 97e1051a39Sopenharmony_ci } elsif ($i==7) { 98e1051a39Sopenharmony_ci my $j=0; 99e1051a39Sopenharmony_ci ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 100e1051a39Sopenharmony_ci } 101e1051a39Sopenharmony_ci 102e1051a39Sopenharmony_ci #&add ($a,$b); # see elsewhere 103e1051a39Sopenharmony_ci &xor ($d,$a); 104e1051a39Sopenharmony_ci &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3); 105e1051a39Sopenharmony_ci &rol ($d,16); 106e1051a39Sopenharmony_ci &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0); 107e1051a39Sopenharmony_ci &add ($c,$d); 108e1051a39Sopenharmony_ci &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3); 109e1051a39Sopenharmony_ci &xor ($b,$c); 110e1051a39Sopenharmony_ci &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn); 111e1051a39Sopenharmony_ci &rol ($b,12); 112e1051a39Sopenharmony_ci &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7); 113e1051a39Sopenharmony_ci &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter 114e1051a39Sopenharmony_ci &add ($a,$b); 115e1051a39Sopenharmony_ci &xor ($d,$a); 116e1051a39Sopenharmony_ci &mov (&DWP(4*$ai,"esp"),$a); 117e1051a39Sopenharmony_ci &rol ($d,8); 118e1051a39Sopenharmony_ci &mov ($a,&DWP(4*$an,"esp")); 119e1051a39Sopenharmony_ci &add ($c,$d); 120e1051a39Sopenharmony_ci &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn); 121e1051a39Sopenharmony_ci &mov ($d_,$d) if ($di==$dn); 122e1051a39Sopenharmony_ci &xor ($b,$c); 123e1051a39Sopenharmony_ci &add ($a,$b_) if ($i<7); # elsewhere 124e1051a39Sopenharmony_ci &rol ($b,7); 125e1051a39Sopenharmony_ci 126e1051a39Sopenharmony_ci ($b,$b_)=($b_,$b); 127e1051a39Sopenharmony_ci ($c,$c_)=($c_,$c); 128e1051a39Sopenharmony_ci ($d,$d_)=($d_,$d); 129e1051a39Sopenharmony_ci} 130e1051a39Sopenharmony_ci 131e1051a39Sopenharmony_ci&static_label("ssse3_shortcut"); 132e1051a39Sopenharmony_ci&static_label("xop_shortcut"); 133e1051a39Sopenharmony_ci&static_label("ssse3_data"); 134e1051a39Sopenharmony_ci&static_label("pic_point"); 135e1051a39Sopenharmony_ci 136e1051a39Sopenharmony_ci&function_begin("ChaCha20_ctr32"); 137e1051a39Sopenharmony_ci &xor ("eax","eax"); 138e1051a39Sopenharmony_ci &cmp ("eax",&wparam(2)); # len==0? 139e1051a39Sopenharmony_ci &je (&label("no_data")); 140e1051a39Sopenharmony_ciif ($xmm) { 141e1051a39Sopenharmony_ci &call (&label("pic_point")); 142e1051a39Sopenharmony_ci&set_label("pic_point"); 143e1051a39Sopenharmony_ci &blindpop("eax"); 144e1051a39Sopenharmony_ci &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point")); 145e1051a39Sopenharmony_ci &test (&DWP(0,"ebp"),1<<24); # test FXSR bit 146e1051a39Sopenharmony_ci &jz (&label("x86")); 147e1051a39Sopenharmony_ci &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit 148e1051a39Sopenharmony_ci &jz (&label("x86")); 149e1051a39Sopenharmony_ci &jmp (&label("ssse3_shortcut")); 150e1051a39Sopenharmony_ci&set_label("x86"); 151e1051a39Sopenharmony_ci} 152e1051a39Sopenharmony_ci &mov ("esi",&wparam(3)); # key 153e1051a39Sopenharmony_ci &mov ("edi",&wparam(4)); # counter and nonce 154e1051a39Sopenharmony_ci 155e1051a39Sopenharmony_ci &stack_push(33); 156e1051a39Sopenharmony_ci 157e1051a39Sopenharmony_ci &mov ("eax",&DWP(4*0,"esi")); # copy key 158e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4*1,"esi")); 159e1051a39Sopenharmony_ci &mov ("ecx",&DWP(4*2,"esi")); 160e1051a39Sopenharmony_ci &mov ("edx",&DWP(4*3,"esi")); 161e1051a39Sopenharmony_ci &mov (&DWP(64+4*4,"esp"),"eax"); 162e1051a39Sopenharmony_ci &mov (&DWP(64+4*5,"esp"),"ebx"); 163e1051a39Sopenharmony_ci &mov (&DWP(64+4*6,"esp"),"ecx"); 164e1051a39Sopenharmony_ci &mov (&DWP(64+4*7,"esp"),"edx"); 165e1051a39Sopenharmony_ci &mov ("eax",&DWP(4*4,"esi")); 166e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4*5,"esi")); 167e1051a39Sopenharmony_ci &mov ("ecx",&DWP(4*6,"esi")); 168e1051a39Sopenharmony_ci &mov ("edx",&DWP(4*7,"esi")); 169e1051a39Sopenharmony_ci &mov (&DWP(64+4*8,"esp"),"eax"); 170e1051a39Sopenharmony_ci &mov (&DWP(64+4*9,"esp"),"ebx"); 171e1051a39Sopenharmony_ci &mov (&DWP(64+4*10,"esp"),"ecx"); 172e1051a39Sopenharmony_ci &mov (&DWP(64+4*11,"esp"),"edx"); 173e1051a39Sopenharmony_ci &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce 174e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4*1,"edi")); 175e1051a39Sopenharmony_ci &mov ("ecx",&DWP(4*2,"edi")); 176e1051a39Sopenharmony_ci &mov ("edx",&DWP(4*3,"edi")); 177e1051a39Sopenharmony_ci &sub ("eax",1); 178e1051a39Sopenharmony_ci &mov (&DWP(64+4*12,"esp"),"eax"); 179e1051a39Sopenharmony_ci &mov (&DWP(64+4*13,"esp"),"ebx"); 180e1051a39Sopenharmony_ci &mov (&DWP(64+4*14,"esp"),"ecx"); 181e1051a39Sopenharmony_ci &mov (&DWP(64+4*15,"esp"),"edx"); 182e1051a39Sopenharmony_ci &jmp (&label("entry")); 183e1051a39Sopenharmony_ci 184e1051a39Sopenharmony_ci&set_label("outer_loop",16); 185e1051a39Sopenharmony_ci &mov (&wparam(1),$b); # save input 186e1051a39Sopenharmony_ci &mov (&wparam(0),$a); # save output 187e1051a39Sopenharmony_ci &mov (&wparam(2),$c); # save len 188e1051a39Sopenharmony_ci&set_label("entry"); 189e1051a39Sopenharmony_ci &mov ($a,0x61707865); 190e1051a39Sopenharmony_ci &mov (&DWP(4*1,"esp"),0x3320646e); 191e1051a39Sopenharmony_ci &mov (&DWP(4*2,"esp"),0x79622d32); 192e1051a39Sopenharmony_ci &mov (&DWP(4*3,"esp"),0x6b206574); 193e1051a39Sopenharmony_ci 194e1051a39Sopenharmony_ci &mov ($b, &DWP(64+4*5,"esp")); # copy key material 195e1051a39Sopenharmony_ci &mov ($b_,&DWP(64+4*6,"esp")); 196e1051a39Sopenharmony_ci &mov ($c, &DWP(64+4*10,"esp")); 197e1051a39Sopenharmony_ci &mov ($c_,&DWP(64+4*11,"esp")); 198e1051a39Sopenharmony_ci &mov ($d, &DWP(64+4*13,"esp")); 199e1051a39Sopenharmony_ci &mov ($d_,&DWP(64+4*14,"esp")); 200e1051a39Sopenharmony_ci &mov (&DWP(4*5,"esp"),$b); 201e1051a39Sopenharmony_ci &mov (&DWP(4*6,"esp"),$b_); 202e1051a39Sopenharmony_ci &mov (&DWP(4*10,"esp"),$c); 203e1051a39Sopenharmony_ci &mov (&DWP(4*11,"esp"),$c_); 204e1051a39Sopenharmony_ci &mov (&DWP(4*13,"esp"),$d); 205e1051a39Sopenharmony_ci &mov (&DWP(4*14,"esp"),$d_); 206e1051a39Sopenharmony_ci 207e1051a39Sopenharmony_ci &mov ($b, &DWP(64+4*7,"esp")); 208e1051a39Sopenharmony_ci &mov ($d_,&DWP(64+4*15,"esp")); 209e1051a39Sopenharmony_ci &mov ($d, &DWP(64+4*12,"esp")); 210e1051a39Sopenharmony_ci &mov ($b_,&DWP(64+4*4,"esp")); 211e1051a39Sopenharmony_ci &mov ($c, &DWP(64+4*8,"esp")); 212e1051a39Sopenharmony_ci &mov ($c_,&DWP(64+4*9,"esp")); 213e1051a39Sopenharmony_ci &add ($d,1); # counter value 214e1051a39Sopenharmony_ci &mov (&DWP(4*7,"esp"),$b); 215e1051a39Sopenharmony_ci &mov (&DWP(4*15,"esp"),$d_); 216e1051a39Sopenharmony_ci &mov (&DWP(64+4*12,"esp"),$d); # save counter value 217e1051a39Sopenharmony_ci 218e1051a39Sopenharmony_ci &mov ($b,10); # loop counter 219e1051a39Sopenharmony_ci &jmp (&label("loop")); 220e1051a39Sopenharmony_ci 221e1051a39Sopenharmony_ci&set_label("loop",16); 222e1051a39Sopenharmony_ci &add ($a,$b_); # elsewhere 223e1051a39Sopenharmony_ci &mov (&DWP(128,"esp"),$b); # save loop counter 224e1051a39Sopenharmony_ci &mov ($b,$b_); 225e1051a39Sopenharmony_ci &QUARTERROUND(0, 4, 8, 12, 0); 226e1051a39Sopenharmony_ci &QUARTERROUND(1, 5, 9, 13, 1); 227e1051a39Sopenharmony_ci &QUARTERROUND(2, 6,10, 14, 2); 228e1051a39Sopenharmony_ci &QUARTERROUND(3, 7,11, 15, 3); 229e1051a39Sopenharmony_ci &QUARTERROUND(0, 5,10, 15, 4); 230e1051a39Sopenharmony_ci &QUARTERROUND(1, 6,11, 12, 5); 231e1051a39Sopenharmony_ci &QUARTERROUND(2, 7, 8, 13, 6); 232e1051a39Sopenharmony_ci &QUARTERROUND(3, 4, 9, 14, 7); 233e1051a39Sopenharmony_ci &dec ($b); 234e1051a39Sopenharmony_ci &jnz (&label("loop")); 235e1051a39Sopenharmony_ci 236e1051a39Sopenharmony_ci &mov ($b,&wparam(2)); # load len 237e1051a39Sopenharmony_ci 238e1051a39Sopenharmony_ci &add ($a,0x61707865); # accumulate key material 239e1051a39Sopenharmony_ci &add ($b_,&DWP(64+4*4,"esp")); 240e1051a39Sopenharmony_ci &add ($c, &DWP(64+4*8,"esp")); 241e1051a39Sopenharmony_ci &add ($c_,&DWP(64+4*9,"esp")); 242e1051a39Sopenharmony_ci 243e1051a39Sopenharmony_ci &cmp ($b,64); 244e1051a39Sopenharmony_ci &jb (&label("tail")); 245e1051a39Sopenharmony_ci 246e1051a39Sopenharmony_ci &mov ($b,&wparam(1)); # load input pointer 247e1051a39Sopenharmony_ci &add ($d, &DWP(64+4*12,"esp")); 248e1051a39Sopenharmony_ci &add ($d_,&DWP(64+4*14,"esp")); 249e1051a39Sopenharmony_ci 250e1051a39Sopenharmony_ci &xor ($a, &DWP(4*0,$b)); # xor with input 251e1051a39Sopenharmony_ci &xor ($b_,&DWP(4*4,$b)); 252e1051a39Sopenharmony_ci &mov (&DWP(4*0,"esp"),$a); 253e1051a39Sopenharmony_ci &mov ($a,&wparam(0)); # load output pointer 254e1051a39Sopenharmony_ci &xor ($c, &DWP(4*8,$b)); 255e1051a39Sopenharmony_ci &xor ($c_,&DWP(4*9,$b)); 256e1051a39Sopenharmony_ci &xor ($d, &DWP(4*12,$b)); 257e1051a39Sopenharmony_ci &xor ($d_,&DWP(4*14,$b)); 258e1051a39Sopenharmony_ci &mov (&DWP(4*4,$a),$b_); # write output 259e1051a39Sopenharmony_ci &mov (&DWP(4*8,$a),$c); 260e1051a39Sopenharmony_ci &mov (&DWP(4*9,$a),$c_); 261e1051a39Sopenharmony_ci &mov (&DWP(4*12,$a),$d); 262e1051a39Sopenharmony_ci &mov (&DWP(4*14,$a),$d_); 263e1051a39Sopenharmony_ci 264e1051a39Sopenharmony_ci &mov ($b_,&DWP(4*1,"esp")); 265e1051a39Sopenharmony_ci &mov ($c, &DWP(4*2,"esp")); 266e1051a39Sopenharmony_ci &mov ($c_,&DWP(4*3,"esp")); 267e1051a39Sopenharmony_ci &mov ($d, &DWP(4*5,"esp")); 268e1051a39Sopenharmony_ci &mov ($d_,&DWP(4*6,"esp")); 269e1051a39Sopenharmony_ci &add ($b_,0x3320646e); # accumulate key material 270e1051a39Sopenharmony_ci &add ($c, 0x79622d32); 271e1051a39Sopenharmony_ci &add ($c_,0x6b206574); 272e1051a39Sopenharmony_ci &add ($d, &DWP(64+4*5,"esp")); 273e1051a39Sopenharmony_ci &add ($d_,&DWP(64+4*6,"esp")); 274e1051a39Sopenharmony_ci &xor ($b_,&DWP(4*1,$b)); 275e1051a39Sopenharmony_ci &xor ($c, &DWP(4*2,$b)); 276e1051a39Sopenharmony_ci &xor ($c_,&DWP(4*3,$b)); 277e1051a39Sopenharmony_ci &xor ($d, &DWP(4*5,$b)); 278e1051a39Sopenharmony_ci &xor ($d_,&DWP(4*6,$b)); 279e1051a39Sopenharmony_ci &mov (&DWP(4*1,$a),$b_); 280e1051a39Sopenharmony_ci &mov (&DWP(4*2,$a),$c); 281e1051a39Sopenharmony_ci &mov (&DWP(4*3,$a),$c_); 282e1051a39Sopenharmony_ci &mov (&DWP(4*5,$a),$d); 283e1051a39Sopenharmony_ci &mov (&DWP(4*6,$a),$d_); 284e1051a39Sopenharmony_ci 285e1051a39Sopenharmony_ci &mov ($b_,&DWP(4*7,"esp")); 286e1051a39Sopenharmony_ci &mov ($c, &DWP(4*10,"esp")); 287e1051a39Sopenharmony_ci &mov ($c_,&DWP(4*11,"esp")); 288e1051a39Sopenharmony_ci &mov ($d, &DWP(4*13,"esp")); 289e1051a39Sopenharmony_ci &mov ($d_,&DWP(4*15,"esp")); 290e1051a39Sopenharmony_ci &add ($b_,&DWP(64+4*7,"esp")); 291e1051a39Sopenharmony_ci &add ($c, &DWP(64+4*10,"esp")); 292e1051a39Sopenharmony_ci &add ($c_,&DWP(64+4*11,"esp")); 293e1051a39Sopenharmony_ci &add ($d, &DWP(64+4*13,"esp")); 294e1051a39Sopenharmony_ci &add ($d_,&DWP(64+4*15,"esp")); 295e1051a39Sopenharmony_ci &xor ($b_,&DWP(4*7,$b)); 296e1051a39Sopenharmony_ci &xor ($c, &DWP(4*10,$b)); 297e1051a39Sopenharmony_ci &xor ($c_,&DWP(4*11,$b)); 298e1051a39Sopenharmony_ci &xor ($d, &DWP(4*13,$b)); 299e1051a39Sopenharmony_ci &xor ($d_,&DWP(4*15,$b)); 300e1051a39Sopenharmony_ci &lea ($b,&DWP(4*16,$b)); 301e1051a39Sopenharmony_ci &mov (&DWP(4*7,$a),$b_); 302e1051a39Sopenharmony_ci &mov ($b_,&DWP(4*0,"esp")); 303e1051a39Sopenharmony_ci &mov (&DWP(4*10,$a),$c); 304e1051a39Sopenharmony_ci &mov ($c,&wparam(2)); # len 305e1051a39Sopenharmony_ci &mov (&DWP(4*11,$a),$c_); 306e1051a39Sopenharmony_ci &mov (&DWP(4*13,$a),$d); 307e1051a39Sopenharmony_ci &mov (&DWP(4*15,$a),$d_); 308e1051a39Sopenharmony_ci &mov (&DWP(4*0,$a),$b_); 309e1051a39Sopenharmony_ci &lea ($a,&DWP(4*16,$a)); 310e1051a39Sopenharmony_ci &sub ($c,64); 311e1051a39Sopenharmony_ci &jnz (&label("outer_loop")); 312e1051a39Sopenharmony_ci 313e1051a39Sopenharmony_ci &jmp (&label("done")); 314e1051a39Sopenharmony_ci 315e1051a39Sopenharmony_ci&set_label("tail"); 316e1051a39Sopenharmony_ci &add ($d, &DWP(64+4*12,"esp")); 317e1051a39Sopenharmony_ci &add ($d_,&DWP(64+4*14,"esp")); 318e1051a39Sopenharmony_ci &mov (&DWP(4*0,"esp"),$a); 319e1051a39Sopenharmony_ci &mov (&DWP(4*4,"esp"),$b_); 320e1051a39Sopenharmony_ci &mov (&DWP(4*8,"esp"),$c); 321e1051a39Sopenharmony_ci &mov (&DWP(4*9,"esp"),$c_); 322e1051a39Sopenharmony_ci &mov (&DWP(4*12,"esp"),$d); 323e1051a39Sopenharmony_ci &mov (&DWP(4*14,"esp"),$d_); 324e1051a39Sopenharmony_ci 325e1051a39Sopenharmony_ci &mov ($b_,&DWP(4*1,"esp")); 326e1051a39Sopenharmony_ci &mov ($c, &DWP(4*2,"esp")); 327e1051a39Sopenharmony_ci &mov ($c_,&DWP(4*3,"esp")); 328e1051a39Sopenharmony_ci &mov ($d, &DWP(4*5,"esp")); 329e1051a39Sopenharmony_ci &mov ($d_,&DWP(4*6,"esp")); 330e1051a39Sopenharmony_ci &add ($b_,0x3320646e); # accumulate key material 331e1051a39Sopenharmony_ci &add ($c, 0x79622d32); 332e1051a39Sopenharmony_ci &add ($c_,0x6b206574); 333e1051a39Sopenharmony_ci &add ($d, &DWP(64+4*5,"esp")); 334e1051a39Sopenharmony_ci &add ($d_,&DWP(64+4*6,"esp")); 335e1051a39Sopenharmony_ci &mov (&DWP(4*1,"esp"),$b_); 336e1051a39Sopenharmony_ci &mov (&DWP(4*2,"esp"),$c); 337e1051a39Sopenharmony_ci &mov (&DWP(4*3,"esp"),$c_); 338e1051a39Sopenharmony_ci &mov (&DWP(4*5,"esp"),$d); 339e1051a39Sopenharmony_ci &mov (&DWP(4*6,"esp"),$d_); 340e1051a39Sopenharmony_ci 341e1051a39Sopenharmony_ci &mov ($b_,&DWP(4*7,"esp")); 342e1051a39Sopenharmony_ci &mov ($c, &DWP(4*10,"esp")); 343e1051a39Sopenharmony_ci &mov ($c_,&DWP(4*11,"esp")); 344e1051a39Sopenharmony_ci &mov ($d, &DWP(4*13,"esp")); 345e1051a39Sopenharmony_ci &mov ($d_,&DWP(4*15,"esp")); 346e1051a39Sopenharmony_ci &add ($b_,&DWP(64+4*7,"esp")); 347e1051a39Sopenharmony_ci &add ($c, &DWP(64+4*10,"esp")); 348e1051a39Sopenharmony_ci &add ($c_,&DWP(64+4*11,"esp")); 349e1051a39Sopenharmony_ci &add ($d, &DWP(64+4*13,"esp")); 350e1051a39Sopenharmony_ci &add ($d_,&DWP(64+4*15,"esp")); 351e1051a39Sopenharmony_ci &mov (&DWP(4*7,"esp"),$b_); 352e1051a39Sopenharmony_ci &mov ($b_,&wparam(1)); # load input 353e1051a39Sopenharmony_ci &mov (&DWP(4*10,"esp"),$c); 354e1051a39Sopenharmony_ci &mov ($c,&wparam(0)); # load output 355e1051a39Sopenharmony_ci &mov (&DWP(4*11,"esp"),$c_); 356e1051a39Sopenharmony_ci &xor ($c_,$c_); 357e1051a39Sopenharmony_ci &mov (&DWP(4*13,"esp"),$d); 358e1051a39Sopenharmony_ci &mov (&DWP(4*15,"esp"),$d_); 359e1051a39Sopenharmony_ci 360e1051a39Sopenharmony_ci &xor ("eax","eax"); 361e1051a39Sopenharmony_ci &xor ("edx","edx"); 362e1051a39Sopenharmony_ci&set_label("tail_loop"); 363e1051a39Sopenharmony_ci &movb ("al",&BP(0,$c_,$b_)); 364e1051a39Sopenharmony_ci &movb ("dl",&BP(0,"esp",$c_)); 365e1051a39Sopenharmony_ci &lea ($c_,&DWP(1,$c_)); 366e1051a39Sopenharmony_ci &xor ("al","dl"); 367e1051a39Sopenharmony_ci &mov (&BP(-1,$c,$c_),"al"); 368e1051a39Sopenharmony_ci &dec ($b); 369e1051a39Sopenharmony_ci &jnz (&label("tail_loop")); 370e1051a39Sopenharmony_ci 371e1051a39Sopenharmony_ci&set_label("done"); 372e1051a39Sopenharmony_ci &stack_pop(33); 373e1051a39Sopenharmony_ci&set_label("no_data"); 374e1051a39Sopenharmony_ci&function_end("ChaCha20_ctr32"); 375e1051a39Sopenharmony_ci 376e1051a39Sopenharmony_ciif ($xmm) { 377e1051a39Sopenharmony_cimy ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); 378e1051a39Sopenharmony_cimy ($out,$inp,$len)=("edi","esi","ecx"); 379e1051a39Sopenharmony_ci 380e1051a39Sopenharmony_cisub QUARTERROUND_SSSE3 { 381e1051a39Sopenharmony_cimy ($ai,$bi,$ci,$di,$i)=@_; 382e1051a39Sopenharmony_cimy ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 383e1051a39Sopenharmony_cimy ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 384e1051a39Sopenharmony_ci 385e1051a39Sopenharmony_ci # a b c d 386e1051a39Sopenharmony_ci # 387e1051a39Sopenharmony_ci # 0 4 8 12 < even round 388e1051a39Sopenharmony_ci # 1 5 9 13 389e1051a39Sopenharmony_ci # 2 6 10 14 390e1051a39Sopenharmony_ci # 3 7 11 15 391e1051a39Sopenharmony_ci # 0 5 10 15 < odd round 392e1051a39Sopenharmony_ci # 1 6 11 12 393e1051a39Sopenharmony_ci # 2 7 8 13 394e1051a39Sopenharmony_ci # 3 4 9 14 395e1051a39Sopenharmony_ci 396e1051a39Sopenharmony_ci if ($i==0) { 397e1051a39Sopenharmony_ci my $j=4; 398e1051a39Sopenharmony_ci ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 399e1051a39Sopenharmony_ci } elsif ($i==3) { 400e1051a39Sopenharmony_ci my $j=0; 401e1051a39Sopenharmony_ci ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 402e1051a39Sopenharmony_ci } elsif ($i==4) { 403e1051a39Sopenharmony_ci my $j=4; 404e1051a39Sopenharmony_ci ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 405e1051a39Sopenharmony_ci } elsif ($i==7) { 406e1051a39Sopenharmony_ci my $j=0; 407e1051a39Sopenharmony_ci ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 408e1051a39Sopenharmony_ci } 409e1051a39Sopenharmony_ci 410e1051a39Sopenharmony_ci #&paddd ($xa,$xb); # see elsewhere 411e1051a39Sopenharmony_ci #&pxor ($xd,$xa); # see elsewhere 412e1051a39Sopenharmony_ci &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); 413e1051a39Sopenharmony_ci &pshufb ($xd,&QWP(0,"eax")); # rot16 414e1051a39Sopenharmony_ci &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); 415e1051a39Sopenharmony_ci &paddd ($xc,$xd); 416e1051a39Sopenharmony_ci &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); 417e1051a39Sopenharmony_ci &pxor ($xb,$xc); 418e1051a39Sopenharmony_ci &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); 419e1051a39Sopenharmony_ci &movdqa ($xa_,$xb); # borrow as temporary 420e1051a39Sopenharmony_ci &pslld ($xb,12); 421e1051a39Sopenharmony_ci &psrld ($xa_,20); 422e1051a39Sopenharmony_ci &por ($xb,$xa_); 423e1051a39Sopenharmony_ci &movdqa($xa_,&QWP(16*$an-128,"ebx")); 424e1051a39Sopenharmony_ci &paddd ($xa,$xb); 425e1051a39Sopenharmony_ci &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); 426e1051a39Sopenharmony_ci &pxor ($xd,$xa); 427e1051a39Sopenharmony_ci &movdqa (&QWP(16*$ai-128,"ebx"),$xa); 428e1051a39Sopenharmony_ci &pshufb ($xd,&QWP(16,"eax")); # rot8 429e1051a39Sopenharmony_ci &paddd ($xc,$xd); 430e1051a39Sopenharmony_ci &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); 431e1051a39Sopenharmony_ci &movdqa ($xd_,$xd) if ($di==$dn); 432e1051a39Sopenharmony_ci &pxor ($xb,$xc); 433e1051a39Sopenharmony_ci &paddd ($xa_,$xb_) if ($i<7); # elsewhere 434e1051a39Sopenharmony_ci &movdqa ($xa,$xb); # borrow as temporary 435e1051a39Sopenharmony_ci &pslld ($xb,7); 436e1051a39Sopenharmony_ci &psrld ($xa,25); 437e1051a39Sopenharmony_ci &pxor ($xd_,$xa_) if ($i<7); # elsewhere 438e1051a39Sopenharmony_ci &por ($xb,$xa); 439e1051a39Sopenharmony_ci 440e1051a39Sopenharmony_ci ($xa,$xa_)=($xa_,$xa); 441e1051a39Sopenharmony_ci ($xb,$xb_)=($xb_,$xb); 442e1051a39Sopenharmony_ci ($xc,$xc_)=($xc_,$xc); 443e1051a39Sopenharmony_ci ($xd,$xd_)=($xd_,$xd); 444e1051a39Sopenharmony_ci} 445e1051a39Sopenharmony_ci 446e1051a39Sopenharmony_ci&function_begin("ChaCha20_ssse3"); 447e1051a39Sopenharmony_ci&set_label("ssse3_shortcut"); 448e1051a39Sopenharmony_ciif ($ymm) { 449e1051a39Sopenharmony_ci &test (&DWP(4,"ebp"),1<<11); # test XOP bit 450e1051a39Sopenharmony_ci &jnz (&label("xop_shortcut")); 451e1051a39Sopenharmony_ci} 452e1051a39Sopenharmony_ci 453e1051a39Sopenharmony_ci &mov ($out,&wparam(0)); 454e1051a39Sopenharmony_ci &mov ($inp,&wparam(1)); 455e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 456e1051a39Sopenharmony_ci &mov ("edx",&wparam(3)); # key 457e1051a39Sopenharmony_ci &mov ("ebx",&wparam(4)); # counter and nonce 458e1051a39Sopenharmony_ci 459e1051a39Sopenharmony_ci &mov ("ebp","esp"); 460e1051a39Sopenharmony_ci &stack_push (131); 461e1051a39Sopenharmony_ci &and ("esp",-64); 462e1051a39Sopenharmony_ci &mov (&DWP(512,"esp"),"ebp"); 463e1051a39Sopenharmony_ci 464e1051a39Sopenharmony_ci &lea ("eax",&DWP(&label("ssse3_data")."-". 465e1051a39Sopenharmony_ci &label("pic_point"),"eax")); 466e1051a39Sopenharmony_ci &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce 467e1051a39Sopenharmony_ci 468e1051a39Sopenharmony_ciif (defined($gasver) && $gasver>=2.17) { # even though we encode 469e1051a39Sopenharmony_ci # pshufb manually, we 470e1051a39Sopenharmony_ci # handle only register 471e1051a39Sopenharmony_ci # operands, while this 472e1051a39Sopenharmony_ci # segment uses memory 473e1051a39Sopenharmony_ci # operand... 474e1051a39Sopenharmony_ci &cmp ($len,64*4); 475e1051a39Sopenharmony_ci &jb (&label("1x")); 476e1051a39Sopenharmony_ci 477e1051a39Sopenharmony_ci &mov (&DWP(512+4,"esp"),"edx"); # offload pointers 478e1051a39Sopenharmony_ci &mov (&DWP(512+8,"esp"),"ebx"); 479e1051a39Sopenharmony_ci &sub ($len,64*4); # bias len 480e1051a39Sopenharmony_ci &lea ("ebp",&DWP(256+128,"esp")); # size optimization 481e1051a39Sopenharmony_ci 482e1051a39Sopenharmony_ci &movdqu ("xmm7",&QWP(0,"edx")); # key 483e1051a39Sopenharmony_ci &pshufd ("xmm0","xmm3",0x00); 484e1051a39Sopenharmony_ci &pshufd ("xmm1","xmm3",0x55); 485e1051a39Sopenharmony_ci &pshufd ("xmm2","xmm3",0xaa); 486e1051a39Sopenharmony_ci &pshufd ("xmm3","xmm3",0xff); 487e1051a39Sopenharmony_ci &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters 488e1051a39Sopenharmony_ci &pshufd ("xmm4","xmm7",0x00); 489e1051a39Sopenharmony_ci &pshufd ("xmm5","xmm7",0x55); 490e1051a39Sopenharmony_ci &psubd ("xmm0",&QWP(16*4,"eax")); 491e1051a39Sopenharmony_ci &pshufd ("xmm6","xmm7",0xaa); 492e1051a39Sopenharmony_ci &pshufd ("xmm7","xmm7",0xff); 493e1051a39Sopenharmony_ci &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); 494e1051a39Sopenharmony_ci &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); 495e1051a39Sopenharmony_ci &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); 496e1051a39Sopenharmony_ci &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); 497e1051a39Sopenharmony_ci &movdqu ("xmm3",&QWP(16,"edx")); # key 498e1051a39Sopenharmony_ci &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); 499e1051a39Sopenharmony_ci &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); 500e1051a39Sopenharmony_ci &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); 501e1051a39Sopenharmony_ci &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); 502e1051a39Sopenharmony_ci &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma 503e1051a39Sopenharmony_ci &lea ("ebx",&DWP(128,"esp")); # size optimization 504e1051a39Sopenharmony_ci 505e1051a39Sopenharmony_ci &pshufd ("xmm0","xmm3",0x00); 506e1051a39Sopenharmony_ci &pshufd ("xmm1","xmm3",0x55); 507e1051a39Sopenharmony_ci &pshufd ("xmm2","xmm3",0xaa); 508e1051a39Sopenharmony_ci &pshufd ("xmm3","xmm3",0xff); 509e1051a39Sopenharmony_ci &pshufd ("xmm4","xmm7",0x00); 510e1051a39Sopenharmony_ci &pshufd ("xmm5","xmm7",0x55); 511e1051a39Sopenharmony_ci &pshufd ("xmm6","xmm7",0xaa); 512e1051a39Sopenharmony_ci &pshufd ("xmm7","xmm7",0xff); 513e1051a39Sopenharmony_ci &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); 514e1051a39Sopenharmony_ci &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); 515e1051a39Sopenharmony_ci &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); 516e1051a39Sopenharmony_ci &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); 517e1051a39Sopenharmony_ci &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); 518e1051a39Sopenharmony_ci &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); 519e1051a39Sopenharmony_ci &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); 520e1051a39Sopenharmony_ci &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); 521e1051a39Sopenharmony_ci 522e1051a39Sopenharmony_ci &lea ($inp,&DWP(128,$inp)); # size optimization 523e1051a39Sopenharmony_ci &lea ($out,&DWP(128,$out)); # size optimization 524e1051a39Sopenharmony_ci &jmp (&label("outer_loop")); 525e1051a39Sopenharmony_ci 526e1051a39Sopenharmony_ci&set_label("outer_loop",16); 527e1051a39Sopenharmony_ci #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material 528e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(16*1-128,"ebp")); 529e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(16*2-128,"ebp")); 530e1051a39Sopenharmony_ci &movdqa ("xmm3",&QWP(16*3-128,"ebp")); 531e1051a39Sopenharmony_ci #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); 532e1051a39Sopenharmony_ci &movdqa ("xmm5",&QWP(16*5-128,"ebp")); 533e1051a39Sopenharmony_ci &movdqa ("xmm6",&QWP(16*6-128,"ebp")); 534e1051a39Sopenharmony_ci &movdqa ("xmm7",&QWP(16*7-128,"ebp")); 535e1051a39Sopenharmony_ci #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); 536e1051a39Sopenharmony_ci &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); 537e1051a39Sopenharmony_ci &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); 538e1051a39Sopenharmony_ci &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); 539e1051a39Sopenharmony_ci #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); 540e1051a39Sopenharmony_ci &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); 541e1051a39Sopenharmony_ci &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); 542e1051a39Sopenharmony_ci &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); 543e1051a39Sopenharmony_ci #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); 544e1051a39Sopenharmony_ci #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); 545e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(16*10-128,"ebp")); 546e1051a39Sopenharmony_ci &movdqa ("xmm3",&QWP(16*11-128,"ebp")); 547e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(16*12-128,"ebp")); 548e1051a39Sopenharmony_ci &movdqa ("xmm5",&QWP(16*13-128,"ebp")); 549e1051a39Sopenharmony_ci &movdqa ("xmm6",&QWP(16*14-128,"ebp")); 550e1051a39Sopenharmony_ci &movdqa ("xmm7",&QWP(16*15-128,"ebp")); 551e1051a39Sopenharmony_ci &paddd ("xmm4",&QWP(16*4,"eax")); # counter value 552e1051a39Sopenharmony_ci #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); 553e1051a39Sopenharmony_ci #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); 554e1051a39Sopenharmony_ci &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); 555e1051a39Sopenharmony_ci &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); 556e1051a39Sopenharmony_ci &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); 557e1051a39Sopenharmony_ci &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); 558e1051a39Sopenharmony_ci &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); 559e1051a39Sopenharmony_ci &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); 560e1051a39Sopenharmony_ci &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value 561e1051a39Sopenharmony_ci 562e1051a39Sopenharmony_ci &movdqa ($xa, &QWP(16*0-128,"ebp")); 563e1051a39Sopenharmony_ci &movdqa ($xd, "xmm4"); 564e1051a39Sopenharmony_ci &movdqa ($xb_,&QWP(16*4-128,"ebp")); 565e1051a39Sopenharmony_ci &movdqa ($xc, &QWP(16*8-128,"ebp")); 566e1051a39Sopenharmony_ci &movdqa ($xc_,&QWP(16*9-128,"ebp")); 567e1051a39Sopenharmony_ci 568e1051a39Sopenharmony_ci &mov ("edx",10); # loop counter 569e1051a39Sopenharmony_ci &nop (); 570e1051a39Sopenharmony_ci 571e1051a39Sopenharmony_ci&set_label("loop",16); 572e1051a39Sopenharmony_ci &paddd ($xa,$xb_); # elsewhere 573e1051a39Sopenharmony_ci &movdqa ($xb,$xb_); 574e1051a39Sopenharmony_ci &pxor ($xd,$xa); # elsewhere 575e1051a39Sopenharmony_ci &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); 576e1051a39Sopenharmony_ci &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); 577e1051a39Sopenharmony_ci &QUARTERROUND_SSSE3(2, 6,10, 14, 2); 578e1051a39Sopenharmony_ci &QUARTERROUND_SSSE3(3, 7,11, 15, 3); 579e1051a39Sopenharmony_ci &QUARTERROUND_SSSE3(0, 5,10, 15, 4); 580e1051a39Sopenharmony_ci &QUARTERROUND_SSSE3(1, 6,11, 12, 5); 581e1051a39Sopenharmony_ci &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); 582e1051a39Sopenharmony_ci &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); 583e1051a39Sopenharmony_ci &dec ("edx"); 584e1051a39Sopenharmony_ci &jnz (&label("loop")); 585e1051a39Sopenharmony_ci 586e1051a39Sopenharmony_ci &movdqa (&QWP(16*4-128,"ebx"),$xb_); 587e1051a39Sopenharmony_ci &movdqa (&QWP(16*8-128,"ebx"),$xc); 588e1051a39Sopenharmony_ci &movdqa (&QWP(16*9-128,"ebx"),$xc_); 589e1051a39Sopenharmony_ci &movdqa (&QWP(16*12-128,"ebx"),$xd); 590e1051a39Sopenharmony_ci &movdqa (&QWP(16*14-128,"ebx"),$xd_); 591e1051a39Sopenharmony_ci 592e1051a39Sopenharmony_ci my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); 593e1051a39Sopenharmony_ci 594e1051a39Sopenharmony_ci #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there 595e1051a39Sopenharmony_ci &movdqa ($xa1,&QWP(16*1-128,"ebx")); 596e1051a39Sopenharmony_ci &movdqa ($xa2,&QWP(16*2-128,"ebx")); 597e1051a39Sopenharmony_ci &movdqa ($xa3,&QWP(16*3-128,"ebx")); 598e1051a39Sopenharmony_ci 599e1051a39Sopenharmony_ci for($i=0;$i<256;$i+=64) { 600e1051a39Sopenharmony_ci &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material 601e1051a39Sopenharmony_ci &paddd ($xa1,&QWP($i+16*1-128,"ebp")); 602e1051a39Sopenharmony_ci &paddd ($xa2,&QWP($i+16*2-128,"ebp")); 603e1051a39Sopenharmony_ci &paddd ($xa3,&QWP($i+16*3-128,"ebp")); 604e1051a39Sopenharmony_ci 605e1051a39Sopenharmony_ci &movdqa ($xt2,$xa0); # "de-interlace" data 606e1051a39Sopenharmony_ci &punpckldq ($xa0,$xa1); 607e1051a39Sopenharmony_ci &movdqa ($xt3,$xa2); 608e1051a39Sopenharmony_ci &punpckldq ($xa2,$xa3); 609e1051a39Sopenharmony_ci &punpckhdq ($xt2,$xa1); 610e1051a39Sopenharmony_ci &punpckhdq ($xt3,$xa3); 611e1051a39Sopenharmony_ci &movdqa ($xa1,$xa0); 612e1051a39Sopenharmony_ci &punpcklqdq ($xa0,$xa2); # "a0" 613e1051a39Sopenharmony_ci &movdqa ($xa3,$xt2); 614e1051a39Sopenharmony_ci &punpcklqdq ($xt2,$xt3); # "a2" 615e1051a39Sopenharmony_ci &punpckhqdq ($xa1,$xa2); # "a1" 616e1051a39Sopenharmony_ci &punpckhqdq ($xa3,$xt3); # "a3" 617e1051a39Sopenharmony_ci 618e1051a39Sopenharmony_ci #($xa2,$xt2)=($xt2,$xa2); 619e1051a39Sopenharmony_ci 620e1051a39Sopenharmony_ci &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input 621e1051a39Sopenharmony_ci &movdqu ($xt1,&QWP(64*1-128,$inp)); 622e1051a39Sopenharmony_ci &movdqu ($xa2,&QWP(64*2-128,$inp)); 623e1051a39Sopenharmony_ci &movdqu ($xt3,&QWP(64*3-128,$inp)); 624e1051a39Sopenharmony_ci &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); 625e1051a39Sopenharmony_ci &pxor ($xt0,$xa0); 626e1051a39Sopenharmony_ci &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); 627e1051a39Sopenharmony_ci &pxor ($xt1,$xa1); 628e1051a39Sopenharmony_ci &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); 629e1051a39Sopenharmony_ci &pxor ($xt2,$xa2); 630e1051a39Sopenharmony_ci &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); 631e1051a39Sopenharmony_ci &pxor ($xt3,$xa3); 632e1051a39Sopenharmony_ci &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); 633e1051a39Sopenharmony_ci &movdqu (&QWP(64*0-128,$out),$xt0); # store output 634e1051a39Sopenharmony_ci &movdqu (&QWP(64*1-128,$out),$xt1); 635e1051a39Sopenharmony_ci &movdqu (&QWP(64*2-128,$out),$xt2); 636e1051a39Sopenharmony_ci &movdqu (&QWP(64*3-128,$out),$xt3); 637e1051a39Sopenharmony_ci &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); 638e1051a39Sopenharmony_ci } 639e1051a39Sopenharmony_ci &sub ($len,64*4); 640e1051a39Sopenharmony_ci &jnc (&label("outer_loop")); 641e1051a39Sopenharmony_ci 642e1051a39Sopenharmony_ci &add ($len,64*4); 643e1051a39Sopenharmony_ci &jz (&label("done")); 644e1051a39Sopenharmony_ci 645e1051a39Sopenharmony_ci &mov ("ebx",&DWP(512+8,"esp")); # restore pointers 646e1051a39Sopenharmony_ci &lea ($inp,&DWP(-128,$inp)); 647e1051a39Sopenharmony_ci &mov ("edx",&DWP(512+4,"esp")); 648e1051a39Sopenharmony_ci &lea ($out,&DWP(-128,$out)); 649e1051a39Sopenharmony_ci 650e1051a39Sopenharmony_ci &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value 651e1051a39Sopenharmony_ci &movdqu ("xmm3",&QWP(0,"ebx")); 652e1051a39Sopenharmony_ci &paddd ("xmm2",&QWP(16*6,"eax")); # +four 653e1051a39Sopenharmony_ci &pand ("xmm3",&QWP(16*7,"eax")); 654e1051a39Sopenharmony_ci &por ("xmm3","xmm2"); # counter value 655e1051a39Sopenharmony_ci} 656e1051a39Sopenharmony_ci{ 657e1051a39Sopenharmony_cimy ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); 658e1051a39Sopenharmony_ci 659e1051a39Sopenharmony_cisub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 660e1051a39Sopenharmony_ci &paddd ($a,$b); 661e1051a39Sopenharmony_ci &pxor ($d,$a); 662e1051a39Sopenharmony_ci &pshufb ($d,$rot16); 663e1051a39Sopenharmony_ci 664e1051a39Sopenharmony_ci &paddd ($c,$d); 665e1051a39Sopenharmony_ci &pxor ($b,$c); 666e1051a39Sopenharmony_ci &movdqa ($t,$b); 667e1051a39Sopenharmony_ci &psrld ($b,20); 668e1051a39Sopenharmony_ci &pslld ($t,12); 669e1051a39Sopenharmony_ci &por ($b,$t); 670e1051a39Sopenharmony_ci 671e1051a39Sopenharmony_ci &paddd ($a,$b); 672e1051a39Sopenharmony_ci &pxor ($d,$a); 673e1051a39Sopenharmony_ci &pshufb ($d,$rot24); 674e1051a39Sopenharmony_ci 675e1051a39Sopenharmony_ci &paddd ($c,$d); 676e1051a39Sopenharmony_ci &pxor ($b,$c); 677e1051a39Sopenharmony_ci &movdqa ($t,$b); 678e1051a39Sopenharmony_ci &psrld ($b,25); 679e1051a39Sopenharmony_ci &pslld ($t,7); 680e1051a39Sopenharmony_ci &por ($b,$t); 681e1051a39Sopenharmony_ci} 682e1051a39Sopenharmony_ci 683e1051a39Sopenharmony_ci&set_label("1x"); 684e1051a39Sopenharmony_ci &movdqa ($a,&QWP(16*2,"eax")); # sigma 685e1051a39Sopenharmony_ci &movdqu ($b,&QWP(0,"edx")); 686e1051a39Sopenharmony_ci &movdqu ($c,&QWP(16,"edx")); 687e1051a39Sopenharmony_ci #&movdqu ($d,&QWP(0,"ebx")); # already loaded 688e1051a39Sopenharmony_ci &movdqa ($rot16,&QWP(0,"eax")); 689e1051a39Sopenharmony_ci &movdqa ($rot24,&QWP(16,"eax")); 690e1051a39Sopenharmony_ci &mov (&DWP(16*3,"esp"),"ebp"); 691e1051a39Sopenharmony_ci 692e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$a); 693e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$b); 694e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$c); 695e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$d); 696e1051a39Sopenharmony_ci &mov ("edx",10); 697e1051a39Sopenharmony_ci &jmp (&label("loop1x")); 698e1051a39Sopenharmony_ci 699e1051a39Sopenharmony_ci&set_label("outer1x",16); 700e1051a39Sopenharmony_ci &movdqa ($d,&QWP(16*5,"eax")); # one 701e1051a39Sopenharmony_ci &movdqa ($a,&QWP(16*0,"esp")); 702e1051a39Sopenharmony_ci &movdqa ($b,&QWP(16*1,"esp")); 703e1051a39Sopenharmony_ci &movdqa ($c,&QWP(16*2,"esp")); 704e1051a39Sopenharmony_ci &paddd ($d,&QWP(16*3,"esp")); 705e1051a39Sopenharmony_ci &mov ("edx",10); 706e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$d); 707e1051a39Sopenharmony_ci &jmp (&label("loop1x")); 708e1051a39Sopenharmony_ci 709e1051a39Sopenharmony_ci&set_label("loop1x",16); 710e1051a39Sopenharmony_ci &SSSE3ROUND(); 711e1051a39Sopenharmony_ci &pshufd ($c,$c,0b01001110); 712e1051a39Sopenharmony_ci &pshufd ($b,$b,0b00111001); 713e1051a39Sopenharmony_ci &pshufd ($d,$d,0b10010011); 714e1051a39Sopenharmony_ci &nop (); 715e1051a39Sopenharmony_ci 716e1051a39Sopenharmony_ci &SSSE3ROUND(); 717e1051a39Sopenharmony_ci &pshufd ($c,$c,0b01001110); 718e1051a39Sopenharmony_ci &pshufd ($b,$b,0b10010011); 719e1051a39Sopenharmony_ci &pshufd ($d,$d,0b00111001); 720e1051a39Sopenharmony_ci 721e1051a39Sopenharmony_ci &dec ("edx"); 722e1051a39Sopenharmony_ci &jnz (&label("loop1x")); 723e1051a39Sopenharmony_ci 724e1051a39Sopenharmony_ci &paddd ($a,&QWP(16*0,"esp")); 725e1051a39Sopenharmony_ci &paddd ($b,&QWP(16*1,"esp")); 726e1051a39Sopenharmony_ci &paddd ($c,&QWP(16*2,"esp")); 727e1051a39Sopenharmony_ci &paddd ($d,&QWP(16*3,"esp")); 728e1051a39Sopenharmony_ci 729e1051a39Sopenharmony_ci &cmp ($len,64); 730e1051a39Sopenharmony_ci &jb (&label("tail")); 731e1051a39Sopenharmony_ci 732e1051a39Sopenharmony_ci &movdqu ($t,&QWP(16*0,$inp)); 733e1051a39Sopenharmony_ci &movdqu ($t1,&QWP(16*1,$inp)); 734e1051a39Sopenharmony_ci &pxor ($a,$t); # xor with input 735e1051a39Sopenharmony_ci &movdqu ($t,&QWP(16*2,$inp)); 736e1051a39Sopenharmony_ci &pxor ($b,$t1); 737e1051a39Sopenharmony_ci &movdqu ($t1,&QWP(16*3,$inp)); 738e1051a39Sopenharmony_ci &pxor ($c,$t); 739e1051a39Sopenharmony_ci &pxor ($d,$t1); 740e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*4,$inp)); # inp+=64 741e1051a39Sopenharmony_ci 742e1051a39Sopenharmony_ci &movdqu (&QWP(16*0,$out),$a); # write output 743e1051a39Sopenharmony_ci &movdqu (&QWP(16*1,$out),$b); 744e1051a39Sopenharmony_ci &movdqu (&QWP(16*2,$out),$c); 745e1051a39Sopenharmony_ci &movdqu (&QWP(16*3,$out),$d); 746e1051a39Sopenharmony_ci &lea ($out,&DWP(16*4,$out)); # inp+=64 747e1051a39Sopenharmony_ci 748e1051a39Sopenharmony_ci &sub ($len,64); 749e1051a39Sopenharmony_ci &jnz (&label("outer1x")); 750e1051a39Sopenharmony_ci 751e1051a39Sopenharmony_ci &jmp (&label("done")); 752e1051a39Sopenharmony_ci 753e1051a39Sopenharmony_ci&set_label("tail"); 754e1051a39Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$a); 755e1051a39Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$b); 756e1051a39Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$c); 757e1051a39Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$d); 758e1051a39Sopenharmony_ci 759e1051a39Sopenharmony_ci &xor ("eax","eax"); 760e1051a39Sopenharmony_ci &xor ("edx","edx"); 761e1051a39Sopenharmony_ci &xor ("ebp","ebp"); 762e1051a39Sopenharmony_ci 763e1051a39Sopenharmony_ci&set_label("tail_loop"); 764e1051a39Sopenharmony_ci &movb ("al",&BP(0,"esp","ebp")); 765e1051a39Sopenharmony_ci &movb ("dl",&BP(0,$inp,"ebp")); 766e1051a39Sopenharmony_ci &lea ("ebp",&DWP(1,"ebp")); 767e1051a39Sopenharmony_ci &xor ("al","dl"); 768e1051a39Sopenharmony_ci &movb (&BP(-1,$out,"ebp"),"al"); 769e1051a39Sopenharmony_ci &dec ($len); 770e1051a39Sopenharmony_ci &jnz (&label("tail_loop")); 771e1051a39Sopenharmony_ci} 772e1051a39Sopenharmony_ci&set_label("done"); 773e1051a39Sopenharmony_ci &mov ("esp",&DWP(512,"esp")); 774e1051a39Sopenharmony_ci&function_end("ChaCha20_ssse3"); 775e1051a39Sopenharmony_ci 776e1051a39Sopenharmony_ci&align (64); 777e1051a39Sopenharmony_ci&set_label("ssse3_data"); 778e1051a39Sopenharmony_ci&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); 779e1051a39Sopenharmony_ci&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); 780e1051a39Sopenharmony_ci&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); 781e1051a39Sopenharmony_ci&data_word(0,1,2,3); 782e1051a39Sopenharmony_ci&data_word(4,4,4,4); 783e1051a39Sopenharmony_ci&data_word(1,0,0,0); 784e1051a39Sopenharmony_ci&data_word(4,0,0,0); 785e1051a39Sopenharmony_ci&data_word(0,-1,-1,-1); 786e1051a39Sopenharmony_ci&align (64); 787e1051a39Sopenharmony_ci} 788e1051a39Sopenharmony_ci&asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>"); 789e1051a39Sopenharmony_ci 790e1051a39Sopenharmony_ciif ($ymm) { 791e1051a39Sopenharmony_cimy ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); 792e1051a39Sopenharmony_cimy ($out,$inp,$len)=("edi","esi","ecx"); 793e1051a39Sopenharmony_ci 794e1051a39Sopenharmony_cisub QUARTERROUND_XOP { 795e1051a39Sopenharmony_cimy ($ai,$bi,$ci,$di,$i)=@_; 796e1051a39Sopenharmony_cimy ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 797e1051a39Sopenharmony_cimy ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 798e1051a39Sopenharmony_ci 799e1051a39Sopenharmony_ci # a b c d 800e1051a39Sopenharmony_ci # 801e1051a39Sopenharmony_ci # 0 4 8 12 < even round 802e1051a39Sopenharmony_ci # 1 5 9 13 803e1051a39Sopenharmony_ci # 2 6 10 14 804e1051a39Sopenharmony_ci # 3 7 11 15 805e1051a39Sopenharmony_ci # 0 5 10 15 < odd round 806e1051a39Sopenharmony_ci # 1 6 11 12 807e1051a39Sopenharmony_ci # 2 7 8 13 808e1051a39Sopenharmony_ci # 3 4 9 14 809e1051a39Sopenharmony_ci 810e1051a39Sopenharmony_ci if ($i==0) { 811e1051a39Sopenharmony_ci my $j=4; 812e1051a39Sopenharmony_ci ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 813e1051a39Sopenharmony_ci } elsif ($i==3) { 814e1051a39Sopenharmony_ci my $j=0; 815e1051a39Sopenharmony_ci ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 816e1051a39Sopenharmony_ci } elsif ($i==4) { 817e1051a39Sopenharmony_ci my $j=4; 818e1051a39Sopenharmony_ci ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 819e1051a39Sopenharmony_ci } elsif ($i==7) { 820e1051a39Sopenharmony_ci my $j=0; 821e1051a39Sopenharmony_ci ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 822e1051a39Sopenharmony_ci } 823e1051a39Sopenharmony_ci 824e1051a39Sopenharmony_ci #&vpaddd ($xa,$xa,$xb); # see elsewhere 825e1051a39Sopenharmony_ci #&vpxor ($xd,$xd,$xa); # see elsewhere 826e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); 827e1051a39Sopenharmony_ci &vprotd ($xd,$xd,16); 828e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); 829e1051a39Sopenharmony_ci &vpaddd ($xc,$xc,$xd); 830e1051a39Sopenharmony_ci &vmovdqa ($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); 831e1051a39Sopenharmony_ci &vpxor ($xb,$i!=0?$xb:$xb_,$xc); 832e1051a39Sopenharmony_ci &vmovdqa ($xa_,&QWP(16*$an-128,"ebx")); 833e1051a39Sopenharmony_ci &vprotd ($xb,$xb,12); 834e1051a39Sopenharmony_ci &vmovdqa ($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); 835e1051a39Sopenharmony_ci &vpaddd ($xa,$xa,$xb); 836e1051a39Sopenharmony_ci &vmovdqa ($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); 837e1051a39Sopenharmony_ci &vpxor ($xd,$xd,$xa); 838e1051a39Sopenharmony_ci &vpaddd ($xa_,$xa_,$xb_) if ($i<7); # elsewhere 839e1051a39Sopenharmony_ci &vprotd ($xd,$xd,8); 840e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*$ai-128,"ebx"),$xa); 841e1051a39Sopenharmony_ci &vpaddd ($xc,$xc,$xd); 842e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); 843e1051a39Sopenharmony_ci &vpxor ($xb,$xb,$xc); 844e1051a39Sopenharmony_ci &vpxor ($xd_,$di==$dn?$xd:$xd_,$xa_) if ($i<7); # elsewhere 845e1051a39Sopenharmony_ci &vprotd ($xb,$xb,7); 846e1051a39Sopenharmony_ci 847e1051a39Sopenharmony_ci ($xa,$xa_)=($xa_,$xa); 848e1051a39Sopenharmony_ci ($xb,$xb_)=($xb_,$xb); 849e1051a39Sopenharmony_ci ($xc,$xc_)=($xc_,$xc); 850e1051a39Sopenharmony_ci ($xd,$xd_)=($xd_,$xd); 851e1051a39Sopenharmony_ci} 852e1051a39Sopenharmony_ci 853e1051a39Sopenharmony_ci&function_begin("ChaCha20_xop"); 854e1051a39Sopenharmony_ci&set_label("xop_shortcut"); 855e1051a39Sopenharmony_ci &mov ($out,&wparam(0)); 856e1051a39Sopenharmony_ci &mov ($inp,&wparam(1)); 857e1051a39Sopenharmony_ci &mov ($len,&wparam(2)); 858e1051a39Sopenharmony_ci &mov ("edx",&wparam(3)); # key 859e1051a39Sopenharmony_ci &mov ("ebx",&wparam(4)); # counter and nonce 860e1051a39Sopenharmony_ci &vzeroupper (); 861e1051a39Sopenharmony_ci 862e1051a39Sopenharmony_ci &mov ("ebp","esp"); 863e1051a39Sopenharmony_ci &stack_push (131); 864e1051a39Sopenharmony_ci &and ("esp",-64); 865e1051a39Sopenharmony_ci &mov (&DWP(512,"esp"),"ebp"); 866e1051a39Sopenharmony_ci 867e1051a39Sopenharmony_ci &lea ("eax",&DWP(&label("ssse3_data")."-". 868e1051a39Sopenharmony_ci &label("pic_point"),"eax")); 869e1051a39Sopenharmony_ci &vmovdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce 870e1051a39Sopenharmony_ci 871e1051a39Sopenharmony_ci &cmp ($len,64*4); 872e1051a39Sopenharmony_ci &jb (&label("1x")); 873e1051a39Sopenharmony_ci 874e1051a39Sopenharmony_ci &mov (&DWP(512+4,"esp"),"edx"); # offload pointers 875e1051a39Sopenharmony_ci &mov (&DWP(512+8,"esp"),"ebx"); 876e1051a39Sopenharmony_ci &sub ($len,64*4); # bias len 877e1051a39Sopenharmony_ci &lea ("ebp",&DWP(256+128,"esp")); # size optimization 878e1051a39Sopenharmony_ci 879e1051a39Sopenharmony_ci &vmovdqu ("xmm7",&QWP(0,"edx")); # key 880e1051a39Sopenharmony_ci &vpshufd ("xmm0","xmm3",0x00); 881e1051a39Sopenharmony_ci &vpshufd ("xmm1","xmm3",0x55); 882e1051a39Sopenharmony_ci &vpshufd ("xmm2","xmm3",0xaa); 883e1051a39Sopenharmony_ci &vpshufd ("xmm3","xmm3",0xff); 884e1051a39Sopenharmony_ci &vpaddd ("xmm0","xmm0",&QWP(16*3,"eax")); # fix counters 885e1051a39Sopenharmony_ci &vpshufd ("xmm4","xmm7",0x00); 886e1051a39Sopenharmony_ci &vpshufd ("xmm5","xmm7",0x55); 887e1051a39Sopenharmony_ci &vpsubd ("xmm0","xmm0",&QWP(16*4,"eax")); 888e1051a39Sopenharmony_ci &vpshufd ("xmm6","xmm7",0xaa); 889e1051a39Sopenharmony_ci &vpshufd ("xmm7","xmm7",0xff); 890e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*12-128,"ebp"),"xmm0"); 891e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*13-128,"ebp"),"xmm1"); 892e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*14-128,"ebp"),"xmm2"); 893e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*15-128,"ebp"),"xmm3"); 894e1051a39Sopenharmony_ci &vmovdqu ("xmm3",&QWP(16,"edx")); # key 895e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*4-128,"ebp"),"xmm4"); 896e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*5-128,"ebp"),"xmm5"); 897e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*6-128,"ebp"),"xmm6"); 898e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*7-128,"ebp"),"xmm7"); 899e1051a39Sopenharmony_ci &vmovdqa ("xmm7",&QWP(16*2,"eax")); # sigma 900e1051a39Sopenharmony_ci &lea ("ebx",&DWP(128,"esp")); # size optimization 901e1051a39Sopenharmony_ci 902e1051a39Sopenharmony_ci &vpshufd ("xmm0","xmm3",0x00); 903e1051a39Sopenharmony_ci &vpshufd ("xmm1","xmm3",0x55); 904e1051a39Sopenharmony_ci &vpshufd ("xmm2","xmm3",0xaa); 905e1051a39Sopenharmony_ci &vpshufd ("xmm3","xmm3",0xff); 906e1051a39Sopenharmony_ci &vpshufd ("xmm4","xmm7",0x00); 907e1051a39Sopenharmony_ci &vpshufd ("xmm5","xmm7",0x55); 908e1051a39Sopenharmony_ci &vpshufd ("xmm6","xmm7",0xaa); 909e1051a39Sopenharmony_ci &vpshufd ("xmm7","xmm7",0xff); 910e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*8-128,"ebp"),"xmm0"); 911e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*9-128,"ebp"),"xmm1"); 912e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*10-128,"ebp"),"xmm2"); 913e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*11-128,"ebp"),"xmm3"); 914e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*0-128,"ebp"),"xmm4"); 915e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*1-128,"ebp"),"xmm5"); 916e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*2-128,"ebp"),"xmm6"); 917e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*3-128,"ebp"),"xmm7"); 918e1051a39Sopenharmony_ci 919e1051a39Sopenharmony_ci &lea ($inp,&DWP(128,$inp)); # size optimization 920e1051a39Sopenharmony_ci &lea ($out,&DWP(128,$out)); # size optimization 921e1051a39Sopenharmony_ci &jmp (&label("outer_loop")); 922e1051a39Sopenharmony_ci 923e1051a39Sopenharmony_ci&set_label("outer_loop",32); 924e1051a39Sopenharmony_ci #&vmovdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material 925e1051a39Sopenharmony_ci &vmovdqa ("xmm1",&QWP(16*1-128,"ebp")); 926e1051a39Sopenharmony_ci &vmovdqa ("xmm2",&QWP(16*2-128,"ebp")); 927e1051a39Sopenharmony_ci &vmovdqa ("xmm3",&QWP(16*3-128,"ebp")); 928e1051a39Sopenharmony_ci #&vmovdqa ("xmm4",&QWP(16*4-128,"ebp")); 929e1051a39Sopenharmony_ci &vmovdqa ("xmm5",&QWP(16*5-128,"ebp")); 930e1051a39Sopenharmony_ci &vmovdqa ("xmm6",&QWP(16*6-128,"ebp")); 931e1051a39Sopenharmony_ci &vmovdqa ("xmm7",&QWP(16*7-128,"ebp")); 932e1051a39Sopenharmony_ci #&vmovdqa (&QWP(16*0-128,"ebx"),"xmm0"); 933e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*1-128,"ebx"),"xmm1"); 934e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*2-128,"ebx"),"xmm2"); 935e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*3-128,"ebx"),"xmm3"); 936e1051a39Sopenharmony_ci #&vmovdqa (&QWP(16*4-128,"ebx"),"xmm4"); 937e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*5-128,"ebx"),"xmm5"); 938e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*6-128,"ebx"),"xmm6"); 939e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*7-128,"ebx"),"xmm7"); 940e1051a39Sopenharmony_ci #&vmovdqa ("xmm0",&QWP(16*8-128,"ebp")); 941e1051a39Sopenharmony_ci #&vmovdqa ("xmm1",&QWP(16*9-128,"ebp")); 942e1051a39Sopenharmony_ci &vmovdqa ("xmm2",&QWP(16*10-128,"ebp")); 943e1051a39Sopenharmony_ci &vmovdqa ("xmm3",&QWP(16*11-128,"ebp")); 944e1051a39Sopenharmony_ci &vmovdqa ("xmm4",&QWP(16*12-128,"ebp")); 945e1051a39Sopenharmony_ci &vmovdqa ("xmm5",&QWP(16*13-128,"ebp")); 946e1051a39Sopenharmony_ci &vmovdqa ("xmm6",&QWP(16*14-128,"ebp")); 947e1051a39Sopenharmony_ci &vmovdqa ("xmm7",&QWP(16*15-128,"ebp")); 948e1051a39Sopenharmony_ci &vpaddd ("xmm4","xmm4",&QWP(16*4,"eax")); # counter value 949e1051a39Sopenharmony_ci #&vmovdqa (&QWP(16*8-128,"ebx"),"xmm0"); 950e1051a39Sopenharmony_ci #&vmovdqa (&QWP(16*9-128,"ebx"),"xmm1"); 951e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*10-128,"ebx"),"xmm2"); 952e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*11-128,"ebx"),"xmm3"); 953e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*12-128,"ebx"),"xmm4"); 954e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*13-128,"ebx"),"xmm5"); 955e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*14-128,"ebx"),"xmm6"); 956e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*15-128,"ebx"),"xmm7"); 957e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value 958e1051a39Sopenharmony_ci 959e1051a39Sopenharmony_ci &vmovdqa ($xa, &QWP(16*0-128,"ebp")); 960e1051a39Sopenharmony_ci &vmovdqa ($xd, "xmm4"); 961e1051a39Sopenharmony_ci &vmovdqa ($xb_,&QWP(16*4-128,"ebp")); 962e1051a39Sopenharmony_ci &vmovdqa ($xc, &QWP(16*8-128,"ebp")); 963e1051a39Sopenharmony_ci &vmovdqa ($xc_,&QWP(16*9-128,"ebp")); 964e1051a39Sopenharmony_ci 965e1051a39Sopenharmony_ci &mov ("edx",10); # loop counter 966e1051a39Sopenharmony_ci &nop (); 967e1051a39Sopenharmony_ci 968e1051a39Sopenharmony_ci&set_label("loop",32); 969e1051a39Sopenharmony_ci &vpaddd ($xa,$xa,$xb_); # elsewhere 970e1051a39Sopenharmony_ci &vpxor ($xd,$xd,$xa); # elsewhere 971e1051a39Sopenharmony_ci &QUARTERROUND_XOP(0, 4, 8, 12, 0); 972e1051a39Sopenharmony_ci &QUARTERROUND_XOP(1, 5, 9, 13, 1); 973e1051a39Sopenharmony_ci &QUARTERROUND_XOP(2, 6,10, 14, 2); 974e1051a39Sopenharmony_ci &QUARTERROUND_XOP(3, 7,11, 15, 3); 975e1051a39Sopenharmony_ci &QUARTERROUND_XOP(0, 5,10, 15, 4); 976e1051a39Sopenharmony_ci &QUARTERROUND_XOP(1, 6,11, 12, 5); 977e1051a39Sopenharmony_ci &QUARTERROUND_XOP(2, 7, 8, 13, 6); 978e1051a39Sopenharmony_ci &QUARTERROUND_XOP(3, 4, 9, 14, 7); 979e1051a39Sopenharmony_ci &dec ("edx"); 980e1051a39Sopenharmony_ci &jnz (&label("loop")); 981e1051a39Sopenharmony_ci 982e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*4-128,"ebx"),$xb_); 983e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*8-128,"ebx"),$xc); 984e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*9-128,"ebx"),$xc_); 985e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*12-128,"ebx"),$xd); 986e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*14-128,"ebx"),$xd_); 987e1051a39Sopenharmony_ci 988e1051a39Sopenharmony_ci my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); 989e1051a39Sopenharmony_ci 990e1051a39Sopenharmony_ci #&vmovdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there 991e1051a39Sopenharmony_ci &vmovdqa ($xa1,&QWP(16*1-128,"ebx")); 992e1051a39Sopenharmony_ci &vmovdqa ($xa2,&QWP(16*2-128,"ebx")); 993e1051a39Sopenharmony_ci &vmovdqa ($xa3,&QWP(16*3-128,"ebx")); 994e1051a39Sopenharmony_ci 995e1051a39Sopenharmony_ci for($i=0;$i<256;$i+=64) { 996e1051a39Sopenharmony_ci &vpaddd ($xa0,$xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material 997e1051a39Sopenharmony_ci &vpaddd ($xa1,$xa1,&QWP($i+16*1-128,"ebp")); 998e1051a39Sopenharmony_ci &vpaddd ($xa2,$xa2,&QWP($i+16*2-128,"ebp")); 999e1051a39Sopenharmony_ci &vpaddd ($xa3,$xa3,&QWP($i+16*3-128,"ebp")); 1000e1051a39Sopenharmony_ci 1001e1051a39Sopenharmony_ci &vpunpckldq ($xt2,$xa0,$xa1); # "de-interlace" data 1002e1051a39Sopenharmony_ci &vpunpckldq ($xt3,$xa2,$xa3); 1003e1051a39Sopenharmony_ci &vpunpckhdq ($xa0,$xa0,$xa1); 1004e1051a39Sopenharmony_ci &vpunpckhdq ($xa2,$xa2,$xa3); 1005e1051a39Sopenharmony_ci &vpunpcklqdq ($xa1,$xt2,$xt3); # "a0" 1006e1051a39Sopenharmony_ci &vpunpckhqdq ($xt2,$xt2,$xt3); # "a1" 1007e1051a39Sopenharmony_ci &vpunpcklqdq ($xt3,$xa0,$xa2); # "a2" 1008e1051a39Sopenharmony_ci &vpunpckhqdq ($xa3,$xa0,$xa2); # "a3" 1009e1051a39Sopenharmony_ci 1010e1051a39Sopenharmony_ci &vpxor ($xt0,$xa1,&QWP(64*0-128,$inp)); 1011e1051a39Sopenharmony_ci &vpxor ($xt1,$xt2,&QWP(64*1-128,$inp)); 1012e1051a39Sopenharmony_ci &vpxor ($xt2,$xt3,&QWP(64*2-128,$inp)); 1013e1051a39Sopenharmony_ci &vpxor ($xt3,$xa3,&QWP(64*3-128,$inp)); 1014e1051a39Sopenharmony_ci &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); 1015e1051a39Sopenharmony_ci &vmovdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); 1016e1051a39Sopenharmony_ci &vmovdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); 1017e1051a39Sopenharmony_ci &vmovdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); 1018e1051a39Sopenharmony_ci &vmovdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); 1019e1051a39Sopenharmony_ci &vmovdqu (&QWP(64*0-128,$out),$xt0); # store output 1020e1051a39Sopenharmony_ci &vmovdqu (&QWP(64*1-128,$out),$xt1); 1021e1051a39Sopenharmony_ci &vmovdqu (&QWP(64*2-128,$out),$xt2); 1022e1051a39Sopenharmony_ci &vmovdqu (&QWP(64*3-128,$out),$xt3); 1023e1051a39Sopenharmony_ci &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); 1024e1051a39Sopenharmony_ci } 1025e1051a39Sopenharmony_ci &sub ($len,64*4); 1026e1051a39Sopenharmony_ci &jnc (&label("outer_loop")); 1027e1051a39Sopenharmony_ci 1028e1051a39Sopenharmony_ci &add ($len,64*4); 1029e1051a39Sopenharmony_ci &jz (&label("done")); 1030e1051a39Sopenharmony_ci 1031e1051a39Sopenharmony_ci &mov ("ebx",&DWP(512+8,"esp")); # restore pointers 1032e1051a39Sopenharmony_ci &lea ($inp,&DWP(-128,$inp)); 1033e1051a39Sopenharmony_ci &mov ("edx",&DWP(512+4,"esp")); 1034e1051a39Sopenharmony_ci &lea ($out,&DWP(-128,$out)); 1035e1051a39Sopenharmony_ci 1036e1051a39Sopenharmony_ci &vmovd ("xmm2",&DWP(16*12-128,"ebp")); # counter value 1037e1051a39Sopenharmony_ci &vmovdqu ("xmm3",&QWP(0,"ebx")); 1038e1051a39Sopenharmony_ci &vpaddd ("xmm2","xmm2",&QWP(16*6,"eax"));# +four 1039e1051a39Sopenharmony_ci &vpand ("xmm3","xmm3",&QWP(16*7,"eax")); 1040e1051a39Sopenharmony_ci &vpor ("xmm3","xmm3","xmm2"); # counter value 1041e1051a39Sopenharmony_ci{ 1042e1051a39Sopenharmony_cimy ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); 1043e1051a39Sopenharmony_ci 1044e1051a39Sopenharmony_cisub XOPROUND { 1045e1051a39Sopenharmony_ci &vpaddd ($a,$a,$b); 1046e1051a39Sopenharmony_ci &vpxor ($d,$d,$a); 1047e1051a39Sopenharmony_ci &vprotd ($d,$d,16); 1048e1051a39Sopenharmony_ci 1049e1051a39Sopenharmony_ci &vpaddd ($c,$c,$d); 1050e1051a39Sopenharmony_ci &vpxor ($b,$b,$c); 1051e1051a39Sopenharmony_ci &vprotd ($b,$b,12); 1052e1051a39Sopenharmony_ci 1053e1051a39Sopenharmony_ci &vpaddd ($a,$a,$b); 1054e1051a39Sopenharmony_ci &vpxor ($d,$d,$a); 1055e1051a39Sopenharmony_ci &vprotd ($d,$d,8); 1056e1051a39Sopenharmony_ci 1057e1051a39Sopenharmony_ci &vpaddd ($c,$c,$d); 1058e1051a39Sopenharmony_ci &vpxor ($b,$b,$c); 1059e1051a39Sopenharmony_ci &vprotd ($b,$b,7); 1060e1051a39Sopenharmony_ci} 1061e1051a39Sopenharmony_ci 1062e1051a39Sopenharmony_ci&set_label("1x"); 1063e1051a39Sopenharmony_ci &vmovdqa ($a,&QWP(16*2,"eax")); # sigma 1064e1051a39Sopenharmony_ci &vmovdqu ($b,&QWP(0,"edx")); 1065e1051a39Sopenharmony_ci &vmovdqu ($c,&QWP(16,"edx")); 1066e1051a39Sopenharmony_ci #&vmovdqu ($d,&QWP(0,"ebx")); # already loaded 1067e1051a39Sopenharmony_ci &vmovdqa ($rot16,&QWP(0,"eax")); 1068e1051a39Sopenharmony_ci &vmovdqa ($rot24,&QWP(16,"eax")); 1069e1051a39Sopenharmony_ci &mov (&DWP(16*3,"esp"),"ebp"); 1070e1051a39Sopenharmony_ci 1071e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*0,"esp"),$a); 1072e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*1,"esp"),$b); 1073e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*2,"esp"),$c); 1074e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*3,"esp"),$d); 1075e1051a39Sopenharmony_ci &mov ("edx",10); 1076e1051a39Sopenharmony_ci &jmp (&label("loop1x")); 1077e1051a39Sopenharmony_ci 1078e1051a39Sopenharmony_ci&set_label("outer1x",16); 1079e1051a39Sopenharmony_ci &vmovdqa ($d,&QWP(16*5,"eax")); # one 1080e1051a39Sopenharmony_ci &vmovdqa ($a,&QWP(16*0,"esp")); 1081e1051a39Sopenharmony_ci &vmovdqa ($b,&QWP(16*1,"esp")); 1082e1051a39Sopenharmony_ci &vmovdqa ($c,&QWP(16*2,"esp")); 1083e1051a39Sopenharmony_ci &vpaddd ($d,$d,&QWP(16*3,"esp")); 1084e1051a39Sopenharmony_ci &mov ("edx",10); 1085e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*3,"esp"),$d); 1086e1051a39Sopenharmony_ci &jmp (&label("loop1x")); 1087e1051a39Sopenharmony_ci 1088e1051a39Sopenharmony_ci&set_label("loop1x",16); 1089e1051a39Sopenharmony_ci &XOPROUND(); 1090e1051a39Sopenharmony_ci &vpshufd ($c,$c,0b01001110); 1091e1051a39Sopenharmony_ci &vpshufd ($b,$b,0b00111001); 1092e1051a39Sopenharmony_ci &vpshufd ($d,$d,0b10010011); 1093e1051a39Sopenharmony_ci 1094e1051a39Sopenharmony_ci &XOPROUND(); 1095e1051a39Sopenharmony_ci &vpshufd ($c,$c,0b01001110); 1096e1051a39Sopenharmony_ci &vpshufd ($b,$b,0b10010011); 1097e1051a39Sopenharmony_ci &vpshufd ($d,$d,0b00111001); 1098e1051a39Sopenharmony_ci 1099e1051a39Sopenharmony_ci &dec ("edx"); 1100e1051a39Sopenharmony_ci &jnz (&label("loop1x")); 1101e1051a39Sopenharmony_ci 1102e1051a39Sopenharmony_ci &vpaddd ($a,$a,&QWP(16*0,"esp")); 1103e1051a39Sopenharmony_ci &vpaddd ($b,$b,&QWP(16*1,"esp")); 1104e1051a39Sopenharmony_ci &vpaddd ($c,$c,&QWP(16*2,"esp")); 1105e1051a39Sopenharmony_ci &vpaddd ($d,$d,&QWP(16*3,"esp")); 1106e1051a39Sopenharmony_ci 1107e1051a39Sopenharmony_ci &cmp ($len,64); 1108e1051a39Sopenharmony_ci &jb (&label("tail")); 1109e1051a39Sopenharmony_ci 1110e1051a39Sopenharmony_ci &vpxor ($a,$a,&QWP(16*0,$inp)); # xor with input 1111e1051a39Sopenharmony_ci &vpxor ($b,$b,&QWP(16*1,$inp)); 1112e1051a39Sopenharmony_ci &vpxor ($c,$c,&QWP(16*2,$inp)); 1113e1051a39Sopenharmony_ci &vpxor ($d,$d,&QWP(16*3,$inp)); 1114e1051a39Sopenharmony_ci &lea ($inp,&DWP(16*4,$inp)); # inp+=64 1115e1051a39Sopenharmony_ci 1116e1051a39Sopenharmony_ci &vmovdqu (&QWP(16*0,$out),$a); # write output 1117e1051a39Sopenharmony_ci &vmovdqu (&QWP(16*1,$out),$b); 1118e1051a39Sopenharmony_ci &vmovdqu (&QWP(16*2,$out),$c); 1119e1051a39Sopenharmony_ci &vmovdqu (&QWP(16*3,$out),$d); 1120e1051a39Sopenharmony_ci &lea ($out,&DWP(16*4,$out)); # inp+=64 1121e1051a39Sopenharmony_ci 1122e1051a39Sopenharmony_ci &sub ($len,64); 1123e1051a39Sopenharmony_ci &jnz (&label("outer1x")); 1124e1051a39Sopenharmony_ci 1125e1051a39Sopenharmony_ci &jmp (&label("done")); 1126e1051a39Sopenharmony_ci 1127e1051a39Sopenharmony_ci&set_label("tail"); 1128e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*0,"esp"),$a); 1129e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*1,"esp"),$b); 1130e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*2,"esp"),$c); 1131e1051a39Sopenharmony_ci &vmovdqa (&QWP(16*3,"esp"),$d); 1132e1051a39Sopenharmony_ci 1133e1051a39Sopenharmony_ci &xor ("eax","eax"); 1134e1051a39Sopenharmony_ci &xor ("edx","edx"); 1135e1051a39Sopenharmony_ci &xor ("ebp","ebp"); 1136e1051a39Sopenharmony_ci 1137e1051a39Sopenharmony_ci&set_label("tail_loop"); 1138e1051a39Sopenharmony_ci &movb ("al",&BP(0,"esp","ebp")); 1139e1051a39Sopenharmony_ci &movb ("dl",&BP(0,$inp,"ebp")); 1140e1051a39Sopenharmony_ci &lea ("ebp",&DWP(1,"ebp")); 1141e1051a39Sopenharmony_ci &xor ("al","dl"); 1142e1051a39Sopenharmony_ci &movb (&BP(-1,$out,"ebp"),"al"); 1143e1051a39Sopenharmony_ci &dec ($len); 1144e1051a39Sopenharmony_ci &jnz (&label("tail_loop")); 1145e1051a39Sopenharmony_ci} 1146e1051a39Sopenharmony_ci&set_label("done"); 1147e1051a39Sopenharmony_ci &vzeroupper (); 1148e1051a39Sopenharmony_ci &mov ("esp",&DWP(512,"esp")); 1149e1051a39Sopenharmony_ci&function_end("ChaCha20_xop"); 1150e1051a39Sopenharmony_ci} 1151e1051a39Sopenharmony_ci 1152e1051a39Sopenharmony_ci&asm_finish(); 1153e1051a39Sopenharmony_ci 1154e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 1155