1e1051a39Sopenharmony_ci#!/usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci# 9e1051a39Sopenharmony_ci# ==================================================================== 10e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 12e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 13e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 14e1051a39Sopenharmony_ci# ==================================================================== 15e1051a39Sopenharmony_ci# 16e1051a39Sopenharmony_ci# Keccak-1600 for x86 MMX. 17e1051a39Sopenharmony_ci# 18e1051a39Sopenharmony_ci# June 2017. 19e1051a39Sopenharmony_ci# 20e1051a39Sopenharmony_ci# Below code is KECCAK_2X implementation (see sha/keccak1600.c) with 21e1051a39Sopenharmony_ci# C[5] held in register bank and D[5] offloaded to memory. Though 22e1051a39Sopenharmony_ci# instead of actually unrolling the loop pair-wise I simply flip 23e1051a39Sopenharmony_ci# pointers to T[][] and A[][] and the end of round. Since number of 24e1051a39Sopenharmony_ci# rounds is even, last round writes to A[][] and everything works out. 25e1051a39Sopenharmony_ci# It's argued that MMX is the only code path meaningful to implement 26e1051a39Sopenharmony_ci# for x86. This is because non-MMX-capable processors is an extinct 27e1051a39Sopenharmony_ci# breed, and they as well can lurk executing compiler-generated code. 28e1051a39Sopenharmony_ci# For reference gcc-5.x-generated KECCAK_2X code takes 89 cycles per 29e1051a39Sopenharmony_ci# processed byte on Pentium. Which is fair result. But older compilers 30e1051a39Sopenharmony_ci# produce worse code. On the other hand one can wonder why not 128-bit 31e1051a39Sopenharmony_ci# SSE2? Well, SSE2 won't provide double improvement, rather far from 32e1051a39Sopenharmony_ci# that, if any at all on some processors, because it will take extra 33e1051a39Sopenharmony_ci# permutations and inter-bank data transfers. Besides, contemporary 34e1051a39Sopenharmony_ci# CPUs are better off executing 64-bit code, and it makes lesser sense 35e1051a39Sopenharmony_ci# to invest into fancy 32-bit code. And the decision doesn't seem to 36e1051a39Sopenharmony_ci# be inadequate, if one compares below results to "64-bit platforms in 37e1051a39Sopenharmony_ci# 32-bit mode" SIMD data points available at 38e1051a39Sopenharmony_ci# http://keccak.noekeon.org/sw_performance.html. 39e1051a39Sopenharmony_ci# 40e1051a39Sopenharmony_ci######################################################################## 41e1051a39Sopenharmony_ci# Numbers are cycles per processed byte out of large message. 42e1051a39Sopenharmony_ci# 43e1051a39Sopenharmony_ci# r=1088(i) 44e1051a39Sopenharmony_ci# 45e1051a39Sopenharmony_ci# PIII 30/+150% 46e1051a39Sopenharmony_ci# Pentium M 27/+150% 47e1051a39Sopenharmony_ci# P4 40/+85% 48e1051a39Sopenharmony_ci# Core 2 19/+170% 49e1051a39Sopenharmony_ci# Sandy Bridge(ii) 18/+140% 50e1051a39Sopenharmony_ci# Atom 33/+180% 51e1051a39Sopenharmony_ci# Silvermont(ii) 30/+180% 52e1051a39Sopenharmony_ci# VIA Nano(ii) 43/+60% 53e1051a39Sopenharmony_ci# Sledgehammer(ii)(iii) 24/+130% 54e1051a39Sopenharmony_ci# 55e1051a39Sopenharmony_ci# (i) Corresponds to SHA3-256. Numbers after slash are improvement 56e1051a39Sopenharmony_ci# coefficients over KECCAK_2X [with bit interleave and lane 57e1051a39Sopenharmony_ci# complementing] position-independent *scalar* code generated 58e1051a39Sopenharmony_ci# by gcc-5.x. It's not exactly fair comparison, but it's a 59e1051a39Sopenharmony_ci# datapoint... 60e1051a39Sopenharmony_ci# (ii) 64-bit processor executing 32-bit code. 61e1051a39Sopenharmony_ci# (iii) Result is considered to be representative even for older AMD 62e1051a39Sopenharmony_ci# processors. 63e1051a39Sopenharmony_ci 64e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 66e1051a39Sopenharmony_cirequire "x86asm.pl"; 67e1051a39Sopenharmony_ci 68e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output"; 69e1051a39Sopenharmony_ci 70e1051a39Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 71e1051a39Sopenharmony_ci 72e1051a39Sopenharmony_cimy @C = map("mm$_",(0..4)); 73e1051a39Sopenharmony_cimy @T = map("mm$_",(5..7)); 74e1051a39Sopenharmony_cimy @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100, 75e1051a39Sopenharmony_ci 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20)); 76e1051a39Sopenharmony_cimy @D = map(8*$_+4, (0..4)); 77e1051a39Sopenharmony_cimy @rhotates = ([ 0, 1, 62, 28, 27 ], 78e1051a39Sopenharmony_ci [ 36, 44, 6, 55, 20 ], 79e1051a39Sopenharmony_ci [ 3, 10, 43, 25, 39 ], 80e1051a39Sopenharmony_ci [ 41, 45, 15, 21, 8 ], 81e1051a39Sopenharmony_ci [ 18, 2, 61, 56, 14 ]); 82e1051a39Sopenharmony_ci 83e1051a39Sopenharmony_ci&static_label("iotas"); 84e1051a39Sopenharmony_ci 85e1051a39Sopenharmony_ci&function_begin_B("_KeccakF1600"); 86e1051a39Sopenharmony_ci &movq (@C[0],&QWP($A[4][0],"esi")); 87e1051a39Sopenharmony_ci &movq (@C[1],&QWP($A[4][1],"esi")); 88e1051a39Sopenharmony_ci &movq (@C[2],&QWP($A[4][2],"esi")); 89e1051a39Sopenharmony_ci &movq (@C[3],&QWP($A[4][3],"esi")); 90e1051a39Sopenharmony_ci &movq (@C[4],&QWP($A[4][4],"esi")); 91e1051a39Sopenharmony_ci 92e1051a39Sopenharmony_ci &mov ("ecx",24); # loop counter 93e1051a39Sopenharmony_ci &jmp (&label("loop")); 94e1051a39Sopenharmony_ci 95e1051a39Sopenharmony_ci &set_label("loop",16); 96e1051a39Sopenharmony_ci ######################################### Theta 97e1051a39Sopenharmony_ci &pxor (@C[0],&QWP($A[0][0],"esi")); 98e1051a39Sopenharmony_ci &pxor (@C[1],&QWP($A[0][1],"esi")); 99e1051a39Sopenharmony_ci &pxor (@C[2],&QWP($A[0][2],"esi")); 100e1051a39Sopenharmony_ci &pxor (@C[3],&QWP($A[0][3],"esi")); 101e1051a39Sopenharmony_ci &pxor (@C[4],&QWP($A[0][4],"esi")); 102e1051a39Sopenharmony_ci 103e1051a39Sopenharmony_ci &pxor (@C[0],&QWP($A[1][0],"esi")); 104e1051a39Sopenharmony_ci &pxor (@C[1],&QWP($A[1][1],"esi")); 105e1051a39Sopenharmony_ci &pxor (@C[2],&QWP($A[1][2],"esi")); 106e1051a39Sopenharmony_ci &pxor (@C[3],&QWP($A[1][3],"esi")); 107e1051a39Sopenharmony_ci &pxor (@C[4],&QWP($A[1][4],"esi")); 108e1051a39Sopenharmony_ci 109e1051a39Sopenharmony_ci &pxor (@C[0],&QWP($A[2][0],"esi")); 110e1051a39Sopenharmony_ci &pxor (@C[1],&QWP($A[2][1],"esi")); 111e1051a39Sopenharmony_ci &pxor (@C[2],&QWP($A[2][2],"esi")); 112e1051a39Sopenharmony_ci &pxor (@C[3],&QWP($A[2][3],"esi")); 113e1051a39Sopenharmony_ci &pxor (@C[4],&QWP($A[2][4],"esi")); 114e1051a39Sopenharmony_ci 115e1051a39Sopenharmony_ci &pxor (@C[2],&QWP($A[3][2],"esi")); 116e1051a39Sopenharmony_ci &pxor (@C[0],&QWP($A[3][0],"esi")); 117e1051a39Sopenharmony_ci &pxor (@C[1],&QWP($A[3][1],"esi")); 118e1051a39Sopenharmony_ci &pxor (@C[3],&QWP($A[3][3],"esi")); 119e1051a39Sopenharmony_ci &movq (@T[0],@C[2]); 120e1051a39Sopenharmony_ci &pxor (@C[4],&QWP($A[3][4],"esi")); 121e1051a39Sopenharmony_ci 122e1051a39Sopenharmony_ci &movq (@T[2],@C[2]); 123e1051a39Sopenharmony_ci &psrlq (@T[0],63); 124e1051a39Sopenharmony_ci &movq (@T[1],@C[0]); 125e1051a39Sopenharmony_ci &psllq (@T[2],1); 126e1051a39Sopenharmony_ci &pxor (@T[0],@C[0]); 127e1051a39Sopenharmony_ci &psrlq (@C[0],63); 128e1051a39Sopenharmony_ci &pxor (@T[0],@T[2]); 129e1051a39Sopenharmony_ci &psllq (@T[1],1); 130e1051a39Sopenharmony_ci &movq (@T[2],@C[1]); 131e1051a39Sopenharmony_ci &movq (&QWP(@D[1],"esp"),@T[0]); # D[1] = E[0] = ROL64(C[2], 1) ^ C[0]; 132e1051a39Sopenharmony_ci 133e1051a39Sopenharmony_ci &pxor (@T[1],@C[0]); 134e1051a39Sopenharmony_ci &psrlq (@T[2],63); 135e1051a39Sopenharmony_ci &pxor (@T[1],@C[3]); 136e1051a39Sopenharmony_ci &movq (@C[0],@C[1]); 137e1051a39Sopenharmony_ci &movq (&QWP(@D[4],"esp"),@T[1]); # D[4] = E[1] = ROL64(C[0], 1) ^ C[3]; 138e1051a39Sopenharmony_ci 139e1051a39Sopenharmony_ci &psllq (@C[0],1); 140e1051a39Sopenharmony_ci &pxor (@T[2],@C[4]); 141e1051a39Sopenharmony_ci &pxor (@C[0],@T[2]); 142e1051a39Sopenharmony_ci 143e1051a39Sopenharmony_ci &movq (@T[2],@C[3]); 144e1051a39Sopenharmony_ci &psrlq (@C[3],63); 145e1051a39Sopenharmony_ci &movq (&QWP(@D[0],"esp"),@C[0]); # D[0] = C[0] = ROL64(C[1], 1) ^ C[4]; 146e1051a39Sopenharmony_ci &psllq (@T[2],1); 147e1051a39Sopenharmony_ci &movq (@T[0],@C[4]); 148e1051a39Sopenharmony_ci &psrlq (@C[4],63); 149e1051a39Sopenharmony_ci &pxor (@C[1],@C[3]); 150e1051a39Sopenharmony_ci &psllq (@T[0],1); 151e1051a39Sopenharmony_ci &pxor (@C[1],@T[2]); 152e1051a39Sopenharmony_ci &pxor (@C[2],@C[4]); 153e1051a39Sopenharmony_ci &movq (&QWP(@D[2],"esp"),@C[1]); # D[2] = C[1] = ROL64(C[3], 1) ^ C[1]; 154e1051a39Sopenharmony_ci &pxor (@C[2],@T[0]); 155e1051a39Sopenharmony_ci 156e1051a39Sopenharmony_ci ######################################### first Rho(0) is special 157e1051a39Sopenharmony_ci &movq (@C[3],&QWP($A[3][3],"esi")); 158e1051a39Sopenharmony_ci &movq (&QWP(@D[3],"esp"),@C[2]); # D[3] = C[2] = ROL64(C[4], 1) ^ C[2]; 159e1051a39Sopenharmony_ci &pxor (@C[3],@C[2]); 160e1051a39Sopenharmony_ci &movq (@C[4],&QWP($A[4][4],"esi")); 161e1051a39Sopenharmony_ci &movq (@T[2],@C[3]); 162e1051a39Sopenharmony_ci &psrlq (@C[3],64-$rhotates[3][3]); 163e1051a39Sopenharmony_ci &pxor (@C[4],@T[1]); 164e1051a39Sopenharmony_ci &psllq (@T[2],$rhotates[3][3]); 165e1051a39Sopenharmony_ci &movq (@T[1],@C[4]); 166e1051a39Sopenharmony_ci &psrlq (@C[4],64-$rhotates[4][4]); 167e1051a39Sopenharmony_ci &por (@C[3],@T[2]); # C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ 168e1051a39Sopenharmony_ci &psllq (@T[1],$rhotates[4][4]); 169e1051a39Sopenharmony_ci 170e1051a39Sopenharmony_ci &movq (@C[2],&QWP($A[2][2],"esi")); 171e1051a39Sopenharmony_ci &por (@C[4],@T[1]); # C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ 172e1051a39Sopenharmony_ci &pxor (@C[2],@C[1]); 173e1051a39Sopenharmony_ci &movq (@C[1],&QWP($A[1][1],"esi")); 174e1051a39Sopenharmony_ci &movq (@T[1],@C[2]); 175e1051a39Sopenharmony_ci &psrlq (@C[2],64-$rhotates[2][2]); 176e1051a39Sopenharmony_ci &pxor (@C[1],&QWP(@D[1],"esp")); 177e1051a39Sopenharmony_ci &psllq (@T[1],$rhotates[2][2]); 178e1051a39Sopenharmony_ci 179e1051a39Sopenharmony_ci &movq (@T[2],@C[1]); 180e1051a39Sopenharmony_ci &psrlq (@C[1],64-$rhotates[1][1]); 181e1051a39Sopenharmony_ci &por (@C[2],@T[1]); # C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ 182e1051a39Sopenharmony_ci &psllq (@T[2],$rhotates[1][1]); 183e1051a39Sopenharmony_ci &pxor (@C[0],&QWP($A[0][0],"esi")); # /* rotate by 0 */ /* D[0] */ 184e1051a39Sopenharmony_ci &por (@C[1],@T[2]); # C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); 185e1051a39Sopenharmony_ci 186e1051a39Sopenharmony_cisub Chi() { ######### regular Chi step 187e1051a39Sopenharmony_ci my ($y,$xrho) = @_; 188e1051a39Sopenharmony_ci 189e1051a39Sopenharmony_ci &movq (@T[0],@C[1]); 190e1051a39Sopenharmony_ci &movq (@T[1],@C[2]); 191e1051a39Sopenharmony_ci &pandn (@T[0],@C[2]); 192e1051a39Sopenharmony_ci &pandn (@C[2],@C[3]); 193e1051a39Sopenharmony_ci &pxor (@T[0],@C[0]); 194e1051a39Sopenharmony_ci &pxor (@C[2],@C[1]); 195e1051a39Sopenharmony_ci &pxor (@T[0],&QWP(0,"ebx")) if ($y == 0); 196e1051a39Sopenharmony_ci &lea ("ebx",&DWP(8,"ebx")) if ($y == 0); 197e1051a39Sopenharmony_ci 198e1051a39Sopenharmony_ci &movq (@T[2],@C[3]); 199e1051a39Sopenharmony_ci &movq (&QWP($A[$y][0],"edi"),@T[0]); # R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; 200e1051a39Sopenharmony_ci &movq (@T[0],@C[4]); 201e1051a39Sopenharmony_ci &pandn (@C[3],@C[4]); 202e1051a39Sopenharmony_ci &pandn (@C[4],@C[0]); 203e1051a39Sopenharmony_ci &pxor (@C[3],@T[1]); 204e1051a39Sopenharmony_ci &movq (&QWP($A[$y][1],"edi"),@C[2]); # R[0][1] = C[1] ^ (~C[2] & C[3]); 205e1051a39Sopenharmony_ci &pxor (@C[4],@T[2]); 206e1051a39Sopenharmony_ci &movq (@T[2],&QWP($A[0][$xrho],"esi")) if (defined($xrho)); 207e1051a39Sopenharmony_ci 208e1051a39Sopenharmony_ci &movq (&QWP($A[$y][2],"edi"),@C[3]); # R[0][2] = C[2] ^ (~C[3] & C[4]); 209e1051a39Sopenharmony_ci &pandn (@C[0],@C[1]); 210e1051a39Sopenharmony_ci &movq (&QWP($A[$y][3],"edi"),@C[4]); # R[0][3] = C[3] ^ (~C[4] & C[0]); 211e1051a39Sopenharmony_ci &pxor (@C[0],@T[0]); 212e1051a39Sopenharmony_ci &pxor (@T[2],&QWP(@D[$xrho],"esp")) if (defined($xrho)); 213e1051a39Sopenharmony_ci &movq (&QWP($A[$y][4],"edi"),@C[0]); # R[0][4] = C[4] ^ (~C[0] & C[1]); 214e1051a39Sopenharmony_ci} 215e1051a39Sopenharmony_ci &Chi (0, 3); 216e1051a39Sopenharmony_ci 217e1051a39Sopenharmony_cisub Rho() { ######### regular Rho step 218e1051a39Sopenharmony_ci my $x = shift; 219e1051a39Sopenharmony_ci 220e1051a39Sopenharmony_ci #&movq (@T[2],&QWP($A[0][$x],"esi")); # moved to Chi 221e1051a39Sopenharmony_ci #&pxor (@T[2],&QWP(@D[$x],"esp")); # moved to Chi 222e1051a39Sopenharmony_ci &movq (@C[0],@T[2]); 223e1051a39Sopenharmony_ci &psrlq (@T[2],64-$rhotates[0][$x]); 224e1051a39Sopenharmony_ci &movq (@C[1],&QWP($A[1][($x+1)%5],"esi")); 225e1051a39Sopenharmony_ci &psllq (@C[0],$rhotates[0][$x]); 226e1051a39Sopenharmony_ci &pxor (@C[1],&QWP(@D[($x+1)%5],"esp")); 227e1051a39Sopenharmony_ci &por (@C[0],@T[2]); # C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); 228e1051a39Sopenharmony_ci 229e1051a39Sopenharmony_ci &movq (@T[1],@C[1]); 230e1051a39Sopenharmony_ci &psrlq (@C[1],64-$rhotates[1][($x+1)%5]); 231e1051a39Sopenharmony_ci &movq (@C[2],&QWP($A[2][($x+2)%5],"esi")); 232e1051a39Sopenharmony_ci &psllq (@T[1],$rhotates[1][($x+1)%5]); 233e1051a39Sopenharmony_ci &pxor (@C[2],&QWP(@D[($x+2)%5],"esp")); 234e1051a39Sopenharmony_ci &por (@C[1],@T[1]); # C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 235e1051a39Sopenharmony_ci 236e1051a39Sopenharmony_ci &movq (@T[2],@C[2]); 237e1051a39Sopenharmony_ci &psrlq (@C[2],64-$rhotates[2][($x+2)%5]); 238e1051a39Sopenharmony_ci &movq (@C[3],&QWP($A[3][($x+3)%5],"esi")); 239e1051a39Sopenharmony_ci &psllq (@T[2],$rhotates[2][($x+2)%5]); 240e1051a39Sopenharmony_ci &pxor (@C[3],&QWP(@D[($x+3)%5],"esp")); 241e1051a39Sopenharmony_ci &por (@C[2],@T[2]); # C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); 242e1051a39Sopenharmony_ci 243e1051a39Sopenharmony_ci &movq (@T[0],@C[3]); 244e1051a39Sopenharmony_ci &psrlq (@C[3],64-$rhotates[3][($x+3)%5]); 245e1051a39Sopenharmony_ci &movq (@C[4],&QWP($A[4][($x+4)%5],"esi")); 246e1051a39Sopenharmony_ci &psllq (@T[0],$rhotates[3][($x+3)%5]); 247e1051a39Sopenharmony_ci &pxor (@C[4],&QWP(@D[($x+4)%5],"esp")); 248e1051a39Sopenharmony_ci &por (@C[3],@T[0]); # C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); 249e1051a39Sopenharmony_ci 250e1051a39Sopenharmony_ci &movq (@T[1],@C[4]); 251e1051a39Sopenharmony_ci &psrlq (@C[4],64-$rhotates[4][($x+4)%5]); 252e1051a39Sopenharmony_ci &psllq (@T[1],$rhotates[4][($x+4)%5]); 253e1051a39Sopenharmony_ci &por (@C[4],@T[1]); # C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); 254e1051a39Sopenharmony_ci} 255e1051a39Sopenharmony_ci &Rho (3); &Chi (1, 1); 256e1051a39Sopenharmony_ci &Rho (1); &Chi (2, 4); 257e1051a39Sopenharmony_ci &Rho (4); &Chi (3, 2); 258e1051a39Sopenharmony_ci &Rho (2); ###&Chi (4); 259e1051a39Sopenharmony_ci 260e1051a39Sopenharmony_ci &movq (@T[0],@C[0]); ######### last Chi(4) is special 261e1051a39Sopenharmony_ci &xor ("edi","esi"); # &xchg ("esi","edi"); 262e1051a39Sopenharmony_ci &movq (&QWP(@D[1],"esp"),@C[1]); 263e1051a39Sopenharmony_ci &xor ("esi","edi"); 264e1051a39Sopenharmony_ci &xor ("edi","esi"); 265e1051a39Sopenharmony_ci 266e1051a39Sopenharmony_ci &movq (@T[1],@C[1]); 267e1051a39Sopenharmony_ci &movq (@T[2],@C[2]); 268e1051a39Sopenharmony_ci &pandn (@T[1],@C[2]); 269e1051a39Sopenharmony_ci &pandn (@T[2],@C[3]); 270e1051a39Sopenharmony_ci &pxor (@C[0],@T[1]); 271e1051a39Sopenharmony_ci &pxor (@C[1],@T[2]); 272e1051a39Sopenharmony_ci 273e1051a39Sopenharmony_ci &movq (@T[1],@C[3]); 274e1051a39Sopenharmony_ci &movq (&QWP($A[4][0],"esi"),@C[0]); # R[4][0] = C[0] ^= (~C[1] & C[2]); 275e1051a39Sopenharmony_ci &pandn (@T[1],@C[4]); 276e1051a39Sopenharmony_ci &movq (&QWP($A[4][1],"esi"),@C[1]); # R[4][1] = C[1] ^= (~C[2] & C[3]); 277e1051a39Sopenharmony_ci &pxor (@C[2],@T[1]); 278e1051a39Sopenharmony_ci &movq (@T[2],@C[4]); 279e1051a39Sopenharmony_ci &movq (&QWP($A[4][2],"esi"),@C[2]); # R[4][2] = C[2] ^= (~C[3] & C[4]); 280e1051a39Sopenharmony_ci 281e1051a39Sopenharmony_ci &pandn (@T[2],@T[0]); 282e1051a39Sopenharmony_ci &pandn (@T[0],&QWP(@D[1],"esp")); 283e1051a39Sopenharmony_ci &pxor (@C[3],@T[2]); 284e1051a39Sopenharmony_ci &pxor (@C[4],@T[0]); 285e1051a39Sopenharmony_ci &movq (&QWP($A[4][3],"esi"),@C[3]); # R[4][3] = C[3] ^= (~C[4] & D[0]); 286e1051a39Sopenharmony_ci &sub ("ecx",1); 287e1051a39Sopenharmony_ci &movq (&QWP($A[4][4],"esi"),@C[4]); # R[4][4] = C[4] ^= (~D[0] & D[1]); 288e1051a39Sopenharmony_ci &jnz (&label("loop")); 289e1051a39Sopenharmony_ci 290e1051a39Sopenharmony_ci &lea ("ebx",&DWP(-192,"ebx")); # rewind iotas 291e1051a39Sopenharmony_ci &ret (); 292e1051a39Sopenharmony_ci&function_end_B("_KeccakF1600"); 293e1051a39Sopenharmony_ci 294e1051a39Sopenharmony_ci&function_begin("KeccakF1600"); 295e1051a39Sopenharmony_ci &mov ("esi",&wparam(0)); 296e1051a39Sopenharmony_ci &mov ("ebp","esp"); 297e1051a39Sopenharmony_ci &sub ("esp",240); 298e1051a39Sopenharmony_ci &call (&label("pic_point")); 299e1051a39Sopenharmony_ci &set_label("pic_point"); 300e1051a39Sopenharmony_ci &blindpop("ebx"); 301e1051a39Sopenharmony_ci &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx")); 302e1051a39Sopenharmony_ci &and ("esp",-8); 303e1051a39Sopenharmony_ci &lea ("esi",&DWP(100,"esi")); # size optimization 304e1051a39Sopenharmony_ci &lea ("edi",&DWP(8*5+100,"esp")); # size optimization 305e1051a39Sopenharmony_ci 306e1051a39Sopenharmony_ci &call ("_KeccakF1600"); 307e1051a39Sopenharmony_ci 308e1051a39Sopenharmony_ci &mov ("esp","ebp"); 309e1051a39Sopenharmony_ci &emms (); 310e1051a39Sopenharmony_ci&function_end("KeccakF1600"); 311e1051a39Sopenharmony_ci 312e1051a39Sopenharmony_ci&function_begin("SHA3_absorb"); 313e1051a39Sopenharmony_ci &mov ("esi",&wparam(0)); # A[][] 314e1051a39Sopenharmony_ci &mov ("eax",&wparam(1)); # inp 315e1051a39Sopenharmony_ci &mov ("ecx",&wparam(2)); # len 316e1051a39Sopenharmony_ci &mov ("edx",&wparam(3)); # bsz 317e1051a39Sopenharmony_ci &mov ("ebp","esp"); 318e1051a39Sopenharmony_ci &sub ("esp",240+8); 319e1051a39Sopenharmony_ci &call (&label("pic_point")); 320e1051a39Sopenharmony_ci &set_label("pic_point"); 321e1051a39Sopenharmony_ci &blindpop("ebx"); 322e1051a39Sopenharmony_ci &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx")); 323e1051a39Sopenharmony_ci &and ("esp",-8); 324e1051a39Sopenharmony_ci 325e1051a39Sopenharmony_ci &mov ("edi","esi"); 326e1051a39Sopenharmony_ci &lea ("esi",&DWP(100,"esi")); # size optimization 327e1051a39Sopenharmony_ci &mov (&DWP(-4,"ebp"),"edx"); # save bsz 328e1051a39Sopenharmony_ci &jmp (&label("loop")); 329e1051a39Sopenharmony_ci 330e1051a39Sopenharmony_ci&set_label("loop",16); 331e1051a39Sopenharmony_ci &cmp ("ecx","edx"); # len < bsz? 332e1051a39Sopenharmony_ci &jc (&label("absorbed")); 333e1051a39Sopenharmony_ci 334e1051a39Sopenharmony_ci &shr ("edx",3); # bsz /= 8 335e1051a39Sopenharmony_ci&set_label("block"); 336e1051a39Sopenharmony_ci &movq ("mm0",&QWP(0,"eax")); 337e1051a39Sopenharmony_ci &lea ("eax",&DWP(8,"eax")); 338e1051a39Sopenharmony_ci &pxor ("mm0",&QWP(0,"edi")); 339e1051a39Sopenharmony_ci &lea ("edi",&DWP(8,"edi")); 340e1051a39Sopenharmony_ci &sub ("ecx",8); # len -= 8 341e1051a39Sopenharmony_ci &movq (&QWP(-8,"edi"),"mm0"); 342e1051a39Sopenharmony_ci &dec ("edx"); # bsz-- 343e1051a39Sopenharmony_ci &jnz (&label("block")); 344e1051a39Sopenharmony_ci 345e1051a39Sopenharmony_ci &lea ("edi",&DWP(8*5+100,"esp")); # size optimization 346e1051a39Sopenharmony_ci &mov (&DWP(-8,"ebp"),"ecx"); # save len 347e1051a39Sopenharmony_ci &call ("_KeccakF1600"); 348e1051a39Sopenharmony_ci &mov ("ecx",&DWP(-8,"ebp")); # pull len 349e1051a39Sopenharmony_ci &mov ("edx",&DWP(-4,"ebp")); # pull bsz 350e1051a39Sopenharmony_ci &lea ("edi",&DWP(-100,"esi")); 351e1051a39Sopenharmony_ci &jmp (&label("loop")); 352e1051a39Sopenharmony_ci 353e1051a39Sopenharmony_ci&set_label("absorbed",16); 354e1051a39Sopenharmony_ci &mov ("eax","ecx"); # return value 355e1051a39Sopenharmony_ci &mov ("esp","ebp"); 356e1051a39Sopenharmony_ci &emms (); 357e1051a39Sopenharmony_ci&function_end("SHA3_absorb"); 358e1051a39Sopenharmony_ci 359e1051a39Sopenharmony_ci&function_begin("SHA3_squeeze"); 360e1051a39Sopenharmony_ci &mov ("esi",&wparam(0)); # A[][] 361e1051a39Sopenharmony_ci &mov ("eax",&wparam(1)); # out 362e1051a39Sopenharmony_ci &mov ("ecx",&wparam(2)); # len 363e1051a39Sopenharmony_ci &mov ("edx",&wparam(3)); # bsz 364e1051a39Sopenharmony_ci &mov ("ebp","esp"); 365e1051a39Sopenharmony_ci &sub ("esp",240+8); 366e1051a39Sopenharmony_ci &call (&label("pic_point")); 367e1051a39Sopenharmony_ci &set_label("pic_point"); 368e1051a39Sopenharmony_ci &blindpop("ebx"); 369e1051a39Sopenharmony_ci &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx")); 370e1051a39Sopenharmony_ci &and ("esp",-8); 371e1051a39Sopenharmony_ci 372e1051a39Sopenharmony_ci &shr ("edx",3); # bsz /= 8 373e1051a39Sopenharmony_ci &mov ("edi","esi"); 374e1051a39Sopenharmony_ci &lea ("esi",&DWP(100,"esi")); # size optimization 375e1051a39Sopenharmony_ci &mov (&DWP(-4,"ebp"),"edx"); # save bsz 376e1051a39Sopenharmony_ci &jmp (&label("loop")); 377e1051a39Sopenharmony_ci 378e1051a39Sopenharmony_ci&set_label("loop",16); 379e1051a39Sopenharmony_ci &cmp ("ecx",8); # len < 8? 380e1051a39Sopenharmony_ci &jc (&label("tail")); 381e1051a39Sopenharmony_ci 382e1051a39Sopenharmony_ci &movq ("mm0",&QWP(0,"edi")); 383e1051a39Sopenharmony_ci &lea ("edi",&DWP(8,"edi")); 384e1051a39Sopenharmony_ci &movq (&QWP(0,"eax"),"mm0"); 385e1051a39Sopenharmony_ci &lea ("eax",&DWP(8,"eax")); 386e1051a39Sopenharmony_ci &sub ("ecx",8); # len -= 8 387e1051a39Sopenharmony_ci &jz (&label("done")); 388e1051a39Sopenharmony_ci 389e1051a39Sopenharmony_ci &dec ("edx"); # bsz-- 390e1051a39Sopenharmony_ci &jnz (&label("loop")); 391e1051a39Sopenharmony_ci 392e1051a39Sopenharmony_ci &lea ("edi",&DWP(8*5+100,"esp")); # size optimization 393e1051a39Sopenharmony_ci &mov (&DWP(-8,"ebp"),"ecx"); # save len 394e1051a39Sopenharmony_ci &call ("_KeccakF1600"); 395e1051a39Sopenharmony_ci &mov ("ecx",&DWP(-8,"ebp")); # pull len 396e1051a39Sopenharmony_ci &mov ("edx",&DWP(-4,"ebp")); # pull bsz 397e1051a39Sopenharmony_ci &lea ("edi",&DWP(-100,"esi")); 398e1051a39Sopenharmony_ci &jmp (&label("loop")); 399e1051a39Sopenharmony_ci 400e1051a39Sopenharmony_ci&set_label("tail",16); 401e1051a39Sopenharmony_ci &mov ("esi","edi"); 402e1051a39Sopenharmony_ci &mov ("edi","eax"); 403e1051a39Sopenharmony_ci &data_word("0xA4F39066"); # rep movsb 404e1051a39Sopenharmony_ci 405e1051a39Sopenharmony_ci&set_label("done"); 406e1051a39Sopenharmony_ci &mov ("esp","ebp"); 407e1051a39Sopenharmony_ci &emms (); 408e1051a39Sopenharmony_ci&function_end("SHA3_squeeze"); 409e1051a39Sopenharmony_ci 410e1051a39Sopenharmony_ci&set_label("iotas",32); 411e1051a39Sopenharmony_ci &data_word(0x00000001,0x00000000); 412e1051a39Sopenharmony_ci &data_word(0x00008082,0x00000000); 413e1051a39Sopenharmony_ci &data_word(0x0000808a,0x80000000); 414e1051a39Sopenharmony_ci &data_word(0x80008000,0x80000000); 415e1051a39Sopenharmony_ci &data_word(0x0000808b,0x00000000); 416e1051a39Sopenharmony_ci &data_word(0x80000001,0x00000000); 417e1051a39Sopenharmony_ci &data_word(0x80008081,0x80000000); 418e1051a39Sopenharmony_ci &data_word(0x00008009,0x80000000); 419e1051a39Sopenharmony_ci &data_word(0x0000008a,0x00000000); 420e1051a39Sopenharmony_ci &data_word(0x00000088,0x00000000); 421e1051a39Sopenharmony_ci &data_word(0x80008009,0x00000000); 422e1051a39Sopenharmony_ci &data_word(0x8000000a,0x00000000); 423e1051a39Sopenharmony_ci &data_word(0x8000808b,0x00000000); 424e1051a39Sopenharmony_ci &data_word(0x0000008b,0x80000000); 425e1051a39Sopenharmony_ci &data_word(0x00008089,0x80000000); 426e1051a39Sopenharmony_ci &data_word(0x00008003,0x80000000); 427e1051a39Sopenharmony_ci &data_word(0x00008002,0x80000000); 428e1051a39Sopenharmony_ci &data_word(0x00000080,0x80000000); 429e1051a39Sopenharmony_ci &data_word(0x0000800a,0x00000000); 430e1051a39Sopenharmony_ci &data_word(0x8000000a,0x80000000); 431e1051a39Sopenharmony_ci &data_word(0x80008081,0x80000000); 432e1051a39Sopenharmony_ci &data_word(0x00008080,0x80000000); 433e1051a39Sopenharmony_ci &data_word(0x80000001,0x00000000); 434e1051a39Sopenharmony_ci &data_word(0x80008008,0x80000000); 435e1051a39Sopenharmony_ci&asciz("Keccak-1600 absorb and squeeze for MMX, CRYPTOGAMS by <appro\@openssl.org>"); 436e1051a39Sopenharmony_ci 437e1051a39Sopenharmony_ci&asm_finish(); 438e1051a39Sopenharmony_ci 439e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 440