1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci###################################################################### 11e1051a39Sopenharmony_ci## Constant-time SSSE3 AES core implementation. 12e1051a39Sopenharmony_ci## version 0.1 13e1051a39Sopenharmony_ci## 14e1051a39Sopenharmony_ci## By Mike Hamburg (Stanford University), 2009 15e1051a39Sopenharmony_ci## Public domain. 16e1051a39Sopenharmony_ci## 17e1051a39Sopenharmony_ci## For details see http://shiftleft.org/papers/vector_aes/ and 18e1051a39Sopenharmony_ci## http://crypto.stanford.edu/vpaes/. 19e1051a39Sopenharmony_ci 20e1051a39Sopenharmony_ci###################################################################### 21e1051a39Sopenharmony_ci# September 2011. 22e1051a39Sopenharmony_ci# 23e1051a39Sopenharmony_ci# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for 24e1051a39Sopenharmony_ci# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt 25e1051a39Sopenharmony_ci# doesn't handle partial vectors (doesn't have to if called from 26e1051a39Sopenharmony_ci# EVP only). "Drop-in" implies that this module doesn't share key 27e1051a39Sopenharmony_ci# schedule structure with the original nor does it make assumption 28e1051a39Sopenharmony_ci# about its alignment... 29e1051a39Sopenharmony_ci# 30e1051a39Sopenharmony_ci# Performance summary. aes-586.pl column lists large-block CBC 31e1051a39Sopenharmony_ci# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per 32e1051a39Sopenharmony_ci# byte processed with 128-bit key, and vpaes-x86.pl column - [also 33e1051a39Sopenharmony_ci# large-block CBC] encrypt/decrypt. 34e1051a39Sopenharmony_ci# 35e1051a39Sopenharmony_ci# aes-586.pl vpaes-x86.pl 36e1051a39Sopenharmony_ci# 37e1051a39Sopenharmony_ci# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) 38e1051a39Sopenharmony_ci# Nehalem 27.9/40.4/18.1 10.2/11.9 39e1051a39Sopenharmony_ci# Atom 70.7/92.1/60.1 61.1/75.4(***) 40e1051a39Sopenharmony_ci# Silvermont 45.4/62.9/24.1 49.2/61.1(***) 41e1051a39Sopenharmony_ci# 42e1051a39Sopenharmony_ci# (*) "Hyper-threading" in the context refers rather to cache shared 43e1051a39Sopenharmony_ci# among multiple cores, than to specifically Intel HTT. As vast 44e1051a39Sopenharmony_ci# majority of contemporary cores share cache, slower code path 45e1051a39Sopenharmony_ci# is common place. In other words "with-hyper-threading-off" 46e1051a39Sopenharmony_ci# results are presented mostly for reference purposes. 47e1051a39Sopenharmony_ci# 48e1051a39Sopenharmony_ci# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. 49e1051a39Sopenharmony_ci# 50e1051a39Sopenharmony_ci# (***) Less impressive improvement on Core 2 and Atom is due to slow 51e1051a39Sopenharmony_ci# pshufb, yet it's respectable +28%/64% improvement on Core 2 52e1051a39Sopenharmony_ci# and +15% on Atom (as implied, over "hyper-threading-safe" 53e1051a39Sopenharmony_ci# code path). 54e1051a39Sopenharmony_ci# 55e1051a39Sopenharmony_ci# <appro@openssl.org> 56e1051a39Sopenharmony_ci 57e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 58e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 59e1051a39Sopenharmony_cirequire "x86asm.pl"; 60e1051a39Sopenharmony_ci 61e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output"; 62e1051a39Sopenharmony_ci 63e1051a39Sopenharmony_ci&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); 64e1051a39Sopenharmony_ci 65e1051a39Sopenharmony_ci$PREFIX="vpaes"; 66e1051a39Sopenharmony_ci 67e1051a39Sopenharmony_cimy ($round, $base, $magic, $key, $const, $inp, $out)= 68e1051a39Sopenharmony_ci ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); 69e1051a39Sopenharmony_ci 70e1051a39Sopenharmony_ci&static_label("_vpaes_consts"); 71e1051a39Sopenharmony_ci&static_label("_vpaes_schedule_low_round"); 72e1051a39Sopenharmony_ci 73e1051a39Sopenharmony_ci&set_label("_vpaes_consts",64); 74e1051a39Sopenharmony_ci$k_inv=-0x30; # inv, inva 75e1051a39Sopenharmony_ci &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); 76e1051a39Sopenharmony_ci &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); 77e1051a39Sopenharmony_ci 78e1051a39Sopenharmony_ci$k_s0F=-0x10; # s0F 79e1051a39Sopenharmony_ci &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); 80e1051a39Sopenharmony_ci 81e1051a39Sopenharmony_ci$k_ipt=0x00; # input transform (lo, hi) 82e1051a39Sopenharmony_ci &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); 83e1051a39Sopenharmony_ci &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); 84e1051a39Sopenharmony_ci 85e1051a39Sopenharmony_ci$k_sb1=0x20; # sb1u, sb1t 86e1051a39Sopenharmony_ci &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); 87e1051a39Sopenharmony_ci &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); 88e1051a39Sopenharmony_ci$k_sb2=0x40; # sb2u, sb2t 89e1051a39Sopenharmony_ci &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); 90e1051a39Sopenharmony_ci &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); 91e1051a39Sopenharmony_ci$k_sbo=0x60; # sbou, sbot 92e1051a39Sopenharmony_ci &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); 93e1051a39Sopenharmony_ci &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); 94e1051a39Sopenharmony_ci 95e1051a39Sopenharmony_ci$k_mc_forward=0x80; # mc_forward 96e1051a39Sopenharmony_ci &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); 97e1051a39Sopenharmony_ci &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); 98e1051a39Sopenharmony_ci &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); 99e1051a39Sopenharmony_ci &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); 100e1051a39Sopenharmony_ci 101e1051a39Sopenharmony_ci$k_mc_backward=0xc0; # mc_backward 102e1051a39Sopenharmony_ci &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); 103e1051a39Sopenharmony_ci &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); 104e1051a39Sopenharmony_ci &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); 105e1051a39Sopenharmony_ci &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); 106e1051a39Sopenharmony_ci 107e1051a39Sopenharmony_ci$k_sr=0x100; # sr 108e1051a39Sopenharmony_ci &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); 109e1051a39Sopenharmony_ci &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); 110e1051a39Sopenharmony_ci &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); 111e1051a39Sopenharmony_ci &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); 112e1051a39Sopenharmony_ci 113e1051a39Sopenharmony_ci$k_rcon=0x140; # rcon 114e1051a39Sopenharmony_ci &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); 115e1051a39Sopenharmony_ci 116e1051a39Sopenharmony_ci$k_s63=0x150; # s63: all equal to 0x63 transformed 117e1051a39Sopenharmony_ci &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); 118e1051a39Sopenharmony_ci 119e1051a39Sopenharmony_ci$k_opt=0x160; # output transform 120e1051a39Sopenharmony_ci &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); 121e1051a39Sopenharmony_ci &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); 122e1051a39Sopenharmony_ci 123e1051a39Sopenharmony_ci$k_deskew=0x180; # deskew tables: inverts the sbox's "skew" 124e1051a39Sopenharmony_ci &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); 125e1051a39Sopenharmony_ci &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); 126e1051a39Sopenharmony_ci## 127e1051a39Sopenharmony_ci## Decryption stuff 128e1051a39Sopenharmony_ci## Key schedule constants 129e1051a39Sopenharmony_ci## 130e1051a39Sopenharmony_ci$k_dksd=0x1a0; # decryption key schedule: invskew x*D 131e1051a39Sopenharmony_ci &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); 132e1051a39Sopenharmony_ci &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); 133e1051a39Sopenharmony_ci$k_dksb=0x1c0; # decryption key schedule: invskew x*B 134e1051a39Sopenharmony_ci &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); 135e1051a39Sopenharmony_ci &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); 136e1051a39Sopenharmony_ci$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 137e1051a39Sopenharmony_ci &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); 138e1051a39Sopenharmony_ci &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); 139e1051a39Sopenharmony_ci$k_dks9=0x200; # decryption key schedule: invskew x*9 140e1051a39Sopenharmony_ci &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); 141e1051a39Sopenharmony_ci &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); 142e1051a39Sopenharmony_ci 143e1051a39Sopenharmony_ci## 144e1051a39Sopenharmony_ci## Decryption stuff 145e1051a39Sopenharmony_ci## Round function constants 146e1051a39Sopenharmony_ci## 147e1051a39Sopenharmony_ci$k_dipt=0x220; # decryption input transform 148e1051a39Sopenharmony_ci &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); 149e1051a39Sopenharmony_ci &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); 150e1051a39Sopenharmony_ci 151e1051a39Sopenharmony_ci$k_dsb9=0x240; # decryption sbox output *9*u, *9*t 152e1051a39Sopenharmony_ci &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); 153e1051a39Sopenharmony_ci &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); 154e1051a39Sopenharmony_ci$k_dsbd=0x260; # decryption sbox output *D*u, *D*t 155e1051a39Sopenharmony_ci &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); 156e1051a39Sopenharmony_ci &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); 157e1051a39Sopenharmony_ci$k_dsbb=0x280; # decryption sbox output *B*u, *B*t 158e1051a39Sopenharmony_ci &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); 159e1051a39Sopenharmony_ci &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); 160e1051a39Sopenharmony_ci$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t 161e1051a39Sopenharmony_ci &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); 162e1051a39Sopenharmony_ci &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); 163e1051a39Sopenharmony_ci$k_dsbo=0x2c0; # decryption sbox final output 164e1051a39Sopenharmony_ci &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); 165e1051a39Sopenharmony_ci &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); 166e1051a39Sopenharmony_ci&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); 167e1051a39Sopenharmony_ci&align (64); 168e1051a39Sopenharmony_ci 169e1051a39Sopenharmony_ci&function_begin_B("_vpaes_preheat"); 170e1051a39Sopenharmony_ci &add ($const,&DWP(0,"esp")); 171e1051a39Sopenharmony_ci &movdqa ("xmm7",&QWP($k_inv,$const)); 172e1051a39Sopenharmony_ci &movdqa ("xmm6",&QWP($k_s0F,$const)); 173e1051a39Sopenharmony_ci &ret (); 174e1051a39Sopenharmony_ci&function_end_B("_vpaes_preheat"); 175e1051a39Sopenharmony_ci 176e1051a39Sopenharmony_ci## 177e1051a39Sopenharmony_ci## _aes_encrypt_core 178e1051a39Sopenharmony_ci## 179e1051a39Sopenharmony_ci## AES-encrypt %xmm0. 180e1051a39Sopenharmony_ci## 181e1051a39Sopenharmony_ci## Inputs: 182e1051a39Sopenharmony_ci## %xmm0 = input 183e1051a39Sopenharmony_ci## %xmm6-%xmm7 as in _vpaes_preheat 184e1051a39Sopenharmony_ci## (%edx) = scheduled keys 185e1051a39Sopenharmony_ci## 186e1051a39Sopenharmony_ci## Output in %xmm0 187e1051a39Sopenharmony_ci## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx 188e1051a39Sopenharmony_ci## 189e1051a39Sopenharmony_ci## 190e1051a39Sopenharmony_ci&function_begin_B("_vpaes_encrypt_core"); 191e1051a39Sopenharmony_ci &mov ($magic,16); 192e1051a39Sopenharmony_ci &mov ($round,&DWP(240,$key)); 193e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm6") 194e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP($k_ipt,$const)); 195e1051a39Sopenharmony_ci &pandn ("xmm1","xmm0"); 196e1051a39Sopenharmony_ci &pand ("xmm0","xmm6"); 197e1051a39Sopenharmony_ci &movdqu ("xmm5",&QWP(0,$key)); 198e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm0"); 199e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP($k_ipt+16,$const)); 200e1051a39Sopenharmony_ci &pxor ("xmm2","xmm5"); 201e1051a39Sopenharmony_ci &psrld ("xmm1",4); 202e1051a39Sopenharmony_ci &add ($key,16); 203e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm1"); 204e1051a39Sopenharmony_ci &lea ($base,&DWP($k_mc_backward,$const)); 205e1051a39Sopenharmony_ci &pxor ("xmm0","xmm2"); 206e1051a39Sopenharmony_ci &jmp (&label("enc_entry")); 207e1051a39Sopenharmony_ci 208e1051a39Sopenharmony_ci 209e1051a39Sopenharmony_ci&set_label("enc_loop",16); 210e1051a39Sopenharmony_ci # middle of middle round 211e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u 212e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t 213e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm2"); # 4 = sb1u 214e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm3"); # 0 = sb1t 215e1051a39Sopenharmony_ci &pxor ("xmm4","xmm5"); # 4 = sb1u + k 216e1051a39Sopenharmony_ci &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u 217e1051a39Sopenharmony_ci &pxor ("xmm0","xmm4"); # 0 = A 218e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] 219e1051a39Sopenharmony_ci &pshufb ("xmm5","xmm2"); # 4 = sb2u 220e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t 221e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] 222e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm3"); # 2 = sb2t 223e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm0"); # 3 = A 224e1051a39Sopenharmony_ci &pxor ("xmm2","xmm5"); # 2 = 2A 225e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm1"); # 0 = B 226e1051a39Sopenharmony_ci &add ($key,16); # next key 227e1051a39Sopenharmony_ci &pxor ("xmm0","xmm2"); # 0 = 2A+B 228e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm4"); # 3 = D 229e1051a39Sopenharmony_ci &add ($magic,16); # next mc 230e1051a39Sopenharmony_ci &pxor ("xmm3","xmm0"); # 3 = 2A+B+D 231e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm1"); # 0 = 2B+C 232e1051a39Sopenharmony_ci &and ($magic,0x30); # ... mod 4 233e1051a39Sopenharmony_ci &sub ($round,1); # nr-- 234e1051a39Sopenharmony_ci &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D 235e1051a39Sopenharmony_ci 236e1051a39Sopenharmony_ci&set_label("enc_entry"); 237e1051a39Sopenharmony_ci # top of round 238e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm6"); # 1 : i 239e1051a39Sopenharmony_ci &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k 240e1051a39Sopenharmony_ci &pandn ("xmm1","xmm0"); # 1 = i<<4 241e1051a39Sopenharmony_ci &psrld ("xmm1",4); # 1 = i 242e1051a39Sopenharmony_ci &pand ("xmm0","xmm6"); # 0 = k 243e1051a39Sopenharmony_ci &pshufb ("xmm5","xmm0"); # 2 = a/k 244e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm7"); # 3 : 1/i 245e1051a39Sopenharmony_ci &pxor ("xmm0","xmm1"); # 0 = j 246e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm1"); # 3 = 1/i 247e1051a39Sopenharmony_ci &movdqa ("xmm4","xmm7"); # 4 : 1/j 248e1051a39Sopenharmony_ci &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k 249e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm0"); # 4 = 1/j 250e1051a39Sopenharmony_ci &movdqa ("xmm2","xmm7"); # 2 : 1/iak 251e1051a39Sopenharmony_ci &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k 252e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm3"); # 2 = 1/iak 253e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm7"); # 3 : 1/jak 254e1051a39Sopenharmony_ci &pxor ("xmm2","xmm0"); # 2 = io 255e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm4"); # 3 = 1/jak 256e1051a39Sopenharmony_ci &movdqu ("xmm5",&QWP(0,$key)); 257e1051a39Sopenharmony_ci &pxor ("xmm3","xmm1"); # 3 = jo 258e1051a39Sopenharmony_ci &jnz (&label("enc_loop")); 259e1051a39Sopenharmony_ci 260e1051a39Sopenharmony_ci # middle of last round 261e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo 262e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 263e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm2"); # 4 = sbou 264e1051a39Sopenharmony_ci &pxor ("xmm4","xmm5"); # 4 = sb1u + k 265e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm3"); # 0 = sb1t 266e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] 267e1051a39Sopenharmony_ci &pxor ("xmm0","xmm4"); # 0 = A 268e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm1"); 269e1051a39Sopenharmony_ci &ret (); 270e1051a39Sopenharmony_ci&function_end_B("_vpaes_encrypt_core"); 271e1051a39Sopenharmony_ci 272e1051a39Sopenharmony_ci## 273e1051a39Sopenharmony_ci## Decryption core 274e1051a39Sopenharmony_ci## 275e1051a39Sopenharmony_ci## Same API as encryption core. 276e1051a39Sopenharmony_ci## 277e1051a39Sopenharmony_ci&function_begin_B("_vpaes_decrypt_core"); 278e1051a39Sopenharmony_ci &lea ($base,&DWP($k_dsbd,$const)); 279e1051a39Sopenharmony_ci &mov ($round,&DWP(240,$key)); 280e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm6"); 281e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); 282e1051a39Sopenharmony_ci &pandn ("xmm1","xmm0"); 283e1051a39Sopenharmony_ci &mov ($magic,$round); 284e1051a39Sopenharmony_ci &psrld ("xmm1",4) 285e1051a39Sopenharmony_ci &movdqu ("xmm5",&QWP(0,$key)); 286e1051a39Sopenharmony_ci &shl ($magic,4); 287e1051a39Sopenharmony_ci &pand ("xmm0","xmm6"); 288e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm0"); 289e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); 290e1051a39Sopenharmony_ci &xor ($magic,0x30); 291e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm1"); 292e1051a39Sopenharmony_ci &and ($magic,0x30); 293e1051a39Sopenharmony_ci &pxor ("xmm2","xmm5"); 294e1051a39Sopenharmony_ci &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); 295e1051a39Sopenharmony_ci &pxor ("xmm0","xmm2"); 296e1051a39Sopenharmony_ci &add ($key,16); 297e1051a39Sopenharmony_ci &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); 298e1051a39Sopenharmony_ci &jmp (&label("dec_entry")); 299e1051a39Sopenharmony_ci 300e1051a39Sopenharmony_ci&set_label("dec_loop",16); 301e1051a39Sopenharmony_ci## 302e1051a39Sopenharmony_ci## Inverse mix columns 303e1051a39Sopenharmony_ci## 304e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u 305e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t 306e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm2"); # 4 = sb9u 307e1051a39Sopenharmony_ci &pshufb ("xmm1","xmm3"); # 0 = sb9t 308e1051a39Sopenharmony_ci &pxor ("xmm0","xmm4"); 309e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu 310e1051a39Sopenharmony_ci &pxor ("xmm0","xmm1"); # 0 = ch 311e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt 312e1051a39Sopenharmony_ci 313e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm2"); # 4 = sbdu 314e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm5"); # MC ch 315e1051a39Sopenharmony_ci &pshufb ("xmm1","xmm3"); # 0 = sbdt 316e1051a39Sopenharmony_ci &pxor ("xmm0","xmm4"); # 4 = ch 317e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu 318e1051a39Sopenharmony_ci &pxor ("xmm0","xmm1"); # 0 = ch 319e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt 320e1051a39Sopenharmony_ci 321e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm2"); # 4 = sbbu 322e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm5"); # MC ch 323e1051a39Sopenharmony_ci &pshufb ("xmm1","xmm3"); # 0 = sbbt 324e1051a39Sopenharmony_ci &pxor ("xmm0","xmm4"); # 4 = ch 325e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu 326e1051a39Sopenharmony_ci &pxor ("xmm0","xmm1"); # 0 = ch 327e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet 328e1051a39Sopenharmony_ci 329e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm2"); # 4 = sbeu 330e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm5"); # MC ch 331e1051a39Sopenharmony_ci &pshufb ("xmm1","xmm3"); # 0 = sbet 332e1051a39Sopenharmony_ci &pxor ("xmm0","xmm4"); # 4 = ch 333e1051a39Sopenharmony_ci &add ($key,16); # next round key 334e1051a39Sopenharmony_ci &palignr("xmm5","xmm5",12); 335e1051a39Sopenharmony_ci &pxor ("xmm0","xmm1"); # 0 = ch 336e1051a39Sopenharmony_ci &sub ($round,1); # nr-- 337e1051a39Sopenharmony_ci 338e1051a39Sopenharmony_ci&set_label("dec_entry"); 339e1051a39Sopenharmony_ci # top of round 340e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm6"); # 1 : i 341e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k 342e1051a39Sopenharmony_ci &pandn ("xmm1","xmm0"); # 1 = i<<4 343e1051a39Sopenharmony_ci &pand ("xmm0","xmm6"); # 0 = k 344e1051a39Sopenharmony_ci &psrld ("xmm1",4); # 1 = i 345e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm0"); # 2 = a/k 346e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm7"); # 3 : 1/i 347e1051a39Sopenharmony_ci &pxor ("xmm0","xmm1"); # 0 = j 348e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm1"); # 3 = 1/i 349e1051a39Sopenharmony_ci &movdqa ("xmm4","xmm7"); # 4 : 1/j 350e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k 351e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm0"); # 4 = 1/j 352e1051a39Sopenharmony_ci &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k 353e1051a39Sopenharmony_ci &movdqa ("xmm2","xmm7"); # 2 : 1/iak 354e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm3"); # 2 = 1/iak 355e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm7"); # 3 : 1/jak 356e1051a39Sopenharmony_ci &pxor ("xmm2","xmm0"); # 2 = io 357e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm4"); # 3 = 1/jak 358e1051a39Sopenharmony_ci &movdqu ("xmm0",&QWP(0,$key)); 359e1051a39Sopenharmony_ci &pxor ("xmm3","xmm1"); # 3 = jo 360e1051a39Sopenharmony_ci &jnz (&label("dec_loop")); 361e1051a39Sopenharmony_ci 362e1051a39Sopenharmony_ci # middle of last round 363e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou 364e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm2"); # 4 = sbou 365e1051a39Sopenharmony_ci &pxor ("xmm4","xmm0"); # 4 = sb1u + k 366e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot 367e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0,$magic)); 368e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm3"); # 0 = sb1t 369e1051a39Sopenharmony_ci &pxor ("xmm0","xmm4"); # 0 = A 370e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm2"); 371e1051a39Sopenharmony_ci &ret (); 372e1051a39Sopenharmony_ci&function_end_B("_vpaes_decrypt_core"); 373e1051a39Sopenharmony_ci 374e1051a39Sopenharmony_ci######################################################## 375e1051a39Sopenharmony_ci## ## 376e1051a39Sopenharmony_ci## AES key schedule ## 377e1051a39Sopenharmony_ci## ## 378e1051a39Sopenharmony_ci######################################################## 379e1051a39Sopenharmony_ci&function_begin_B("_vpaes_schedule_core"); 380e1051a39Sopenharmony_ci &add ($const,&DWP(0,"esp")); 381e1051a39Sopenharmony_ci &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) 382e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon 383e1051a39Sopenharmony_ci 384e1051a39Sopenharmony_ci # input transform 385e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm0"); 386e1051a39Sopenharmony_ci &lea ($base,&DWP($k_ipt,$const)); 387e1051a39Sopenharmony_ci &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 388e1051a39Sopenharmony_ci &call ("_vpaes_schedule_transform"); 389e1051a39Sopenharmony_ci &movdqa ("xmm7","xmm0"); 390e1051a39Sopenharmony_ci 391e1051a39Sopenharmony_ci &test ($out,$out); 392e1051a39Sopenharmony_ci &jnz (&label("schedule_am_decrypting")); 393e1051a39Sopenharmony_ci 394e1051a39Sopenharmony_ci # encrypting, output zeroth round key after transform 395e1051a39Sopenharmony_ci &movdqu (&QWP(0,$key),"xmm0"); 396e1051a39Sopenharmony_ci &jmp (&label("schedule_go")); 397e1051a39Sopenharmony_ci 398e1051a39Sopenharmony_ci&set_label("schedule_am_decrypting"); 399e1051a39Sopenharmony_ci # decrypting, output zeroth round key after shiftrows 400e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 401e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm1"); 402e1051a39Sopenharmony_ci &movdqu (&QWP(0,$key),"xmm3"); 403e1051a39Sopenharmony_ci &xor ($magic,0x30); 404e1051a39Sopenharmony_ci 405e1051a39Sopenharmony_ci&set_label("schedule_go"); 406e1051a39Sopenharmony_ci &cmp ($round,192); 407e1051a39Sopenharmony_ci &ja (&label("schedule_256")); 408e1051a39Sopenharmony_ci &je (&label("schedule_192")); 409e1051a39Sopenharmony_ci # 128: fall though 410e1051a39Sopenharmony_ci 411e1051a39Sopenharmony_ci## 412e1051a39Sopenharmony_ci## .schedule_128 413e1051a39Sopenharmony_ci## 414e1051a39Sopenharmony_ci## 128-bit specific part of key schedule. 415e1051a39Sopenharmony_ci## 416e1051a39Sopenharmony_ci## This schedule is really simple, because all its parts 417e1051a39Sopenharmony_ci## are accomplished by the subroutines. 418e1051a39Sopenharmony_ci## 419e1051a39Sopenharmony_ci&set_label("schedule_128"); 420e1051a39Sopenharmony_ci &mov ($round,10); 421e1051a39Sopenharmony_ci 422e1051a39Sopenharmony_ci&set_label("loop_schedule_128"); 423e1051a39Sopenharmony_ci &call ("_vpaes_schedule_round"); 424e1051a39Sopenharmony_ci &dec ($round); 425e1051a39Sopenharmony_ci &jz (&label("schedule_mangle_last")); 426e1051a39Sopenharmony_ci &call ("_vpaes_schedule_mangle"); # write output 427e1051a39Sopenharmony_ci &jmp (&label("loop_schedule_128")); 428e1051a39Sopenharmony_ci 429e1051a39Sopenharmony_ci## 430e1051a39Sopenharmony_ci## .aes_schedule_192 431e1051a39Sopenharmony_ci## 432e1051a39Sopenharmony_ci## 192-bit specific part of key schedule. 433e1051a39Sopenharmony_ci## 434e1051a39Sopenharmony_ci## The main body of this schedule is the same as the 128-bit 435e1051a39Sopenharmony_ci## schedule, but with more smearing. The long, high side is 436e1051a39Sopenharmony_ci## stored in %xmm7 as before, and the short, low side is in 437e1051a39Sopenharmony_ci## the high bits of %xmm6. 438e1051a39Sopenharmony_ci## 439e1051a39Sopenharmony_ci## This schedule is somewhat nastier, however, because each 440e1051a39Sopenharmony_ci## round produces 192 bits of key material, or 1.5 round keys. 441e1051a39Sopenharmony_ci## Therefore, on each cycle we do 2 rounds and produce 3 round 442e1051a39Sopenharmony_ci## keys. 443e1051a39Sopenharmony_ci## 444e1051a39Sopenharmony_ci&set_label("schedule_192",16); 445e1051a39Sopenharmony_ci &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) 446e1051a39Sopenharmony_ci &call ("_vpaes_schedule_transform"); # input transform 447e1051a39Sopenharmony_ci &movdqa ("xmm6","xmm0"); # save short part 448e1051a39Sopenharmony_ci &pxor ("xmm4","xmm4"); # clear 4 449e1051a39Sopenharmony_ci &movhlps("xmm6","xmm4"); # clobber low side with zeros 450e1051a39Sopenharmony_ci &mov ($round,4); 451e1051a39Sopenharmony_ci 452e1051a39Sopenharmony_ci&set_label("loop_schedule_192"); 453e1051a39Sopenharmony_ci &call ("_vpaes_schedule_round"); 454e1051a39Sopenharmony_ci &palignr("xmm0","xmm6",8); 455e1051a39Sopenharmony_ci &call ("_vpaes_schedule_mangle"); # save key n 456e1051a39Sopenharmony_ci &call ("_vpaes_schedule_192_smear"); 457e1051a39Sopenharmony_ci &call ("_vpaes_schedule_mangle"); # save key n+1 458e1051a39Sopenharmony_ci &call ("_vpaes_schedule_round"); 459e1051a39Sopenharmony_ci &dec ($round); 460e1051a39Sopenharmony_ci &jz (&label("schedule_mangle_last")); 461e1051a39Sopenharmony_ci &call ("_vpaes_schedule_mangle"); # save key n+2 462e1051a39Sopenharmony_ci &call ("_vpaes_schedule_192_smear"); 463e1051a39Sopenharmony_ci &jmp (&label("loop_schedule_192")); 464e1051a39Sopenharmony_ci 465e1051a39Sopenharmony_ci## 466e1051a39Sopenharmony_ci## .aes_schedule_256 467e1051a39Sopenharmony_ci## 468e1051a39Sopenharmony_ci## 256-bit specific part of key schedule. 469e1051a39Sopenharmony_ci## 470e1051a39Sopenharmony_ci## The structure here is very similar to the 128-bit 471e1051a39Sopenharmony_ci## schedule, but with an additional "low side" in 472e1051a39Sopenharmony_ci## %xmm6. The low side's rounds are the same as the 473e1051a39Sopenharmony_ci## high side's, except no rcon and no rotation. 474e1051a39Sopenharmony_ci## 475e1051a39Sopenharmony_ci&set_label("schedule_256",16); 476e1051a39Sopenharmony_ci &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) 477e1051a39Sopenharmony_ci &call ("_vpaes_schedule_transform"); # input transform 478e1051a39Sopenharmony_ci &mov ($round,7); 479e1051a39Sopenharmony_ci 480e1051a39Sopenharmony_ci&set_label("loop_schedule_256"); 481e1051a39Sopenharmony_ci &call ("_vpaes_schedule_mangle"); # output low result 482e1051a39Sopenharmony_ci &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 483e1051a39Sopenharmony_ci 484e1051a39Sopenharmony_ci # high round 485e1051a39Sopenharmony_ci &call ("_vpaes_schedule_round"); 486e1051a39Sopenharmony_ci &dec ($round); 487e1051a39Sopenharmony_ci &jz (&label("schedule_mangle_last")); 488e1051a39Sopenharmony_ci &call ("_vpaes_schedule_mangle"); 489e1051a39Sopenharmony_ci 490e1051a39Sopenharmony_ci # low round. swap xmm7 and xmm6 491e1051a39Sopenharmony_ci &pshufd ("xmm0","xmm0",0xFF); 492e1051a39Sopenharmony_ci &movdqa (&QWP(20,"esp"),"xmm7"); 493e1051a39Sopenharmony_ci &movdqa ("xmm7","xmm6"); 494e1051a39Sopenharmony_ci &call ("_vpaes_schedule_low_round"); 495e1051a39Sopenharmony_ci &movdqa ("xmm7",&QWP(20,"esp")); 496e1051a39Sopenharmony_ci 497e1051a39Sopenharmony_ci &jmp (&label("loop_schedule_256")); 498e1051a39Sopenharmony_ci 499e1051a39Sopenharmony_ci## 500e1051a39Sopenharmony_ci## .aes_schedule_mangle_last 501e1051a39Sopenharmony_ci## 502e1051a39Sopenharmony_ci## Mangler for last round of key schedule 503e1051a39Sopenharmony_ci## Mangles %xmm0 504e1051a39Sopenharmony_ci## when encrypting, outputs out(%xmm0) ^ 63 505e1051a39Sopenharmony_ci## when decrypting, outputs unskew(%xmm0) 506e1051a39Sopenharmony_ci## 507e1051a39Sopenharmony_ci## Always called right before return... jumps to cleanup and exits 508e1051a39Sopenharmony_ci## 509e1051a39Sopenharmony_ci&set_label("schedule_mangle_last",16); 510e1051a39Sopenharmony_ci # schedule last round key from xmm0 511e1051a39Sopenharmony_ci &lea ($base,&DWP($k_deskew,$const)); 512e1051a39Sopenharmony_ci &test ($out,$out); 513e1051a39Sopenharmony_ci &jnz (&label("schedule_mangle_last_dec")); 514e1051a39Sopenharmony_ci 515e1051a39Sopenharmony_ci # encrypting 516e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 517e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm1"); # output permute 518e1051a39Sopenharmony_ci &lea ($base,&DWP($k_opt,$const)); # prepare to output transform 519e1051a39Sopenharmony_ci &add ($key,32); 520e1051a39Sopenharmony_ci 521e1051a39Sopenharmony_ci&set_label("schedule_mangle_last_dec"); 522e1051a39Sopenharmony_ci &add ($key,-16); 523e1051a39Sopenharmony_ci &pxor ("xmm0",&QWP($k_s63,$const)); 524e1051a39Sopenharmony_ci &call ("_vpaes_schedule_transform"); # output transform 525e1051a39Sopenharmony_ci &movdqu (&QWP(0,$key),"xmm0"); # save last key 526e1051a39Sopenharmony_ci 527e1051a39Sopenharmony_ci # cleanup 528e1051a39Sopenharmony_ci &pxor ("xmm0","xmm0"); 529e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 530e1051a39Sopenharmony_ci &pxor ("xmm2","xmm2"); 531e1051a39Sopenharmony_ci &pxor ("xmm3","xmm3"); 532e1051a39Sopenharmony_ci &pxor ("xmm4","xmm4"); 533e1051a39Sopenharmony_ci &pxor ("xmm5","xmm5"); 534e1051a39Sopenharmony_ci &pxor ("xmm6","xmm6"); 535e1051a39Sopenharmony_ci &pxor ("xmm7","xmm7"); 536e1051a39Sopenharmony_ci &ret (); 537e1051a39Sopenharmony_ci&function_end_B("_vpaes_schedule_core"); 538e1051a39Sopenharmony_ci 539e1051a39Sopenharmony_ci## 540e1051a39Sopenharmony_ci## .aes_schedule_192_smear 541e1051a39Sopenharmony_ci## 542e1051a39Sopenharmony_ci## Smear the short, low side in the 192-bit key schedule. 543e1051a39Sopenharmony_ci## 544e1051a39Sopenharmony_ci## Inputs: 545e1051a39Sopenharmony_ci## %xmm7: high side, b a x y 546e1051a39Sopenharmony_ci## %xmm6: low side, d c 0 0 547e1051a39Sopenharmony_ci## %xmm13: 0 548e1051a39Sopenharmony_ci## 549e1051a39Sopenharmony_ci## Outputs: 550e1051a39Sopenharmony_ci## %xmm6: b+c+d b+c 0 0 551e1051a39Sopenharmony_ci## %xmm0: b+c+d b+c b a 552e1051a39Sopenharmony_ci## 553e1051a39Sopenharmony_ci&function_begin_B("_vpaes_schedule_192_smear"); 554e1051a39Sopenharmony_ci &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0 555e1051a39Sopenharmony_ci &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a 556e1051a39Sopenharmony_ci &pxor ("xmm6","xmm1"); # -> c+d c 0 0 557e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 558e1051a39Sopenharmony_ci &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a 559e1051a39Sopenharmony_ci &movdqa ("xmm0","xmm6"); 560e1051a39Sopenharmony_ci &movhlps("xmm6","xmm1"); # clobber low side with zeros 561e1051a39Sopenharmony_ci &ret (); 562e1051a39Sopenharmony_ci&function_end_B("_vpaes_schedule_192_smear"); 563e1051a39Sopenharmony_ci 564e1051a39Sopenharmony_ci## 565e1051a39Sopenharmony_ci## .aes_schedule_round 566e1051a39Sopenharmony_ci## 567e1051a39Sopenharmony_ci## Runs one main round of the key schedule on %xmm0, %xmm7 568e1051a39Sopenharmony_ci## 569e1051a39Sopenharmony_ci## Specifically, runs subbytes on the high dword of %xmm0 570e1051a39Sopenharmony_ci## then rotates it by one byte and xors into the low dword of 571e1051a39Sopenharmony_ci## %xmm7. 572e1051a39Sopenharmony_ci## 573e1051a39Sopenharmony_ci## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 574e1051a39Sopenharmony_ci## next rcon. 575e1051a39Sopenharmony_ci## 576e1051a39Sopenharmony_ci## Smears the dwords of %xmm7 by xoring the low into the 577e1051a39Sopenharmony_ci## second low, result into third, result into highest. 578e1051a39Sopenharmony_ci## 579e1051a39Sopenharmony_ci## Returns results in %xmm7 = %xmm0. 580e1051a39Sopenharmony_ci## Clobbers %xmm1-%xmm5. 581e1051a39Sopenharmony_ci## 582e1051a39Sopenharmony_ci&function_begin_B("_vpaes_schedule_round"); 583e1051a39Sopenharmony_ci # extract rcon from xmm8 584e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 585e1051a39Sopenharmony_ci &pxor ("xmm1","xmm1"); 586e1051a39Sopenharmony_ci &palignr("xmm1","xmm2",15); 587e1051a39Sopenharmony_ci &palignr("xmm2","xmm2",15); 588e1051a39Sopenharmony_ci &pxor ("xmm7","xmm1"); 589e1051a39Sopenharmony_ci 590e1051a39Sopenharmony_ci # rotate 591e1051a39Sopenharmony_ci &pshufd ("xmm0","xmm0",0xFF); 592e1051a39Sopenharmony_ci &palignr("xmm0","xmm0",1); 593e1051a39Sopenharmony_ci 594e1051a39Sopenharmony_ci # fall through... 595e1051a39Sopenharmony_ci &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 596e1051a39Sopenharmony_ci 597e1051a39Sopenharmony_ci # low round: same as high round, but no rotation and no rcon. 598e1051a39Sopenharmony_ci&set_label("_vpaes_schedule_low_round"); 599e1051a39Sopenharmony_ci # smear xmm7 600e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm7"); 601e1051a39Sopenharmony_ci &pslldq ("xmm7",4); 602e1051a39Sopenharmony_ci &pxor ("xmm7","xmm1"); 603e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm7"); 604e1051a39Sopenharmony_ci &pslldq ("xmm7",8); 605e1051a39Sopenharmony_ci &pxor ("xmm7","xmm1"); 606e1051a39Sopenharmony_ci &pxor ("xmm7",&QWP($k_s63,$const)); 607e1051a39Sopenharmony_ci 608e1051a39Sopenharmony_ci # subbyte 609e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP($k_s0F,$const)); 610e1051a39Sopenharmony_ci &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j 611e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm4"); 612e1051a39Sopenharmony_ci &pandn ("xmm1","xmm0"); 613e1051a39Sopenharmony_ci &psrld ("xmm1",4); # 1 = i 614e1051a39Sopenharmony_ci &pand ("xmm0","xmm4"); # 0 = k 615e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k 616e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm0"); # 2 = a/k 617e1051a39Sopenharmony_ci &pxor ("xmm0","xmm1"); # 0 = j 618e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm5"); # 3 : 1/i 619e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm1"); # 3 = 1/i 620e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k 621e1051a39Sopenharmony_ci &movdqa ("xmm4","xmm5"); # 4 : 1/j 622e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm0"); # 4 = 1/j 623e1051a39Sopenharmony_ci &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k 624e1051a39Sopenharmony_ci &movdqa ("xmm2","xmm5"); # 2 : 1/iak 625e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm3"); # 2 = 1/iak 626e1051a39Sopenharmony_ci &pxor ("xmm2","xmm0"); # 2 = io 627e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm5"); # 3 : 1/jak 628e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm4"); # 3 = 1/jak 629e1051a39Sopenharmony_ci &pxor ("xmm3","xmm1"); # 3 = jo 630e1051a39Sopenharmony_ci &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou 631e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm2"); # 4 = sbou 632e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot 633e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm3"); # 0 = sb1t 634e1051a39Sopenharmony_ci &pxor ("xmm0","xmm4"); # 0 = sbox output 635e1051a39Sopenharmony_ci 636e1051a39Sopenharmony_ci # add in smeared stuff 637e1051a39Sopenharmony_ci &pxor ("xmm0","xmm7"); 638e1051a39Sopenharmony_ci &movdqa ("xmm7","xmm0"); 639e1051a39Sopenharmony_ci &ret (); 640e1051a39Sopenharmony_ci&function_end_B("_vpaes_schedule_round"); 641e1051a39Sopenharmony_ci 642e1051a39Sopenharmony_ci## 643e1051a39Sopenharmony_ci## .aes_schedule_transform 644e1051a39Sopenharmony_ci## 645e1051a39Sopenharmony_ci## Linear-transform %xmm0 according to tables at (%ebx) 646e1051a39Sopenharmony_ci## 647e1051a39Sopenharmony_ci## Output in %xmm0 648e1051a39Sopenharmony_ci## Clobbers %xmm1, %xmm2 649e1051a39Sopenharmony_ci## 650e1051a39Sopenharmony_ci&function_begin_B("_vpaes_schedule_transform"); 651e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP($k_s0F,$const)); 652e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm2"); 653e1051a39Sopenharmony_ci &pandn ("xmm1","xmm0"); 654e1051a39Sopenharmony_ci &psrld ("xmm1",4); 655e1051a39Sopenharmony_ci &pand ("xmm0","xmm2"); 656e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0,$base)); 657e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm0"); 658e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP(16,$base)); 659e1051a39Sopenharmony_ci &pshufb ("xmm0","xmm1"); 660e1051a39Sopenharmony_ci &pxor ("xmm0","xmm2"); 661e1051a39Sopenharmony_ci &ret (); 662e1051a39Sopenharmony_ci&function_end_B("_vpaes_schedule_transform"); 663e1051a39Sopenharmony_ci 664e1051a39Sopenharmony_ci## 665e1051a39Sopenharmony_ci## .aes_schedule_mangle 666e1051a39Sopenharmony_ci## 667e1051a39Sopenharmony_ci## Mangle xmm0 from (basis-transformed) standard version 668e1051a39Sopenharmony_ci## to our version. 669e1051a39Sopenharmony_ci## 670e1051a39Sopenharmony_ci## On encrypt, 671e1051a39Sopenharmony_ci## xor with 0x63 672e1051a39Sopenharmony_ci## multiply by circulant 0,1,1,1 673e1051a39Sopenharmony_ci## apply shiftrows transform 674e1051a39Sopenharmony_ci## 675e1051a39Sopenharmony_ci## On decrypt, 676e1051a39Sopenharmony_ci## xor with 0x63 677e1051a39Sopenharmony_ci## multiply by "inverse mixcolumns" circulant E,B,D,9 678e1051a39Sopenharmony_ci## deskew 679e1051a39Sopenharmony_ci## apply shiftrows transform 680e1051a39Sopenharmony_ci## 681e1051a39Sopenharmony_ci## 682e1051a39Sopenharmony_ci## Writes out to (%edx), and increments or decrements it 683e1051a39Sopenharmony_ci## Keeps track of round number mod 4 in %ecx 684e1051a39Sopenharmony_ci## Preserves xmm0 685e1051a39Sopenharmony_ci## Clobbers xmm1-xmm5 686e1051a39Sopenharmony_ci## 687e1051a39Sopenharmony_ci&function_begin_B("_vpaes_schedule_mangle"); 688e1051a39Sopenharmony_ci &movdqa ("xmm4","xmm0"); # save xmm0 for later 689e1051a39Sopenharmony_ci &movdqa ("xmm5",&QWP($k_mc_forward,$const)); 690e1051a39Sopenharmony_ci &test ($out,$out); 691e1051a39Sopenharmony_ci &jnz (&label("schedule_mangle_dec")); 692e1051a39Sopenharmony_ci 693e1051a39Sopenharmony_ci # encrypting 694e1051a39Sopenharmony_ci &add ($key,16); 695e1051a39Sopenharmony_ci &pxor ("xmm4",&QWP($k_s63,$const)); 696e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm5"); 697e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm4"); 698e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm5"); 699e1051a39Sopenharmony_ci &pxor ("xmm3","xmm4"); 700e1051a39Sopenharmony_ci &pshufb ("xmm4","xmm5"); 701e1051a39Sopenharmony_ci &pxor ("xmm3","xmm4"); 702e1051a39Sopenharmony_ci 703e1051a39Sopenharmony_ci &jmp (&label("schedule_mangle_both")); 704e1051a39Sopenharmony_ci 705e1051a39Sopenharmony_ci&set_label("schedule_mangle_dec",16); 706e1051a39Sopenharmony_ci # inverse mix columns 707e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP($k_s0F,$const)); 708e1051a39Sopenharmony_ci &lea ($inp,&DWP($k_dksd,$const)); 709e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm2"); 710e1051a39Sopenharmony_ci &pandn ("xmm1","xmm4"); 711e1051a39Sopenharmony_ci &psrld ("xmm1",4); # 1 = hi 712e1051a39Sopenharmony_ci &pand ("xmm4","xmm2"); # 4 = lo 713e1051a39Sopenharmony_ci 714e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0,$inp)); 715e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm4"); 716e1051a39Sopenharmony_ci &movdqa ("xmm3",&QWP(0x10,$inp)); 717e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm1"); 718e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); 719e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm5"); 720e1051a39Sopenharmony_ci 721e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0x20,$inp)); 722e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm4"); 723e1051a39Sopenharmony_ci &pxor ("xmm2","xmm3"); 724e1051a39Sopenharmony_ci &movdqa ("xmm3",&QWP(0x30,$inp)); 725e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm1"); 726e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); 727e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm5"); 728e1051a39Sopenharmony_ci 729e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0x40,$inp)); 730e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm4"); 731e1051a39Sopenharmony_ci &pxor ("xmm2","xmm3"); 732e1051a39Sopenharmony_ci &movdqa ("xmm3",&QWP(0x50,$inp)); 733e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm1"); 734e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); 735e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm5"); 736e1051a39Sopenharmony_ci 737e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0x60,$inp)); 738e1051a39Sopenharmony_ci &pshufb ("xmm2","xmm4"); 739e1051a39Sopenharmony_ci &pxor ("xmm2","xmm3"); 740e1051a39Sopenharmony_ci &movdqa ("xmm3",&QWP(0x70,$inp)); 741e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm1"); 742e1051a39Sopenharmony_ci &pxor ("xmm3","xmm2"); 743e1051a39Sopenharmony_ci 744e1051a39Sopenharmony_ci &add ($key,-16); 745e1051a39Sopenharmony_ci 746e1051a39Sopenharmony_ci&set_label("schedule_mangle_both"); 747e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 748e1051a39Sopenharmony_ci &pshufb ("xmm3","xmm1"); 749e1051a39Sopenharmony_ci &add ($magic,-16); 750e1051a39Sopenharmony_ci &and ($magic,0x30); 751e1051a39Sopenharmony_ci &movdqu (&QWP(0,$key),"xmm3"); 752e1051a39Sopenharmony_ci &ret (); 753e1051a39Sopenharmony_ci&function_end_B("_vpaes_schedule_mangle"); 754e1051a39Sopenharmony_ci 755e1051a39Sopenharmony_ci# 756e1051a39Sopenharmony_ci# Interface to OpenSSL 757e1051a39Sopenharmony_ci# 758e1051a39Sopenharmony_ci&function_begin("${PREFIX}_set_encrypt_key"); 759e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); # inp 760e1051a39Sopenharmony_ci &lea ($base,&DWP(-56,"esp")); 761e1051a39Sopenharmony_ci &mov ($round,&wparam(1)); # bits 762e1051a39Sopenharmony_ci &and ($base,-16); 763e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); # key 764e1051a39Sopenharmony_ci &xchg ($base,"esp"); # alloca 765e1051a39Sopenharmony_ci &mov (&DWP(48,"esp"),$base); 766e1051a39Sopenharmony_ci 767e1051a39Sopenharmony_ci &mov ($base,$round); 768e1051a39Sopenharmony_ci &shr ($base,5); 769e1051a39Sopenharmony_ci &add ($base,5); 770e1051a39Sopenharmony_ci &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; 771e1051a39Sopenharmony_ci &mov ($magic,0x30); 772e1051a39Sopenharmony_ci &mov ($out,0); 773e1051a39Sopenharmony_ci 774e1051a39Sopenharmony_ci &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 775e1051a39Sopenharmony_ci &call ("_vpaes_schedule_core"); 776e1051a39Sopenharmony_ci&set_label("pic_point"); 777e1051a39Sopenharmony_ci 778e1051a39Sopenharmony_ci &mov ("esp",&DWP(48,"esp")); 779e1051a39Sopenharmony_ci &xor ("eax","eax"); 780e1051a39Sopenharmony_ci&function_end("${PREFIX}_set_encrypt_key"); 781e1051a39Sopenharmony_ci 782e1051a39Sopenharmony_ci&function_begin("${PREFIX}_set_decrypt_key"); 783e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); # inp 784e1051a39Sopenharmony_ci &lea ($base,&DWP(-56,"esp")); 785e1051a39Sopenharmony_ci &mov ($round,&wparam(1)); # bits 786e1051a39Sopenharmony_ci &and ($base,-16); 787e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); # key 788e1051a39Sopenharmony_ci &xchg ($base,"esp"); # alloca 789e1051a39Sopenharmony_ci &mov (&DWP(48,"esp"),$base); 790e1051a39Sopenharmony_ci 791e1051a39Sopenharmony_ci &mov ($base,$round); 792e1051a39Sopenharmony_ci &shr ($base,5); 793e1051a39Sopenharmony_ci &add ($base,5); 794e1051a39Sopenharmony_ci &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; 795e1051a39Sopenharmony_ci &shl ($base,4); 796e1051a39Sopenharmony_ci &lea ($key,&DWP(16,$key,$base)); 797e1051a39Sopenharmony_ci 798e1051a39Sopenharmony_ci &mov ($out,1); 799e1051a39Sopenharmony_ci &mov ($magic,$round); 800e1051a39Sopenharmony_ci &shr ($magic,1); 801e1051a39Sopenharmony_ci &and ($magic,32); 802e1051a39Sopenharmony_ci &xor ($magic,32); # nbist==192?0:32; 803e1051a39Sopenharmony_ci 804e1051a39Sopenharmony_ci &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 805e1051a39Sopenharmony_ci &call ("_vpaes_schedule_core"); 806e1051a39Sopenharmony_ci&set_label("pic_point"); 807e1051a39Sopenharmony_ci 808e1051a39Sopenharmony_ci &mov ("esp",&DWP(48,"esp")); 809e1051a39Sopenharmony_ci &xor ("eax","eax"); 810e1051a39Sopenharmony_ci&function_end("${PREFIX}_set_decrypt_key"); 811e1051a39Sopenharmony_ci 812e1051a39Sopenharmony_ci&function_begin("${PREFIX}_encrypt"); 813e1051a39Sopenharmony_ci &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 814e1051a39Sopenharmony_ci &call ("_vpaes_preheat"); 815e1051a39Sopenharmony_ci&set_label("pic_point"); 816e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); # inp 817e1051a39Sopenharmony_ci &lea ($base,&DWP(-56,"esp")); 818e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); # out 819e1051a39Sopenharmony_ci &and ($base,-16); 820e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); # key 821e1051a39Sopenharmony_ci &xchg ($base,"esp"); # alloca 822e1051a39Sopenharmony_ci &mov (&DWP(48,"esp"),$base); 823e1051a39Sopenharmony_ci 824e1051a39Sopenharmony_ci &movdqu ("xmm0",&QWP(0,$inp)); 825e1051a39Sopenharmony_ci &call ("_vpaes_encrypt_core"); 826e1051a39Sopenharmony_ci &movdqu (&QWP(0,$out),"xmm0"); 827e1051a39Sopenharmony_ci 828e1051a39Sopenharmony_ci &mov ("esp",&DWP(48,"esp")); 829e1051a39Sopenharmony_ci&function_end("${PREFIX}_encrypt"); 830e1051a39Sopenharmony_ci 831e1051a39Sopenharmony_ci&function_begin("${PREFIX}_decrypt"); 832e1051a39Sopenharmony_ci &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 833e1051a39Sopenharmony_ci &call ("_vpaes_preheat"); 834e1051a39Sopenharmony_ci&set_label("pic_point"); 835e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); # inp 836e1051a39Sopenharmony_ci &lea ($base,&DWP(-56,"esp")); 837e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); # out 838e1051a39Sopenharmony_ci &and ($base,-16); 839e1051a39Sopenharmony_ci &mov ($key,&wparam(2)); # key 840e1051a39Sopenharmony_ci &xchg ($base,"esp"); # alloca 841e1051a39Sopenharmony_ci &mov (&DWP(48,"esp"),$base); 842e1051a39Sopenharmony_ci 843e1051a39Sopenharmony_ci &movdqu ("xmm0",&QWP(0,$inp)); 844e1051a39Sopenharmony_ci &call ("_vpaes_decrypt_core"); 845e1051a39Sopenharmony_ci &movdqu (&QWP(0,$out),"xmm0"); 846e1051a39Sopenharmony_ci 847e1051a39Sopenharmony_ci &mov ("esp",&DWP(48,"esp")); 848e1051a39Sopenharmony_ci&function_end("${PREFIX}_decrypt"); 849e1051a39Sopenharmony_ci 850e1051a39Sopenharmony_ci&function_begin("${PREFIX}_cbc_encrypt"); 851e1051a39Sopenharmony_ci &mov ($inp,&wparam(0)); # inp 852e1051a39Sopenharmony_ci &mov ($out,&wparam(1)); # out 853e1051a39Sopenharmony_ci &mov ($round,&wparam(2)); # len 854e1051a39Sopenharmony_ci &mov ($key,&wparam(3)); # key 855e1051a39Sopenharmony_ci &sub ($round,16); 856e1051a39Sopenharmony_ci &jc (&label("cbc_abort")); 857e1051a39Sopenharmony_ci &lea ($base,&DWP(-56,"esp")); 858e1051a39Sopenharmony_ci &mov ($const,&wparam(4)); # ivp 859e1051a39Sopenharmony_ci &and ($base,-16); 860e1051a39Sopenharmony_ci &mov ($magic,&wparam(5)); # enc 861e1051a39Sopenharmony_ci &xchg ($base,"esp"); # alloca 862e1051a39Sopenharmony_ci &movdqu ("xmm1",&QWP(0,$const)); # load IV 863e1051a39Sopenharmony_ci &sub ($out,$inp); 864e1051a39Sopenharmony_ci &mov (&DWP(48,"esp"),$base); 865e1051a39Sopenharmony_ci 866e1051a39Sopenharmony_ci &mov (&DWP(0,"esp"),$out); # save out 867e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),$key) # save key 868e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),$const); # save ivp 869e1051a39Sopenharmony_ci &mov ($out,$round); # $out works as $len 870e1051a39Sopenharmony_ci 871e1051a39Sopenharmony_ci &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 872e1051a39Sopenharmony_ci &call ("_vpaes_preheat"); 873e1051a39Sopenharmony_ci&set_label("pic_point"); 874e1051a39Sopenharmony_ci &cmp ($magic,0); 875e1051a39Sopenharmony_ci &je (&label("cbc_dec_loop")); 876e1051a39Sopenharmony_ci &jmp (&label("cbc_enc_loop")); 877e1051a39Sopenharmony_ci 878e1051a39Sopenharmony_ci&set_label("cbc_enc_loop",16); 879e1051a39Sopenharmony_ci &movdqu ("xmm0",&QWP(0,$inp)); # load input 880e1051a39Sopenharmony_ci &pxor ("xmm0","xmm1"); # inp^=iv 881e1051a39Sopenharmony_ci &call ("_vpaes_encrypt_core"); 882e1051a39Sopenharmony_ci &mov ($base,&DWP(0,"esp")); # restore out 883e1051a39Sopenharmony_ci &mov ($key,&DWP(4,"esp")); # restore key 884e1051a39Sopenharmony_ci &movdqa ("xmm1","xmm0"); 885e1051a39Sopenharmony_ci &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output 886e1051a39Sopenharmony_ci &lea ($inp,&DWP(16,$inp)); 887e1051a39Sopenharmony_ci &sub ($out,16); 888e1051a39Sopenharmony_ci &jnc (&label("cbc_enc_loop")); 889e1051a39Sopenharmony_ci &jmp (&label("cbc_done")); 890e1051a39Sopenharmony_ci 891e1051a39Sopenharmony_ci&set_label("cbc_dec_loop",16); 892e1051a39Sopenharmony_ci &movdqu ("xmm0",&QWP(0,$inp)); # load input 893e1051a39Sopenharmony_ci &movdqa (&QWP(16,"esp"),"xmm1"); # save IV 894e1051a39Sopenharmony_ci &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV 895e1051a39Sopenharmony_ci &call ("_vpaes_decrypt_core"); 896e1051a39Sopenharmony_ci &mov ($base,&DWP(0,"esp")); # restore out 897e1051a39Sopenharmony_ci &mov ($key,&DWP(4,"esp")); # restore key 898e1051a39Sopenharmony_ci &pxor ("xmm0",&QWP(16,"esp")); # out^=iv 899e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(32,"esp")); # load next IV 900e1051a39Sopenharmony_ci &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output 901e1051a39Sopenharmony_ci &lea ($inp,&DWP(16,$inp)); 902e1051a39Sopenharmony_ci &sub ($out,16); 903e1051a39Sopenharmony_ci &jnc (&label("cbc_dec_loop")); 904e1051a39Sopenharmony_ci 905e1051a39Sopenharmony_ci&set_label("cbc_done"); 906e1051a39Sopenharmony_ci &mov ($base,&DWP(8,"esp")); # restore ivp 907e1051a39Sopenharmony_ci &mov ("esp",&DWP(48,"esp")); 908e1051a39Sopenharmony_ci &movdqu (&QWP(0,$base),"xmm1"); # write IV 909e1051a39Sopenharmony_ci&set_label("cbc_abort"); 910e1051a39Sopenharmony_ci&function_end("${PREFIX}_cbc_encrypt"); 911e1051a39Sopenharmony_ci 912e1051a39Sopenharmony_ci&asm_finish(); 913e1051a39Sopenharmony_ci 914e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 915