1370b324cSopenharmony_ci/* AesOpt.c -- AES optimized code for x86 AES hardware instructions 2370b324cSopenharmony_ci2023-04-02 : Igor Pavlov : Public domain */ 3370b324cSopenharmony_ci 4370b324cSopenharmony_ci#include "Precomp.h" 5370b324cSopenharmony_ci 6370b324cSopenharmony_ci#include "Aes.h" 7370b324cSopenharmony_ci#include "CpuArch.h" 8370b324cSopenharmony_ci 9370b324cSopenharmony_ci#ifdef MY_CPU_X86_OR_AMD64 10370b324cSopenharmony_ci 11370b324cSopenharmony_ci #if defined(__INTEL_COMPILER) 12370b324cSopenharmony_ci #if (__INTEL_COMPILER >= 1110) 13370b324cSopenharmony_ci #define USE_INTEL_AES 14370b324cSopenharmony_ci #if (__INTEL_COMPILER >= 1900) 15370b324cSopenharmony_ci #define USE_INTEL_VAES 16370b324cSopenharmony_ci #endif 17370b324cSopenharmony_ci #endif 18370b324cSopenharmony_ci #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \ 19370b324cSopenharmony_ci || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4) 20370b324cSopenharmony_ci #define USE_INTEL_AES 21370b324cSopenharmony_ci #if !defined(__AES__) 22370b324cSopenharmony_ci #define ATTRIB_AES __attribute__((__target__("aes"))) 23370b324cSopenharmony_ci #endif 24370b324cSopenharmony_ci #if defined(__clang__) && (__clang_major__ >= 8) \ 25370b324cSopenharmony_ci || defined(__GNUC__) && (__GNUC__ >= 8) 26370b324cSopenharmony_ci #define USE_INTEL_VAES 27370b324cSopenharmony_ci #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__) 28370b324cSopenharmony_ci #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2"))) 29370b324cSopenharmony_ci #endif 30370b324cSopenharmony_ci #endif 31370b324cSopenharmony_ci #elif defined(_MSC_VER) 32370b324cSopenharmony_ci #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729) 33370b324cSopenharmony_ci #define USE_INTEL_AES 34370b324cSopenharmony_ci #if (_MSC_VER >= 1910) 35370b324cSopenharmony_ci #define USE_INTEL_VAES 36370b324cSopenharmony_ci #endif 37370b324cSopenharmony_ci #endif 38370b324cSopenharmony_ci #endif 39370b324cSopenharmony_ci 40370b324cSopenharmony_ci#ifndef ATTRIB_AES 41370b324cSopenharmony_ci #define ATTRIB_AES 42370b324cSopenharmony_ci#endif 43370b324cSopenharmony_ci#ifndef ATTRIB_VAES 44370b324cSopenharmony_ci #define ATTRIB_VAES 45370b324cSopenharmony_ci#endif 46370b324cSopenharmony_ci 47370b324cSopenharmony_ci 48370b324cSopenharmony_ci#ifdef USE_INTEL_AES 49370b324cSopenharmony_ci 50370b324cSopenharmony_ci#include <wmmintrin.h> 51370b324cSopenharmony_ci 52370b324cSopenharmony_ci#ifndef USE_INTEL_VAES 53370b324cSopenharmony_ci#define AES_TYPE_keys UInt32 54370b324cSopenharmony_ci#define AES_TYPE_data Byte 55370b324cSopenharmony_ci// #define AES_TYPE_keys __m128i 56370b324cSopenharmony_ci// #define AES_TYPE_data __m128i 57370b324cSopenharmony_ci#endif 58370b324cSopenharmony_ci 59370b324cSopenharmony_ci#define AES_FUNC_START(name) \ 60370b324cSopenharmony_ci void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) 61370b324cSopenharmony_ci // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) 62370b324cSopenharmony_ci 63370b324cSopenharmony_ci#define AES_FUNC_START2(name) \ 64370b324cSopenharmony_ciAES_FUNC_START (name); \ 65370b324cSopenharmony_ciATTRIB_AES \ 66370b324cSopenharmony_ciAES_FUNC_START (name) 67370b324cSopenharmony_ci 68370b324cSopenharmony_ci#define MM_OP(op, dest, src) dest = op(dest, src); 69370b324cSopenharmony_ci#define MM_OP_m(op, src) MM_OP(op, m, src) 70370b324cSopenharmony_ci 71370b324cSopenharmony_ci#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) 72370b324cSopenharmony_ci#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) 73370b324cSopenharmony_ci 74370b324cSopenharmony_ci 75370b324cSopenharmony_ciAES_FUNC_START2 (AesCbc_Encode_HW) 76370b324cSopenharmony_ci{ 77370b324cSopenharmony_ci __m128i *p = (__m128i *)(void *)ivAes; 78370b324cSopenharmony_ci __m128i *data = (__m128i *)(void *)data8; 79370b324cSopenharmony_ci __m128i m = *p; 80370b324cSopenharmony_ci const __m128i k0 = p[2]; 81370b324cSopenharmony_ci const __m128i k1 = p[3]; 82370b324cSopenharmony_ci const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; 83370b324cSopenharmony_ci for (; numBlocks != 0; numBlocks--, data++) 84370b324cSopenharmony_ci { 85370b324cSopenharmony_ci UInt32 r = numRounds2; 86370b324cSopenharmony_ci const __m128i *w = p + 4; 87370b324cSopenharmony_ci __m128i temp = *data; 88370b324cSopenharmony_ci MM_XOR (temp, k0) 89370b324cSopenharmony_ci MM_XOR (m, temp) 90370b324cSopenharmony_ci MM_OP_m (_mm_aesenc_si128, k1) 91370b324cSopenharmony_ci do 92370b324cSopenharmony_ci { 93370b324cSopenharmony_ci MM_OP_m (_mm_aesenc_si128, w[0]) 94370b324cSopenharmony_ci MM_OP_m (_mm_aesenc_si128, w[1]) 95370b324cSopenharmony_ci w += 2; 96370b324cSopenharmony_ci } 97370b324cSopenharmony_ci while (--r); 98370b324cSopenharmony_ci MM_OP_m (_mm_aesenclast_si128, w[0]) 99370b324cSopenharmony_ci *data = m; 100370b324cSopenharmony_ci } 101370b324cSopenharmony_ci *p = m; 102370b324cSopenharmony_ci} 103370b324cSopenharmony_ci 104370b324cSopenharmony_ci 105370b324cSopenharmony_ci#define WOP_1(op) 106370b324cSopenharmony_ci#define WOP_2(op) WOP_1 (op) op (m1, 1) 107370b324cSopenharmony_ci#define WOP_3(op) WOP_2 (op) op (m2, 2) 108370b324cSopenharmony_ci#define WOP_4(op) WOP_3 (op) op (m3, 3) 109370b324cSopenharmony_ci#ifdef MY_CPU_AMD64 110370b324cSopenharmony_ci#define WOP_5(op) WOP_4 (op) op (m4, 4) 111370b324cSopenharmony_ci#define WOP_6(op) WOP_5 (op) op (m5, 5) 112370b324cSopenharmony_ci#define WOP_7(op) WOP_6 (op) op (m6, 6) 113370b324cSopenharmony_ci#define WOP_8(op) WOP_7 (op) op (m7, 7) 114370b324cSopenharmony_ci#endif 115370b324cSopenharmony_ci/* 116370b324cSopenharmony_ci#define WOP_9(op) WOP_8 (op) op (m8, 8); 117370b324cSopenharmony_ci#define WOP_10(op) WOP_9 (op) op (m9, 9); 118370b324cSopenharmony_ci#define WOP_11(op) WOP_10(op) op (m10, 10); 119370b324cSopenharmony_ci#define WOP_12(op) WOP_11(op) op (m11, 11); 120370b324cSopenharmony_ci#define WOP_13(op) WOP_12(op) op (m12, 12); 121370b324cSopenharmony_ci#define WOP_14(op) WOP_13(op) op (m13, 13); 122370b324cSopenharmony_ci*/ 123370b324cSopenharmony_ci 124370b324cSopenharmony_ci#ifdef MY_CPU_AMD64 125370b324cSopenharmony_ci #define NUM_WAYS 8 126370b324cSopenharmony_ci #define WOP_M1 WOP_8 127370b324cSopenharmony_ci#else 128370b324cSopenharmony_ci #define NUM_WAYS 4 129370b324cSopenharmony_ci #define WOP_M1 WOP_4 130370b324cSopenharmony_ci#endif 131370b324cSopenharmony_ci 132370b324cSopenharmony_ci#define WOP(op) op (m0, 0) WOP_M1(op) 133370b324cSopenharmony_ci 134370b324cSopenharmony_ci 135370b324cSopenharmony_ci#define DECLARE_VAR(reg, ii) __m128i reg; 136370b324cSopenharmony_ci#define LOAD_data( reg, ii) reg = data[ii]; 137370b324cSopenharmony_ci#define STORE_data( reg, ii) data[ii] = reg; 138370b324cSopenharmony_ci#if (NUM_WAYS > 1) 139370b324cSopenharmony_ci#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) 140370b324cSopenharmony_ci#endif 141370b324cSopenharmony_ci 142370b324cSopenharmony_ci#define AVX_DECLARE_VAR(reg, ii) __m256i reg; 143370b324cSopenharmony_ci#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; 144370b324cSopenharmony_ci#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; 145370b324cSopenharmony_ci#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) 146370b324cSopenharmony_ci 147370b324cSopenharmony_ci#define MM_OP_key(op, reg) MM_OP(op, reg, key); 148370b324cSopenharmony_ci 149370b324cSopenharmony_ci#define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) 150370b324cSopenharmony_ci#define AES_DEC_LAST( reg, ii) MM_OP_key (_mm_aesdeclast_si128, reg) 151370b324cSopenharmony_ci#define AES_ENC( reg, ii) MM_OP_key (_mm_aesenc_si128, reg) 152370b324cSopenharmony_ci#define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) 153370b324cSopenharmony_ci#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) 154370b324cSopenharmony_ci 155370b324cSopenharmony_ci 156370b324cSopenharmony_ci#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) 157370b324cSopenharmony_ci#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) 158370b324cSopenharmony_ci#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) 159370b324cSopenharmony_ci#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) 160370b324cSopenharmony_ci#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) 161370b324cSopenharmony_ci 162370b324cSopenharmony_ci#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; 163370b324cSopenharmony_ci#define CTR_END( reg, ii) MM_XOR (data[ii], reg) 164370b324cSopenharmony_ci 165370b324cSopenharmony_ci#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); 166370b324cSopenharmony_ci#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) 167370b324cSopenharmony_ci 168370b324cSopenharmony_ci#define WOP_KEY(op, n) { \ 169370b324cSopenharmony_ci const __m128i key = w[n]; \ 170370b324cSopenharmony_ci WOP(op); } 171370b324cSopenharmony_ci 172370b324cSopenharmony_ci#define AVX_WOP_KEY(op, n) { \ 173370b324cSopenharmony_ci const __m256i key = w[n]; \ 174370b324cSopenharmony_ci WOP(op); } 175370b324cSopenharmony_ci 176370b324cSopenharmony_ci 177370b324cSopenharmony_ci#define WIDE_LOOP_START \ 178370b324cSopenharmony_ci dataEnd = data + numBlocks; \ 179370b324cSopenharmony_ci if (numBlocks >= NUM_WAYS) \ 180370b324cSopenharmony_ci { dataEnd -= NUM_WAYS; do { \ 181370b324cSopenharmony_ci 182370b324cSopenharmony_ci 183370b324cSopenharmony_ci#define WIDE_LOOP_END \ 184370b324cSopenharmony_ci data += NUM_WAYS; \ 185370b324cSopenharmony_ci } while (data <= dataEnd); \ 186370b324cSopenharmony_ci dataEnd += NUM_WAYS; } \ 187370b324cSopenharmony_ci 188370b324cSopenharmony_ci 189370b324cSopenharmony_ci#define SINGLE_LOOP \ 190370b324cSopenharmony_ci for (; data < dataEnd; data++) 191370b324cSopenharmony_ci 192370b324cSopenharmony_ci 193370b324cSopenharmony_ci#define NUM_AES_KEYS_MAX 15 194370b324cSopenharmony_ci 195370b324cSopenharmony_ci#define WIDE_LOOP_START_AVX(OP) \ 196370b324cSopenharmony_ci dataEnd = data + numBlocks; \ 197370b324cSopenharmony_ci if (numBlocks >= NUM_WAYS * 2) \ 198370b324cSopenharmony_ci { __m256i keys[NUM_AES_KEYS_MAX]; \ 199370b324cSopenharmony_ci UInt32 ii; \ 200370b324cSopenharmony_ci OP \ 201370b324cSopenharmony_ci for (ii = 0; ii < numRounds; ii++) \ 202370b324cSopenharmony_ci keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ 203370b324cSopenharmony_ci dataEnd -= NUM_WAYS * 2; do { \ 204370b324cSopenharmony_ci 205370b324cSopenharmony_ci 206370b324cSopenharmony_ci#define WIDE_LOOP_END_AVX(OP) \ 207370b324cSopenharmony_ci data += NUM_WAYS * 2; \ 208370b324cSopenharmony_ci } while (data <= dataEnd); \ 209370b324cSopenharmony_ci dataEnd += NUM_WAYS * 2; \ 210370b324cSopenharmony_ci OP \ 211370b324cSopenharmony_ci _mm256_zeroupper(); \ 212370b324cSopenharmony_ci } \ 213370b324cSopenharmony_ci 214370b324cSopenharmony_ci/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, 215370b324cSopenharmony_ci MSVC still can insert vzeroupper instruction. */ 216370b324cSopenharmony_ci 217370b324cSopenharmony_ci 218370b324cSopenharmony_ciAES_FUNC_START2 (AesCbc_Decode_HW) 219370b324cSopenharmony_ci{ 220370b324cSopenharmony_ci __m128i *p = (__m128i *)(void *)ivAes; 221370b324cSopenharmony_ci __m128i *data = (__m128i *)(void *)data8; 222370b324cSopenharmony_ci __m128i iv = *p; 223370b324cSopenharmony_ci const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; 224370b324cSopenharmony_ci const __m128i *dataEnd; 225370b324cSopenharmony_ci p += 2; 226370b324cSopenharmony_ci 227370b324cSopenharmony_ci WIDE_LOOP_START 228370b324cSopenharmony_ci { 229370b324cSopenharmony_ci const __m128i *w = wStart; 230370b324cSopenharmony_ci 231370b324cSopenharmony_ci WOP (DECLARE_VAR) 232370b324cSopenharmony_ci WOP (LOAD_data) 233370b324cSopenharmony_ci WOP_KEY (AES_XOR, 1) 234370b324cSopenharmony_ci 235370b324cSopenharmony_ci do 236370b324cSopenharmony_ci { 237370b324cSopenharmony_ci WOP_KEY (AES_DEC, 0) 238370b324cSopenharmony_ci w--; 239370b324cSopenharmony_ci } 240370b324cSopenharmony_ci while (w != p); 241370b324cSopenharmony_ci WOP_KEY (AES_DEC_LAST, 0) 242370b324cSopenharmony_ci 243370b324cSopenharmony_ci MM_XOR (m0, iv) 244370b324cSopenharmony_ci WOP_M1 (XOR_data_M1) 245370b324cSopenharmony_ci iv = data[NUM_WAYS - 1]; 246370b324cSopenharmony_ci WOP (STORE_data) 247370b324cSopenharmony_ci } 248370b324cSopenharmony_ci WIDE_LOOP_END 249370b324cSopenharmony_ci 250370b324cSopenharmony_ci SINGLE_LOOP 251370b324cSopenharmony_ci { 252370b324cSopenharmony_ci const __m128i *w = wStart - 1; 253370b324cSopenharmony_ci __m128i m = _mm_xor_si128 (w[2], *data); 254370b324cSopenharmony_ci do 255370b324cSopenharmony_ci { 256370b324cSopenharmony_ci MM_OP_m (_mm_aesdec_si128, w[1]) 257370b324cSopenharmony_ci MM_OP_m (_mm_aesdec_si128, w[0]) 258370b324cSopenharmony_ci w -= 2; 259370b324cSopenharmony_ci } 260370b324cSopenharmony_ci while (w != p); 261370b324cSopenharmony_ci MM_OP_m (_mm_aesdec_si128, w[1]) 262370b324cSopenharmony_ci MM_OP_m (_mm_aesdeclast_si128, w[0]) 263370b324cSopenharmony_ci 264370b324cSopenharmony_ci MM_XOR (m, iv) 265370b324cSopenharmony_ci iv = *data; 266370b324cSopenharmony_ci *data = m; 267370b324cSopenharmony_ci } 268370b324cSopenharmony_ci 269370b324cSopenharmony_ci p[-2] = iv; 270370b324cSopenharmony_ci} 271370b324cSopenharmony_ci 272370b324cSopenharmony_ci 273370b324cSopenharmony_ciAES_FUNC_START2 (AesCtr_Code_HW) 274370b324cSopenharmony_ci{ 275370b324cSopenharmony_ci __m128i *p = (__m128i *)(void *)ivAes; 276370b324cSopenharmony_ci __m128i *data = (__m128i *)(void *)data8; 277370b324cSopenharmony_ci __m128i ctr = *p; 278370b324cSopenharmony_ci UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; 279370b324cSopenharmony_ci const __m128i *dataEnd; 280370b324cSopenharmony_ci __m128i one = _mm_cvtsi32_si128(1); 281370b324cSopenharmony_ci 282370b324cSopenharmony_ci p += 2; 283370b324cSopenharmony_ci 284370b324cSopenharmony_ci WIDE_LOOP_START 285370b324cSopenharmony_ci { 286370b324cSopenharmony_ci const __m128i *w = p; 287370b324cSopenharmony_ci UInt32 r = numRoundsMinus2; 288370b324cSopenharmony_ci WOP (DECLARE_VAR) 289370b324cSopenharmony_ci WOP (CTR_START) 290370b324cSopenharmony_ci WOP_KEY (AES_XOR, 0) 291370b324cSopenharmony_ci w += 1; 292370b324cSopenharmony_ci do 293370b324cSopenharmony_ci { 294370b324cSopenharmony_ci WOP_KEY (AES_ENC, 0) 295370b324cSopenharmony_ci w += 1; 296370b324cSopenharmony_ci } 297370b324cSopenharmony_ci while (--r); 298370b324cSopenharmony_ci WOP_KEY (AES_ENC_LAST, 0) 299370b324cSopenharmony_ci 300370b324cSopenharmony_ci WOP (CTR_END) 301370b324cSopenharmony_ci } 302370b324cSopenharmony_ci WIDE_LOOP_END 303370b324cSopenharmony_ci 304370b324cSopenharmony_ci SINGLE_LOOP 305370b324cSopenharmony_ci { 306370b324cSopenharmony_ci UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; 307370b324cSopenharmony_ci const __m128i *w = p; 308370b324cSopenharmony_ci __m128i m; 309370b324cSopenharmony_ci MM_OP (_mm_add_epi64, ctr, one) 310370b324cSopenharmony_ci m = _mm_xor_si128 (ctr, p[0]); 311370b324cSopenharmony_ci w += 1; 312370b324cSopenharmony_ci do 313370b324cSopenharmony_ci { 314370b324cSopenharmony_ci MM_OP_m (_mm_aesenc_si128, w[0]) 315370b324cSopenharmony_ci MM_OP_m (_mm_aesenc_si128, w[1]) 316370b324cSopenharmony_ci w += 2; 317370b324cSopenharmony_ci } 318370b324cSopenharmony_ci while (--numRounds2); 319370b324cSopenharmony_ci MM_OP_m (_mm_aesenc_si128, w[0]) 320370b324cSopenharmony_ci MM_OP_m (_mm_aesenclast_si128, w[1]) 321370b324cSopenharmony_ci MM_XOR (*data, m) 322370b324cSopenharmony_ci } 323370b324cSopenharmony_ci 324370b324cSopenharmony_ci p[-2] = ctr; 325370b324cSopenharmony_ci} 326370b324cSopenharmony_ci 327370b324cSopenharmony_ci 328370b324cSopenharmony_ci 329370b324cSopenharmony_ci#ifdef USE_INTEL_VAES 330370b324cSopenharmony_ci 331370b324cSopenharmony_ci/* 332370b324cSopenharmony_ciGCC before 2013-Jun: 333370b324cSopenharmony_ci <immintrin.h>: 334370b324cSopenharmony_ci #ifdef __AVX__ 335370b324cSopenharmony_ci #include <avxintrin.h> 336370b324cSopenharmony_ci #endif 337370b324cSopenharmony_ciGCC after 2013-Jun: 338370b324cSopenharmony_ci <immintrin.h>: 339370b324cSopenharmony_ci #include <avxintrin.h> 340370b324cSopenharmony_ciCLANG 3.8+: 341370b324cSopenharmony_ci{ 342370b324cSopenharmony_ci <immintrin.h>: 343370b324cSopenharmony_ci #if !defined(_MSC_VER) || defined(__AVX__) 344370b324cSopenharmony_ci #include <avxintrin.h> 345370b324cSopenharmony_ci #endif 346370b324cSopenharmony_ci 347370b324cSopenharmony_ci if (the compiler is clang for Windows and if global arch is not set for __AVX__) 348370b324cSopenharmony_ci [ if (defined(_MSC_VER) && !defined(__AVX__)) ] 349370b324cSopenharmony_ci { 350370b324cSopenharmony_ci <immintrin.h> doesn't include <avxintrin.h> 351370b324cSopenharmony_ci and we have 2 ways to fix it: 352370b324cSopenharmony_ci 1) we can define required __AVX__ before <immintrin.h> 353370b324cSopenharmony_ci or 354370b324cSopenharmony_ci 2) we can include <avxintrin.h> after <immintrin.h> 355370b324cSopenharmony_ci } 356370b324cSopenharmony_ci} 357370b324cSopenharmony_ci 358370b324cSopenharmony_ciIf we include <avxintrin.h> manually for GCC/CLANG, it's 359370b324cSopenharmony_cirequired that <immintrin.h> must be included before <avxintrin.h>. 360370b324cSopenharmony_ci*/ 361370b324cSopenharmony_ci 362370b324cSopenharmony_ci/* 363370b324cSopenharmony_ci#if defined(__clang__) && defined(_MSC_VER) 364370b324cSopenharmony_ci#define __AVX__ 365370b324cSopenharmony_ci#define __AVX2__ 366370b324cSopenharmony_ci#define __VAES__ 367370b324cSopenharmony_ci#endif 368370b324cSopenharmony_ci*/ 369370b324cSopenharmony_ci 370370b324cSopenharmony_ci#include <immintrin.h> 371370b324cSopenharmony_ci#if defined(__clang__) && defined(_MSC_VER) 372370b324cSopenharmony_ci #if !defined(__AVX__) 373370b324cSopenharmony_ci #include <avxintrin.h> 374370b324cSopenharmony_ci #endif 375370b324cSopenharmony_ci #if !defined(__AVX2__) 376370b324cSopenharmony_ci #include <avx2intrin.h> 377370b324cSopenharmony_ci #endif 378370b324cSopenharmony_ci #if !defined(__VAES__) 379370b324cSopenharmony_ci #include <vaesintrin.h> 380370b324cSopenharmony_ci #endif 381370b324cSopenharmony_ci#endif // __clang__ && _MSC_VER 382370b324cSopenharmony_ci 383370b324cSopenharmony_ci 384370b324cSopenharmony_ci#define VAES_FUNC_START2(name) \ 385370b324cSopenharmony_ciAES_FUNC_START (name); \ 386370b324cSopenharmony_ciATTRIB_VAES \ 387370b324cSopenharmony_ciAES_FUNC_START (name) 388370b324cSopenharmony_ci 389370b324cSopenharmony_ciVAES_FUNC_START2 (AesCbc_Decode_HW_256) 390370b324cSopenharmony_ci{ 391370b324cSopenharmony_ci __m128i *p = (__m128i *)(void *)ivAes; 392370b324cSopenharmony_ci __m128i *data = (__m128i *)(void *)data8; 393370b324cSopenharmony_ci __m128i iv = *p; 394370b324cSopenharmony_ci const __m128i *dataEnd; 395370b324cSopenharmony_ci UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; 396370b324cSopenharmony_ci p += 2; 397370b324cSopenharmony_ci 398370b324cSopenharmony_ci WIDE_LOOP_START_AVX(;) 399370b324cSopenharmony_ci { 400370b324cSopenharmony_ci const __m256i *w = keys + numRounds - 2; 401370b324cSopenharmony_ci 402370b324cSopenharmony_ci WOP (AVX_DECLARE_VAR) 403370b324cSopenharmony_ci WOP (AVX_LOAD_data) 404370b324cSopenharmony_ci AVX_WOP_KEY (AVX_AES_XOR, 1) 405370b324cSopenharmony_ci 406370b324cSopenharmony_ci do 407370b324cSopenharmony_ci { 408370b324cSopenharmony_ci AVX_WOP_KEY (AVX_AES_DEC, 0) 409370b324cSopenharmony_ci w--; 410370b324cSopenharmony_ci } 411370b324cSopenharmony_ci while (w != keys); 412370b324cSopenharmony_ci AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) 413370b324cSopenharmony_ci 414370b324cSopenharmony_ci AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])) 415370b324cSopenharmony_ci WOP_M1 (AVX_XOR_data_M1) 416370b324cSopenharmony_ci iv = data[NUM_WAYS * 2 - 1]; 417370b324cSopenharmony_ci WOP (AVX_STORE_data) 418370b324cSopenharmony_ci } 419370b324cSopenharmony_ci WIDE_LOOP_END_AVX(;) 420370b324cSopenharmony_ci 421370b324cSopenharmony_ci SINGLE_LOOP 422370b324cSopenharmony_ci { 423370b324cSopenharmony_ci const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; 424370b324cSopenharmony_ci __m128i m = _mm_xor_si128 (w[2], *data); 425370b324cSopenharmony_ci do 426370b324cSopenharmony_ci { 427370b324cSopenharmony_ci MM_OP_m (_mm_aesdec_si128, w[1]) 428370b324cSopenharmony_ci MM_OP_m (_mm_aesdec_si128, w[0]) 429370b324cSopenharmony_ci w -= 2; 430370b324cSopenharmony_ci } 431370b324cSopenharmony_ci while (w != p); 432370b324cSopenharmony_ci MM_OP_m (_mm_aesdec_si128, w[1]) 433370b324cSopenharmony_ci MM_OP_m (_mm_aesdeclast_si128, w[0]) 434370b324cSopenharmony_ci 435370b324cSopenharmony_ci MM_XOR (m, iv) 436370b324cSopenharmony_ci iv = *data; 437370b324cSopenharmony_ci *data = m; 438370b324cSopenharmony_ci } 439370b324cSopenharmony_ci 440370b324cSopenharmony_ci p[-2] = iv; 441370b324cSopenharmony_ci} 442370b324cSopenharmony_ci 443370b324cSopenharmony_ci 444370b324cSopenharmony_ci/* 445370b324cSopenharmony_ciSSE2: _mm_cvtsi32_si128 : movd 446370b324cSopenharmony_ciAVX: _mm256_setr_m128i : vinsertf128 447370b324cSopenharmony_ciAVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm 448370b324cSopenharmony_ci _mm256_extracti128_si256 : vextracti128 449370b324cSopenharmony_ci _mm256_broadcastsi128_si256 : vbroadcasti128 450370b324cSopenharmony_ci*/ 451370b324cSopenharmony_ci 452370b324cSopenharmony_ci#define AVX_CTR_LOOP_START \ 453370b324cSopenharmony_ci ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \ 454370b324cSopenharmony_ci two = _mm256_setr_m128i(one, one); \ 455370b324cSopenharmony_ci two = _mm256_add_epi64(two, two); \ 456370b324cSopenharmony_ci 457370b324cSopenharmony_ci// two = _mm256_setr_epi64x(2, 0, 2, 0); 458370b324cSopenharmony_ci 459370b324cSopenharmony_ci#define AVX_CTR_LOOP_ENC \ 460370b324cSopenharmony_ci ctr = _mm256_extracti128_si256 (ctr2, 1); \ 461370b324cSopenharmony_ci 462370b324cSopenharmony_ciVAES_FUNC_START2 (AesCtr_Code_HW_256) 463370b324cSopenharmony_ci{ 464370b324cSopenharmony_ci __m128i *p = (__m128i *)(void *)ivAes; 465370b324cSopenharmony_ci __m128i *data = (__m128i *)(void *)data8; 466370b324cSopenharmony_ci __m128i ctr = *p; 467370b324cSopenharmony_ci UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; 468370b324cSopenharmony_ci const __m128i *dataEnd; 469370b324cSopenharmony_ci __m128i one = _mm_cvtsi32_si128(1); 470370b324cSopenharmony_ci __m256i ctr2, two; 471370b324cSopenharmony_ci p += 2; 472370b324cSopenharmony_ci 473370b324cSopenharmony_ci WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START) 474370b324cSopenharmony_ci { 475370b324cSopenharmony_ci const __m256i *w = keys; 476370b324cSopenharmony_ci UInt32 r = numRounds - 2; 477370b324cSopenharmony_ci WOP (AVX_DECLARE_VAR) 478370b324cSopenharmony_ci AVX_WOP_KEY (AVX_CTR_START, 0) 479370b324cSopenharmony_ci 480370b324cSopenharmony_ci w += 1; 481370b324cSopenharmony_ci do 482370b324cSopenharmony_ci { 483370b324cSopenharmony_ci AVX_WOP_KEY (AVX_AES_ENC, 0) 484370b324cSopenharmony_ci w += 1; 485370b324cSopenharmony_ci } 486370b324cSopenharmony_ci while (--r); 487370b324cSopenharmony_ci AVX_WOP_KEY (AVX_AES_ENC_LAST, 0) 488370b324cSopenharmony_ci 489370b324cSopenharmony_ci WOP (AVX_CTR_END) 490370b324cSopenharmony_ci } 491370b324cSopenharmony_ci WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC) 492370b324cSopenharmony_ci 493370b324cSopenharmony_ci SINGLE_LOOP 494370b324cSopenharmony_ci { 495370b324cSopenharmony_ci UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; 496370b324cSopenharmony_ci const __m128i *w = p; 497370b324cSopenharmony_ci __m128i m; 498370b324cSopenharmony_ci MM_OP (_mm_add_epi64, ctr, one) 499370b324cSopenharmony_ci m = _mm_xor_si128 (ctr, p[0]); 500370b324cSopenharmony_ci w += 1; 501370b324cSopenharmony_ci do 502370b324cSopenharmony_ci { 503370b324cSopenharmony_ci MM_OP_m (_mm_aesenc_si128, w[0]) 504370b324cSopenharmony_ci MM_OP_m (_mm_aesenc_si128, w[1]) 505370b324cSopenharmony_ci w += 2; 506370b324cSopenharmony_ci } 507370b324cSopenharmony_ci while (--numRounds2); 508370b324cSopenharmony_ci MM_OP_m (_mm_aesenc_si128, w[0]) 509370b324cSopenharmony_ci MM_OP_m (_mm_aesenclast_si128, w[1]) 510370b324cSopenharmony_ci MM_XOR (*data, m) 511370b324cSopenharmony_ci } 512370b324cSopenharmony_ci 513370b324cSopenharmony_ci p[-2] = ctr; 514370b324cSopenharmony_ci} 515370b324cSopenharmony_ci 516370b324cSopenharmony_ci#endif // USE_INTEL_VAES 517370b324cSopenharmony_ci 518370b324cSopenharmony_ci#else // USE_INTEL_AES 519370b324cSopenharmony_ci 520370b324cSopenharmony_ci/* no USE_INTEL_AES */ 521370b324cSopenharmony_ci 522370b324cSopenharmony_ci#pragma message("AES HW_SW stub was used") 523370b324cSopenharmony_ci 524370b324cSopenharmony_ci#define AES_TYPE_keys UInt32 525370b324cSopenharmony_ci#define AES_TYPE_data Byte 526370b324cSopenharmony_ci 527370b324cSopenharmony_ci#define AES_FUNC_START(name) \ 528370b324cSopenharmony_ci void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ 529370b324cSopenharmony_ci 530370b324cSopenharmony_ci#define AES_COMPAT_STUB(name) \ 531370b324cSopenharmony_ci AES_FUNC_START(name); \ 532370b324cSopenharmony_ci AES_FUNC_START(name ## _HW) \ 533370b324cSopenharmony_ci { name(p, data, numBlocks); } 534370b324cSopenharmony_ci 535370b324cSopenharmony_ciAES_COMPAT_STUB (AesCbc_Encode) 536370b324cSopenharmony_ciAES_COMPAT_STUB (AesCbc_Decode) 537370b324cSopenharmony_ciAES_COMPAT_STUB (AesCtr_Code) 538370b324cSopenharmony_ci 539370b324cSopenharmony_ci#endif // USE_INTEL_AES 540370b324cSopenharmony_ci 541370b324cSopenharmony_ci 542370b324cSopenharmony_ci#ifndef USE_INTEL_VAES 543370b324cSopenharmony_ci 544370b324cSopenharmony_ci#pragma message("VAES HW_SW stub was used") 545370b324cSopenharmony_ci 546370b324cSopenharmony_ci#define VAES_COMPAT_STUB(name) \ 547370b324cSopenharmony_ci void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ 548370b324cSopenharmony_ci void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \ 549370b324cSopenharmony_ci { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); } 550370b324cSopenharmony_ci 551370b324cSopenharmony_ciVAES_COMPAT_STUB (AesCbc_Decode_HW) 552370b324cSopenharmony_ciVAES_COMPAT_STUB (AesCtr_Code_HW) 553370b324cSopenharmony_ci 554370b324cSopenharmony_ci#endif // ! USE_INTEL_VAES 555370b324cSopenharmony_ci 556370b324cSopenharmony_ci 557370b324cSopenharmony_ci#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) 558370b324cSopenharmony_ci 559370b324cSopenharmony_ci #if defined(__clang__) 560370b324cSopenharmony_ci #if (__clang_major__ >= 8) // fix that check 561370b324cSopenharmony_ci #define USE_HW_AES 562370b324cSopenharmony_ci #endif 563370b324cSopenharmony_ci #elif defined(__GNUC__) 564370b324cSopenharmony_ci #if (__GNUC__ >= 6) // fix that check 565370b324cSopenharmony_ci #define USE_HW_AES 566370b324cSopenharmony_ci #endif 567370b324cSopenharmony_ci #elif defined(_MSC_VER) 568370b324cSopenharmony_ci #if _MSC_VER >= 1910 569370b324cSopenharmony_ci #define USE_HW_AES 570370b324cSopenharmony_ci #endif 571370b324cSopenharmony_ci #endif 572370b324cSopenharmony_ci 573370b324cSopenharmony_ci#ifdef USE_HW_AES 574370b324cSopenharmony_ci 575370b324cSopenharmony_ci// #pragma message("=== AES HW === ") 576370b324cSopenharmony_ci 577370b324cSopenharmony_ci#if defined(__clang__) || defined(__GNUC__) 578370b324cSopenharmony_ci #ifdef MY_CPU_ARM64 579370b324cSopenharmony_ci #define ATTRIB_AES __attribute__((__target__("+crypto"))) 580370b324cSopenharmony_ci #else 581370b324cSopenharmony_ci #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) 582370b324cSopenharmony_ci #endif 583370b324cSopenharmony_ci#else 584370b324cSopenharmony_ci // _MSC_VER 585370b324cSopenharmony_ci // for arm32 586370b324cSopenharmony_ci #define _ARM_USE_NEW_NEON_INTRINSICS 587370b324cSopenharmony_ci#endif 588370b324cSopenharmony_ci 589370b324cSopenharmony_ci#ifndef ATTRIB_AES 590370b324cSopenharmony_ci #define ATTRIB_AES 591370b324cSopenharmony_ci#endif 592370b324cSopenharmony_ci 593370b324cSopenharmony_ci#if defined(_MSC_VER) && defined(MY_CPU_ARM64) 594370b324cSopenharmony_ci#include <arm64_neon.h> 595370b324cSopenharmony_ci#else 596370b324cSopenharmony_ci#include <arm_neon.h> 597370b324cSopenharmony_ci#endif 598370b324cSopenharmony_ci 599370b324cSopenharmony_citypedef uint8x16_t v128; 600370b324cSopenharmony_ci 601370b324cSopenharmony_ci#define AES_FUNC_START(name) \ 602370b324cSopenharmony_ci void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) 603370b324cSopenharmony_ci // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks) 604370b324cSopenharmony_ci 605370b324cSopenharmony_ci#define AES_FUNC_START2(name) \ 606370b324cSopenharmony_ciAES_FUNC_START (name); \ 607370b324cSopenharmony_ciATTRIB_AES \ 608370b324cSopenharmony_ciAES_FUNC_START (name) 609370b324cSopenharmony_ci 610370b324cSopenharmony_ci#define MM_OP(op, dest, src) dest = op(dest, src); 611370b324cSopenharmony_ci#define MM_OP_m(op, src) MM_OP(op, m, src) 612370b324cSopenharmony_ci#define MM_OP1_m(op) m = op(m); 613370b324cSopenharmony_ci 614370b324cSopenharmony_ci#define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src) 615370b324cSopenharmony_ci#define MM_XOR_m( src) MM_XOR(m, src) 616370b324cSopenharmony_ci 617370b324cSopenharmony_ci#define AES_E_m(k) MM_OP_m (vaeseq_u8, k) 618370b324cSopenharmony_ci#define AES_E_MC_m(k) AES_E_m (k) MM_OP1_m(vaesmcq_u8) 619370b324cSopenharmony_ci 620370b324cSopenharmony_ci 621370b324cSopenharmony_ciAES_FUNC_START2 (AesCbc_Encode_HW) 622370b324cSopenharmony_ci{ 623370b324cSopenharmony_ci v128 *p = (v128*)(void*)ivAes; 624370b324cSopenharmony_ci v128 *data = (v128*)(void*)data8; 625370b324cSopenharmony_ci v128 m = *p; 626370b324cSopenharmony_ci const v128 k0 = p[2]; 627370b324cSopenharmony_ci const v128 k1 = p[3]; 628370b324cSopenharmony_ci const v128 k2 = p[4]; 629370b324cSopenharmony_ci const v128 k3 = p[5]; 630370b324cSopenharmony_ci const v128 k4 = p[6]; 631370b324cSopenharmony_ci const v128 k5 = p[7]; 632370b324cSopenharmony_ci const v128 k6 = p[8]; 633370b324cSopenharmony_ci const v128 k7 = p[9]; 634370b324cSopenharmony_ci const v128 k8 = p[10]; 635370b324cSopenharmony_ci const v128 k9 = p[11]; 636370b324cSopenharmony_ci const UInt32 numRounds2 = *(const UInt32 *)(p + 1); 637370b324cSopenharmony_ci const v128 *w = p + ((size_t)numRounds2 * 2); 638370b324cSopenharmony_ci const v128 k_z1 = w[1]; 639370b324cSopenharmony_ci const v128 k_z0 = w[2]; 640370b324cSopenharmony_ci for (; numBlocks != 0; numBlocks--, data++) 641370b324cSopenharmony_ci { 642370b324cSopenharmony_ci MM_XOR_m (*data); 643370b324cSopenharmony_ci AES_E_MC_m (k0) 644370b324cSopenharmony_ci AES_E_MC_m (k1) 645370b324cSopenharmony_ci AES_E_MC_m (k2) 646370b324cSopenharmony_ci AES_E_MC_m (k3) 647370b324cSopenharmony_ci AES_E_MC_m (k4) 648370b324cSopenharmony_ci AES_E_MC_m (k5) 649370b324cSopenharmony_ci AES_E_MC_m (k6) 650370b324cSopenharmony_ci AES_E_MC_m (k7) 651370b324cSopenharmony_ci AES_E_MC_m (k8) 652370b324cSopenharmony_ci if (numRounds2 >= 6) 653370b324cSopenharmony_ci { 654370b324cSopenharmony_ci AES_E_MC_m (k9) 655370b324cSopenharmony_ci AES_E_MC_m (p[12]) 656370b324cSopenharmony_ci if (numRounds2 != 6) 657370b324cSopenharmony_ci { 658370b324cSopenharmony_ci AES_E_MC_m (p[13]) 659370b324cSopenharmony_ci AES_E_MC_m (p[14]) 660370b324cSopenharmony_ci } 661370b324cSopenharmony_ci } 662370b324cSopenharmony_ci AES_E_m (k_z1) 663370b324cSopenharmony_ci MM_XOR_m (k_z0); 664370b324cSopenharmony_ci *data = m; 665370b324cSopenharmony_ci } 666370b324cSopenharmony_ci *p = m; 667370b324cSopenharmony_ci} 668370b324cSopenharmony_ci 669370b324cSopenharmony_ci 670370b324cSopenharmony_ci#define WOP_1(op) 671370b324cSopenharmony_ci#define WOP_2(op) WOP_1 (op) op (m1, 1) 672370b324cSopenharmony_ci#define WOP_3(op) WOP_2 (op) op (m2, 2) 673370b324cSopenharmony_ci#define WOP_4(op) WOP_3 (op) op (m3, 3) 674370b324cSopenharmony_ci#define WOP_5(op) WOP_4 (op) op (m4, 4) 675370b324cSopenharmony_ci#define WOP_6(op) WOP_5 (op) op (m5, 5) 676370b324cSopenharmony_ci#define WOP_7(op) WOP_6 (op) op (m6, 6) 677370b324cSopenharmony_ci#define WOP_8(op) WOP_7 (op) op (m7, 7) 678370b324cSopenharmony_ci 679370b324cSopenharmony_ci #define NUM_WAYS 8 680370b324cSopenharmony_ci #define WOP_M1 WOP_8 681370b324cSopenharmony_ci 682370b324cSopenharmony_ci#define WOP(op) op (m0, 0) WOP_M1(op) 683370b324cSopenharmony_ci 684370b324cSopenharmony_ci#define DECLARE_VAR(reg, ii) v128 reg; 685370b324cSopenharmony_ci#define LOAD_data( reg, ii) reg = data[ii]; 686370b324cSopenharmony_ci#define STORE_data( reg, ii) data[ii] = reg; 687370b324cSopenharmony_ci#if (NUM_WAYS > 1) 688370b324cSopenharmony_ci#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) 689370b324cSopenharmony_ci#endif 690370b324cSopenharmony_ci 691370b324cSopenharmony_ci#define MM_OP_key(op, reg) MM_OP (op, reg, key) 692370b324cSopenharmony_ci 693370b324cSopenharmony_ci#define AES_D_m(k) MM_OP_m (vaesdq_u8, k) 694370b324cSopenharmony_ci#define AES_D_IMC_m(k) AES_D_m (k) MM_OP1_m (vaesimcq_u8) 695370b324cSopenharmony_ci 696370b324cSopenharmony_ci#define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg) 697370b324cSopenharmony_ci#define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg) 698370b324cSopenharmony_ci#define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg) 699370b324cSopenharmony_ci 700370b324cSopenharmony_ci#define AES_D_IMC( reg, ii) AES_D (reg, ii) reg = vaesimcq_u8(reg); 701370b324cSopenharmony_ci#define AES_E_MC( reg, ii) AES_E (reg, ii) reg = vaesmcq_u8(reg); 702370b324cSopenharmony_ci 703370b324cSopenharmony_ci#define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one) reg = vreinterpretq_u8_u64(ctr); 704370b324cSopenharmony_ci#define CTR_END( reg, ii) MM_XOR (data[ii], reg) 705370b324cSopenharmony_ci 706370b324cSopenharmony_ci#define WOP_KEY(op, n) { \ 707370b324cSopenharmony_ci const v128 key = w[n]; \ 708370b324cSopenharmony_ci WOP(op) } 709370b324cSopenharmony_ci 710370b324cSopenharmony_ci#define WIDE_LOOP_START \ 711370b324cSopenharmony_ci dataEnd = data + numBlocks; \ 712370b324cSopenharmony_ci if (numBlocks >= NUM_WAYS) \ 713370b324cSopenharmony_ci { dataEnd -= NUM_WAYS; do { \ 714370b324cSopenharmony_ci 715370b324cSopenharmony_ci#define WIDE_LOOP_END \ 716370b324cSopenharmony_ci data += NUM_WAYS; \ 717370b324cSopenharmony_ci } while (data <= dataEnd); \ 718370b324cSopenharmony_ci dataEnd += NUM_WAYS; } \ 719370b324cSopenharmony_ci 720370b324cSopenharmony_ci#define SINGLE_LOOP \ 721370b324cSopenharmony_ci for (; data < dataEnd; data++) 722370b324cSopenharmony_ci 723370b324cSopenharmony_ci 724370b324cSopenharmony_ciAES_FUNC_START2 (AesCbc_Decode_HW) 725370b324cSopenharmony_ci{ 726370b324cSopenharmony_ci v128 *p = (v128*)(void*)ivAes; 727370b324cSopenharmony_ci v128 *data = (v128*)(void*)data8; 728370b324cSopenharmony_ci v128 iv = *p; 729370b324cSopenharmony_ci const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 730370b324cSopenharmony_ci const v128 *dataEnd; 731370b324cSopenharmony_ci p += 2; 732370b324cSopenharmony_ci 733370b324cSopenharmony_ci WIDE_LOOP_START 734370b324cSopenharmony_ci { 735370b324cSopenharmony_ci const v128 *w = wStart; 736370b324cSopenharmony_ci WOP (DECLARE_VAR) 737370b324cSopenharmony_ci WOP (LOAD_data) 738370b324cSopenharmony_ci WOP_KEY (AES_D_IMC, 2) 739370b324cSopenharmony_ci do 740370b324cSopenharmony_ci { 741370b324cSopenharmony_ci WOP_KEY (AES_D_IMC, 1) 742370b324cSopenharmony_ci WOP_KEY (AES_D_IMC, 0) 743370b324cSopenharmony_ci w -= 2; 744370b324cSopenharmony_ci } 745370b324cSopenharmony_ci while (w != p); 746370b324cSopenharmony_ci WOP_KEY (AES_D, 1) 747370b324cSopenharmony_ci WOP_KEY (AES_XOR, 0) 748370b324cSopenharmony_ci MM_XOR (m0, iv); 749370b324cSopenharmony_ci WOP_M1 (XOR_data_M1) 750370b324cSopenharmony_ci iv = data[NUM_WAYS - 1]; 751370b324cSopenharmony_ci WOP (STORE_data) 752370b324cSopenharmony_ci } 753370b324cSopenharmony_ci WIDE_LOOP_END 754370b324cSopenharmony_ci 755370b324cSopenharmony_ci SINGLE_LOOP 756370b324cSopenharmony_ci { 757370b324cSopenharmony_ci const v128 *w = wStart; 758370b324cSopenharmony_ci v128 m = *data; 759370b324cSopenharmony_ci AES_D_IMC_m (w[2]) 760370b324cSopenharmony_ci do 761370b324cSopenharmony_ci { 762370b324cSopenharmony_ci AES_D_IMC_m (w[1]); 763370b324cSopenharmony_ci AES_D_IMC_m (w[0]); 764370b324cSopenharmony_ci w -= 2; 765370b324cSopenharmony_ci } 766370b324cSopenharmony_ci while (w != p); 767370b324cSopenharmony_ci AES_D_m (w[1]); 768370b324cSopenharmony_ci MM_XOR_m (w[0]); 769370b324cSopenharmony_ci MM_XOR_m (iv); 770370b324cSopenharmony_ci iv = *data; 771370b324cSopenharmony_ci *data = m; 772370b324cSopenharmony_ci } 773370b324cSopenharmony_ci 774370b324cSopenharmony_ci p[-2] = iv; 775370b324cSopenharmony_ci} 776370b324cSopenharmony_ci 777370b324cSopenharmony_ci 778370b324cSopenharmony_ciAES_FUNC_START2 (AesCtr_Code_HW) 779370b324cSopenharmony_ci{ 780370b324cSopenharmony_ci v128 *p = (v128*)(void*)ivAes; 781370b324cSopenharmony_ci v128 *data = (v128*)(void*)data8; 782370b324cSopenharmony_ci uint64x2_t ctr = vreinterpretq_u64_u8(*p); 783370b324cSopenharmony_ci const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 784370b324cSopenharmony_ci const v128 *dataEnd; 785370b324cSopenharmony_ci uint64x2_t one = vdupq_n_u64(0); 786370b324cSopenharmony_ci one = vsetq_lane_u64(1, one, 0); 787370b324cSopenharmony_ci p += 2; 788370b324cSopenharmony_ci 789370b324cSopenharmony_ci WIDE_LOOP_START 790370b324cSopenharmony_ci { 791370b324cSopenharmony_ci const v128 *w = p; 792370b324cSopenharmony_ci WOP (DECLARE_VAR) 793370b324cSopenharmony_ci WOP (CTR_START) 794370b324cSopenharmony_ci do 795370b324cSopenharmony_ci { 796370b324cSopenharmony_ci WOP_KEY (AES_E_MC, 0) 797370b324cSopenharmony_ci WOP_KEY (AES_E_MC, 1) 798370b324cSopenharmony_ci w += 2; 799370b324cSopenharmony_ci } 800370b324cSopenharmony_ci while (w != wEnd); 801370b324cSopenharmony_ci WOP_KEY (AES_E_MC, 0) 802370b324cSopenharmony_ci WOP_KEY (AES_E, 1) 803370b324cSopenharmony_ci WOP_KEY (AES_XOR, 2) 804370b324cSopenharmony_ci WOP (CTR_END) 805370b324cSopenharmony_ci } 806370b324cSopenharmony_ci WIDE_LOOP_END 807370b324cSopenharmony_ci 808370b324cSopenharmony_ci SINGLE_LOOP 809370b324cSopenharmony_ci { 810370b324cSopenharmony_ci const v128 *w = p; 811370b324cSopenharmony_ci v128 m; 812370b324cSopenharmony_ci CTR_START (m, 0); 813370b324cSopenharmony_ci do 814370b324cSopenharmony_ci { 815370b324cSopenharmony_ci AES_E_MC_m (w[0]); 816370b324cSopenharmony_ci AES_E_MC_m (w[1]); 817370b324cSopenharmony_ci w += 2; 818370b324cSopenharmony_ci } 819370b324cSopenharmony_ci while (w != wEnd); 820370b324cSopenharmony_ci AES_E_MC_m (w[0]) 821370b324cSopenharmony_ci AES_E_m (w[1]) 822370b324cSopenharmony_ci MM_XOR_m (w[2]) 823370b324cSopenharmony_ci CTR_END (m, 0) 824370b324cSopenharmony_ci } 825370b324cSopenharmony_ci 826370b324cSopenharmony_ci p[-2] = vreinterpretq_u8_u64(ctr); 827370b324cSopenharmony_ci} 828370b324cSopenharmony_ci 829370b324cSopenharmony_ci#endif // USE_HW_AES 830370b324cSopenharmony_ci 831370b324cSopenharmony_ci#endif // MY_CPU_ARM_OR_ARM64 832370b324cSopenharmony_ci 833370b324cSopenharmony_ci#undef NUM_WAYS 834370b324cSopenharmony_ci#undef WOP_M1 835370b324cSopenharmony_ci#undef WOP 836370b324cSopenharmony_ci#undef DECLARE_VAR 837370b324cSopenharmony_ci#undef LOAD_data 838370b324cSopenharmony_ci#undef STORE_data 839370b324cSopenharmony_ci#undef USE_INTEL_AES 840370b324cSopenharmony_ci#undef USE_HW_AES 841