1370b324cSopenharmony_ci/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2370b324cSopenharmony_ci2023-04-02 : Igor Pavlov : Public domain */
3370b324cSopenharmony_ci
4370b324cSopenharmony_ci#include "Precomp.h"
5370b324cSopenharmony_ci
6370b324cSopenharmony_ci#include "Aes.h"
7370b324cSopenharmony_ci#include "CpuArch.h"
8370b324cSopenharmony_ci
9370b324cSopenharmony_ci#ifdef MY_CPU_X86_OR_AMD64
10370b324cSopenharmony_ci
11370b324cSopenharmony_ci  #if defined(__INTEL_COMPILER)
12370b324cSopenharmony_ci    #if (__INTEL_COMPILER >= 1110)
13370b324cSopenharmony_ci      #define USE_INTEL_AES
14370b324cSopenharmony_ci      #if (__INTEL_COMPILER >= 1900)
15370b324cSopenharmony_ci        #define USE_INTEL_VAES
16370b324cSopenharmony_ci      #endif
17370b324cSopenharmony_ci    #endif
18370b324cSopenharmony_ci  #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \
19370b324cSopenharmony_ci       || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4)
20370b324cSopenharmony_ci        #define USE_INTEL_AES
21370b324cSopenharmony_ci        #if !defined(__AES__)
22370b324cSopenharmony_ci          #define ATTRIB_AES __attribute__((__target__("aes")))
23370b324cSopenharmony_ci        #endif
24370b324cSopenharmony_ci      #if defined(__clang__) && (__clang_major__ >= 8) \
25370b324cSopenharmony_ci          || defined(__GNUC__) && (__GNUC__ >= 8)
26370b324cSopenharmony_ci        #define USE_INTEL_VAES
27370b324cSopenharmony_ci        #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
28370b324cSopenharmony_ci          #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
29370b324cSopenharmony_ci        #endif
30370b324cSopenharmony_ci      #endif
31370b324cSopenharmony_ci  #elif defined(_MSC_VER)
32370b324cSopenharmony_ci    #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
33370b324cSopenharmony_ci      #define USE_INTEL_AES
34370b324cSopenharmony_ci      #if (_MSC_VER >= 1910)
35370b324cSopenharmony_ci        #define USE_INTEL_VAES
36370b324cSopenharmony_ci      #endif
37370b324cSopenharmony_ci    #endif
38370b324cSopenharmony_ci  #endif
39370b324cSopenharmony_ci
40370b324cSopenharmony_ci#ifndef ATTRIB_AES
41370b324cSopenharmony_ci  #define ATTRIB_AES
42370b324cSopenharmony_ci#endif
43370b324cSopenharmony_ci#ifndef ATTRIB_VAES
44370b324cSopenharmony_ci  #define ATTRIB_VAES
45370b324cSopenharmony_ci#endif
46370b324cSopenharmony_ci
47370b324cSopenharmony_ci
48370b324cSopenharmony_ci#ifdef USE_INTEL_AES
49370b324cSopenharmony_ci
50370b324cSopenharmony_ci#include <wmmintrin.h>
51370b324cSopenharmony_ci
52370b324cSopenharmony_ci#ifndef USE_INTEL_VAES
53370b324cSopenharmony_ci#define AES_TYPE_keys UInt32
54370b324cSopenharmony_ci#define AES_TYPE_data Byte
55370b324cSopenharmony_ci// #define AES_TYPE_keys __m128i
56370b324cSopenharmony_ci// #define AES_TYPE_data __m128i
57370b324cSopenharmony_ci#endif
58370b324cSopenharmony_ci
59370b324cSopenharmony_ci#define AES_FUNC_START(name) \
60370b324cSopenharmony_ci    void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
61370b324cSopenharmony_ci    // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
62370b324cSopenharmony_ci
63370b324cSopenharmony_ci#define AES_FUNC_START2(name) \
64370b324cSopenharmony_ciAES_FUNC_START (name); \
65370b324cSopenharmony_ciATTRIB_AES \
66370b324cSopenharmony_ciAES_FUNC_START (name)
67370b324cSopenharmony_ci
68370b324cSopenharmony_ci#define MM_OP(op, dest, src)  dest = op(dest, src);
69370b324cSopenharmony_ci#define MM_OP_m(op, src)      MM_OP(op, m, src)
70370b324cSopenharmony_ci
71370b324cSopenharmony_ci#define MM_XOR( dest, src)    MM_OP(_mm_xor_si128,    dest, src)
72370b324cSopenharmony_ci#define AVX_XOR(dest, src)    MM_OP(_mm256_xor_si256, dest, src)
73370b324cSopenharmony_ci
74370b324cSopenharmony_ci
75370b324cSopenharmony_ciAES_FUNC_START2 (AesCbc_Encode_HW)
76370b324cSopenharmony_ci{
77370b324cSopenharmony_ci  __m128i *p = (__m128i *)(void *)ivAes;
78370b324cSopenharmony_ci  __m128i *data = (__m128i *)(void *)data8;
79370b324cSopenharmony_ci  __m128i m = *p;
80370b324cSopenharmony_ci  const __m128i k0 = p[2];
81370b324cSopenharmony_ci  const __m128i k1 = p[3];
82370b324cSopenharmony_ci  const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
83370b324cSopenharmony_ci  for (; numBlocks != 0; numBlocks--, data++)
84370b324cSopenharmony_ci  {
85370b324cSopenharmony_ci    UInt32 r = numRounds2;
86370b324cSopenharmony_ci    const __m128i *w = p + 4;
87370b324cSopenharmony_ci    __m128i temp = *data;
88370b324cSopenharmony_ci    MM_XOR (temp, k0)
89370b324cSopenharmony_ci    MM_XOR (m, temp)
90370b324cSopenharmony_ci    MM_OP_m (_mm_aesenc_si128, k1)
91370b324cSopenharmony_ci    do
92370b324cSopenharmony_ci    {
93370b324cSopenharmony_ci      MM_OP_m (_mm_aesenc_si128, w[0])
94370b324cSopenharmony_ci      MM_OP_m (_mm_aesenc_si128, w[1])
95370b324cSopenharmony_ci      w += 2;
96370b324cSopenharmony_ci    }
97370b324cSopenharmony_ci    while (--r);
98370b324cSopenharmony_ci    MM_OP_m (_mm_aesenclast_si128, w[0])
99370b324cSopenharmony_ci    *data = m;
100370b324cSopenharmony_ci  }
101370b324cSopenharmony_ci  *p = m;
102370b324cSopenharmony_ci}
103370b324cSopenharmony_ci
104370b324cSopenharmony_ci
105370b324cSopenharmony_ci#define WOP_1(op)
106370b324cSopenharmony_ci#define WOP_2(op)   WOP_1 (op)  op (m1, 1)
107370b324cSopenharmony_ci#define WOP_3(op)   WOP_2 (op)  op (m2, 2)
108370b324cSopenharmony_ci#define WOP_4(op)   WOP_3 (op)  op (m3, 3)
109370b324cSopenharmony_ci#ifdef MY_CPU_AMD64
110370b324cSopenharmony_ci#define WOP_5(op)   WOP_4 (op)  op (m4, 4)
111370b324cSopenharmony_ci#define WOP_6(op)   WOP_5 (op)  op (m5, 5)
112370b324cSopenharmony_ci#define WOP_7(op)   WOP_6 (op)  op (m6, 6)
113370b324cSopenharmony_ci#define WOP_8(op)   WOP_7 (op)  op (m7, 7)
114370b324cSopenharmony_ci#endif
115370b324cSopenharmony_ci/*
116370b324cSopenharmony_ci#define WOP_9(op)   WOP_8 (op)  op (m8, 8);
117370b324cSopenharmony_ci#define WOP_10(op)  WOP_9 (op)  op (m9, 9);
118370b324cSopenharmony_ci#define WOP_11(op)  WOP_10(op)  op (m10, 10);
119370b324cSopenharmony_ci#define WOP_12(op)  WOP_11(op)  op (m11, 11);
120370b324cSopenharmony_ci#define WOP_13(op)  WOP_12(op)  op (m12, 12);
121370b324cSopenharmony_ci#define WOP_14(op)  WOP_13(op)  op (m13, 13);
122370b324cSopenharmony_ci*/
123370b324cSopenharmony_ci
124370b324cSopenharmony_ci#ifdef MY_CPU_AMD64
125370b324cSopenharmony_ci  #define NUM_WAYS      8
126370b324cSopenharmony_ci  #define WOP_M1    WOP_8
127370b324cSopenharmony_ci#else
128370b324cSopenharmony_ci  #define NUM_WAYS      4
129370b324cSopenharmony_ci  #define WOP_M1    WOP_4
130370b324cSopenharmony_ci#endif
131370b324cSopenharmony_ci
132370b324cSopenharmony_ci#define WOP(op)  op (m0, 0)  WOP_M1(op)
133370b324cSopenharmony_ci
134370b324cSopenharmony_ci
135370b324cSopenharmony_ci#define DECLARE_VAR(reg, ii)  __m128i reg;
136370b324cSopenharmony_ci#define LOAD_data(  reg, ii)  reg = data[ii];
137370b324cSopenharmony_ci#define STORE_data( reg, ii)  data[ii] = reg;
138370b324cSopenharmony_ci#if (NUM_WAYS > 1)
139370b324cSopenharmony_ci#define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
140370b324cSopenharmony_ci#endif
141370b324cSopenharmony_ci
142370b324cSopenharmony_ci#define AVX_DECLARE_VAR(reg, ii)  __m256i reg;
143370b324cSopenharmony_ci#define AVX_LOAD_data(  reg, ii)  reg = ((const __m256i *)(const void *)data)[ii];
144370b324cSopenharmony_ci#define AVX_STORE_data( reg, ii)  ((__m256i *)(void *)data)[ii] = reg;
145370b324cSopenharmony_ci#define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
146370b324cSopenharmony_ci
147370b324cSopenharmony_ci#define MM_OP_key(op, reg)  MM_OP(op, reg, key);
148370b324cSopenharmony_ci
149370b324cSopenharmony_ci#define AES_DEC(      reg, ii)   MM_OP_key (_mm_aesdec_si128,     reg)
150370b324cSopenharmony_ci#define AES_DEC_LAST( reg, ii)   MM_OP_key (_mm_aesdeclast_si128, reg)
151370b324cSopenharmony_ci#define AES_ENC(      reg, ii)   MM_OP_key (_mm_aesenc_si128,     reg)
152370b324cSopenharmony_ci#define AES_ENC_LAST( reg, ii)   MM_OP_key (_mm_aesenclast_si128, reg)
153370b324cSopenharmony_ci#define AES_XOR(      reg, ii)   MM_OP_key (_mm_xor_si128,        reg)
154370b324cSopenharmony_ci
155370b324cSopenharmony_ci
156370b324cSopenharmony_ci#define AVX_AES_DEC(      reg, ii)   MM_OP_key (_mm256_aesdec_epi128,     reg)
157370b324cSopenharmony_ci#define AVX_AES_DEC_LAST( reg, ii)   MM_OP_key (_mm256_aesdeclast_epi128, reg)
158370b324cSopenharmony_ci#define AVX_AES_ENC(      reg, ii)   MM_OP_key (_mm256_aesenc_epi128,     reg)
159370b324cSopenharmony_ci#define AVX_AES_ENC_LAST( reg, ii)   MM_OP_key (_mm256_aesenclast_epi128, reg)
160370b324cSopenharmony_ci#define AVX_AES_XOR(      reg, ii)   MM_OP_key (_mm256_xor_si256,         reg)
161370b324cSopenharmony_ci
162370b324cSopenharmony_ci#define CTR_START(reg, ii)  MM_OP (_mm_add_epi64, ctr, one)  reg = ctr;
163370b324cSopenharmony_ci#define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
164370b324cSopenharmony_ci
165370b324cSopenharmony_ci#define AVX_CTR_START(reg, ii)  MM_OP (_mm256_add_epi64, ctr2, two)  reg = _mm256_xor_si256(ctr2, key);
166370b324cSopenharmony_ci#define AVX_CTR_END(  reg, ii)  AVX_XOR (((__m256i *)(void *)data)[ii], reg)
167370b324cSopenharmony_ci
168370b324cSopenharmony_ci#define WOP_KEY(op, n) { \
169370b324cSopenharmony_ci    const __m128i key = w[n]; \
170370b324cSopenharmony_ci    WOP(op); }
171370b324cSopenharmony_ci
172370b324cSopenharmony_ci#define AVX_WOP_KEY(op, n) { \
173370b324cSopenharmony_ci    const __m256i key = w[n]; \
174370b324cSopenharmony_ci    WOP(op); }
175370b324cSopenharmony_ci
176370b324cSopenharmony_ci
177370b324cSopenharmony_ci#define WIDE_LOOP_START  \
178370b324cSopenharmony_ci    dataEnd = data + numBlocks;  \
179370b324cSopenharmony_ci    if (numBlocks >= NUM_WAYS)  \
180370b324cSopenharmony_ci    { dataEnd -= NUM_WAYS; do {  \
181370b324cSopenharmony_ci
182370b324cSopenharmony_ci
183370b324cSopenharmony_ci#define WIDE_LOOP_END  \
184370b324cSopenharmony_ci    data += NUM_WAYS;  \
185370b324cSopenharmony_ci    } while (data <= dataEnd);  \
186370b324cSopenharmony_ci    dataEnd += NUM_WAYS; }  \
187370b324cSopenharmony_ci
188370b324cSopenharmony_ci
189370b324cSopenharmony_ci#define SINGLE_LOOP  \
190370b324cSopenharmony_ci    for (; data < dataEnd; data++)
191370b324cSopenharmony_ci
192370b324cSopenharmony_ci
193370b324cSopenharmony_ci#define NUM_AES_KEYS_MAX 15
194370b324cSopenharmony_ci
195370b324cSopenharmony_ci#define WIDE_LOOP_START_AVX(OP)  \
196370b324cSopenharmony_ci    dataEnd = data + numBlocks;  \
197370b324cSopenharmony_ci    if (numBlocks >= NUM_WAYS * 2)  \
198370b324cSopenharmony_ci    { __m256i keys[NUM_AES_KEYS_MAX]; \
199370b324cSopenharmony_ci    UInt32 ii; \
200370b324cSopenharmony_ci    OP \
201370b324cSopenharmony_ci    for (ii = 0; ii < numRounds; ii++) \
202370b324cSopenharmony_ci      keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
203370b324cSopenharmony_ci    dataEnd -= NUM_WAYS * 2; do {  \
204370b324cSopenharmony_ci
205370b324cSopenharmony_ci
206370b324cSopenharmony_ci#define WIDE_LOOP_END_AVX(OP)  \
207370b324cSopenharmony_ci    data += NUM_WAYS * 2;  \
208370b324cSopenharmony_ci    } while (data <= dataEnd);  \
209370b324cSopenharmony_ci    dataEnd += NUM_WAYS * 2;  \
210370b324cSopenharmony_ci    OP  \
211370b324cSopenharmony_ci    _mm256_zeroupper();  \
212370b324cSopenharmony_ci    }  \
213370b324cSopenharmony_ci
214370b324cSopenharmony_ci/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
215370b324cSopenharmony_ci   MSVC still can insert vzeroupper instruction. */
216370b324cSopenharmony_ci
217370b324cSopenharmony_ci
218370b324cSopenharmony_ciAES_FUNC_START2 (AesCbc_Decode_HW)
219370b324cSopenharmony_ci{
220370b324cSopenharmony_ci  __m128i *p = (__m128i *)(void *)ivAes;
221370b324cSopenharmony_ci  __m128i *data = (__m128i *)(void *)data8;
222370b324cSopenharmony_ci  __m128i iv = *p;
223370b324cSopenharmony_ci  const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
224370b324cSopenharmony_ci  const __m128i *dataEnd;
225370b324cSopenharmony_ci  p += 2;
226370b324cSopenharmony_ci
227370b324cSopenharmony_ci  WIDE_LOOP_START
228370b324cSopenharmony_ci  {
229370b324cSopenharmony_ci    const __m128i *w = wStart;
230370b324cSopenharmony_ci
231370b324cSopenharmony_ci    WOP (DECLARE_VAR)
232370b324cSopenharmony_ci    WOP (LOAD_data)
233370b324cSopenharmony_ci    WOP_KEY (AES_XOR, 1)
234370b324cSopenharmony_ci
235370b324cSopenharmony_ci    do
236370b324cSopenharmony_ci    {
237370b324cSopenharmony_ci      WOP_KEY (AES_DEC, 0)
238370b324cSopenharmony_ci      w--;
239370b324cSopenharmony_ci    }
240370b324cSopenharmony_ci    while (w != p);
241370b324cSopenharmony_ci    WOP_KEY (AES_DEC_LAST, 0)
242370b324cSopenharmony_ci
243370b324cSopenharmony_ci    MM_XOR (m0, iv)
244370b324cSopenharmony_ci    WOP_M1 (XOR_data_M1)
245370b324cSopenharmony_ci    iv = data[NUM_WAYS - 1];
246370b324cSopenharmony_ci    WOP (STORE_data)
247370b324cSopenharmony_ci  }
248370b324cSopenharmony_ci  WIDE_LOOP_END
249370b324cSopenharmony_ci
250370b324cSopenharmony_ci  SINGLE_LOOP
251370b324cSopenharmony_ci  {
252370b324cSopenharmony_ci    const __m128i *w = wStart - 1;
253370b324cSopenharmony_ci    __m128i m = _mm_xor_si128 (w[2], *data);
254370b324cSopenharmony_ci    do
255370b324cSopenharmony_ci    {
256370b324cSopenharmony_ci      MM_OP_m (_mm_aesdec_si128, w[1])
257370b324cSopenharmony_ci      MM_OP_m (_mm_aesdec_si128, w[0])
258370b324cSopenharmony_ci      w -= 2;
259370b324cSopenharmony_ci    }
260370b324cSopenharmony_ci    while (w != p);
261370b324cSopenharmony_ci    MM_OP_m (_mm_aesdec_si128,     w[1])
262370b324cSopenharmony_ci    MM_OP_m (_mm_aesdeclast_si128, w[0])
263370b324cSopenharmony_ci
264370b324cSopenharmony_ci    MM_XOR (m, iv)
265370b324cSopenharmony_ci    iv = *data;
266370b324cSopenharmony_ci    *data = m;
267370b324cSopenharmony_ci  }
268370b324cSopenharmony_ci
269370b324cSopenharmony_ci  p[-2] = iv;
270370b324cSopenharmony_ci}
271370b324cSopenharmony_ci
272370b324cSopenharmony_ci
273370b324cSopenharmony_ciAES_FUNC_START2 (AesCtr_Code_HW)
274370b324cSopenharmony_ci{
275370b324cSopenharmony_ci  __m128i *p = (__m128i *)(void *)ivAes;
276370b324cSopenharmony_ci  __m128i *data = (__m128i *)(void *)data8;
277370b324cSopenharmony_ci  __m128i ctr = *p;
278370b324cSopenharmony_ci  UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
279370b324cSopenharmony_ci  const __m128i *dataEnd;
280370b324cSopenharmony_ci  __m128i one = _mm_cvtsi32_si128(1);
281370b324cSopenharmony_ci
282370b324cSopenharmony_ci  p += 2;
283370b324cSopenharmony_ci
284370b324cSopenharmony_ci  WIDE_LOOP_START
285370b324cSopenharmony_ci  {
286370b324cSopenharmony_ci    const __m128i *w = p;
287370b324cSopenharmony_ci    UInt32 r = numRoundsMinus2;
288370b324cSopenharmony_ci    WOP (DECLARE_VAR)
289370b324cSopenharmony_ci    WOP (CTR_START)
290370b324cSopenharmony_ci    WOP_KEY (AES_XOR, 0)
291370b324cSopenharmony_ci    w += 1;
292370b324cSopenharmony_ci    do
293370b324cSopenharmony_ci    {
294370b324cSopenharmony_ci      WOP_KEY (AES_ENC, 0)
295370b324cSopenharmony_ci      w += 1;
296370b324cSopenharmony_ci    }
297370b324cSopenharmony_ci    while (--r);
298370b324cSopenharmony_ci    WOP_KEY (AES_ENC_LAST, 0)
299370b324cSopenharmony_ci
300370b324cSopenharmony_ci    WOP (CTR_END)
301370b324cSopenharmony_ci  }
302370b324cSopenharmony_ci  WIDE_LOOP_END
303370b324cSopenharmony_ci
304370b324cSopenharmony_ci  SINGLE_LOOP
305370b324cSopenharmony_ci  {
306370b324cSopenharmony_ci    UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
307370b324cSopenharmony_ci    const __m128i *w = p;
308370b324cSopenharmony_ci    __m128i m;
309370b324cSopenharmony_ci    MM_OP (_mm_add_epi64, ctr, one)
310370b324cSopenharmony_ci    m = _mm_xor_si128 (ctr, p[0]);
311370b324cSopenharmony_ci    w += 1;
312370b324cSopenharmony_ci    do
313370b324cSopenharmony_ci    {
314370b324cSopenharmony_ci      MM_OP_m (_mm_aesenc_si128, w[0])
315370b324cSopenharmony_ci      MM_OP_m (_mm_aesenc_si128, w[1])
316370b324cSopenharmony_ci      w += 2;
317370b324cSopenharmony_ci    }
318370b324cSopenharmony_ci    while (--numRounds2);
319370b324cSopenharmony_ci    MM_OP_m (_mm_aesenc_si128,     w[0])
320370b324cSopenharmony_ci    MM_OP_m (_mm_aesenclast_si128, w[1])
321370b324cSopenharmony_ci    MM_XOR (*data, m)
322370b324cSopenharmony_ci  }
323370b324cSopenharmony_ci
324370b324cSopenharmony_ci  p[-2] = ctr;
325370b324cSopenharmony_ci}
326370b324cSopenharmony_ci
327370b324cSopenharmony_ci
328370b324cSopenharmony_ci
329370b324cSopenharmony_ci#ifdef USE_INTEL_VAES
330370b324cSopenharmony_ci
331370b324cSopenharmony_ci/*
332370b324cSopenharmony_ciGCC before 2013-Jun:
333370b324cSopenharmony_ci  <immintrin.h>:
334370b324cSopenharmony_ci    #ifdef __AVX__
335370b324cSopenharmony_ci     #include <avxintrin.h>
336370b324cSopenharmony_ci    #endif
337370b324cSopenharmony_ciGCC after 2013-Jun:
338370b324cSopenharmony_ci  <immintrin.h>:
339370b324cSopenharmony_ci    #include <avxintrin.h>
340370b324cSopenharmony_ciCLANG 3.8+:
341370b324cSopenharmony_ci{
342370b324cSopenharmony_ci  <immintrin.h>:
343370b324cSopenharmony_ci    #if !defined(_MSC_VER) || defined(__AVX__)
344370b324cSopenharmony_ci      #include <avxintrin.h>
345370b324cSopenharmony_ci    #endif
346370b324cSopenharmony_ci
347370b324cSopenharmony_ci  if (the compiler is clang for Windows and if global arch is not set for __AVX__)
348370b324cSopenharmony_ci    [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
349370b324cSopenharmony_ci  {
350370b324cSopenharmony_ci    <immintrin.h> doesn't include <avxintrin.h>
351370b324cSopenharmony_ci    and we have 2 ways to fix it:
352370b324cSopenharmony_ci      1) we can define required __AVX__ before <immintrin.h>
353370b324cSopenharmony_ci      or
354370b324cSopenharmony_ci      2) we can include <avxintrin.h> after <immintrin.h>
355370b324cSopenharmony_ci  }
356370b324cSopenharmony_ci}
357370b324cSopenharmony_ci
358370b324cSopenharmony_ciIf we include <avxintrin.h> manually for GCC/CLANG, it's
359370b324cSopenharmony_cirequired that <immintrin.h> must be included before <avxintrin.h>.
360370b324cSopenharmony_ci*/
361370b324cSopenharmony_ci
362370b324cSopenharmony_ci/*
363370b324cSopenharmony_ci#if defined(__clang__) && defined(_MSC_VER)
364370b324cSopenharmony_ci#define __AVX__
365370b324cSopenharmony_ci#define __AVX2__
366370b324cSopenharmony_ci#define __VAES__
367370b324cSopenharmony_ci#endif
368370b324cSopenharmony_ci*/
369370b324cSopenharmony_ci
370370b324cSopenharmony_ci#include <immintrin.h>
371370b324cSopenharmony_ci#if defined(__clang__) && defined(_MSC_VER)
372370b324cSopenharmony_ci  #if !defined(__AVX__)
373370b324cSopenharmony_ci    #include <avxintrin.h>
374370b324cSopenharmony_ci  #endif
375370b324cSopenharmony_ci  #if !defined(__AVX2__)
376370b324cSopenharmony_ci    #include <avx2intrin.h>
377370b324cSopenharmony_ci  #endif
378370b324cSopenharmony_ci  #if !defined(__VAES__)
379370b324cSopenharmony_ci    #include <vaesintrin.h>
380370b324cSopenharmony_ci  #endif
381370b324cSopenharmony_ci#endif  // __clang__ && _MSC_VER
382370b324cSopenharmony_ci
383370b324cSopenharmony_ci
384370b324cSopenharmony_ci#define VAES_FUNC_START2(name) \
385370b324cSopenharmony_ciAES_FUNC_START (name); \
386370b324cSopenharmony_ciATTRIB_VAES \
387370b324cSopenharmony_ciAES_FUNC_START (name)
388370b324cSopenharmony_ci
389370b324cSopenharmony_ciVAES_FUNC_START2 (AesCbc_Decode_HW_256)
390370b324cSopenharmony_ci{
391370b324cSopenharmony_ci  __m128i *p = (__m128i *)(void *)ivAes;
392370b324cSopenharmony_ci  __m128i *data = (__m128i *)(void *)data8;
393370b324cSopenharmony_ci  __m128i iv = *p;
394370b324cSopenharmony_ci  const __m128i *dataEnd;
395370b324cSopenharmony_ci  UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
396370b324cSopenharmony_ci  p += 2;
397370b324cSopenharmony_ci
398370b324cSopenharmony_ci  WIDE_LOOP_START_AVX(;)
399370b324cSopenharmony_ci  {
400370b324cSopenharmony_ci    const __m256i *w = keys + numRounds - 2;
401370b324cSopenharmony_ci
402370b324cSopenharmony_ci    WOP (AVX_DECLARE_VAR)
403370b324cSopenharmony_ci    WOP (AVX_LOAD_data)
404370b324cSopenharmony_ci    AVX_WOP_KEY (AVX_AES_XOR, 1)
405370b324cSopenharmony_ci
406370b324cSopenharmony_ci    do
407370b324cSopenharmony_ci    {
408370b324cSopenharmony_ci      AVX_WOP_KEY (AVX_AES_DEC, 0)
409370b324cSopenharmony_ci      w--;
410370b324cSopenharmony_ci    }
411370b324cSopenharmony_ci    while (w != keys);
412370b324cSopenharmony_ci    AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
413370b324cSopenharmony_ci
414370b324cSopenharmony_ci    AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
415370b324cSopenharmony_ci    WOP_M1 (AVX_XOR_data_M1)
416370b324cSopenharmony_ci    iv = data[NUM_WAYS * 2 - 1];
417370b324cSopenharmony_ci    WOP (AVX_STORE_data)
418370b324cSopenharmony_ci  }
419370b324cSopenharmony_ci  WIDE_LOOP_END_AVX(;)
420370b324cSopenharmony_ci
421370b324cSopenharmony_ci  SINGLE_LOOP
422370b324cSopenharmony_ci  {
423370b324cSopenharmony_ci    const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
424370b324cSopenharmony_ci    __m128i m = _mm_xor_si128 (w[2], *data);
425370b324cSopenharmony_ci    do
426370b324cSopenharmony_ci    {
427370b324cSopenharmony_ci      MM_OP_m (_mm_aesdec_si128, w[1])
428370b324cSopenharmony_ci      MM_OP_m (_mm_aesdec_si128, w[0])
429370b324cSopenharmony_ci      w -= 2;
430370b324cSopenharmony_ci    }
431370b324cSopenharmony_ci    while (w != p);
432370b324cSopenharmony_ci    MM_OP_m (_mm_aesdec_si128,     w[1])
433370b324cSopenharmony_ci    MM_OP_m (_mm_aesdeclast_si128, w[0])
434370b324cSopenharmony_ci
435370b324cSopenharmony_ci    MM_XOR (m, iv)
436370b324cSopenharmony_ci    iv = *data;
437370b324cSopenharmony_ci    *data = m;
438370b324cSopenharmony_ci  }
439370b324cSopenharmony_ci
440370b324cSopenharmony_ci  p[-2] = iv;
441370b324cSopenharmony_ci}
442370b324cSopenharmony_ci
443370b324cSopenharmony_ci
444370b324cSopenharmony_ci/*
445370b324cSopenharmony_ciSSE2: _mm_cvtsi32_si128 : movd
446370b324cSopenharmony_ciAVX:  _mm256_setr_m128i            : vinsertf128
447370b324cSopenharmony_ciAVX2: _mm256_add_epi64             : vpaddq ymm, ymm, ymm
448370b324cSopenharmony_ci      _mm256_extracti128_si256     : vextracti128
449370b324cSopenharmony_ci      _mm256_broadcastsi128_si256  : vbroadcasti128
450370b324cSopenharmony_ci*/
451370b324cSopenharmony_ci
452370b324cSopenharmony_ci#define AVX_CTR_LOOP_START  \
453370b324cSopenharmony_ci    ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
454370b324cSopenharmony_ci    two = _mm256_setr_m128i(one, one); \
455370b324cSopenharmony_ci    two = _mm256_add_epi64(two, two); \
456370b324cSopenharmony_ci
457370b324cSopenharmony_ci// two = _mm256_setr_epi64x(2, 0, 2, 0);
458370b324cSopenharmony_ci
459370b324cSopenharmony_ci#define AVX_CTR_LOOP_ENC  \
460370b324cSopenharmony_ci    ctr = _mm256_extracti128_si256 (ctr2, 1); \
461370b324cSopenharmony_ci
462370b324cSopenharmony_ciVAES_FUNC_START2 (AesCtr_Code_HW_256)
463370b324cSopenharmony_ci{
464370b324cSopenharmony_ci  __m128i *p = (__m128i *)(void *)ivAes;
465370b324cSopenharmony_ci  __m128i *data = (__m128i *)(void *)data8;
466370b324cSopenharmony_ci  __m128i ctr = *p;
467370b324cSopenharmony_ci  UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
468370b324cSopenharmony_ci  const __m128i *dataEnd;
469370b324cSopenharmony_ci  __m128i one = _mm_cvtsi32_si128(1);
470370b324cSopenharmony_ci  __m256i ctr2, two;
471370b324cSopenharmony_ci  p += 2;
472370b324cSopenharmony_ci
473370b324cSopenharmony_ci  WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
474370b324cSopenharmony_ci  {
475370b324cSopenharmony_ci    const __m256i *w = keys;
476370b324cSopenharmony_ci    UInt32 r = numRounds - 2;
477370b324cSopenharmony_ci    WOP (AVX_DECLARE_VAR)
478370b324cSopenharmony_ci    AVX_WOP_KEY (AVX_CTR_START, 0)
479370b324cSopenharmony_ci
480370b324cSopenharmony_ci    w += 1;
481370b324cSopenharmony_ci    do
482370b324cSopenharmony_ci    {
483370b324cSopenharmony_ci      AVX_WOP_KEY (AVX_AES_ENC, 0)
484370b324cSopenharmony_ci      w += 1;
485370b324cSopenharmony_ci    }
486370b324cSopenharmony_ci    while (--r);
487370b324cSopenharmony_ci    AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
488370b324cSopenharmony_ci
489370b324cSopenharmony_ci    WOP (AVX_CTR_END)
490370b324cSopenharmony_ci  }
491370b324cSopenharmony_ci  WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
492370b324cSopenharmony_ci
493370b324cSopenharmony_ci  SINGLE_LOOP
494370b324cSopenharmony_ci  {
495370b324cSopenharmony_ci    UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
496370b324cSopenharmony_ci    const __m128i *w = p;
497370b324cSopenharmony_ci    __m128i m;
498370b324cSopenharmony_ci    MM_OP (_mm_add_epi64, ctr, one)
499370b324cSopenharmony_ci    m = _mm_xor_si128 (ctr, p[0]);
500370b324cSopenharmony_ci    w += 1;
501370b324cSopenharmony_ci    do
502370b324cSopenharmony_ci    {
503370b324cSopenharmony_ci      MM_OP_m (_mm_aesenc_si128, w[0])
504370b324cSopenharmony_ci      MM_OP_m (_mm_aesenc_si128, w[1])
505370b324cSopenharmony_ci      w += 2;
506370b324cSopenharmony_ci    }
507370b324cSopenharmony_ci    while (--numRounds2);
508370b324cSopenharmony_ci    MM_OP_m (_mm_aesenc_si128,     w[0])
509370b324cSopenharmony_ci    MM_OP_m (_mm_aesenclast_si128, w[1])
510370b324cSopenharmony_ci    MM_XOR (*data, m)
511370b324cSopenharmony_ci  }
512370b324cSopenharmony_ci
513370b324cSopenharmony_ci  p[-2] = ctr;
514370b324cSopenharmony_ci}
515370b324cSopenharmony_ci
516370b324cSopenharmony_ci#endif // USE_INTEL_VAES
517370b324cSopenharmony_ci
518370b324cSopenharmony_ci#else // USE_INTEL_AES
519370b324cSopenharmony_ci
520370b324cSopenharmony_ci/* no USE_INTEL_AES */
521370b324cSopenharmony_ci
522370b324cSopenharmony_ci#pragma message("AES  HW_SW stub was used")
523370b324cSopenharmony_ci
524370b324cSopenharmony_ci#define AES_TYPE_keys UInt32
525370b324cSopenharmony_ci#define AES_TYPE_data Byte
526370b324cSopenharmony_ci
527370b324cSopenharmony_ci#define AES_FUNC_START(name) \
528370b324cSopenharmony_ci    void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
529370b324cSopenharmony_ci
530370b324cSopenharmony_ci#define AES_COMPAT_STUB(name) \
531370b324cSopenharmony_ci    AES_FUNC_START(name); \
532370b324cSopenharmony_ci    AES_FUNC_START(name ## _HW) \
533370b324cSopenharmony_ci    { name(p, data, numBlocks); }
534370b324cSopenharmony_ci
535370b324cSopenharmony_ciAES_COMPAT_STUB (AesCbc_Encode)
536370b324cSopenharmony_ciAES_COMPAT_STUB (AesCbc_Decode)
537370b324cSopenharmony_ciAES_COMPAT_STUB (AesCtr_Code)
538370b324cSopenharmony_ci
539370b324cSopenharmony_ci#endif // USE_INTEL_AES
540370b324cSopenharmony_ci
541370b324cSopenharmony_ci
542370b324cSopenharmony_ci#ifndef USE_INTEL_VAES
543370b324cSopenharmony_ci
544370b324cSopenharmony_ci#pragma message("VAES HW_SW stub was used")
545370b324cSopenharmony_ci
546370b324cSopenharmony_ci#define VAES_COMPAT_STUB(name) \
547370b324cSopenharmony_ci    void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
548370b324cSopenharmony_ci    void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
549370b324cSopenharmony_ci    { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
550370b324cSopenharmony_ci
551370b324cSopenharmony_ciVAES_COMPAT_STUB (AesCbc_Decode_HW)
552370b324cSopenharmony_ciVAES_COMPAT_STUB (AesCtr_Code_HW)
553370b324cSopenharmony_ci
554370b324cSopenharmony_ci#endif // ! USE_INTEL_VAES
555370b324cSopenharmony_ci
556370b324cSopenharmony_ci
557370b324cSopenharmony_ci#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
558370b324cSopenharmony_ci
559370b324cSopenharmony_ci  #if defined(__clang__)
560370b324cSopenharmony_ci    #if (__clang_major__ >= 8) // fix that check
561370b324cSopenharmony_ci      #define USE_HW_AES
562370b324cSopenharmony_ci    #endif
563370b324cSopenharmony_ci  #elif defined(__GNUC__)
564370b324cSopenharmony_ci    #if (__GNUC__ >= 6) // fix that check
565370b324cSopenharmony_ci      #define USE_HW_AES
566370b324cSopenharmony_ci    #endif
567370b324cSopenharmony_ci  #elif defined(_MSC_VER)
568370b324cSopenharmony_ci    #if _MSC_VER >= 1910
569370b324cSopenharmony_ci      #define USE_HW_AES
570370b324cSopenharmony_ci    #endif
571370b324cSopenharmony_ci  #endif
572370b324cSopenharmony_ci
573370b324cSopenharmony_ci#ifdef USE_HW_AES
574370b324cSopenharmony_ci
575370b324cSopenharmony_ci// #pragma message("=== AES HW === ")
576370b324cSopenharmony_ci
577370b324cSopenharmony_ci#if defined(__clang__) || defined(__GNUC__)
578370b324cSopenharmony_ci  #ifdef MY_CPU_ARM64
579370b324cSopenharmony_ci    #define ATTRIB_AES __attribute__((__target__("+crypto")))
580370b324cSopenharmony_ci  #else
581370b324cSopenharmony_ci    #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
582370b324cSopenharmony_ci  #endif
583370b324cSopenharmony_ci#else
584370b324cSopenharmony_ci  // _MSC_VER
585370b324cSopenharmony_ci  // for arm32
586370b324cSopenharmony_ci  #define _ARM_USE_NEW_NEON_INTRINSICS
587370b324cSopenharmony_ci#endif
588370b324cSopenharmony_ci
589370b324cSopenharmony_ci#ifndef ATTRIB_AES
590370b324cSopenharmony_ci  #define ATTRIB_AES
591370b324cSopenharmony_ci#endif
592370b324cSopenharmony_ci
593370b324cSopenharmony_ci#if defined(_MSC_VER) && defined(MY_CPU_ARM64)
594370b324cSopenharmony_ci#include <arm64_neon.h>
595370b324cSopenharmony_ci#else
596370b324cSopenharmony_ci#include <arm_neon.h>
597370b324cSopenharmony_ci#endif
598370b324cSopenharmony_ci
599370b324cSopenharmony_citypedef uint8x16_t v128;
600370b324cSopenharmony_ci
601370b324cSopenharmony_ci#define AES_FUNC_START(name) \
602370b324cSopenharmony_ci    void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
603370b324cSopenharmony_ci    // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
604370b324cSopenharmony_ci
605370b324cSopenharmony_ci#define AES_FUNC_START2(name) \
606370b324cSopenharmony_ciAES_FUNC_START (name); \
607370b324cSopenharmony_ciATTRIB_AES \
608370b324cSopenharmony_ciAES_FUNC_START (name)
609370b324cSopenharmony_ci
610370b324cSopenharmony_ci#define MM_OP(op, dest, src)  dest = op(dest, src);
611370b324cSopenharmony_ci#define MM_OP_m(op, src)      MM_OP(op, m, src)
612370b324cSopenharmony_ci#define MM_OP1_m(op)          m = op(m);
613370b324cSopenharmony_ci
614370b324cSopenharmony_ci#define MM_XOR( dest, src)    MM_OP(veorq_u8, dest, src)
615370b324cSopenharmony_ci#define MM_XOR_m( src)        MM_XOR(m, src)
616370b324cSopenharmony_ci
617370b324cSopenharmony_ci#define AES_E_m(k)     MM_OP_m (vaeseq_u8, k)
618370b324cSopenharmony_ci#define AES_E_MC_m(k)  AES_E_m (k)  MM_OP1_m(vaesmcq_u8)
619370b324cSopenharmony_ci
620370b324cSopenharmony_ci
621370b324cSopenharmony_ciAES_FUNC_START2 (AesCbc_Encode_HW)
622370b324cSopenharmony_ci{
623370b324cSopenharmony_ci  v128 *p = (v128*)(void*)ivAes;
624370b324cSopenharmony_ci  v128 *data = (v128*)(void*)data8;
625370b324cSopenharmony_ci  v128 m = *p;
626370b324cSopenharmony_ci  const v128 k0 = p[2];
627370b324cSopenharmony_ci  const v128 k1 = p[3];
628370b324cSopenharmony_ci  const v128 k2 = p[4];
629370b324cSopenharmony_ci  const v128 k3 = p[5];
630370b324cSopenharmony_ci  const v128 k4 = p[6];
631370b324cSopenharmony_ci  const v128 k5 = p[7];
632370b324cSopenharmony_ci  const v128 k6 = p[8];
633370b324cSopenharmony_ci  const v128 k7 = p[9];
634370b324cSopenharmony_ci  const v128 k8 = p[10];
635370b324cSopenharmony_ci  const v128 k9 = p[11];
636370b324cSopenharmony_ci  const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
637370b324cSopenharmony_ci  const v128 *w = p + ((size_t)numRounds2 * 2);
638370b324cSopenharmony_ci  const v128 k_z1 = w[1];
639370b324cSopenharmony_ci  const v128 k_z0 = w[2];
640370b324cSopenharmony_ci  for (; numBlocks != 0; numBlocks--, data++)
641370b324cSopenharmony_ci  {
642370b324cSopenharmony_ci    MM_XOR_m (*data);
643370b324cSopenharmony_ci    AES_E_MC_m (k0)
644370b324cSopenharmony_ci    AES_E_MC_m (k1)
645370b324cSopenharmony_ci    AES_E_MC_m (k2)
646370b324cSopenharmony_ci    AES_E_MC_m (k3)
647370b324cSopenharmony_ci    AES_E_MC_m (k4)
648370b324cSopenharmony_ci    AES_E_MC_m (k5)
649370b324cSopenharmony_ci    AES_E_MC_m (k6)
650370b324cSopenharmony_ci    AES_E_MC_m (k7)
651370b324cSopenharmony_ci    AES_E_MC_m (k8)
652370b324cSopenharmony_ci    if (numRounds2 >= 6)
653370b324cSopenharmony_ci    {
654370b324cSopenharmony_ci      AES_E_MC_m (k9)
655370b324cSopenharmony_ci      AES_E_MC_m (p[12])
656370b324cSopenharmony_ci      if (numRounds2 != 6)
657370b324cSopenharmony_ci      {
658370b324cSopenharmony_ci        AES_E_MC_m (p[13])
659370b324cSopenharmony_ci        AES_E_MC_m (p[14])
660370b324cSopenharmony_ci      }
661370b324cSopenharmony_ci    }
662370b324cSopenharmony_ci    AES_E_m  (k_z1)
663370b324cSopenharmony_ci    MM_XOR_m (k_z0);
664370b324cSopenharmony_ci    *data = m;
665370b324cSopenharmony_ci  }
666370b324cSopenharmony_ci  *p = m;
667370b324cSopenharmony_ci}
668370b324cSopenharmony_ci
669370b324cSopenharmony_ci
670370b324cSopenharmony_ci#define WOP_1(op)
671370b324cSopenharmony_ci#define WOP_2(op)   WOP_1 (op)  op (m1, 1)
672370b324cSopenharmony_ci#define WOP_3(op)   WOP_2 (op)  op (m2, 2)
673370b324cSopenharmony_ci#define WOP_4(op)   WOP_3 (op)  op (m3, 3)
674370b324cSopenharmony_ci#define WOP_5(op)   WOP_4 (op)  op (m4, 4)
675370b324cSopenharmony_ci#define WOP_6(op)   WOP_5 (op)  op (m5, 5)
676370b324cSopenharmony_ci#define WOP_7(op)   WOP_6 (op)  op (m6, 6)
677370b324cSopenharmony_ci#define WOP_8(op)   WOP_7 (op)  op (m7, 7)
678370b324cSopenharmony_ci
679370b324cSopenharmony_ci  #define NUM_WAYS      8
680370b324cSopenharmony_ci  #define WOP_M1    WOP_8
681370b324cSopenharmony_ci
682370b324cSopenharmony_ci#define WOP(op)  op (m0, 0)   WOP_M1(op)
683370b324cSopenharmony_ci
684370b324cSopenharmony_ci#define DECLARE_VAR(reg, ii)  v128 reg;
685370b324cSopenharmony_ci#define LOAD_data(  reg, ii)  reg = data[ii];
686370b324cSopenharmony_ci#define STORE_data( reg, ii)  data[ii] = reg;
687370b324cSopenharmony_ci#if (NUM_WAYS > 1)
688370b324cSopenharmony_ci#define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
689370b324cSopenharmony_ci#endif
690370b324cSopenharmony_ci
691370b324cSopenharmony_ci#define MM_OP_key(op, reg)  MM_OP (op, reg, key)
692370b324cSopenharmony_ci
693370b324cSopenharmony_ci#define AES_D_m(k)      MM_OP_m (vaesdq_u8, k)
694370b324cSopenharmony_ci#define AES_D_IMC_m(k)  AES_D_m (k)  MM_OP1_m (vaesimcq_u8)
695370b324cSopenharmony_ci
696370b324cSopenharmony_ci#define AES_XOR(   reg, ii)  MM_OP_key (veorq_u8,  reg)
697370b324cSopenharmony_ci#define AES_D(     reg, ii)  MM_OP_key (vaesdq_u8, reg)
698370b324cSopenharmony_ci#define AES_E(     reg, ii)  MM_OP_key (vaeseq_u8, reg)
699370b324cSopenharmony_ci
700370b324cSopenharmony_ci#define AES_D_IMC( reg, ii)  AES_D (reg, ii)  reg = vaesimcq_u8(reg);
701370b324cSopenharmony_ci#define AES_E_MC(  reg, ii)  AES_E (reg, ii)  reg = vaesmcq_u8(reg);
702370b324cSopenharmony_ci
703370b324cSopenharmony_ci#define CTR_START(reg, ii)  MM_OP (vaddq_u64, ctr, one)  reg = vreinterpretq_u8_u64(ctr);
704370b324cSopenharmony_ci#define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
705370b324cSopenharmony_ci
706370b324cSopenharmony_ci#define WOP_KEY(op, n) { \
707370b324cSopenharmony_ci    const v128 key = w[n]; \
708370b324cSopenharmony_ci    WOP(op) }
709370b324cSopenharmony_ci
710370b324cSopenharmony_ci#define WIDE_LOOP_START  \
711370b324cSopenharmony_ci    dataEnd = data + numBlocks;  \
712370b324cSopenharmony_ci    if (numBlocks >= NUM_WAYS)  \
713370b324cSopenharmony_ci    { dataEnd -= NUM_WAYS; do {  \
714370b324cSopenharmony_ci
715370b324cSopenharmony_ci#define WIDE_LOOP_END  \
716370b324cSopenharmony_ci    data += NUM_WAYS;  \
717370b324cSopenharmony_ci    } while (data <= dataEnd);  \
718370b324cSopenharmony_ci    dataEnd += NUM_WAYS; }  \
719370b324cSopenharmony_ci
720370b324cSopenharmony_ci#define SINGLE_LOOP  \
721370b324cSopenharmony_ci    for (; data < dataEnd; data++)
722370b324cSopenharmony_ci
723370b324cSopenharmony_ci
724370b324cSopenharmony_ciAES_FUNC_START2 (AesCbc_Decode_HW)
725370b324cSopenharmony_ci{
726370b324cSopenharmony_ci  v128 *p = (v128*)(void*)ivAes;
727370b324cSopenharmony_ci  v128 *data = (v128*)(void*)data8;
728370b324cSopenharmony_ci  v128 iv = *p;
729370b324cSopenharmony_ci  const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
730370b324cSopenharmony_ci  const v128 *dataEnd;
731370b324cSopenharmony_ci  p += 2;
732370b324cSopenharmony_ci
733370b324cSopenharmony_ci  WIDE_LOOP_START
734370b324cSopenharmony_ci  {
735370b324cSopenharmony_ci    const v128 *w = wStart;
736370b324cSopenharmony_ci    WOP (DECLARE_VAR)
737370b324cSopenharmony_ci    WOP (LOAD_data)
738370b324cSopenharmony_ci    WOP_KEY (AES_D_IMC, 2)
739370b324cSopenharmony_ci    do
740370b324cSopenharmony_ci    {
741370b324cSopenharmony_ci      WOP_KEY (AES_D_IMC, 1)
742370b324cSopenharmony_ci      WOP_KEY (AES_D_IMC, 0)
743370b324cSopenharmony_ci      w -= 2;
744370b324cSopenharmony_ci    }
745370b324cSopenharmony_ci    while (w != p);
746370b324cSopenharmony_ci    WOP_KEY (AES_D,   1)
747370b324cSopenharmony_ci    WOP_KEY (AES_XOR, 0)
748370b324cSopenharmony_ci    MM_XOR (m0, iv);
749370b324cSopenharmony_ci    WOP_M1 (XOR_data_M1)
750370b324cSopenharmony_ci    iv = data[NUM_WAYS - 1];
751370b324cSopenharmony_ci    WOP (STORE_data)
752370b324cSopenharmony_ci  }
753370b324cSopenharmony_ci  WIDE_LOOP_END
754370b324cSopenharmony_ci
755370b324cSopenharmony_ci  SINGLE_LOOP
756370b324cSopenharmony_ci  {
757370b324cSopenharmony_ci    const v128 *w = wStart;
758370b324cSopenharmony_ci    v128 m = *data;
759370b324cSopenharmony_ci    AES_D_IMC_m (w[2])
760370b324cSopenharmony_ci    do
761370b324cSopenharmony_ci    {
762370b324cSopenharmony_ci      AES_D_IMC_m (w[1]);
763370b324cSopenharmony_ci      AES_D_IMC_m (w[0]);
764370b324cSopenharmony_ci      w -= 2;
765370b324cSopenharmony_ci    }
766370b324cSopenharmony_ci    while (w != p);
767370b324cSopenharmony_ci    AES_D_m  (w[1]);
768370b324cSopenharmony_ci    MM_XOR_m (w[0]);
769370b324cSopenharmony_ci    MM_XOR_m (iv);
770370b324cSopenharmony_ci    iv = *data;
771370b324cSopenharmony_ci    *data = m;
772370b324cSopenharmony_ci  }
773370b324cSopenharmony_ci
774370b324cSopenharmony_ci  p[-2] = iv;
775370b324cSopenharmony_ci}
776370b324cSopenharmony_ci
777370b324cSopenharmony_ci
778370b324cSopenharmony_ciAES_FUNC_START2 (AesCtr_Code_HW)
779370b324cSopenharmony_ci{
780370b324cSopenharmony_ci  v128 *p = (v128*)(void*)ivAes;
781370b324cSopenharmony_ci  v128 *data = (v128*)(void*)data8;
782370b324cSopenharmony_ci  uint64x2_t ctr = vreinterpretq_u64_u8(*p);
783370b324cSopenharmony_ci  const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
784370b324cSopenharmony_ci  const v128 *dataEnd;
785370b324cSopenharmony_ci  uint64x2_t one = vdupq_n_u64(0);
786370b324cSopenharmony_ci  one = vsetq_lane_u64(1, one, 0);
787370b324cSopenharmony_ci  p += 2;
788370b324cSopenharmony_ci
789370b324cSopenharmony_ci  WIDE_LOOP_START
790370b324cSopenharmony_ci  {
791370b324cSopenharmony_ci    const v128 *w = p;
792370b324cSopenharmony_ci    WOP (DECLARE_VAR)
793370b324cSopenharmony_ci    WOP (CTR_START)
794370b324cSopenharmony_ci    do
795370b324cSopenharmony_ci    {
796370b324cSopenharmony_ci      WOP_KEY (AES_E_MC, 0)
797370b324cSopenharmony_ci      WOP_KEY (AES_E_MC, 1)
798370b324cSopenharmony_ci      w += 2;
799370b324cSopenharmony_ci    }
800370b324cSopenharmony_ci    while (w != wEnd);
801370b324cSopenharmony_ci    WOP_KEY (AES_E_MC, 0)
802370b324cSopenharmony_ci    WOP_KEY (AES_E,    1)
803370b324cSopenharmony_ci    WOP_KEY (AES_XOR,  2)
804370b324cSopenharmony_ci    WOP (CTR_END)
805370b324cSopenharmony_ci  }
806370b324cSopenharmony_ci  WIDE_LOOP_END
807370b324cSopenharmony_ci
808370b324cSopenharmony_ci  SINGLE_LOOP
809370b324cSopenharmony_ci  {
810370b324cSopenharmony_ci    const v128 *w = p;
811370b324cSopenharmony_ci    v128 m;
812370b324cSopenharmony_ci    CTR_START (m, 0);
813370b324cSopenharmony_ci    do
814370b324cSopenharmony_ci    {
815370b324cSopenharmony_ci      AES_E_MC_m (w[0]);
816370b324cSopenharmony_ci      AES_E_MC_m (w[1]);
817370b324cSopenharmony_ci      w += 2;
818370b324cSopenharmony_ci    }
819370b324cSopenharmony_ci    while (w != wEnd);
820370b324cSopenharmony_ci    AES_E_MC_m (w[0])
821370b324cSopenharmony_ci    AES_E_m    (w[1])
822370b324cSopenharmony_ci    MM_XOR_m   (w[2])
823370b324cSopenharmony_ci    CTR_END (m, 0)
824370b324cSopenharmony_ci  }
825370b324cSopenharmony_ci
826370b324cSopenharmony_ci  p[-2] = vreinterpretq_u8_u64(ctr);
827370b324cSopenharmony_ci}
828370b324cSopenharmony_ci
829370b324cSopenharmony_ci#endif // USE_HW_AES
830370b324cSopenharmony_ci
831370b324cSopenharmony_ci#endif // MY_CPU_ARM_OR_ARM64
832370b324cSopenharmony_ci
833370b324cSopenharmony_ci#undef NUM_WAYS
834370b324cSopenharmony_ci#undef WOP_M1
835370b324cSopenharmony_ci#undef WOP
836370b324cSopenharmony_ci#undef DECLARE_VAR
837370b324cSopenharmony_ci#undef LOAD_data
838370b324cSopenharmony_ci#undef STORE_data
839370b324cSopenharmony_ci#undef USE_INTEL_AES
840370b324cSopenharmony_ci#undef USE_HW_AES
841