xref: /third_party/mbedtls/library/aesce.c (revision a8e1175b)
1/*
2 *  Armv8-A Cryptographic Extension support functions for Aarch64
3 *
4 *  Copyright The Mbed TLS Contributors
5 *  SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6 */
7
8#if defined(__clang__) &&  (__clang_major__ >= 4)
9
10/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
11 * but that is defined by build_info.h, and we need this block to happen first. */
12#if defined(__ARM_ARCH)
13#if __ARM_ARCH >= 8
14#define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
15#endif
16#endif
17
18#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
19/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
20 *
21 * The intrinsic declaration are guarded by predefined ACLE macros in clang:
22 * these are normally only enabled by the -march option on the command line.
23 * By defining the macros ourselves we gain access to those declarations without
24 * requiring -march on the command line.
25 *
26 * `arm_neon.h` is included by common.h, so we put these defines
27 * at the top of this file, before any includes.
28 */
29#define __ARM_FEATURE_CRYPTO 1
30/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
31 *
32 * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
33 * for older compilers.
34 */
35#define __ARM_FEATURE_AES    1
36#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
37#endif
38
39#endif /* defined(__clang__) &&  (__clang_major__ >= 4) */
40
41#include <string.h>
42#include "common.h"
43
44#if defined(MBEDTLS_AESCE_C)
45
46#include "aesce.h"
47
48#if defined(MBEDTLS_AESCE_HAVE_CODE)
49
50/* Compiler version checks. */
51#if defined(__clang__)
52#   if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
53#       error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
54#   elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
55#       error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
56#   endif
57#elif defined(__GNUC__)
58#   if __GNUC__ < 6
59#       error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
60#   endif
61#elif defined(_MSC_VER)
62/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
63 *       please update this and document of `MBEDTLS_AESCE_C` in
64 *       `mbedtls_config.h`. */
65#   if _MSC_VER < 1929
66#       error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
67#   endif
68#elif defined(__ARMCC_VERSION)
69#    if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
70/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
71 * If someone verified that, please update this and document of
72 * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
73#         error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
74#    elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
75#         error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
76#    endif
77#endif
78
79#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
80    defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
81#   if defined(__ARMCOMPILER_VERSION)
82#       if __ARMCOMPILER_VERSION <= 6090000
83#           error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
84#       else
85#           pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
86#           define MBEDTLS_POP_TARGET_PRAGMA
87#       endif
88#   elif defined(__clang__)
89#       pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
90#       define MBEDTLS_POP_TARGET_PRAGMA
91#   elif defined(__GNUC__)
92#       pragma GCC push_options
93#       pragma GCC target ("+crypto")
94#       define MBEDTLS_POP_TARGET_PRAGMA
95#   elif defined(_MSC_VER)
96#       error "Required feature(__ARM_FEATURE_AES) is not enabled."
97#   endif
98#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
99          MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
100
101#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
102
103#include <sys/auxv.h>
104#if !defined(HWCAP_NEON)
105#define HWCAP_NEON  (1 << 12)
106#endif
107#if !defined(HWCAP2_AES)
108#define HWCAP2_AES  (1 << 0)
109#endif
110#if !defined(HWCAP_AES)
111#define HWCAP_AES   (1 << 3)
112#endif
113#if !defined(HWCAP_ASIMD)
114#define HWCAP_ASIMD (1 << 1)
115#endif
116
117signed char mbedtls_aesce_has_support_result = -1;
118
119#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
120/*
121 * AES instruction support detection routine
122 */
123int mbedtls_aesce_has_support_impl(void)
124{
125    /* To avoid many calls to getauxval, cache the result. This is
126     * thread-safe, because we store the result in a char so cannot
127     * be vulnerable to non-atomic updates.
128     * It is possible that we could end up setting result more than
129     * once, but that is harmless.
130     */
131    if (mbedtls_aesce_has_support_result == -1) {
132#if defined(MBEDTLS_ARCH_IS_ARM32)
133        unsigned long auxval  = getauxval(AT_HWCAP);
134        unsigned long auxval2 = getauxval(AT_HWCAP2);
135        if (((auxval  & HWCAP_NEON) == HWCAP_NEON) &&
136            ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
137            mbedtls_aesce_has_support_result = 1;
138        } else {
139            mbedtls_aesce_has_support_result = 0;
140        }
141#else
142        unsigned long auxval = getauxval(AT_HWCAP);
143        if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
144            (HWCAP_ASIMD | HWCAP_AES)) {
145            mbedtls_aesce_has_support_result = 1;
146        } else {
147            mbedtls_aesce_has_support_result = 0;
148        }
149#endif
150    }
151    return mbedtls_aesce_has_support_result;
152}
153#endif
154
155#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
156
157/* Single round of AESCE encryption */
158#define AESCE_ENCRYPT_ROUND                   \
159    block = vaeseq_u8(block, vld1q_u8(keys)); \
160    block = vaesmcq_u8(block);                \
161    keys += 16
162/* Two rounds of AESCE encryption */
163#define AESCE_ENCRYPT_ROUND_X2        AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
164
165MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
166static uint8x16_t aesce_encrypt_block(uint8x16_t block,
167                                      unsigned char *keys,
168                                      int rounds)
169{
170    /* 10, 12 or 14 rounds. Unroll loop. */
171    if (rounds == 10) {
172        goto rounds_10;
173    }
174    if (rounds == 12) {
175        goto rounds_12;
176    }
177    AESCE_ENCRYPT_ROUND_X2;
178rounds_12:
179    AESCE_ENCRYPT_ROUND_X2;
180rounds_10:
181    AESCE_ENCRYPT_ROUND_X2;
182    AESCE_ENCRYPT_ROUND_X2;
183    AESCE_ENCRYPT_ROUND_X2;
184    AESCE_ENCRYPT_ROUND_X2;
185    AESCE_ENCRYPT_ROUND;
186
187    /* AES AddRoundKey for the previous round.
188     * SubBytes, ShiftRows for the final round.  */
189    block = vaeseq_u8(block, vld1q_u8(keys));
190    keys += 16;
191
192    /* Final round: no MixColumns */
193
194    /* Final AddRoundKey */
195    block = veorq_u8(block, vld1q_u8(keys));
196
197    return block;
198}
199
200/* Single round of AESCE decryption
201 *
202 * AES AddRoundKey, SubBytes, ShiftRows
203 *
204 *      block = vaesdq_u8(block, vld1q_u8(keys));
205 *
206 * AES inverse MixColumns for the next round.
207 *
208 * This means that we switch the order of the inverse AddRoundKey and
209 * inverse MixColumns operations. We have to do this as AddRoundKey is
210 * done in an atomic instruction together with the inverses of SubBytes
211 * and ShiftRows.
212 *
213 * It works because MixColumns is a linear operation over GF(2^8) and
214 * AddRoundKey is an exclusive or, which is equivalent to addition over
215 * GF(2^8). (The inverse of MixColumns needs to be applied to the
216 * affected round keys separately which has been done when the
217 * decryption round keys were calculated.)
218 *
219 *      block = vaesimcq_u8(block);
220 */
221#define AESCE_DECRYPT_ROUND                   \
222    block = vaesdq_u8(block, vld1q_u8(keys)); \
223    block = vaesimcq_u8(block);               \
224    keys += 16
225/* Two rounds of AESCE decryption */
226#define AESCE_DECRYPT_ROUND_X2        AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
227
228#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
229static uint8x16_t aesce_decrypt_block(uint8x16_t block,
230                                      unsigned char *keys,
231                                      int rounds)
232{
233    /* 10, 12 or 14 rounds. Unroll loop. */
234    if (rounds == 10) {
235        goto rounds_10;
236    }
237    if (rounds == 12) {
238        goto rounds_12;
239    }
240    AESCE_DECRYPT_ROUND_X2;
241rounds_12:
242    AESCE_DECRYPT_ROUND_X2;
243rounds_10:
244    AESCE_DECRYPT_ROUND_X2;
245    AESCE_DECRYPT_ROUND_X2;
246    AESCE_DECRYPT_ROUND_X2;
247    AESCE_DECRYPT_ROUND_X2;
248    AESCE_DECRYPT_ROUND;
249
250    /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
251     * last full round. */
252    block = vaesdq_u8(block, vld1q_u8(keys));
253    keys += 16;
254
255    /* Inverse AddRoundKey for inverting the initial round key addition. */
256    block = veorq_u8(block, vld1q_u8(keys));
257
258    return block;
259}
260#endif
261
262/*
263 * AES-ECB block en(de)cryption
264 */
265int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
266                            int mode,
267                            const unsigned char input[16],
268                            unsigned char output[16])
269{
270    uint8x16_t block = vld1q_u8(&input[0]);
271    unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
272
273#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
274    if (mode == MBEDTLS_AES_DECRYPT) {
275        block = aesce_decrypt_block(block, keys, ctx->nr);
276    } else
277#else
278    (void) mode;
279#endif
280    {
281        block = aesce_encrypt_block(block, keys, ctx->nr);
282    }
283    vst1q_u8(&output[0], block);
284
285    return 0;
286}
287
288/*
289 * Compute decryption round keys from encryption round keys
290 */
291#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
292void mbedtls_aesce_inverse_key(unsigned char *invkey,
293                               const unsigned char *fwdkey,
294                               int nr)
295{
296    int i, j;
297    j = nr;
298    vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
299    for (i = 1, j--; j > 0; i++, j--) {
300        vst1q_u8(invkey + i * 16,
301                 vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
302    }
303    vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
304
305}
306#endif
307
308static inline uint32_t aes_rot_word(uint32_t word)
309{
310    return (word << (32 - 8)) | (word >> 8);
311}
312
313static inline uint32_t aes_sub_word(uint32_t in)
314{
315    uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
316    uint8x16_t zero = vdupq_n_u8(0);
317
318    /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
319     * the correct result as ShiftRows doesn't change the first row. */
320    v = vaeseq_u8(zero, v);
321    return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
322}
323
324/*
325 * Key expansion function
326 */
327static void aesce_setkey_enc(unsigned char *rk,
328                             const unsigned char *key,
329                             const size_t key_bit_length)
330{
331    static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
332                                    0x20, 0x40, 0x80, 0x1b, 0x36 };
333    /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
334     *   - Section 5, Nr = Nk + 6
335     *   - Section 5.2, the length of round keys is Nb*(Nr+1)
336     */
337    const size_t key_len_in_words = key_bit_length / 32;    /* Nk */
338    const size_t round_key_len_in_words = 4;                /* Nb */
339    const size_t rounds_needed = key_len_in_words + 6;      /* Nr */
340    const size_t round_keys_len_in_words =
341        round_key_len_in_words * (rounds_needed + 1);       /* Nb*(Nr+1) */
342    const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
343
344    memcpy(rk, key, key_len_in_words * 4);
345
346    for (uint32_t *rki = (uint32_t *) rk;
347         rki + key_len_in_words < rko_end;
348         rki += key_len_in_words) {
349
350        size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
351        uint32_t *rko;
352        rko = rki + key_len_in_words;
353        rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
354        rko[0] ^= rcon[iteration] ^ rki[0];
355        rko[1] = rko[0] ^ rki[1];
356        rko[2] = rko[1] ^ rki[2];
357        rko[3] = rko[2] ^ rki[3];
358        if (rko + key_len_in_words > rko_end) {
359            /* Do not write overflow words.*/
360            continue;
361        }
362#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
363        switch (key_bit_length) {
364            case 128:
365                break;
366            case 192:
367                rko[4] = rko[3] ^ rki[4];
368                rko[5] = rko[4] ^ rki[5];
369                break;
370            case 256:
371                rko[4] = aes_sub_word(rko[3]) ^ rki[4];
372                rko[5] = rko[4] ^ rki[5];
373                rko[6] = rko[5] ^ rki[6];
374                rko[7] = rko[6] ^ rki[7];
375                break;
376        }
377#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
378    }
379}
380
381/*
382 * Key expansion, wrapper
383 */
384int mbedtls_aesce_setkey_enc(unsigned char *rk,
385                             const unsigned char *key,
386                             size_t bits)
387{
388    switch (bits) {
389        case 128:
390        case 192:
391        case 256:
392            aesce_setkey_enc(rk, key, bits);
393            break;
394        default:
395            return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
396    }
397
398    return 0;
399}
400
401#if defined(MBEDTLS_GCM_C)
402
403#if defined(MBEDTLS_ARCH_IS_ARM32)
404
405#if defined(__clang__)
406/* On clang for A32/T32, work around some missing intrinsics and types which are listed in
407 * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
408 * These are only required for GCM.
409 */
410#define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
411
412typedef uint8x16_t poly128_t;
413
414static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
415{
416    poly128_t r;
417    asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
418    return r;
419}
420
421/* This is set to cause some more missing intrinsics to be defined below */
422#define COMMON_MISSING_INTRINSICS
423
424static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
425{
426    return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
427                     (poly64_t) (vget_high_u64((uint64x2_t) b)));
428}
429
430#endif /* defined(__clang__) */
431
432static inline uint8x16_t vrbitq_u8(uint8x16_t x)
433{
434    /* There is no vrbitq_u8 instruction in A32/T32, so provide
435     * an equivalent non-Neon implementation. Reverse bit order in each
436     * byte with 4x rbit, rev. */
437    asm ("ldm  %[p], { r2-r5 } \n\t"
438         "rbit r2, r2          \n\t"
439         "rev  r2, r2          \n\t"
440         "rbit r3, r3          \n\t"
441         "rev  r3, r3          \n\t"
442         "rbit r4, r4          \n\t"
443         "rev  r4, r4          \n\t"
444         "rbit r5, r5          \n\t"
445         "rev  r5, r5          \n\t"
446         "stm  %[p], { r2-r5 } \n\t"
447         :
448         /* Output: 16 bytes of memory pointed to by &x */
449         "+m" (*(uint8_t(*)[16]) &x)
450         :
451         [p] "r" (&x)
452         :
453         "r2", "r3", "r4", "r5"
454         );
455    return x;
456}
457
458#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
459
460#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
461/* Some intrinsics are not available for GCC 5.X. */
462#define COMMON_MISSING_INTRINSICS
463#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
464
465
466#if defined(COMMON_MISSING_INTRINSICS)
467
468/* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
469
470#define vreinterpretq_p64_u8(a)  ((poly64x2_t) a)
471#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
472
473static inline poly64x1_t vget_low_p64(poly64x2_t a)
474{
475    uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
476    return (poly64x1_t) r;
477
478}
479
480#endif /* COMMON_MISSING_INTRINSICS */
481
482/* vmull_p64/vmull_high_p64 wrappers.
483 *
484 * Older compilers miss some intrinsic functions for `poly*_t`. We use
485 * uint8x16_t and uint8x16x3_t as input/output parameters.
486 */
487#if defined(MBEDTLS_COMPILER_IS_GCC)
488/* GCC reports incompatible type error without cast. GCC think poly64_t and
489 * poly64x1_t are different, that is different with MSVC and Clang. */
490#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
491#else
492/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
493 * error with/without cast. And I think poly64_t and poly64x1_t are same, no
494 * cast for clang also. */
495#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
496#endif /* MBEDTLS_COMPILER_IS_GCC */
497
498static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
499{
500
501    return vreinterpretq_u8_p128(
502        MBEDTLS_VMULL_P64(
503            (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
504            (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
505            ));
506}
507
508static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
509{
510    return vreinterpretq_u8_p128(
511        vmull_high_p64(vreinterpretq_p64_u8(a),
512                       vreinterpretq_p64_u8(b)));
513}
514
515/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
516 * `x^128 + x^7 + x^2 + x + 1`.
517 *
518 * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
519 * multiplies to generate a 128b.
520 *
521 * `poly_mult_128` executes polynomial multiplication and outputs 256b that
522 * represented by 3 128b due to code size optimization.
523 *
524 * Output layout:
525 * |            |             |             |
526 * |------------|-------------|-------------|
527 * | ret.val[0] | h3:h2:00:00 | high   128b |
528 * | ret.val[1] |   :m2:m1:00 | middle 128b |
529 * | ret.val[2] |   :  :l1:l0 | low    128b |
530 */
531static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
532{
533    uint8x16x3_t ret;
534    uint8x16_t h, m, l; /* retval high/middle/low */
535    uint8x16_t c, d, e;
536
537    h = pmull_high(a, b);                       /* h3:h2:00:00 = a1*b1 */
538    l = pmull_low(a, b);                        /*   :  :l1:l0 = a0*b0 */
539    c = vextq_u8(b, b, 8);                      /*      :c1:c0 = b0:b1 */
540    d = pmull_high(a, c);                       /*   :d2:d1:00 = a1*b0 */
541    e = pmull_low(a, c);                        /*   :e2:e1:00 = a0*b1 */
542    m = veorq_u8(d, e);                         /*   :m2:m1:00 = d + e */
543
544    ret.val[0] = h;
545    ret.val[1] = m;
546    ret.val[2] = l;
547    return ret;
548}
549
550/*
551 * Modulo reduction.
552 *
553 * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
554 *
555 * Section 4.3
556 *
557 * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
558 * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
559 * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
560 * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
561 * simply multiply the higher part of the operand by r(z) and add it to l(z). If
562 * the result is still larger than 128 bits, we reduce again.
563 */
564static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
565{
566    uint8x16_t const ZERO = vdupq_n_u8(0);
567
568    uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
569#if defined(__GNUC__)
570    /* use 'asm' as an optimisation barrier to prevent loading MODULO from
571     * memory. It is for GNUC compatible compilers.
572     */
573    asm volatile ("" : "+w" (r));
574#endif
575    uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
576    uint8x16_t h, m, l; /* input high/middle/low 128b */
577    uint8x16_t c, d, e, f, g, n, o;
578    h = input.val[0];            /* h3:h2:00:00                          */
579    m = input.val[1];            /*   :m2:m1:00                          */
580    l = input.val[2];            /*   :  :l1:l0                          */
581    c = pmull_high(h, MODULO);   /*   :c2:c1:00 = reduction of h3        */
582    d = pmull_low(h, MODULO);    /*   :  :d1:d0 = reduction of h2        */
583    e = veorq_u8(c, m);          /*   :e2:e1:00 = m2:m1:00 + c2:c1:00    */
584    f = pmull_high(e, MODULO);   /*   :  :f1:f0 = reduction of e2        */
585    g = vextq_u8(ZERO, e, 8);    /*   :  :g1:00 = e1:00                  */
586    n = veorq_u8(d, l);          /*   :  :n1:n0 = d1:d0 + l1:l0          */
587    o = veorq_u8(n, f);          /*       o1:o0 = f1:f0 + n1:n0          */
588    return veorq_u8(o, g);       /*             = o1:o0 + g1:00          */
589}
590
591/*
592 * GCM multiplication: c = a times b in GF(2^128)
593 */
594void mbedtls_aesce_gcm_mult(unsigned char c[16],
595                            const unsigned char a[16],
596                            const unsigned char b[16])
597{
598    uint8x16_t va, vb, vc;
599    va = vrbitq_u8(vld1q_u8(&a[0]));
600    vb = vrbitq_u8(vld1q_u8(&b[0]));
601    vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
602    vst1q_u8(&c[0], vc);
603}
604
605#endif /* MBEDTLS_GCM_C */
606
607#if defined(MBEDTLS_POP_TARGET_PRAGMA)
608#if defined(__clang__)
609#pragma clang attribute pop
610#elif defined(__GNUC__)
611#pragma GCC pop_options
612#endif
613#undef MBEDTLS_POP_TARGET_PRAGMA
614#endif
615
616#endif /* MBEDTLS_AESCE_HAVE_CODE */
617
618#endif /* MBEDTLS_AESCE_C */
619