1a8e1175bSopenharmony_ci/* 2a8e1175bSopenharmony_ci * Armv8-A Cryptographic Extension support functions for Aarch64 3a8e1175bSopenharmony_ci * 4a8e1175bSopenharmony_ci * Copyright The Mbed TLS Contributors 5a8e1175bSopenharmony_ci * SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 6a8e1175bSopenharmony_ci */ 7a8e1175bSopenharmony_ci 8a8e1175bSopenharmony_ci#if defined(__clang__) && (__clang_major__ >= 4) 9a8e1175bSopenharmony_ci 10a8e1175bSopenharmony_ci/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if, 11a8e1175bSopenharmony_ci * but that is defined by build_info.h, and we need this block to happen first. */ 12a8e1175bSopenharmony_ci#if defined(__ARM_ARCH) 13a8e1175bSopenharmony_ci#if __ARM_ARCH >= 8 14a8e1175bSopenharmony_ci#define MBEDTLS_AESCE_ARCH_IS_ARMV8_A 15a8e1175bSopenharmony_ci#endif 16a8e1175bSopenharmony_ci#endif 17a8e1175bSopenharmony_ci 18a8e1175bSopenharmony_ci#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO) 19a8e1175bSopenharmony_ci/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged. 20a8e1175bSopenharmony_ci * 21a8e1175bSopenharmony_ci * The intrinsic declaration are guarded by predefined ACLE macros in clang: 22a8e1175bSopenharmony_ci * these are normally only enabled by the -march option on the command line. 23a8e1175bSopenharmony_ci * By defining the macros ourselves we gain access to those declarations without 24a8e1175bSopenharmony_ci * requiring -march on the command line. 25a8e1175bSopenharmony_ci * 26a8e1175bSopenharmony_ci * `arm_neon.h` is included by common.h, so we put these defines 27a8e1175bSopenharmony_ci * at the top of this file, before any includes. 28a8e1175bSopenharmony_ci */ 29a8e1175bSopenharmony_ci#define __ARM_FEATURE_CRYPTO 1 30a8e1175bSopenharmony_ci/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions 31a8e1175bSopenharmony_ci * 32a8e1175bSopenharmony_ci * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it 33a8e1175bSopenharmony_ci * for older compilers. 34a8e1175bSopenharmony_ci */ 35a8e1175bSopenharmony_ci#define __ARM_FEATURE_AES 1 36a8e1175bSopenharmony_ci#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG 37a8e1175bSopenharmony_ci#endif 38a8e1175bSopenharmony_ci 39a8e1175bSopenharmony_ci#endif /* defined(__clang__) && (__clang_major__ >= 4) */ 40a8e1175bSopenharmony_ci 41a8e1175bSopenharmony_ci#include <string.h> 42a8e1175bSopenharmony_ci#include "common.h" 43a8e1175bSopenharmony_ci 44a8e1175bSopenharmony_ci#if defined(MBEDTLS_AESCE_C) 45a8e1175bSopenharmony_ci 46a8e1175bSopenharmony_ci#include "aesce.h" 47a8e1175bSopenharmony_ci 48a8e1175bSopenharmony_ci#if defined(MBEDTLS_AESCE_HAVE_CODE) 49a8e1175bSopenharmony_ci 50a8e1175bSopenharmony_ci/* Compiler version checks. */ 51a8e1175bSopenharmony_ci#if defined(__clang__) 52a8e1175bSopenharmony_ci# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11) 53a8e1175bSopenharmony_ci# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0." 54a8e1175bSopenharmony_ci# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4) 55a8e1175bSopenharmony_ci# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0." 56a8e1175bSopenharmony_ci# endif 57a8e1175bSopenharmony_ci#elif defined(__GNUC__) 58a8e1175bSopenharmony_ci# if __GNUC__ < 6 59a8e1175bSopenharmony_ci# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0." 60a8e1175bSopenharmony_ci# endif 61a8e1175bSopenharmony_ci#elif defined(_MSC_VER) 62a8e1175bSopenharmony_ci/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that, 63a8e1175bSopenharmony_ci * please update this and document of `MBEDTLS_AESCE_C` in 64a8e1175bSopenharmony_ci * `mbedtls_config.h`. */ 65a8e1175bSopenharmony_ci# if _MSC_VER < 1929 66a8e1175bSopenharmony_ci# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2." 67a8e1175bSopenharmony_ci# endif 68a8e1175bSopenharmony_ci#elif defined(__ARMCC_VERSION) 69a8e1175bSopenharmony_ci# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002) 70a8e1175bSopenharmony_ci/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20. 71a8e1175bSopenharmony_ci * If someone verified that, please update this and document of 72a8e1175bSopenharmony_ci * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */ 73a8e1175bSopenharmony_ci# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20." 74a8e1175bSopenharmony_ci# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000) 75a8e1175bSopenharmony_ci# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6." 76a8e1175bSopenharmony_ci# endif 77a8e1175bSopenharmony_ci#endif 78a8e1175bSopenharmony_ci 79a8e1175bSopenharmony_ci#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \ 80a8e1175bSopenharmony_ci defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG) 81a8e1175bSopenharmony_ci# if defined(__ARMCOMPILER_VERSION) 82a8e1175bSopenharmony_ci# if __ARMCOMPILER_VERSION <= 6090000 83a8e1175bSopenharmony_ci# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C" 84a8e1175bSopenharmony_ci# else 85a8e1175bSopenharmony_ci# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function) 86a8e1175bSopenharmony_ci# define MBEDTLS_POP_TARGET_PRAGMA 87a8e1175bSopenharmony_ci# endif 88a8e1175bSopenharmony_ci# elif defined(__clang__) 89a8e1175bSopenharmony_ci# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function) 90a8e1175bSopenharmony_ci# define MBEDTLS_POP_TARGET_PRAGMA 91a8e1175bSopenharmony_ci# elif defined(__GNUC__) 92a8e1175bSopenharmony_ci# pragma GCC push_options 93a8e1175bSopenharmony_ci# pragma GCC target ("+crypto") 94a8e1175bSopenharmony_ci# define MBEDTLS_POP_TARGET_PRAGMA 95a8e1175bSopenharmony_ci# elif defined(_MSC_VER) 96a8e1175bSopenharmony_ci# error "Required feature(__ARM_FEATURE_AES) is not enabled." 97a8e1175bSopenharmony_ci# endif 98a8e1175bSopenharmony_ci#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) || 99a8e1175bSopenharmony_ci MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */ 100a8e1175bSopenharmony_ci 101a8e1175bSopenharmony_ci#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) 102a8e1175bSopenharmony_ci 103a8e1175bSopenharmony_ci#include <sys/auxv.h> 104a8e1175bSopenharmony_ci#if !defined(HWCAP_NEON) 105a8e1175bSopenharmony_ci#define HWCAP_NEON (1 << 12) 106a8e1175bSopenharmony_ci#endif 107a8e1175bSopenharmony_ci#if !defined(HWCAP2_AES) 108a8e1175bSopenharmony_ci#define HWCAP2_AES (1 << 0) 109a8e1175bSopenharmony_ci#endif 110a8e1175bSopenharmony_ci#if !defined(HWCAP_AES) 111a8e1175bSopenharmony_ci#define HWCAP_AES (1 << 3) 112a8e1175bSopenharmony_ci#endif 113a8e1175bSopenharmony_ci#if !defined(HWCAP_ASIMD) 114a8e1175bSopenharmony_ci#define HWCAP_ASIMD (1 << 1) 115a8e1175bSopenharmony_ci#endif 116a8e1175bSopenharmony_ci 117a8e1175bSopenharmony_cisigned char mbedtls_aesce_has_support_result = -1; 118a8e1175bSopenharmony_ci 119a8e1175bSopenharmony_ci#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) 120a8e1175bSopenharmony_ci/* 121a8e1175bSopenharmony_ci * AES instruction support detection routine 122a8e1175bSopenharmony_ci */ 123a8e1175bSopenharmony_ciint mbedtls_aesce_has_support_impl(void) 124a8e1175bSopenharmony_ci{ 125a8e1175bSopenharmony_ci /* To avoid many calls to getauxval, cache the result. This is 126a8e1175bSopenharmony_ci * thread-safe, because we store the result in a char so cannot 127a8e1175bSopenharmony_ci * be vulnerable to non-atomic updates. 128a8e1175bSopenharmony_ci * It is possible that we could end up setting result more than 129a8e1175bSopenharmony_ci * once, but that is harmless. 130a8e1175bSopenharmony_ci */ 131a8e1175bSopenharmony_ci if (mbedtls_aesce_has_support_result == -1) { 132a8e1175bSopenharmony_ci#if defined(MBEDTLS_ARCH_IS_ARM32) 133a8e1175bSopenharmony_ci unsigned long auxval = getauxval(AT_HWCAP); 134a8e1175bSopenharmony_ci unsigned long auxval2 = getauxval(AT_HWCAP2); 135a8e1175bSopenharmony_ci if (((auxval & HWCAP_NEON) == HWCAP_NEON) && 136a8e1175bSopenharmony_ci ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) { 137a8e1175bSopenharmony_ci mbedtls_aesce_has_support_result = 1; 138a8e1175bSopenharmony_ci } else { 139a8e1175bSopenharmony_ci mbedtls_aesce_has_support_result = 0; 140a8e1175bSopenharmony_ci } 141a8e1175bSopenharmony_ci#else 142a8e1175bSopenharmony_ci unsigned long auxval = getauxval(AT_HWCAP); 143a8e1175bSopenharmony_ci if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == 144a8e1175bSopenharmony_ci (HWCAP_ASIMD | HWCAP_AES)) { 145a8e1175bSopenharmony_ci mbedtls_aesce_has_support_result = 1; 146a8e1175bSopenharmony_ci } else { 147a8e1175bSopenharmony_ci mbedtls_aesce_has_support_result = 0; 148a8e1175bSopenharmony_ci } 149a8e1175bSopenharmony_ci#endif 150a8e1175bSopenharmony_ci } 151a8e1175bSopenharmony_ci return mbedtls_aesce_has_support_result; 152a8e1175bSopenharmony_ci} 153a8e1175bSopenharmony_ci#endif 154a8e1175bSopenharmony_ci 155a8e1175bSopenharmony_ci#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */ 156a8e1175bSopenharmony_ci 157a8e1175bSopenharmony_ci/* Single round of AESCE encryption */ 158a8e1175bSopenharmony_ci#define AESCE_ENCRYPT_ROUND \ 159a8e1175bSopenharmony_ci block = vaeseq_u8(block, vld1q_u8(keys)); \ 160a8e1175bSopenharmony_ci block = vaesmcq_u8(block); \ 161a8e1175bSopenharmony_ci keys += 16 162a8e1175bSopenharmony_ci/* Two rounds of AESCE encryption */ 163a8e1175bSopenharmony_ci#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND 164a8e1175bSopenharmony_ci 165a8e1175bSopenharmony_ciMBEDTLS_OPTIMIZE_FOR_PERFORMANCE 166a8e1175bSopenharmony_cistatic uint8x16_t aesce_encrypt_block(uint8x16_t block, 167a8e1175bSopenharmony_ci unsigned char *keys, 168a8e1175bSopenharmony_ci int rounds) 169a8e1175bSopenharmony_ci{ 170a8e1175bSopenharmony_ci /* 10, 12 or 14 rounds. Unroll loop. */ 171a8e1175bSopenharmony_ci if (rounds == 10) { 172a8e1175bSopenharmony_ci goto rounds_10; 173a8e1175bSopenharmony_ci } 174a8e1175bSopenharmony_ci if (rounds == 12) { 175a8e1175bSopenharmony_ci goto rounds_12; 176a8e1175bSopenharmony_ci } 177a8e1175bSopenharmony_ci AESCE_ENCRYPT_ROUND_X2; 178a8e1175bSopenharmony_cirounds_12: 179a8e1175bSopenharmony_ci AESCE_ENCRYPT_ROUND_X2; 180a8e1175bSopenharmony_cirounds_10: 181a8e1175bSopenharmony_ci AESCE_ENCRYPT_ROUND_X2; 182a8e1175bSopenharmony_ci AESCE_ENCRYPT_ROUND_X2; 183a8e1175bSopenharmony_ci AESCE_ENCRYPT_ROUND_X2; 184a8e1175bSopenharmony_ci AESCE_ENCRYPT_ROUND_X2; 185a8e1175bSopenharmony_ci AESCE_ENCRYPT_ROUND; 186a8e1175bSopenharmony_ci 187a8e1175bSopenharmony_ci /* AES AddRoundKey for the previous round. 188a8e1175bSopenharmony_ci * SubBytes, ShiftRows for the final round. */ 189a8e1175bSopenharmony_ci block = vaeseq_u8(block, vld1q_u8(keys)); 190a8e1175bSopenharmony_ci keys += 16; 191a8e1175bSopenharmony_ci 192a8e1175bSopenharmony_ci /* Final round: no MixColumns */ 193a8e1175bSopenharmony_ci 194a8e1175bSopenharmony_ci /* Final AddRoundKey */ 195a8e1175bSopenharmony_ci block = veorq_u8(block, vld1q_u8(keys)); 196a8e1175bSopenharmony_ci 197a8e1175bSopenharmony_ci return block; 198a8e1175bSopenharmony_ci} 199a8e1175bSopenharmony_ci 200a8e1175bSopenharmony_ci/* Single round of AESCE decryption 201a8e1175bSopenharmony_ci * 202a8e1175bSopenharmony_ci * AES AddRoundKey, SubBytes, ShiftRows 203a8e1175bSopenharmony_ci * 204a8e1175bSopenharmony_ci * block = vaesdq_u8(block, vld1q_u8(keys)); 205a8e1175bSopenharmony_ci * 206a8e1175bSopenharmony_ci * AES inverse MixColumns for the next round. 207a8e1175bSopenharmony_ci * 208a8e1175bSopenharmony_ci * This means that we switch the order of the inverse AddRoundKey and 209a8e1175bSopenharmony_ci * inverse MixColumns operations. We have to do this as AddRoundKey is 210a8e1175bSopenharmony_ci * done in an atomic instruction together with the inverses of SubBytes 211a8e1175bSopenharmony_ci * and ShiftRows. 212a8e1175bSopenharmony_ci * 213a8e1175bSopenharmony_ci * It works because MixColumns is a linear operation over GF(2^8) and 214a8e1175bSopenharmony_ci * AddRoundKey is an exclusive or, which is equivalent to addition over 215a8e1175bSopenharmony_ci * GF(2^8). (The inverse of MixColumns needs to be applied to the 216a8e1175bSopenharmony_ci * affected round keys separately which has been done when the 217a8e1175bSopenharmony_ci * decryption round keys were calculated.) 218a8e1175bSopenharmony_ci * 219a8e1175bSopenharmony_ci * block = vaesimcq_u8(block); 220a8e1175bSopenharmony_ci */ 221a8e1175bSopenharmony_ci#define AESCE_DECRYPT_ROUND \ 222a8e1175bSopenharmony_ci block = vaesdq_u8(block, vld1q_u8(keys)); \ 223a8e1175bSopenharmony_ci block = vaesimcq_u8(block); \ 224a8e1175bSopenharmony_ci keys += 16 225a8e1175bSopenharmony_ci/* Two rounds of AESCE decryption */ 226a8e1175bSopenharmony_ci#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND 227a8e1175bSopenharmony_ci 228a8e1175bSopenharmony_ci#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT) 229a8e1175bSopenharmony_cistatic uint8x16_t aesce_decrypt_block(uint8x16_t block, 230a8e1175bSopenharmony_ci unsigned char *keys, 231a8e1175bSopenharmony_ci int rounds) 232a8e1175bSopenharmony_ci{ 233a8e1175bSopenharmony_ci /* 10, 12 or 14 rounds. Unroll loop. */ 234a8e1175bSopenharmony_ci if (rounds == 10) { 235a8e1175bSopenharmony_ci goto rounds_10; 236a8e1175bSopenharmony_ci } 237a8e1175bSopenharmony_ci if (rounds == 12) { 238a8e1175bSopenharmony_ci goto rounds_12; 239a8e1175bSopenharmony_ci } 240a8e1175bSopenharmony_ci AESCE_DECRYPT_ROUND_X2; 241a8e1175bSopenharmony_cirounds_12: 242a8e1175bSopenharmony_ci AESCE_DECRYPT_ROUND_X2; 243a8e1175bSopenharmony_cirounds_10: 244a8e1175bSopenharmony_ci AESCE_DECRYPT_ROUND_X2; 245a8e1175bSopenharmony_ci AESCE_DECRYPT_ROUND_X2; 246a8e1175bSopenharmony_ci AESCE_DECRYPT_ROUND_X2; 247a8e1175bSopenharmony_ci AESCE_DECRYPT_ROUND_X2; 248a8e1175bSopenharmony_ci AESCE_DECRYPT_ROUND; 249a8e1175bSopenharmony_ci 250a8e1175bSopenharmony_ci /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the 251a8e1175bSopenharmony_ci * last full round. */ 252a8e1175bSopenharmony_ci block = vaesdq_u8(block, vld1q_u8(keys)); 253a8e1175bSopenharmony_ci keys += 16; 254a8e1175bSopenharmony_ci 255a8e1175bSopenharmony_ci /* Inverse AddRoundKey for inverting the initial round key addition. */ 256a8e1175bSopenharmony_ci block = veorq_u8(block, vld1q_u8(keys)); 257a8e1175bSopenharmony_ci 258a8e1175bSopenharmony_ci return block; 259a8e1175bSopenharmony_ci} 260a8e1175bSopenharmony_ci#endif 261a8e1175bSopenharmony_ci 262a8e1175bSopenharmony_ci/* 263a8e1175bSopenharmony_ci * AES-ECB block en(de)cryption 264a8e1175bSopenharmony_ci */ 265a8e1175bSopenharmony_ciint mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx, 266a8e1175bSopenharmony_ci int mode, 267a8e1175bSopenharmony_ci const unsigned char input[16], 268a8e1175bSopenharmony_ci unsigned char output[16]) 269a8e1175bSopenharmony_ci{ 270a8e1175bSopenharmony_ci uint8x16_t block = vld1q_u8(&input[0]); 271a8e1175bSopenharmony_ci unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset); 272a8e1175bSopenharmony_ci 273a8e1175bSopenharmony_ci#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT) 274a8e1175bSopenharmony_ci if (mode == MBEDTLS_AES_DECRYPT) { 275a8e1175bSopenharmony_ci block = aesce_decrypt_block(block, keys, ctx->nr); 276a8e1175bSopenharmony_ci } else 277a8e1175bSopenharmony_ci#else 278a8e1175bSopenharmony_ci (void) mode; 279a8e1175bSopenharmony_ci#endif 280a8e1175bSopenharmony_ci { 281a8e1175bSopenharmony_ci block = aesce_encrypt_block(block, keys, ctx->nr); 282a8e1175bSopenharmony_ci } 283a8e1175bSopenharmony_ci vst1q_u8(&output[0], block); 284a8e1175bSopenharmony_ci 285a8e1175bSopenharmony_ci return 0; 286a8e1175bSopenharmony_ci} 287a8e1175bSopenharmony_ci 288a8e1175bSopenharmony_ci/* 289a8e1175bSopenharmony_ci * Compute decryption round keys from encryption round keys 290a8e1175bSopenharmony_ci */ 291a8e1175bSopenharmony_ci#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT) 292a8e1175bSopenharmony_civoid mbedtls_aesce_inverse_key(unsigned char *invkey, 293a8e1175bSopenharmony_ci const unsigned char *fwdkey, 294a8e1175bSopenharmony_ci int nr) 295a8e1175bSopenharmony_ci{ 296a8e1175bSopenharmony_ci int i, j; 297a8e1175bSopenharmony_ci j = nr; 298a8e1175bSopenharmony_ci vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16)); 299a8e1175bSopenharmony_ci for (i = 1, j--; j > 0; i++, j--) { 300a8e1175bSopenharmony_ci vst1q_u8(invkey + i * 16, 301a8e1175bSopenharmony_ci vaesimcq_u8(vld1q_u8(fwdkey + j * 16))); 302a8e1175bSopenharmony_ci } 303a8e1175bSopenharmony_ci vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16)); 304a8e1175bSopenharmony_ci 305a8e1175bSopenharmony_ci} 306a8e1175bSopenharmony_ci#endif 307a8e1175bSopenharmony_ci 308a8e1175bSopenharmony_cistatic inline uint32_t aes_rot_word(uint32_t word) 309a8e1175bSopenharmony_ci{ 310a8e1175bSopenharmony_ci return (word << (32 - 8)) | (word >> 8); 311a8e1175bSopenharmony_ci} 312a8e1175bSopenharmony_ci 313a8e1175bSopenharmony_cistatic inline uint32_t aes_sub_word(uint32_t in) 314a8e1175bSopenharmony_ci{ 315a8e1175bSopenharmony_ci uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in)); 316a8e1175bSopenharmony_ci uint8x16_t zero = vdupq_n_u8(0); 317a8e1175bSopenharmony_ci 318a8e1175bSopenharmony_ci /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields 319a8e1175bSopenharmony_ci * the correct result as ShiftRows doesn't change the first row. */ 320a8e1175bSopenharmony_ci v = vaeseq_u8(zero, v); 321a8e1175bSopenharmony_ci return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0); 322a8e1175bSopenharmony_ci} 323a8e1175bSopenharmony_ci 324a8e1175bSopenharmony_ci/* 325a8e1175bSopenharmony_ci * Key expansion function 326a8e1175bSopenharmony_ci */ 327a8e1175bSopenharmony_cistatic void aesce_setkey_enc(unsigned char *rk, 328a8e1175bSopenharmony_ci const unsigned char *key, 329a8e1175bSopenharmony_ci const size_t key_bit_length) 330a8e1175bSopenharmony_ci{ 331a8e1175bSopenharmony_ci static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 332a8e1175bSopenharmony_ci 0x20, 0x40, 0x80, 0x1b, 0x36 }; 333a8e1175bSopenharmony_ci /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf 334a8e1175bSopenharmony_ci * - Section 5, Nr = Nk + 6 335a8e1175bSopenharmony_ci * - Section 5.2, the length of round keys is Nb*(Nr+1) 336a8e1175bSopenharmony_ci */ 337a8e1175bSopenharmony_ci const size_t key_len_in_words = key_bit_length / 32; /* Nk */ 338a8e1175bSopenharmony_ci const size_t round_key_len_in_words = 4; /* Nb */ 339a8e1175bSopenharmony_ci const size_t rounds_needed = key_len_in_words + 6; /* Nr */ 340a8e1175bSopenharmony_ci const size_t round_keys_len_in_words = 341a8e1175bSopenharmony_ci round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */ 342a8e1175bSopenharmony_ci const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words; 343a8e1175bSopenharmony_ci 344a8e1175bSopenharmony_ci memcpy(rk, key, key_len_in_words * 4); 345a8e1175bSopenharmony_ci 346a8e1175bSopenharmony_ci for (uint32_t *rki = (uint32_t *) rk; 347a8e1175bSopenharmony_ci rki + key_len_in_words < rko_end; 348a8e1175bSopenharmony_ci rki += key_len_in_words) { 349a8e1175bSopenharmony_ci 350a8e1175bSopenharmony_ci size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words; 351a8e1175bSopenharmony_ci uint32_t *rko; 352a8e1175bSopenharmony_ci rko = rki + key_len_in_words; 353a8e1175bSopenharmony_ci rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1])); 354a8e1175bSopenharmony_ci rko[0] ^= rcon[iteration] ^ rki[0]; 355a8e1175bSopenharmony_ci rko[1] = rko[0] ^ rki[1]; 356a8e1175bSopenharmony_ci rko[2] = rko[1] ^ rki[2]; 357a8e1175bSopenharmony_ci rko[3] = rko[2] ^ rki[3]; 358a8e1175bSopenharmony_ci if (rko + key_len_in_words > rko_end) { 359a8e1175bSopenharmony_ci /* Do not write overflow words.*/ 360a8e1175bSopenharmony_ci continue; 361a8e1175bSopenharmony_ci } 362a8e1175bSopenharmony_ci#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH) 363a8e1175bSopenharmony_ci switch (key_bit_length) { 364a8e1175bSopenharmony_ci case 128: 365a8e1175bSopenharmony_ci break; 366a8e1175bSopenharmony_ci case 192: 367a8e1175bSopenharmony_ci rko[4] = rko[3] ^ rki[4]; 368a8e1175bSopenharmony_ci rko[5] = rko[4] ^ rki[5]; 369a8e1175bSopenharmony_ci break; 370a8e1175bSopenharmony_ci case 256: 371a8e1175bSopenharmony_ci rko[4] = aes_sub_word(rko[3]) ^ rki[4]; 372a8e1175bSopenharmony_ci rko[5] = rko[4] ^ rki[5]; 373a8e1175bSopenharmony_ci rko[6] = rko[5] ^ rki[6]; 374a8e1175bSopenharmony_ci rko[7] = rko[6] ^ rki[7]; 375a8e1175bSopenharmony_ci break; 376a8e1175bSopenharmony_ci } 377a8e1175bSopenharmony_ci#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */ 378a8e1175bSopenharmony_ci } 379a8e1175bSopenharmony_ci} 380a8e1175bSopenharmony_ci 381a8e1175bSopenharmony_ci/* 382a8e1175bSopenharmony_ci * Key expansion, wrapper 383a8e1175bSopenharmony_ci */ 384a8e1175bSopenharmony_ciint mbedtls_aesce_setkey_enc(unsigned char *rk, 385a8e1175bSopenharmony_ci const unsigned char *key, 386a8e1175bSopenharmony_ci size_t bits) 387a8e1175bSopenharmony_ci{ 388a8e1175bSopenharmony_ci switch (bits) { 389a8e1175bSopenharmony_ci case 128: 390a8e1175bSopenharmony_ci case 192: 391a8e1175bSopenharmony_ci case 256: 392a8e1175bSopenharmony_ci aesce_setkey_enc(rk, key, bits); 393a8e1175bSopenharmony_ci break; 394a8e1175bSopenharmony_ci default: 395a8e1175bSopenharmony_ci return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH; 396a8e1175bSopenharmony_ci } 397a8e1175bSopenharmony_ci 398a8e1175bSopenharmony_ci return 0; 399a8e1175bSopenharmony_ci} 400a8e1175bSopenharmony_ci 401a8e1175bSopenharmony_ci#if defined(MBEDTLS_GCM_C) 402a8e1175bSopenharmony_ci 403a8e1175bSopenharmony_ci#if defined(MBEDTLS_ARCH_IS_ARM32) 404a8e1175bSopenharmony_ci 405a8e1175bSopenharmony_ci#if defined(__clang__) 406a8e1175bSopenharmony_ci/* On clang for A32/T32, work around some missing intrinsics and types which are listed in 407a8e1175bSopenharmony_ci * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1) 408a8e1175bSopenharmony_ci * These are only required for GCM. 409a8e1175bSopenharmony_ci */ 410a8e1175bSopenharmony_ci#define vreinterpretq_u64_p64(a) ((uint64x2_t) a) 411a8e1175bSopenharmony_ci 412a8e1175bSopenharmony_citypedef uint8x16_t poly128_t; 413a8e1175bSopenharmony_ci 414a8e1175bSopenharmony_cistatic inline poly128_t vmull_p64(poly64_t a, poly64_t b) 415a8e1175bSopenharmony_ci{ 416a8e1175bSopenharmony_ci poly128_t r; 417a8e1175bSopenharmony_ci asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :); 418a8e1175bSopenharmony_ci return r; 419a8e1175bSopenharmony_ci} 420a8e1175bSopenharmony_ci 421a8e1175bSopenharmony_ci/* This is set to cause some more missing intrinsics to be defined below */ 422a8e1175bSopenharmony_ci#define COMMON_MISSING_INTRINSICS 423a8e1175bSopenharmony_ci 424a8e1175bSopenharmony_cistatic inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b) 425a8e1175bSopenharmony_ci{ 426a8e1175bSopenharmony_ci return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)), 427a8e1175bSopenharmony_ci (poly64_t) (vget_high_u64((uint64x2_t) b))); 428a8e1175bSopenharmony_ci} 429a8e1175bSopenharmony_ci 430a8e1175bSopenharmony_ci#endif /* defined(__clang__) */ 431a8e1175bSopenharmony_ci 432a8e1175bSopenharmony_cistatic inline uint8x16_t vrbitq_u8(uint8x16_t x) 433a8e1175bSopenharmony_ci{ 434a8e1175bSopenharmony_ci /* There is no vrbitq_u8 instruction in A32/T32, so provide 435a8e1175bSopenharmony_ci * an equivalent non-Neon implementation. Reverse bit order in each 436a8e1175bSopenharmony_ci * byte with 4x rbit, rev. */ 437a8e1175bSopenharmony_ci asm ("ldm %[p], { r2-r5 } \n\t" 438a8e1175bSopenharmony_ci "rbit r2, r2 \n\t" 439a8e1175bSopenharmony_ci "rev r2, r2 \n\t" 440a8e1175bSopenharmony_ci "rbit r3, r3 \n\t" 441a8e1175bSopenharmony_ci "rev r3, r3 \n\t" 442a8e1175bSopenharmony_ci "rbit r4, r4 \n\t" 443a8e1175bSopenharmony_ci "rev r4, r4 \n\t" 444a8e1175bSopenharmony_ci "rbit r5, r5 \n\t" 445a8e1175bSopenharmony_ci "rev r5, r5 \n\t" 446a8e1175bSopenharmony_ci "stm %[p], { r2-r5 } \n\t" 447a8e1175bSopenharmony_ci : 448a8e1175bSopenharmony_ci /* Output: 16 bytes of memory pointed to by &x */ 449a8e1175bSopenharmony_ci "+m" (*(uint8_t(*)[16]) &x) 450a8e1175bSopenharmony_ci : 451a8e1175bSopenharmony_ci [p] "r" (&x) 452a8e1175bSopenharmony_ci : 453a8e1175bSopenharmony_ci "r2", "r3", "r4", "r5" 454a8e1175bSopenharmony_ci ); 455a8e1175bSopenharmony_ci return x; 456a8e1175bSopenharmony_ci} 457a8e1175bSopenharmony_ci 458a8e1175bSopenharmony_ci#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */ 459a8e1175bSopenharmony_ci 460a8e1175bSopenharmony_ci#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5 461a8e1175bSopenharmony_ci/* Some intrinsics are not available for GCC 5.X. */ 462a8e1175bSopenharmony_ci#define COMMON_MISSING_INTRINSICS 463a8e1175bSopenharmony_ci#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */ 464a8e1175bSopenharmony_ci 465a8e1175bSopenharmony_ci 466a8e1175bSopenharmony_ci#if defined(COMMON_MISSING_INTRINSICS) 467a8e1175bSopenharmony_ci 468a8e1175bSopenharmony_ci/* Missing intrinsics common to both GCC 5, and Clang on 32-bit */ 469a8e1175bSopenharmony_ci 470a8e1175bSopenharmony_ci#define vreinterpretq_p64_u8(a) ((poly64x2_t) a) 471a8e1175bSopenharmony_ci#define vreinterpretq_u8_p128(a) ((uint8x16_t) a) 472a8e1175bSopenharmony_ci 473a8e1175bSopenharmony_cistatic inline poly64x1_t vget_low_p64(poly64x2_t a) 474a8e1175bSopenharmony_ci{ 475a8e1175bSopenharmony_ci uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a)); 476a8e1175bSopenharmony_ci return (poly64x1_t) r; 477a8e1175bSopenharmony_ci 478a8e1175bSopenharmony_ci} 479a8e1175bSopenharmony_ci 480a8e1175bSopenharmony_ci#endif /* COMMON_MISSING_INTRINSICS */ 481a8e1175bSopenharmony_ci 482a8e1175bSopenharmony_ci/* vmull_p64/vmull_high_p64 wrappers. 483a8e1175bSopenharmony_ci * 484a8e1175bSopenharmony_ci * Older compilers miss some intrinsic functions for `poly*_t`. We use 485a8e1175bSopenharmony_ci * uint8x16_t and uint8x16x3_t as input/output parameters. 486a8e1175bSopenharmony_ci */ 487a8e1175bSopenharmony_ci#if defined(MBEDTLS_COMPILER_IS_GCC) 488a8e1175bSopenharmony_ci/* GCC reports incompatible type error without cast. GCC think poly64_t and 489a8e1175bSopenharmony_ci * poly64x1_t are different, that is different with MSVC and Clang. */ 490a8e1175bSopenharmony_ci#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b) 491a8e1175bSopenharmony_ci#else 492a8e1175bSopenharmony_ci/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report 493a8e1175bSopenharmony_ci * error with/without cast. And I think poly64_t and poly64x1_t are same, no 494a8e1175bSopenharmony_ci * cast for clang also. */ 495a8e1175bSopenharmony_ci#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b) 496a8e1175bSopenharmony_ci#endif /* MBEDTLS_COMPILER_IS_GCC */ 497a8e1175bSopenharmony_ci 498a8e1175bSopenharmony_cistatic inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b) 499a8e1175bSopenharmony_ci{ 500a8e1175bSopenharmony_ci 501a8e1175bSopenharmony_ci return vreinterpretq_u8_p128( 502a8e1175bSopenharmony_ci MBEDTLS_VMULL_P64( 503a8e1175bSopenharmony_ci (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)), 504a8e1175bSopenharmony_ci (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b)) 505a8e1175bSopenharmony_ci )); 506a8e1175bSopenharmony_ci} 507a8e1175bSopenharmony_ci 508a8e1175bSopenharmony_cistatic inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b) 509a8e1175bSopenharmony_ci{ 510a8e1175bSopenharmony_ci return vreinterpretq_u8_p128( 511a8e1175bSopenharmony_ci vmull_high_p64(vreinterpretq_p64_u8(a), 512a8e1175bSopenharmony_ci vreinterpretq_p64_u8(b))); 513a8e1175bSopenharmony_ci} 514a8e1175bSopenharmony_ci 515a8e1175bSopenharmony_ci/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by 516a8e1175bSopenharmony_ci * `x^128 + x^7 + x^2 + x + 1`. 517a8e1175bSopenharmony_ci * 518a8e1175bSopenharmony_ci * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b 519a8e1175bSopenharmony_ci * multiplies to generate a 128b. 520a8e1175bSopenharmony_ci * 521a8e1175bSopenharmony_ci * `poly_mult_128` executes polynomial multiplication and outputs 256b that 522a8e1175bSopenharmony_ci * represented by 3 128b due to code size optimization. 523a8e1175bSopenharmony_ci * 524a8e1175bSopenharmony_ci * Output layout: 525a8e1175bSopenharmony_ci * | | | | 526a8e1175bSopenharmony_ci * |------------|-------------|-------------| 527a8e1175bSopenharmony_ci * | ret.val[0] | h3:h2:00:00 | high 128b | 528a8e1175bSopenharmony_ci * | ret.val[1] | :m2:m1:00 | middle 128b | 529a8e1175bSopenharmony_ci * | ret.val[2] | : :l1:l0 | low 128b | 530a8e1175bSopenharmony_ci */ 531a8e1175bSopenharmony_cistatic inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b) 532a8e1175bSopenharmony_ci{ 533a8e1175bSopenharmony_ci uint8x16x3_t ret; 534a8e1175bSopenharmony_ci uint8x16_t h, m, l; /* retval high/middle/low */ 535a8e1175bSopenharmony_ci uint8x16_t c, d, e; 536a8e1175bSopenharmony_ci 537a8e1175bSopenharmony_ci h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */ 538a8e1175bSopenharmony_ci l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */ 539a8e1175bSopenharmony_ci c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */ 540a8e1175bSopenharmony_ci d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */ 541a8e1175bSopenharmony_ci e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */ 542a8e1175bSopenharmony_ci m = veorq_u8(d, e); /* :m2:m1:00 = d + e */ 543a8e1175bSopenharmony_ci 544a8e1175bSopenharmony_ci ret.val[0] = h; 545a8e1175bSopenharmony_ci ret.val[1] = m; 546a8e1175bSopenharmony_ci ret.val[2] = l; 547a8e1175bSopenharmony_ci return ret; 548a8e1175bSopenharmony_ci} 549a8e1175bSopenharmony_ci 550a8e1175bSopenharmony_ci/* 551a8e1175bSopenharmony_ci * Modulo reduction. 552a8e1175bSopenharmony_ci * 553a8e1175bSopenharmony_ci * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8 554a8e1175bSopenharmony_ci * 555a8e1175bSopenharmony_ci * Section 4.3 556a8e1175bSopenharmony_ci * 557a8e1175bSopenharmony_ci * Modular reduction is slightly more complex. Write the GCM modulus as f(z) = 558a8e1175bSopenharmony_ci * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to 559a8e1175bSopenharmony_ci * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit 560a8e1175bSopenharmony_ci * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we 561a8e1175bSopenharmony_ci * simply multiply the higher part of the operand by r(z) and add it to l(z). If 562a8e1175bSopenharmony_ci * the result is still larger than 128 bits, we reduce again. 563a8e1175bSopenharmony_ci */ 564a8e1175bSopenharmony_cistatic inline uint8x16_t poly_mult_reduce(uint8x16x3_t input) 565a8e1175bSopenharmony_ci{ 566a8e1175bSopenharmony_ci uint8x16_t const ZERO = vdupq_n_u8(0); 567a8e1175bSopenharmony_ci 568a8e1175bSopenharmony_ci uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87)); 569a8e1175bSopenharmony_ci#if defined(__GNUC__) 570a8e1175bSopenharmony_ci /* use 'asm' as an optimisation barrier to prevent loading MODULO from 571a8e1175bSopenharmony_ci * memory. It is for GNUC compatible compilers. 572a8e1175bSopenharmony_ci */ 573a8e1175bSopenharmony_ci asm volatile ("" : "+w" (r)); 574a8e1175bSopenharmony_ci#endif 575a8e1175bSopenharmony_ci uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8)); 576a8e1175bSopenharmony_ci uint8x16_t h, m, l; /* input high/middle/low 128b */ 577a8e1175bSopenharmony_ci uint8x16_t c, d, e, f, g, n, o; 578a8e1175bSopenharmony_ci h = input.val[0]; /* h3:h2:00:00 */ 579a8e1175bSopenharmony_ci m = input.val[1]; /* :m2:m1:00 */ 580a8e1175bSopenharmony_ci l = input.val[2]; /* : :l1:l0 */ 581a8e1175bSopenharmony_ci c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */ 582a8e1175bSopenharmony_ci d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */ 583a8e1175bSopenharmony_ci e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */ 584a8e1175bSopenharmony_ci f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */ 585a8e1175bSopenharmony_ci g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */ 586a8e1175bSopenharmony_ci n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */ 587a8e1175bSopenharmony_ci o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */ 588a8e1175bSopenharmony_ci return veorq_u8(o, g); /* = o1:o0 + g1:00 */ 589a8e1175bSopenharmony_ci} 590a8e1175bSopenharmony_ci 591a8e1175bSopenharmony_ci/* 592a8e1175bSopenharmony_ci * GCM multiplication: c = a times b in GF(2^128) 593a8e1175bSopenharmony_ci */ 594a8e1175bSopenharmony_civoid mbedtls_aesce_gcm_mult(unsigned char c[16], 595a8e1175bSopenharmony_ci const unsigned char a[16], 596a8e1175bSopenharmony_ci const unsigned char b[16]) 597a8e1175bSopenharmony_ci{ 598a8e1175bSopenharmony_ci uint8x16_t va, vb, vc; 599a8e1175bSopenharmony_ci va = vrbitq_u8(vld1q_u8(&a[0])); 600a8e1175bSopenharmony_ci vb = vrbitq_u8(vld1q_u8(&b[0])); 601a8e1175bSopenharmony_ci vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb))); 602a8e1175bSopenharmony_ci vst1q_u8(&c[0], vc); 603a8e1175bSopenharmony_ci} 604a8e1175bSopenharmony_ci 605a8e1175bSopenharmony_ci#endif /* MBEDTLS_GCM_C */ 606a8e1175bSopenharmony_ci 607a8e1175bSopenharmony_ci#if defined(MBEDTLS_POP_TARGET_PRAGMA) 608a8e1175bSopenharmony_ci#if defined(__clang__) 609a8e1175bSopenharmony_ci#pragma clang attribute pop 610a8e1175bSopenharmony_ci#elif defined(__GNUC__) 611a8e1175bSopenharmony_ci#pragma GCC pop_options 612a8e1175bSopenharmony_ci#endif 613a8e1175bSopenharmony_ci#undef MBEDTLS_POP_TARGET_PRAGMA 614a8e1175bSopenharmony_ci#endif 615a8e1175bSopenharmony_ci 616a8e1175bSopenharmony_ci#endif /* MBEDTLS_AESCE_HAVE_CODE */ 617a8e1175bSopenharmony_ci 618a8e1175bSopenharmony_ci#endif /* MBEDTLS_AESCE_C */ 619