1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci###################################################################### 11e1051a39Sopenharmony_ci## Constant-time SSSE3 AES core implementation. 12e1051a39Sopenharmony_ci## version 0.1 13e1051a39Sopenharmony_ci## 14e1051a39Sopenharmony_ci## By Mike Hamburg (Stanford University), 2009 15e1051a39Sopenharmony_ci## Public domain. 16e1051a39Sopenharmony_ci## 17e1051a39Sopenharmony_ci## For details see http://shiftleft.org/papers/vector_aes/ and 18e1051a39Sopenharmony_ci## http://crypto.stanford.edu/vpaes/. 19e1051a39Sopenharmony_ci## 20e1051a39Sopenharmony_ci###################################################################### 21e1051a39Sopenharmony_ci# ARMv8 NEON adaptation by <appro@openssl.org> 22e1051a39Sopenharmony_ci# 23e1051a39Sopenharmony_ci# Reason for undertaken effort is that there is at least one popular 24e1051a39Sopenharmony_ci# SoC based on Cortex-A53 that doesn't have crypto extensions. 25e1051a39Sopenharmony_ci# 26e1051a39Sopenharmony_ci# CBC enc ECB enc/dec(*) [bit-sliced enc/dec] 27e1051a39Sopenharmony_ci# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ] 28e1051a39Sopenharmony_ci# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ] 29e1051a39Sopenharmony_ci# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ] 30e1051a39Sopenharmony_ci# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] 31e1051a39Sopenharmony_ci# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] 32e1051a39Sopenharmony_ci# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] 33e1051a39Sopenharmony_ci# ThunderX2(***) 39.4(**) 33.8/48.6(**) 34e1051a39Sopenharmony_ci# 35e1051a39Sopenharmony_ci# (*) ECB denotes approximate result for parallelizable modes 36e1051a39Sopenharmony_ci# such as CBC decrypt, CTR, etc.; 37e1051a39Sopenharmony_ci# (**) these results are worse than scalar compiler-generated 38e1051a39Sopenharmony_ci# code, but it's constant-time and therefore preferred; 39e1051a39Sopenharmony_ci# (***) presented for reference/comparison purposes; 40e1051a39Sopenharmony_ci 41e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 42e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 43e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 44e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 45e1051a39Sopenharmony_ci 46e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47e1051a39Sopenharmony_ci( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 48e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 49e1051a39Sopenharmony_cidie "can't locate arm-xlate.pl"; 50e1051a39Sopenharmony_ci 51e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" $xlate $flavour \"$output\"" 52e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 53e1051a39Sopenharmony_ci*STDOUT=*OUT; 54e1051a39Sopenharmony_ci 55e1051a39Sopenharmony_ci$code.=<<___; 56e1051a39Sopenharmony_ci.rodata 57e1051a39Sopenharmony_ci 58e1051a39Sopenharmony_ci.type _vpaes_consts,%object 59e1051a39Sopenharmony_ci.align 7 // totally strategic alignment 60e1051a39Sopenharmony_ci_vpaes_consts: 61e1051a39Sopenharmony_ci.Lk_mc_forward: // mc_forward 62e1051a39Sopenharmony_ci .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 63e1051a39Sopenharmony_ci .quad 0x080B0A0904070605, 0x000302010C0F0E0D 64e1051a39Sopenharmony_ci .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 65e1051a39Sopenharmony_ci .quad 0x000302010C0F0E0D, 0x080B0A0904070605 66e1051a39Sopenharmony_ci.Lk_mc_backward:// mc_backward 67e1051a39Sopenharmony_ci .quad 0x0605040702010003, 0x0E0D0C0F0A09080B 68e1051a39Sopenharmony_ci .quad 0x020100030E0D0C0F, 0x0A09080B06050407 69e1051a39Sopenharmony_ci .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 70e1051a39Sopenharmony_ci .quad 0x0A09080B06050407, 0x020100030E0D0C0F 71e1051a39Sopenharmony_ci.Lk_sr: // sr 72e1051a39Sopenharmony_ci .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 73e1051a39Sopenharmony_ci .quad 0x030E09040F0A0500, 0x0B06010C07020D08 74e1051a39Sopenharmony_ci .quad 0x0F060D040B020900, 0x070E050C030A0108 75e1051a39Sopenharmony_ci .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 76e1051a39Sopenharmony_ci 77e1051a39Sopenharmony_ci// 78e1051a39Sopenharmony_ci// "Hot" constants 79e1051a39Sopenharmony_ci// 80e1051a39Sopenharmony_ci.Lk_inv: // inv, inva 81e1051a39Sopenharmony_ci .quad 0x0E05060F0D080180, 0x040703090A0B0C02 82e1051a39Sopenharmony_ci .quad 0x01040A060F0B0780, 0x030D0E0C02050809 83e1051a39Sopenharmony_ci.Lk_ipt: // input transform (lo, hi) 84e1051a39Sopenharmony_ci .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 85e1051a39Sopenharmony_ci .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 86e1051a39Sopenharmony_ci.Lk_sbo: // sbou, sbot 87e1051a39Sopenharmony_ci .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 88e1051a39Sopenharmony_ci .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 89e1051a39Sopenharmony_ci.Lk_sb1: // sb1u, sb1t 90e1051a39Sopenharmony_ci .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 91e1051a39Sopenharmony_ci .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 92e1051a39Sopenharmony_ci.Lk_sb2: // sb2u, sb2t 93e1051a39Sopenharmony_ci .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 94e1051a39Sopenharmony_ci .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 95e1051a39Sopenharmony_ci 96e1051a39Sopenharmony_ci// 97e1051a39Sopenharmony_ci// Decryption stuff 98e1051a39Sopenharmony_ci// 99e1051a39Sopenharmony_ci.Lk_dipt: // decryption input transform 100e1051a39Sopenharmony_ci .quad 0x0F505B040B545F00, 0x154A411E114E451A 101e1051a39Sopenharmony_ci .quad 0x86E383E660056500, 0x12771772F491F194 102e1051a39Sopenharmony_ci.Lk_dsbo: // decryption sbox final output 103e1051a39Sopenharmony_ci .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 104e1051a39Sopenharmony_ci .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 105e1051a39Sopenharmony_ci.Lk_dsb9: // decryption sbox output *9*u, *9*t 106e1051a39Sopenharmony_ci .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 107e1051a39Sopenharmony_ci .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 108e1051a39Sopenharmony_ci.Lk_dsbd: // decryption sbox output *D*u, *D*t 109e1051a39Sopenharmony_ci .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 110e1051a39Sopenharmony_ci .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 111e1051a39Sopenharmony_ci.Lk_dsbb: // decryption sbox output *B*u, *B*t 112e1051a39Sopenharmony_ci .quad 0xD022649296B44200, 0x602646F6B0F2D404 113e1051a39Sopenharmony_ci .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 114e1051a39Sopenharmony_ci.Lk_dsbe: // decryption sbox output *E*u, *E*t 115e1051a39Sopenharmony_ci .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 116e1051a39Sopenharmony_ci .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 117e1051a39Sopenharmony_ci 118e1051a39Sopenharmony_ci// 119e1051a39Sopenharmony_ci// Key schedule constants 120e1051a39Sopenharmony_ci// 121e1051a39Sopenharmony_ci.Lk_dksd: // decryption key schedule: invskew x*D 122e1051a39Sopenharmony_ci .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 123e1051a39Sopenharmony_ci .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 124e1051a39Sopenharmony_ci.Lk_dksb: // decryption key schedule: invskew x*B 125e1051a39Sopenharmony_ci .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 126e1051a39Sopenharmony_ci .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 127e1051a39Sopenharmony_ci.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 128e1051a39Sopenharmony_ci .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 129e1051a39Sopenharmony_ci .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 130e1051a39Sopenharmony_ci.Lk_dks9: // decryption key schedule: invskew x*9 131e1051a39Sopenharmony_ci .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 132e1051a39Sopenharmony_ci .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 133e1051a39Sopenharmony_ci 134e1051a39Sopenharmony_ci.Lk_rcon: // rcon 135e1051a39Sopenharmony_ci .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 136e1051a39Sopenharmony_ci 137e1051a39Sopenharmony_ci.Lk_opt: // output transform 138e1051a39Sopenharmony_ci .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 139e1051a39Sopenharmony_ci .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 140e1051a39Sopenharmony_ci.Lk_deskew: // deskew tables: inverts the sbox's "skew" 141e1051a39Sopenharmony_ci .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 142e1051a39Sopenharmony_ci .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 143e1051a39Sopenharmony_ci 144e1051a39Sopenharmony_ci.asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)" 145e1051a39Sopenharmony_ci.size _vpaes_consts,.-_vpaes_consts 146e1051a39Sopenharmony_ci.align 6 147e1051a39Sopenharmony_ci 148e1051a39Sopenharmony_ci.text 149e1051a39Sopenharmony_ci 150e1051a39Sopenharmony_ci___ 151e1051a39Sopenharmony_ci 152e1051a39Sopenharmony_ci{ 153e1051a39Sopenharmony_cimy ($inp,$out,$key) = map("x$_",(0..2)); 154e1051a39Sopenharmony_ci 155e1051a39Sopenharmony_cimy ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); 156e1051a39Sopenharmony_cimy ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); 157e1051a39Sopenharmony_cimy ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); 158e1051a39Sopenharmony_ci 159e1051a39Sopenharmony_ci$code.=<<___; 160e1051a39Sopenharmony_ci// 161e1051a39Sopenharmony_ci// _aes_preheat 162e1051a39Sopenharmony_ci// 163e1051a39Sopenharmony_ci// Fills register %r10 -> .aes_consts (so you can -fPIC) 164e1051a39Sopenharmony_ci// and %xmm9-%xmm15 as specified below. 165e1051a39Sopenharmony_ci// 166e1051a39Sopenharmony_ci.type _vpaes_encrypt_preheat,%function 167e1051a39Sopenharmony_ci.align 4 168e1051a39Sopenharmony_ci_vpaes_encrypt_preheat: 169e1051a39Sopenharmony_ci adrp x10, .Lk_inv 170e1051a39Sopenharmony_ci add x10, x10, :lo12:.Lk_inv 171e1051a39Sopenharmony_ci movi v17.16b, #0x0f 172e1051a39Sopenharmony_ci ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv 173e1051a39Sopenharmony_ci ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 174e1051a39Sopenharmony_ci ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 175e1051a39Sopenharmony_ci ret 176e1051a39Sopenharmony_ci.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 177e1051a39Sopenharmony_ci 178e1051a39Sopenharmony_ci// 179e1051a39Sopenharmony_ci// _aes_encrypt_core 180e1051a39Sopenharmony_ci// 181e1051a39Sopenharmony_ci// AES-encrypt %xmm0. 182e1051a39Sopenharmony_ci// 183e1051a39Sopenharmony_ci// Inputs: 184e1051a39Sopenharmony_ci// %xmm0 = input 185e1051a39Sopenharmony_ci// %xmm9-%xmm15 as in _vpaes_preheat 186e1051a39Sopenharmony_ci// (%rdx) = scheduled keys 187e1051a39Sopenharmony_ci// 188e1051a39Sopenharmony_ci// Output in %xmm0 189e1051a39Sopenharmony_ci// Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 190e1051a39Sopenharmony_ci// Preserves %xmm6 - %xmm8 so you get some local vectors 191e1051a39Sopenharmony_ci// 192e1051a39Sopenharmony_ci// 193e1051a39Sopenharmony_ci.type _vpaes_encrypt_core,%function 194e1051a39Sopenharmony_ci.align 4 195e1051a39Sopenharmony_ci_vpaes_encrypt_core: 196e1051a39Sopenharmony_ci mov x9, $key 197e1051a39Sopenharmony_ci ldr w8, [$key,#240] // pull rounds 198e1051a39Sopenharmony_ci adrp x11, .Lk_mc_forward+16 199e1051a39Sopenharmony_ci add x11, x11, :lo12:.Lk_mc_forward+16 200e1051a39Sopenharmony_ci // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 201e1051a39Sopenharmony_ci ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 202e1051a39Sopenharmony_ci and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 203e1051a39Sopenharmony_ci ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 204e1051a39Sopenharmony_ci tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 205e1051a39Sopenharmony_ci // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 206e1051a39Sopenharmony_ci tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 207e1051a39Sopenharmony_ci eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 208e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 209e1051a39Sopenharmony_ci b .Lenc_entry 210e1051a39Sopenharmony_ci 211e1051a39Sopenharmony_ci.align 4 212e1051a39Sopenharmony_ci.Lenc_loop: 213e1051a39Sopenharmony_ci // middle of middle round 214e1051a39Sopenharmony_ci add x10, x11, #0x40 215e1051a39Sopenharmony_ci tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 216e1051a39Sopenharmony_ci ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 217e1051a39Sopenharmony_ci tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 218e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 219e1051a39Sopenharmony_ci tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 220e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 221e1051a39Sopenharmony_ci tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 222e1051a39Sopenharmony_ci ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 223e1051a39Sopenharmony_ci tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 224e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 225e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 226e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 227e1051a39Sopenharmony_ci tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 228e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 229e1051a39Sopenharmony_ci and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 230e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 231e1051a39Sopenharmony_ci sub w8, w8, #1 // nr-- 232e1051a39Sopenharmony_ci 233e1051a39Sopenharmony_ci.Lenc_entry: 234e1051a39Sopenharmony_ci // top of round 235e1051a39Sopenharmony_ci and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 236e1051a39Sopenharmony_ci ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 237e1051a39Sopenharmony_ci tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 238e1051a39Sopenharmony_ci eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 239e1051a39Sopenharmony_ci tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 240e1051a39Sopenharmony_ci tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 241e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 242e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 243e1051a39Sopenharmony_ci tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 244e1051a39Sopenharmony_ci tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 245e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 246e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 247e1051a39Sopenharmony_ci ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 248e1051a39Sopenharmony_ci cbnz w8, .Lenc_loop 249e1051a39Sopenharmony_ci 250e1051a39Sopenharmony_ci // middle of last round 251e1051a39Sopenharmony_ci add x10, x11, #0x80 252e1051a39Sopenharmony_ci // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 253e1051a39Sopenharmony_ci // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 254e1051a39Sopenharmony_ci tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 255e1051a39Sopenharmony_ci ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 256e1051a39Sopenharmony_ci tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 257e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 258e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 259e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 260e1051a39Sopenharmony_ci ret 261e1051a39Sopenharmony_ci.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 262e1051a39Sopenharmony_ci 263e1051a39Sopenharmony_ci.globl vpaes_encrypt 264e1051a39Sopenharmony_ci.type vpaes_encrypt,%function 265e1051a39Sopenharmony_ci.align 4 266e1051a39Sopenharmony_civpaes_encrypt: 267e1051a39Sopenharmony_ci .inst 0xd503233f // paciasp 268e1051a39Sopenharmony_ci stp x29,x30,[sp,#-16]! 269e1051a39Sopenharmony_ci add x29,sp,#0 270e1051a39Sopenharmony_ci 271e1051a39Sopenharmony_ci ld1 {v7.16b}, [$inp] 272e1051a39Sopenharmony_ci bl _vpaes_encrypt_preheat 273e1051a39Sopenharmony_ci bl _vpaes_encrypt_core 274e1051a39Sopenharmony_ci st1 {v0.16b}, [$out] 275e1051a39Sopenharmony_ci 276e1051a39Sopenharmony_ci ldp x29,x30,[sp],#16 277e1051a39Sopenharmony_ci .inst 0xd50323bf // autiasp 278e1051a39Sopenharmony_ci ret 279e1051a39Sopenharmony_ci.size vpaes_encrypt,.-vpaes_encrypt 280e1051a39Sopenharmony_ci 281e1051a39Sopenharmony_ci.type _vpaes_encrypt_2x,%function 282e1051a39Sopenharmony_ci.align 4 283e1051a39Sopenharmony_ci_vpaes_encrypt_2x: 284e1051a39Sopenharmony_ci mov x9, $key 285e1051a39Sopenharmony_ci ldr w8, [$key,#240] // pull rounds 286e1051a39Sopenharmony_ci adrp x11, .Lk_mc_forward+16 287e1051a39Sopenharmony_ci add x11, x11, :lo12:.Lk_mc_forward+16 288e1051a39Sopenharmony_ci // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 289e1051a39Sopenharmony_ci ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 290e1051a39Sopenharmony_ci and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 291e1051a39Sopenharmony_ci ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 292e1051a39Sopenharmony_ci and v9.16b, v15.16b, v17.16b 293e1051a39Sopenharmony_ci ushr v8.16b, v15.16b, #4 294e1051a39Sopenharmony_ci tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 295e1051a39Sopenharmony_ci tbl v9.16b, {$iptlo}, v9.16b 296e1051a39Sopenharmony_ci // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 297e1051a39Sopenharmony_ci tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 298e1051a39Sopenharmony_ci tbl v10.16b, {$ipthi}, v8.16b 299e1051a39Sopenharmony_ci eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 300e1051a39Sopenharmony_ci eor v8.16b, v9.16b, v16.16b 301e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 302e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v10.16b 303e1051a39Sopenharmony_ci b .Lenc_2x_entry 304e1051a39Sopenharmony_ci 305e1051a39Sopenharmony_ci.align 4 306e1051a39Sopenharmony_ci.Lenc_2x_loop: 307e1051a39Sopenharmony_ci // middle of middle round 308e1051a39Sopenharmony_ci add x10, x11, #0x40 309e1051a39Sopenharmony_ci tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 310e1051a39Sopenharmony_ci tbl v12.16b, {$sb1t}, v10.16b 311e1051a39Sopenharmony_ci ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 312e1051a39Sopenharmony_ci tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 313e1051a39Sopenharmony_ci tbl v8.16b, {$sb1u}, v11.16b 314e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 315e1051a39Sopenharmony_ci eor v12.16b, v12.16b, v16.16b 316e1051a39Sopenharmony_ci tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 317e1051a39Sopenharmony_ci tbl v13.16b, {$sb2t}, v10.16b 318e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 319e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v12.16b 320e1051a39Sopenharmony_ci tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 321e1051a39Sopenharmony_ci tbl v10.16b, {$sb2u}, v11.16b 322e1051a39Sopenharmony_ci ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 323e1051a39Sopenharmony_ci tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 324e1051a39Sopenharmony_ci tbl v11.16b, {v8.16b}, v1.16b 325e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 326e1051a39Sopenharmony_ci eor v10.16b, v10.16b, v13.16b 327e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 328e1051a39Sopenharmony_ci tbl v8.16b, {v8.16b}, v4.16b 329e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 330e1051a39Sopenharmony_ci eor v11.16b, v11.16b, v10.16b 331e1051a39Sopenharmony_ci tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 332e1051a39Sopenharmony_ci tbl v12.16b, {v11.16b},v1.16b 333e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 334e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v11.16b 335e1051a39Sopenharmony_ci and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 336e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 337e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v12.16b 338e1051a39Sopenharmony_ci sub w8, w8, #1 // nr-- 339e1051a39Sopenharmony_ci 340e1051a39Sopenharmony_ci.Lenc_2x_entry: 341e1051a39Sopenharmony_ci // top of round 342e1051a39Sopenharmony_ci and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 343e1051a39Sopenharmony_ci ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 344e1051a39Sopenharmony_ci and v9.16b, v8.16b, v17.16b 345e1051a39Sopenharmony_ci ushr v8.16b, v8.16b, #4 346e1051a39Sopenharmony_ci tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 347e1051a39Sopenharmony_ci tbl v13.16b, {$invhi},v9.16b 348e1051a39Sopenharmony_ci eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 349e1051a39Sopenharmony_ci eor v9.16b, v9.16b, v8.16b 350e1051a39Sopenharmony_ci tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 351e1051a39Sopenharmony_ci tbl v11.16b, {$invlo},v8.16b 352e1051a39Sopenharmony_ci tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 353e1051a39Sopenharmony_ci tbl v12.16b, {$invlo},v9.16b 354e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 355e1051a39Sopenharmony_ci eor v11.16b, v11.16b, v13.16b 356e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 357e1051a39Sopenharmony_ci eor v12.16b, v12.16b, v13.16b 358e1051a39Sopenharmony_ci tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 359e1051a39Sopenharmony_ci tbl v10.16b, {$invlo},v11.16b 360e1051a39Sopenharmony_ci tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 361e1051a39Sopenharmony_ci tbl v11.16b, {$invlo},v12.16b 362e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 363e1051a39Sopenharmony_ci eor v10.16b, v10.16b, v9.16b 364e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 365e1051a39Sopenharmony_ci eor v11.16b, v11.16b, v8.16b 366e1051a39Sopenharmony_ci ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 367e1051a39Sopenharmony_ci cbnz w8, .Lenc_2x_loop 368e1051a39Sopenharmony_ci 369e1051a39Sopenharmony_ci // middle of last round 370e1051a39Sopenharmony_ci add x10, x11, #0x80 371e1051a39Sopenharmony_ci // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 372e1051a39Sopenharmony_ci // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 373e1051a39Sopenharmony_ci tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 374e1051a39Sopenharmony_ci tbl v12.16b, {$sbou}, v10.16b 375e1051a39Sopenharmony_ci ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 376e1051a39Sopenharmony_ci tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 377e1051a39Sopenharmony_ci tbl v8.16b, {$sbot}, v11.16b 378e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 379e1051a39Sopenharmony_ci eor v12.16b, v12.16b, v16.16b 380e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 381e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v12.16b 382e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 383e1051a39Sopenharmony_ci tbl v1.16b, {v8.16b},v1.16b 384e1051a39Sopenharmony_ci ret 385e1051a39Sopenharmony_ci.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 386e1051a39Sopenharmony_ci 387e1051a39Sopenharmony_ci.type _vpaes_decrypt_preheat,%function 388e1051a39Sopenharmony_ci.align 4 389e1051a39Sopenharmony_ci_vpaes_decrypt_preheat: 390e1051a39Sopenharmony_ci adrp x10, .Lk_inv 391e1051a39Sopenharmony_ci add x10, x10, :lo12:.Lk_inv 392e1051a39Sopenharmony_ci movi v17.16b, #0x0f 393e1051a39Sopenharmony_ci adrp x11, .Lk_dipt 394e1051a39Sopenharmony_ci add x11, x11, :lo12:.Lk_dipt 395e1051a39Sopenharmony_ci ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv 396e1051a39Sopenharmony_ci ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo 397e1051a39Sopenharmony_ci ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd 398e1051a39Sopenharmony_ci ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe 399e1051a39Sopenharmony_ci ret 400e1051a39Sopenharmony_ci.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat 401e1051a39Sopenharmony_ci 402e1051a39Sopenharmony_ci// 403e1051a39Sopenharmony_ci// Decryption core 404e1051a39Sopenharmony_ci// 405e1051a39Sopenharmony_ci// Same API as encryption core. 406e1051a39Sopenharmony_ci// 407e1051a39Sopenharmony_ci.type _vpaes_decrypt_core,%function 408e1051a39Sopenharmony_ci.align 4 409e1051a39Sopenharmony_ci_vpaes_decrypt_core: 410e1051a39Sopenharmony_ci mov x9, $key 411e1051a39Sopenharmony_ci ldr w8, [$key,#240] // pull rounds 412e1051a39Sopenharmony_ci 413e1051a39Sopenharmony_ci // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 414e1051a39Sopenharmony_ci lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 415e1051a39Sopenharmony_ci eor x11, x11, #0x30 // xor \$0x30, %r11 416e1051a39Sopenharmony_ci adrp x10, .Lk_sr 417e1051a39Sopenharmony_ci add x10, x10, :lo12:.Lk_sr 418e1051a39Sopenharmony_ci and x11, x11, #0x30 // and \$0x30, %r11 419e1051a39Sopenharmony_ci add x11, x11, x10 420e1051a39Sopenharmony_ci adrp x10, .Lk_mc_forward+48 421e1051a39Sopenharmony_ci add x10, x10, :lo12:.Lk_mc_forward+48 422e1051a39Sopenharmony_ci 423e1051a39Sopenharmony_ci ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 424e1051a39Sopenharmony_ci and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 425e1051a39Sopenharmony_ci ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 426e1051a39Sopenharmony_ci tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 427e1051a39Sopenharmony_ci ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 428e1051a39Sopenharmony_ci // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 429e1051a39Sopenharmony_ci tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 430e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 431e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 432e1051a39Sopenharmony_ci b .Ldec_entry 433e1051a39Sopenharmony_ci 434e1051a39Sopenharmony_ci.align 4 435e1051a39Sopenharmony_ci.Ldec_loop: 436e1051a39Sopenharmony_ci// 437e1051a39Sopenharmony_ci// Inverse mix columns 438e1051a39Sopenharmony_ci// 439e1051a39Sopenharmony_ci // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 440e1051a39Sopenharmony_ci // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 441e1051a39Sopenharmony_ci tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 442e1051a39Sopenharmony_ci tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 443e1051a39Sopenharmony_ci eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 444e1051a39Sopenharmony_ci // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 445e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 446e1051a39Sopenharmony_ci // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 447e1051a39Sopenharmony_ci 448e1051a39Sopenharmony_ci tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 449e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 450e1051a39Sopenharmony_ci tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 451e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 452e1051a39Sopenharmony_ci // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 453e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 454e1051a39Sopenharmony_ci // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 455e1051a39Sopenharmony_ci 456e1051a39Sopenharmony_ci tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 457e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 458e1051a39Sopenharmony_ci tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 459e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 460e1051a39Sopenharmony_ci // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 461e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 462e1051a39Sopenharmony_ci // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 463e1051a39Sopenharmony_ci 464e1051a39Sopenharmony_ci tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 465e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 466e1051a39Sopenharmony_ci tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 467e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 468e1051a39Sopenharmony_ci ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 469e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 470e1051a39Sopenharmony_ci sub w8, w8, #1 // sub \$1,%rax # nr-- 471e1051a39Sopenharmony_ci 472e1051a39Sopenharmony_ci.Ldec_entry: 473e1051a39Sopenharmony_ci // top of round 474e1051a39Sopenharmony_ci and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 475e1051a39Sopenharmony_ci ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 476e1051a39Sopenharmony_ci tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 477e1051a39Sopenharmony_ci eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 478e1051a39Sopenharmony_ci tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 479e1051a39Sopenharmony_ci tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 480e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 481e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 482e1051a39Sopenharmony_ci tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 483e1051a39Sopenharmony_ci tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 484e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 485e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 486e1051a39Sopenharmony_ci ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 487e1051a39Sopenharmony_ci cbnz w8, .Ldec_loop 488e1051a39Sopenharmony_ci 489e1051a39Sopenharmony_ci // middle of last round 490e1051a39Sopenharmony_ci // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 491e1051a39Sopenharmony_ci tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 492e1051a39Sopenharmony_ci // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 493e1051a39Sopenharmony_ci ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 494e1051a39Sopenharmony_ci tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 495e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 496e1051a39Sopenharmony_ci eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 497e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 498e1051a39Sopenharmony_ci ret 499e1051a39Sopenharmony_ci.size _vpaes_decrypt_core,.-_vpaes_decrypt_core 500e1051a39Sopenharmony_ci 501e1051a39Sopenharmony_ci.globl vpaes_decrypt 502e1051a39Sopenharmony_ci.type vpaes_decrypt,%function 503e1051a39Sopenharmony_ci.align 4 504e1051a39Sopenharmony_civpaes_decrypt: 505e1051a39Sopenharmony_ci .inst 0xd503233f // paciasp 506e1051a39Sopenharmony_ci stp x29,x30,[sp,#-16]! 507e1051a39Sopenharmony_ci add x29,sp,#0 508e1051a39Sopenharmony_ci 509e1051a39Sopenharmony_ci ld1 {v7.16b}, [$inp] 510e1051a39Sopenharmony_ci bl _vpaes_decrypt_preheat 511e1051a39Sopenharmony_ci bl _vpaes_decrypt_core 512e1051a39Sopenharmony_ci st1 {v0.16b}, [$out] 513e1051a39Sopenharmony_ci 514e1051a39Sopenharmony_ci ldp x29,x30,[sp],#16 515e1051a39Sopenharmony_ci .inst 0xd50323bf // autiasp 516e1051a39Sopenharmony_ci ret 517e1051a39Sopenharmony_ci.size vpaes_decrypt,.-vpaes_decrypt 518e1051a39Sopenharmony_ci 519e1051a39Sopenharmony_ci// v14-v15 input, v0-v1 output 520e1051a39Sopenharmony_ci.type _vpaes_decrypt_2x,%function 521e1051a39Sopenharmony_ci.align 4 522e1051a39Sopenharmony_ci_vpaes_decrypt_2x: 523e1051a39Sopenharmony_ci mov x9, $key 524e1051a39Sopenharmony_ci ldr w8, [$key,#240] // pull rounds 525e1051a39Sopenharmony_ci 526e1051a39Sopenharmony_ci // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 527e1051a39Sopenharmony_ci lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 528e1051a39Sopenharmony_ci eor x11, x11, #0x30 // xor \$0x30, %r11 529e1051a39Sopenharmony_ci adrp x10, .Lk_sr 530e1051a39Sopenharmony_ci add x10, x10, :lo12:.Lk_sr 531e1051a39Sopenharmony_ci and x11, x11, #0x30 // and \$0x30, %r11 532e1051a39Sopenharmony_ci add x11, x11, x10 533e1051a39Sopenharmony_ci adrp x10, .Lk_mc_forward+48 534e1051a39Sopenharmony_ci add x10, x10, :lo12:.Lk_mc_forward+48 535e1051a39Sopenharmony_ci 536e1051a39Sopenharmony_ci ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 537e1051a39Sopenharmony_ci and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 538e1051a39Sopenharmony_ci ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 539e1051a39Sopenharmony_ci and v9.16b, v15.16b, v17.16b 540e1051a39Sopenharmony_ci ushr v8.16b, v15.16b, #4 541e1051a39Sopenharmony_ci tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 542e1051a39Sopenharmony_ci tbl v10.16b, {$iptlo},v9.16b 543e1051a39Sopenharmony_ci ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 544e1051a39Sopenharmony_ci // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 545e1051a39Sopenharmony_ci tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 546e1051a39Sopenharmony_ci tbl v8.16b, {$ipthi},v8.16b 547e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 548e1051a39Sopenharmony_ci eor v10.16b, v10.16b, v16.16b 549e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 550e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v10.16b 551e1051a39Sopenharmony_ci b .Ldec_2x_entry 552e1051a39Sopenharmony_ci 553e1051a39Sopenharmony_ci.align 4 554e1051a39Sopenharmony_ci.Ldec_2x_loop: 555e1051a39Sopenharmony_ci// 556e1051a39Sopenharmony_ci// Inverse mix columns 557e1051a39Sopenharmony_ci// 558e1051a39Sopenharmony_ci // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 559e1051a39Sopenharmony_ci // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 560e1051a39Sopenharmony_ci tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 561e1051a39Sopenharmony_ci tbl v12.16b, {$sb9u}, v10.16b 562e1051a39Sopenharmony_ci tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 563e1051a39Sopenharmony_ci tbl v9.16b, {$sb9t}, v11.16b 564e1051a39Sopenharmony_ci eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 565e1051a39Sopenharmony_ci eor v8.16b, v12.16b, v16.16b 566e1051a39Sopenharmony_ci // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 567e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 568e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 569e1051a39Sopenharmony_ci // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 570e1051a39Sopenharmony_ci 571e1051a39Sopenharmony_ci tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 572e1051a39Sopenharmony_ci tbl v12.16b, {$sbdu}, v10.16b 573e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 574e1051a39Sopenharmony_ci tbl v8.16b, {v8.16b},v5.16b 575e1051a39Sopenharmony_ci tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 576e1051a39Sopenharmony_ci tbl v9.16b, {$sbdt}, v11.16b 577e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 578e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v12.16b 579e1051a39Sopenharmony_ci // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 580e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 581e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v9.16b 582e1051a39Sopenharmony_ci // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 583e1051a39Sopenharmony_ci 584e1051a39Sopenharmony_ci tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 585e1051a39Sopenharmony_ci tbl v12.16b, {$sbbu}, v10.16b 586e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 587e1051a39Sopenharmony_ci tbl v8.16b, {v8.16b},v5.16b 588e1051a39Sopenharmony_ci tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 589e1051a39Sopenharmony_ci tbl v9.16b, {$sbbt}, v11.16b 590e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 591e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v12.16b 592e1051a39Sopenharmony_ci // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 593e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 594e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v9.16b 595e1051a39Sopenharmony_ci // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 596e1051a39Sopenharmony_ci 597e1051a39Sopenharmony_ci tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 598e1051a39Sopenharmony_ci tbl v12.16b, {$sbeu}, v10.16b 599e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 600e1051a39Sopenharmony_ci tbl v8.16b, {v8.16b},v5.16b 601e1051a39Sopenharmony_ci tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 602e1051a39Sopenharmony_ci tbl v9.16b, {$sbet}, v11.16b 603e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 604e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v12.16b 605e1051a39Sopenharmony_ci ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 606e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 607e1051a39Sopenharmony_ci eor v8.16b, v8.16b, v9.16b 608e1051a39Sopenharmony_ci sub w8, w8, #1 // sub \$1,%rax # nr-- 609e1051a39Sopenharmony_ci 610e1051a39Sopenharmony_ci.Ldec_2x_entry: 611e1051a39Sopenharmony_ci // top of round 612e1051a39Sopenharmony_ci and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 613e1051a39Sopenharmony_ci ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 614e1051a39Sopenharmony_ci and v9.16b, v8.16b, v17.16b 615e1051a39Sopenharmony_ci ushr v8.16b, v8.16b, #4 616e1051a39Sopenharmony_ci tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 617e1051a39Sopenharmony_ci tbl v10.16b, {$invhi},v9.16b 618e1051a39Sopenharmony_ci eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 619e1051a39Sopenharmony_ci eor v9.16b, v9.16b, v8.16b 620e1051a39Sopenharmony_ci tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 621e1051a39Sopenharmony_ci tbl v11.16b, {$invlo},v8.16b 622e1051a39Sopenharmony_ci tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 623e1051a39Sopenharmony_ci tbl v12.16b, {$invlo},v9.16b 624e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 625e1051a39Sopenharmony_ci eor v11.16b, v11.16b, v10.16b 626e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 627e1051a39Sopenharmony_ci eor v12.16b, v12.16b, v10.16b 628e1051a39Sopenharmony_ci tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 629e1051a39Sopenharmony_ci tbl v10.16b, {$invlo},v11.16b 630e1051a39Sopenharmony_ci tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 631e1051a39Sopenharmony_ci tbl v11.16b, {$invlo},v12.16b 632e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 633e1051a39Sopenharmony_ci eor v10.16b, v10.16b, v9.16b 634e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 635e1051a39Sopenharmony_ci eor v11.16b, v11.16b, v8.16b 636e1051a39Sopenharmony_ci ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 637e1051a39Sopenharmony_ci cbnz w8, .Ldec_2x_loop 638e1051a39Sopenharmony_ci 639e1051a39Sopenharmony_ci // middle of last round 640e1051a39Sopenharmony_ci // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 641e1051a39Sopenharmony_ci tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 642e1051a39Sopenharmony_ci tbl v12.16b, {$sbou}, v10.16b 643e1051a39Sopenharmony_ci // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 644e1051a39Sopenharmony_ci tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 645e1051a39Sopenharmony_ci tbl v9.16b, {$sbot}, v11.16b 646e1051a39Sopenharmony_ci ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 647e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 648e1051a39Sopenharmony_ci eor v12.16b, v12.16b, v16.16b 649e1051a39Sopenharmony_ci eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 650e1051a39Sopenharmony_ci eor v8.16b, v9.16b, v12.16b 651e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 652e1051a39Sopenharmony_ci tbl v1.16b, {v8.16b},v2.16b 653e1051a39Sopenharmony_ci ret 654e1051a39Sopenharmony_ci.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x 655e1051a39Sopenharmony_ci___ 656e1051a39Sopenharmony_ci} 657e1051a39Sopenharmony_ci{ 658e1051a39Sopenharmony_cimy ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); 659e1051a39Sopenharmony_cimy ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); 660e1051a39Sopenharmony_ci 661e1051a39Sopenharmony_ci$code.=<<___; 662e1051a39Sopenharmony_ci//////////////////////////////////////////////////////// 663e1051a39Sopenharmony_ci// // 664e1051a39Sopenharmony_ci// AES key schedule // 665e1051a39Sopenharmony_ci// // 666e1051a39Sopenharmony_ci//////////////////////////////////////////////////////// 667e1051a39Sopenharmony_ci.type _vpaes_key_preheat,%function 668e1051a39Sopenharmony_ci.align 4 669e1051a39Sopenharmony_ci_vpaes_key_preheat: 670e1051a39Sopenharmony_ci adrp x10, .Lk_inv 671e1051a39Sopenharmony_ci add x10, x10, :lo12:.Lk_inv 672e1051a39Sopenharmony_ci movi v16.16b, #0x5b // .Lk_s63 673e1051a39Sopenharmony_ci adrp x11, .Lk_sb1 674e1051a39Sopenharmony_ci add x11, x11, :lo12:.Lk_sb1 675e1051a39Sopenharmony_ci movi v17.16b, #0x0f // .Lk_s0F 676e1051a39Sopenharmony_ci ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt 677e1051a39Sopenharmony_ci adrp x10, .Lk_dksd 678e1051a39Sopenharmony_ci add x10, x10, :lo12:.Lk_dksd 679e1051a39Sopenharmony_ci ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 680e1051a39Sopenharmony_ci adrp x11, .Lk_mc_forward 681e1051a39Sopenharmony_ci add x11, x11, :lo12:.Lk_mc_forward 682e1051a39Sopenharmony_ci ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 683e1051a39Sopenharmony_ci ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 684e1051a39Sopenharmony_ci ld1 {v8.2d}, [x10] // .Lk_rcon 685e1051a39Sopenharmony_ci ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 686e1051a39Sopenharmony_ci ret 687e1051a39Sopenharmony_ci.size _vpaes_key_preheat,.-_vpaes_key_preheat 688e1051a39Sopenharmony_ci 689e1051a39Sopenharmony_ci.type _vpaes_schedule_core,%function 690e1051a39Sopenharmony_ci.align 4 691e1051a39Sopenharmony_ci_vpaes_schedule_core: 692e1051a39Sopenharmony_ci .inst 0xd503233f // paciasp 693e1051a39Sopenharmony_ci stp x29, x30, [sp,#-16]! 694e1051a39Sopenharmony_ci add x29,sp,#0 695e1051a39Sopenharmony_ci 696e1051a39Sopenharmony_ci bl _vpaes_key_preheat // load the tables 697e1051a39Sopenharmony_ci 698e1051a39Sopenharmony_ci ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 699e1051a39Sopenharmony_ci 700e1051a39Sopenharmony_ci // input transform 701e1051a39Sopenharmony_ci mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 702e1051a39Sopenharmony_ci bl _vpaes_schedule_transform 703e1051a39Sopenharmony_ci mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 704e1051a39Sopenharmony_ci 705e1051a39Sopenharmony_ci adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 706e1051a39Sopenharmony_ci add x10, x10, :lo12:.Lk_sr 707e1051a39Sopenharmony_ci add x8, x8, x10 708e1051a39Sopenharmony_ci cbnz $dir, .Lschedule_am_decrypting 709e1051a39Sopenharmony_ci 710e1051a39Sopenharmony_ci // encrypting, output zeroth round key after transform 711e1051a39Sopenharmony_ci st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) 712e1051a39Sopenharmony_ci b .Lschedule_go 713e1051a39Sopenharmony_ci 714e1051a39Sopenharmony_ci.Lschedule_am_decrypting: 715e1051a39Sopenharmony_ci // decrypting, output zeroth round key after shiftrows 716e1051a39Sopenharmony_ci ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 717e1051a39Sopenharmony_ci tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 718e1051a39Sopenharmony_ci st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) 719e1051a39Sopenharmony_ci eor x8, x8, #0x30 // xor \$0x30, %r8 720e1051a39Sopenharmony_ci 721e1051a39Sopenharmony_ci.Lschedule_go: 722e1051a39Sopenharmony_ci cmp $bits, #192 // cmp \$192, %esi 723e1051a39Sopenharmony_ci b.hi .Lschedule_256 724e1051a39Sopenharmony_ci b.eq .Lschedule_192 725e1051a39Sopenharmony_ci // 128: fall though 726e1051a39Sopenharmony_ci 727e1051a39Sopenharmony_ci// 728e1051a39Sopenharmony_ci// .schedule_128 729e1051a39Sopenharmony_ci// 730e1051a39Sopenharmony_ci// 128-bit specific part of key schedule. 731e1051a39Sopenharmony_ci// 732e1051a39Sopenharmony_ci// This schedule is really simple, because all its parts 733e1051a39Sopenharmony_ci// are accomplished by the subroutines. 734e1051a39Sopenharmony_ci// 735e1051a39Sopenharmony_ci.Lschedule_128: 736e1051a39Sopenharmony_ci mov $inp, #10 // mov \$10, %esi 737e1051a39Sopenharmony_ci 738e1051a39Sopenharmony_ci.Loop_schedule_128: 739e1051a39Sopenharmony_ci sub $inp, $inp, #1 // dec %esi 740e1051a39Sopenharmony_ci bl _vpaes_schedule_round 741e1051a39Sopenharmony_ci cbz $inp, .Lschedule_mangle_last 742e1051a39Sopenharmony_ci bl _vpaes_schedule_mangle // write output 743e1051a39Sopenharmony_ci b .Loop_schedule_128 744e1051a39Sopenharmony_ci 745e1051a39Sopenharmony_ci// 746e1051a39Sopenharmony_ci// .aes_schedule_192 747e1051a39Sopenharmony_ci// 748e1051a39Sopenharmony_ci// 192-bit specific part of key schedule. 749e1051a39Sopenharmony_ci// 750e1051a39Sopenharmony_ci// The main body of this schedule is the same as the 128-bit 751e1051a39Sopenharmony_ci// schedule, but with more smearing. The long, high side is 752e1051a39Sopenharmony_ci// stored in %xmm7 as before, and the short, low side is in 753e1051a39Sopenharmony_ci// the high bits of %xmm6. 754e1051a39Sopenharmony_ci// 755e1051a39Sopenharmony_ci// This schedule is somewhat nastier, however, because each 756e1051a39Sopenharmony_ci// round produces 192 bits of key material, or 1.5 round keys. 757e1051a39Sopenharmony_ci// Therefore, on each cycle we do 2 rounds and produce 3 round 758e1051a39Sopenharmony_ci// keys. 759e1051a39Sopenharmony_ci// 760e1051a39Sopenharmony_ci.align 4 761e1051a39Sopenharmony_ci.Lschedule_192: 762e1051a39Sopenharmony_ci sub $inp, $inp, #8 763e1051a39Sopenharmony_ci ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 764e1051a39Sopenharmony_ci bl _vpaes_schedule_transform // input transform 765e1051a39Sopenharmony_ci mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 766e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 767e1051a39Sopenharmony_ci ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 768e1051a39Sopenharmony_ci mov $inp, #4 // mov \$4, %esi 769e1051a39Sopenharmony_ci 770e1051a39Sopenharmony_ci.Loop_schedule_192: 771e1051a39Sopenharmony_ci sub $inp, $inp, #1 // dec %esi 772e1051a39Sopenharmony_ci bl _vpaes_schedule_round 773e1051a39Sopenharmony_ci ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 774e1051a39Sopenharmony_ci bl _vpaes_schedule_mangle // save key n 775e1051a39Sopenharmony_ci bl _vpaes_schedule_192_smear 776e1051a39Sopenharmony_ci bl _vpaes_schedule_mangle // save key n+1 777e1051a39Sopenharmony_ci bl _vpaes_schedule_round 778e1051a39Sopenharmony_ci cbz $inp, .Lschedule_mangle_last 779e1051a39Sopenharmony_ci bl _vpaes_schedule_mangle // save key n+2 780e1051a39Sopenharmony_ci bl _vpaes_schedule_192_smear 781e1051a39Sopenharmony_ci b .Loop_schedule_192 782e1051a39Sopenharmony_ci 783e1051a39Sopenharmony_ci// 784e1051a39Sopenharmony_ci// .aes_schedule_256 785e1051a39Sopenharmony_ci// 786e1051a39Sopenharmony_ci// 256-bit specific part of key schedule. 787e1051a39Sopenharmony_ci// 788e1051a39Sopenharmony_ci// The structure here is very similar to the 128-bit 789e1051a39Sopenharmony_ci// schedule, but with an additional "low side" in 790e1051a39Sopenharmony_ci// %xmm6. The low side's rounds are the same as the 791e1051a39Sopenharmony_ci// high side's, except no rcon and no rotation. 792e1051a39Sopenharmony_ci// 793e1051a39Sopenharmony_ci.align 4 794e1051a39Sopenharmony_ci.Lschedule_256: 795e1051a39Sopenharmony_ci ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 796e1051a39Sopenharmony_ci bl _vpaes_schedule_transform // input transform 797e1051a39Sopenharmony_ci mov $inp, #7 // mov \$7, %esi 798e1051a39Sopenharmony_ci 799e1051a39Sopenharmony_ci.Loop_schedule_256: 800e1051a39Sopenharmony_ci sub $inp, $inp, #1 // dec %esi 801e1051a39Sopenharmony_ci bl _vpaes_schedule_mangle // output low result 802e1051a39Sopenharmony_ci mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 803e1051a39Sopenharmony_ci 804e1051a39Sopenharmony_ci // high round 805e1051a39Sopenharmony_ci bl _vpaes_schedule_round 806e1051a39Sopenharmony_ci cbz $inp, .Lschedule_mangle_last 807e1051a39Sopenharmony_ci bl _vpaes_schedule_mangle 808e1051a39Sopenharmony_ci 809e1051a39Sopenharmony_ci // low round. swap xmm7 and xmm6 810e1051a39Sopenharmony_ci dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 811e1051a39Sopenharmony_ci movi v4.16b, #0 812e1051a39Sopenharmony_ci mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 813e1051a39Sopenharmony_ci mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 814e1051a39Sopenharmony_ci bl _vpaes_schedule_low_round 815e1051a39Sopenharmony_ci mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 816e1051a39Sopenharmony_ci 817e1051a39Sopenharmony_ci b .Loop_schedule_256 818e1051a39Sopenharmony_ci 819e1051a39Sopenharmony_ci// 820e1051a39Sopenharmony_ci// .aes_schedule_mangle_last 821e1051a39Sopenharmony_ci// 822e1051a39Sopenharmony_ci// Mangler for last round of key schedule 823e1051a39Sopenharmony_ci// Mangles %xmm0 824e1051a39Sopenharmony_ci// when encrypting, outputs out(%xmm0) ^ 63 825e1051a39Sopenharmony_ci// when decrypting, outputs unskew(%xmm0) 826e1051a39Sopenharmony_ci// 827e1051a39Sopenharmony_ci// Always called right before return... jumps to cleanup and exits 828e1051a39Sopenharmony_ci// 829e1051a39Sopenharmony_ci.align 4 830e1051a39Sopenharmony_ci.Lschedule_mangle_last: 831e1051a39Sopenharmony_ci // schedule last round key from xmm0 832e1051a39Sopenharmony_ci adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 833e1051a39Sopenharmony_ci add x11, x11, :lo12:.Lk_deskew 834e1051a39Sopenharmony_ci cbnz $dir, .Lschedule_mangle_last_dec 835e1051a39Sopenharmony_ci 836e1051a39Sopenharmony_ci // encrypting 837e1051a39Sopenharmony_ci ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 838e1051a39Sopenharmony_ci adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 839e1051a39Sopenharmony_ci add x11, x11, :lo12:.Lk_opt 840e1051a39Sopenharmony_ci add $out, $out, #32 // add \$32, %rdx 841e1051a39Sopenharmony_ci tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 842e1051a39Sopenharmony_ci 843e1051a39Sopenharmony_ci.Lschedule_mangle_last_dec: 844e1051a39Sopenharmony_ci ld1 {v20.2d-v21.2d}, [x11] // reload constants 845e1051a39Sopenharmony_ci sub $out, $out, #16 // add \$-16, %rdx 846e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 847e1051a39Sopenharmony_ci bl _vpaes_schedule_transform // output transform 848e1051a39Sopenharmony_ci st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key 849e1051a39Sopenharmony_ci 850e1051a39Sopenharmony_ci // cleanup 851e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 852e1051a39Sopenharmony_ci eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 853e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 854e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 855e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 856e1051a39Sopenharmony_ci eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 857e1051a39Sopenharmony_ci eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 858e1051a39Sopenharmony_ci eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 859e1051a39Sopenharmony_ci ldp x29, x30, [sp],#16 860e1051a39Sopenharmony_ci .inst 0xd50323bf // autiasp 861e1051a39Sopenharmony_ci ret 862e1051a39Sopenharmony_ci.size _vpaes_schedule_core,.-_vpaes_schedule_core 863e1051a39Sopenharmony_ci 864e1051a39Sopenharmony_ci// 865e1051a39Sopenharmony_ci// .aes_schedule_192_smear 866e1051a39Sopenharmony_ci// 867e1051a39Sopenharmony_ci// Smear the short, low side in the 192-bit key schedule. 868e1051a39Sopenharmony_ci// 869e1051a39Sopenharmony_ci// Inputs: 870e1051a39Sopenharmony_ci// %xmm7: high side, b a x y 871e1051a39Sopenharmony_ci// %xmm6: low side, d c 0 0 872e1051a39Sopenharmony_ci// %xmm13: 0 873e1051a39Sopenharmony_ci// 874e1051a39Sopenharmony_ci// Outputs: 875e1051a39Sopenharmony_ci// %xmm6: b+c+d b+c 0 0 876e1051a39Sopenharmony_ci// %xmm0: b+c+d b+c b a 877e1051a39Sopenharmony_ci// 878e1051a39Sopenharmony_ci.type _vpaes_schedule_192_smear,%function 879e1051a39Sopenharmony_ci.align 4 880e1051a39Sopenharmony_ci_vpaes_schedule_192_smear: 881e1051a39Sopenharmony_ci movi v1.16b, #0 882e1051a39Sopenharmony_ci dup v0.4s, v7.s[3] 883e1051a39Sopenharmony_ci ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 884e1051a39Sopenharmony_ci ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 885e1051a39Sopenharmony_ci eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 886e1051a39Sopenharmony_ci eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 887e1051a39Sopenharmony_ci eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 888e1051a39Sopenharmony_ci mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 889e1051a39Sopenharmony_ci ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 890e1051a39Sopenharmony_ci ret 891e1051a39Sopenharmony_ci.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 892e1051a39Sopenharmony_ci 893e1051a39Sopenharmony_ci// 894e1051a39Sopenharmony_ci// .aes_schedule_round 895e1051a39Sopenharmony_ci// 896e1051a39Sopenharmony_ci// Runs one main round of the key schedule on %xmm0, %xmm7 897e1051a39Sopenharmony_ci// 898e1051a39Sopenharmony_ci// Specifically, runs subbytes on the high dword of %xmm0 899e1051a39Sopenharmony_ci// then rotates it by one byte and xors into the low dword of 900e1051a39Sopenharmony_ci// %xmm7. 901e1051a39Sopenharmony_ci// 902e1051a39Sopenharmony_ci// Adds rcon from low byte of %xmm8, then rotates %xmm8 for 903e1051a39Sopenharmony_ci// next rcon. 904e1051a39Sopenharmony_ci// 905e1051a39Sopenharmony_ci// Smears the dwords of %xmm7 by xoring the low into the 906e1051a39Sopenharmony_ci// second low, result into third, result into highest. 907e1051a39Sopenharmony_ci// 908e1051a39Sopenharmony_ci// Returns results in %xmm7 = %xmm0. 909e1051a39Sopenharmony_ci// Clobbers %xmm1-%xmm4, %r11. 910e1051a39Sopenharmony_ci// 911e1051a39Sopenharmony_ci.type _vpaes_schedule_round,%function 912e1051a39Sopenharmony_ci.align 4 913e1051a39Sopenharmony_ci_vpaes_schedule_round: 914e1051a39Sopenharmony_ci // extract rcon from xmm8 915e1051a39Sopenharmony_ci movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 916e1051a39Sopenharmony_ci ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1 917e1051a39Sopenharmony_ci ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8 918e1051a39Sopenharmony_ci eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 919e1051a39Sopenharmony_ci 920e1051a39Sopenharmony_ci // rotate 921e1051a39Sopenharmony_ci dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 922e1051a39Sopenharmony_ci ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0 923e1051a39Sopenharmony_ci 924e1051a39Sopenharmony_ci // fall through... 925e1051a39Sopenharmony_ci 926e1051a39Sopenharmony_ci // low round: same as high round, but no rotation and no rcon. 927e1051a39Sopenharmony_ci_vpaes_schedule_low_round: 928e1051a39Sopenharmony_ci // smear xmm7 929e1051a39Sopenharmony_ci ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1 930e1051a39Sopenharmony_ci eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 931e1051a39Sopenharmony_ci ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4 932e1051a39Sopenharmony_ci 933e1051a39Sopenharmony_ci // subbytes 934e1051a39Sopenharmony_ci and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 935e1051a39Sopenharmony_ci ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 936e1051a39Sopenharmony_ci eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 937e1051a39Sopenharmony_ci tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 938e1051a39Sopenharmony_ci eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 939e1051a39Sopenharmony_ci tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 940e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 941e1051a39Sopenharmony_ci tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 942e1051a39Sopenharmony_ci eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 943e1051a39Sopenharmony_ci tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 944e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 945e1051a39Sopenharmony_ci tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 946e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 947e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 948e1051a39Sopenharmony_ci tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 949e1051a39Sopenharmony_ci tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 950e1051a39Sopenharmony_ci eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 951e1051a39Sopenharmony_ci 952e1051a39Sopenharmony_ci // add in smeared stuff 953e1051a39Sopenharmony_ci eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 954e1051a39Sopenharmony_ci eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 955e1051a39Sopenharmony_ci ret 956e1051a39Sopenharmony_ci.size _vpaes_schedule_round,.-_vpaes_schedule_round 957e1051a39Sopenharmony_ci 958e1051a39Sopenharmony_ci// 959e1051a39Sopenharmony_ci// .aes_schedule_transform 960e1051a39Sopenharmony_ci// 961e1051a39Sopenharmony_ci// Linear-transform %xmm0 according to tables at (%r11) 962e1051a39Sopenharmony_ci// 963e1051a39Sopenharmony_ci// Requires that %xmm9 = 0x0F0F... as in preheat 964e1051a39Sopenharmony_ci// Output in %xmm0 965e1051a39Sopenharmony_ci// Clobbers %xmm1, %xmm2 966e1051a39Sopenharmony_ci// 967e1051a39Sopenharmony_ci.type _vpaes_schedule_transform,%function 968e1051a39Sopenharmony_ci.align 4 969e1051a39Sopenharmony_ci_vpaes_schedule_transform: 970e1051a39Sopenharmony_ci and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 971e1051a39Sopenharmony_ci ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 972e1051a39Sopenharmony_ci // vmovdqa (%r11), %xmm2 # lo 973e1051a39Sopenharmony_ci tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 974e1051a39Sopenharmony_ci // vmovdqa 16(%r11), %xmm1 # hi 975e1051a39Sopenharmony_ci tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 976e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 977e1051a39Sopenharmony_ci ret 978e1051a39Sopenharmony_ci.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 979e1051a39Sopenharmony_ci 980e1051a39Sopenharmony_ci// 981e1051a39Sopenharmony_ci// .aes_schedule_mangle 982e1051a39Sopenharmony_ci// 983e1051a39Sopenharmony_ci// Mangle xmm0 from (basis-transformed) standard version 984e1051a39Sopenharmony_ci// to our version. 985e1051a39Sopenharmony_ci// 986e1051a39Sopenharmony_ci// On encrypt, 987e1051a39Sopenharmony_ci// xor with 0x63 988e1051a39Sopenharmony_ci// multiply by circulant 0,1,1,1 989e1051a39Sopenharmony_ci// apply shiftrows transform 990e1051a39Sopenharmony_ci// 991e1051a39Sopenharmony_ci// On decrypt, 992e1051a39Sopenharmony_ci// xor with 0x63 993e1051a39Sopenharmony_ci// multiply by "inverse mixcolumns" circulant E,B,D,9 994e1051a39Sopenharmony_ci// deskew 995e1051a39Sopenharmony_ci// apply shiftrows transform 996e1051a39Sopenharmony_ci// 997e1051a39Sopenharmony_ci// 998e1051a39Sopenharmony_ci// Writes out to (%rdx), and increments or decrements it 999e1051a39Sopenharmony_ci// Keeps track of round number mod 4 in %r8 1000e1051a39Sopenharmony_ci// Preserves xmm0 1001e1051a39Sopenharmony_ci// Clobbers xmm1-xmm5 1002e1051a39Sopenharmony_ci// 1003e1051a39Sopenharmony_ci.type _vpaes_schedule_mangle,%function 1004e1051a39Sopenharmony_ci.align 4 1005e1051a39Sopenharmony_ci_vpaes_schedule_mangle: 1006e1051a39Sopenharmony_ci mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 1007e1051a39Sopenharmony_ci // vmovdqa .Lk_mc_forward(%rip),%xmm5 1008e1051a39Sopenharmony_ci cbnz $dir, .Lschedule_mangle_dec 1009e1051a39Sopenharmony_ci 1010e1051a39Sopenharmony_ci // encrypting 1011e1051a39Sopenharmony_ci eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 1012e1051a39Sopenharmony_ci add $out, $out, #16 // add \$16, %rdx 1013e1051a39Sopenharmony_ci tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 1014e1051a39Sopenharmony_ci tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 1015e1051a39Sopenharmony_ci tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 1016e1051a39Sopenharmony_ci eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 1017e1051a39Sopenharmony_ci ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 1018e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 1019e1051a39Sopenharmony_ci 1020e1051a39Sopenharmony_ci b .Lschedule_mangle_both 1021e1051a39Sopenharmony_ci.align 4 1022e1051a39Sopenharmony_ci.Lschedule_mangle_dec: 1023e1051a39Sopenharmony_ci // inverse mix columns 1024e1051a39Sopenharmony_ci // lea .Lk_dksd(%rip),%r11 1025e1051a39Sopenharmony_ci ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi 1026e1051a39Sopenharmony_ci and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 1027e1051a39Sopenharmony_ci 1028e1051a39Sopenharmony_ci // vmovdqa 0x00(%r11), %xmm2 1029e1051a39Sopenharmony_ci tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1030e1051a39Sopenharmony_ci // vmovdqa 0x10(%r11), %xmm3 1031e1051a39Sopenharmony_ci tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1032e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1033e1051a39Sopenharmony_ci tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1034e1051a39Sopenharmony_ci 1035e1051a39Sopenharmony_ci // vmovdqa 0x20(%r11), %xmm2 1036e1051a39Sopenharmony_ci tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1037e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1038e1051a39Sopenharmony_ci // vmovdqa 0x30(%r11), %xmm3 1039e1051a39Sopenharmony_ci tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1040e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1041e1051a39Sopenharmony_ci tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1042e1051a39Sopenharmony_ci 1043e1051a39Sopenharmony_ci // vmovdqa 0x40(%r11), %xmm2 1044e1051a39Sopenharmony_ci tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1045e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1046e1051a39Sopenharmony_ci // vmovdqa 0x50(%r11), %xmm3 1047e1051a39Sopenharmony_ci tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1048e1051a39Sopenharmony_ci eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1049e1051a39Sopenharmony_ci 1050e1051a39Sopenharmony_ci // vmovdqa 0x60(%r11), %xmm2 1051e1051a39Sopenharmony_ci tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1052e1051a39Sopenharmony_ci tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1053e1051a39Sopenharmony_ci // vmovdqa 0x70(%r11), %xmm4 1054e1051a39Sopenharmony_ci tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 1055e1051a39Sopenharmony_ci ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 1056e1051a39Sopenharmony_ci eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1057e1051a39Sopenharmony_ci eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 1058e1051a39Sopenharmony_ci 1059e1051a39Sopenharmony_ci sub $out, $out, #16 // add \$-16, %rdx 1060e1051a39Sopenharmony_ci 1061e1051a39Sopenharmony_ci.Lschedule_mangle_both: 1062e1051a39Sopenharmony_ci tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1063e1051a39Sopenharmony_ci add x8, x8, #64-16 // add \$-16, %r8 1064e1051a39Sopenharmony_ci and x8, x8, #~(1<<6) // and \$0x30, %r8 1065e1051a39Sopenharmony_ci st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) 1066e1051a39Sopenharmony_ci ret 1067e1051a39Sopenharmony_ci.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 1068e1051a39Sopenharmony_ci 1069e1051a39Sopenharmony_ci.globl vpaes_set_encrypt_key 1070e1051a39Sopenharmony_ci.type vpaes_set_encrypt_key,%function 1071e1051a39Sopenharmony_ci.align 4 1072e1051a39Sopenharmony_civpaes_set_encrypt_key: 1073e1051a39Sopenharmony_ci .inst 0xd503233f // paciasp 1074e1051a39Sopenharmony_ci stp x29,x30,[sp,#-16]! 1075e1051a39Sopenharmony_ci add x29,sp,#0 1076e1051a39Sopenharmony_ci stp d8,d9,[sp,#-16]! // ABI spec says so 1077e1051a39Sopenharmony_ci 1078e1051a39Sopenharmony_ci lsr w9, $bits, #5 // shr \$5,%eax 1079e1051a39Sopenharmony_ci add w9, w9, #5 // \$5,%eax 1080e1051a39Sopenharmony_ci str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1081e1051a39Sopenharmony_ci 1082e1051a39Sopenharmony_ci mov $dir, #0 // mov \$0,%ecx 1083e1051a39Sopenharmony_ci mov x8, #0x30 // mov \$0x30,%r8d 1084e1051a39Sopenharmony_ci bl _vpaes_schedule_core 1085e1051a39Sopenharmony_ci eor x0, x0, x0 1086e1051a39Sopenharmony_ci 1087e1051a39Sopenharmony_ci ldp d8,d9,[sp],#16 1088e1051a39Sopenharmony_ci ldp x29,x30,[sp],#16 1089e1051a39Sopenharmony_ci .inst 0xd50323bf // autiasp 1090e1051a39Sopenharmony_ci ret 1091e1051a39Sopenharmony_ci.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 1092e1051a39Sopenharmony_ci 1093e1051a39Sopenharmony_ci.globl vpaes_set_decrypt_key 1094e1051a39Sopenharmony_ci.type vpaes_set_decrypt_key,%function 1095e1051a39Sopenharmony_ci.align 4 1096e1051a39Sopenharmony_civpaes_set_decrypt_key: 1097e1051a39Sopenharmony_ci .inst 0xd503233f // paciasp 1098e1051a39Sopenharmony_ci stp x29,x30,[sp,#-16]! 1099e1051a39Sopenharmony_ci add x29,sp,#0 1100e1051a39Sopenharmony_ci stp d8,d9,[sp,#-16]! // ABI spec says so 1101e1051a39Sopenharmony_ci 1102e1051a39Sopenharmony_ci lsr w9, $bits, #5 // shr \$5,%eax 1103e1051a39Sopenharmony_ci add w9, w9, #5 // \$5,%eax 1104e1051a39Sopenharmony_ci str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1105e1051a39Sopenharmony_ci lsl w9, w9, #4 // shl \$4,%eax 1106e1051a39Sopenharmony_ci add $out, $out, #16 // lea 16(%rdx,%rax),%rdx 1107e1051a39Sopenharmony_ci add $out, $out, x9 1108e1051a39Sopenharmony_ci 1109e1051a39Sopenharmony_ci mov $dir, #1 // mov \$1,%ecx 1110e1051a39Sopenharmony_ci lsr w8, $bits, #1 // shr \$1,%r8d 1111e1051a39Sopenharmony_ci and x8, x8, #32 // and \$32,%r8d 1112e1051a39Sopenharmony_ci eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32 1113e1051a39Sopenharmony_ci bl _vpaes_schedule_core 1114e1051a39Sopenharmony_ci 1115e1051a39Sopenharmony_ci ldp d8,d9,[sp],#16 1116e1051a39Sopenharmony_ci ldp x29,x30,[sp],#16 1117e1051a39Sopenharmony_ci .inst 0xd50323bf // autiasp 1118e1051a39Sopenharmony_ci ret 1119e1051a39Sopenharmony_ci.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1120e1051a39Sopenharmony_ci___ 1121e1051a39Sopenharmony_ci} 1122e1051a39Sopenharmony_ci{ 1123e1051a39Sopenharmony_cimy ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5)); 1124e1051a39Sopenharmony_ci 1125e1051a39Sopenharmony_ci$code.=<<___; 1126e1051a39Sopenharmony_ci.globl vpaes_cbc_encrypt 1127e1051a39Sopenharmony_ci.type vpaes_cbc_encrypt,%function 1128e1051a39Sopenharmony_ci.align 4 1129e1051a39Sopenharmony_civpaes_cbc_encrypt: 1130e1051a39Sopenharmony_ci cbz $len, .Lcbc_abort 1131e1051a39Sopenharmony_ci cmp w5, #0 // check direction 1132e1051a39Sopenharmony_ci b.eq vpaes_cbc_decrypt 1133e1051a39Sopenharmony_ci 1134e1051a39Sopenharmony_ci .inst 0xd503233f // paciasp 1135e1051a39Sopenharmony_ci stp x29,x30,[sp,#-16]! 1136e1051a39Sopenharmony_ci add x29,sp,#0 1137e1051a39Sopenharmony_ci 1138e1051a39Sopenharmony_ci mov x17, $len // reassign 1139e1051a39Sopenharmony_ci mov x2, $key // reassign 1140e1051a39Sopenharmony_ci 1141e1051a39Sopenharmony_ci ld1 {v0.16b}, [$ivec] // load ivec 1142e1051a39Sopenharmony_ci bl _vpaes_encrypt_preheat 1143e1051a39Sopenharmony_ci b .Lcbc_enc_loop 1144e1051a39Sopenharmony_ci 1145e1051a39Sopenharmony_ci.align 4 1146e1051a39Sopenharmony_ci.Lcbc_enc_loop: 1147e1051a39Sopenharmony_ci ld1 {v7.16b}, [$inp],#16 // load input 1148e1051a39Sopenharmony_ci eor v7.16b, v7.16b, v0.16b // xor with ivec 1149e1051a39Sopenharmony_ci bl _vpaes_encrypt_core 1150e1051a39Sopenharmony_ci st1 {v0.16b}, [$out],#16 // save output 1151e1051a39Sopenharmony_ci subs x17, x17, #16 1152e1051a39Sopenharmony_ci b.hi .Lcbc_enc_loop 1153e1051a39Sopenharmony_ci 1154e1051a39Sopenharmony_ci st1 {v0.16b}, [$ivec] // write ivec 1155e1051a39Sopenharmony_ci 1156e1051a39Sopenharmony_ci ldp x29,x30,[sp],#16 1157e1051a39Sopenharmony_ci .inst 0xd50323bf // autiasp 1158e1051a39Sopenharmony_ci.Lcbc_abort: 1159e1051a39Sopenharmony_ci ret 1160e1051a39Sopenharmony_ci.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt 1161e1051a39Sopenharmony_ci 1162e1051a39Sopenharmony_ci.type vpaes_cbc_decrypt,%function 1163e1051a39Sopenharmony_ci.align 4 1164e1051a39Sopenharmony_civpaes_cbc_decrypt: 1165e1051a39Sopenharmony_ci .inst 0xd503233f // paciasp 1166e1051a39Sopenharmony_ci stp x29,x30,[sp,#-16]! 1167e1051a39Sopenharmony_ci add x29,sp,#0 1168e1051a39Sopenharmony_ci stp d8,d9,[sp,#-16]! // ABI spec says so 1169e1051a39Sopenharmony_ci stp d10,d11,[sp,#-16]! 1170e1051a39Sopenharmony_ci stp d12,d13,[sp,#-16]! 1171e1051a39Sopenharmony_ci stp d14,d15,[sp,#-16]! 1172e1051a39Sopenharmony_ci 1173e1051a39Sopenharmony_ci mov x17, $len // reassign 1174e1051a39Sopenharmony_ci mov x2, $key // reassign 1175e1051a39Sopenharmony_ci ld1 {v6.16b}, [$ivec] // load ivec 1176e1051a39Sopenharmony_ci bl _vpaes_decrypt_preheat 1177e1051a39Sopenharmony_ci tst x17, #16 1178e1051a39Sopenharmony_ci b.eq .Lcbc_dec_loop2x 1179e1051a39Sopenharmony_ci 1180e1051a39Sopenharmony_ci ld1 {v7.16b}, [$inp], #16 // load input 1181e1051a39Sopenharmony_ci bl _vpaes_decrypt_core 1182e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v6.16b // xor with ivec 1183e1051a39Sopenharmony_ci orr v6.16b, v7.16b, v7.16b // next ivec value 1184e1051a39Sopenharmony_ci st1 {v0.16b}, [$out], #16 1185e1051a39Sopenharmony_ci subs x17, x17, #16 1186e1051a39Sopenharmony_ci b.ls .Lcbc_dec_done 1187e1051a39Sopenharmony_ci 1188e1051a39Sopenharmony_ci.align 4 1189e1051a39Sopenharmony_ci.Lcbc_dec_loop2x: 1190e1051a39Sopenharmony_ci ld1 {v14.16b,v15.16b}, [$inp], #32 1191e1051a39Sopenharmony_ci bl _vpaes_decrypt_2x 1192e1051a39Sopenharmony_ci eor v0.16b, v0.16b, v6.16b // xor with ivec 1193e1051a39Sopenharmony_ci eor v1.16b, v1.16b, v14.16b 1194e1051a39Sopenharmony_ci orr v6.16b, v15.16b, v15.16b 1195e1051a39Sopenharmony_ci st1 {v0.16b,v1.16b}, [$out], #32 1196e1051a39Sopenharmony_ci subs x17, x17, #32 1197e1051a39Sopenharmony_ci b.hi .Lcbc_dec_loop2x 1198e1051a39Sopenharmony_ci 1199e1051a39Sopenharmony_ci.Lcbc_dec_done: 1200e1051a39Sopenharmony_ci st1 {v6.16b}, [$ivec] 1201e1051a39Sopenharmony_ci 1202e1051a39Sopenharmony_ci ldp d14,d15,[sp],#16 1203e1051a39Sopenharmony_ci ldp d12,d13,[sp],#16 1204e1051a39Sopenharmony_ci ldp d10,d11,[sp],#16 1205e1051a39Sopenharmony_ci ldp d8,d9,[sp],#16 1206e1051a39Sopenharmony_ci ldp x29,x30,[sp],#16 1207e1051a39Sopenharmony_ci .inst 0xd50323bf // autiasp 1208e1051a39Sopenharmony_ci ret 1209e1051a39Sopenharmony_ci.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt 1210e1051a39Sopenharmony_ci___ 1211e1051a39Sopenharmony_ciif (1) { 1212e1051a39Sopenharmony_ci$code.=<<___; 1213e1051a39Sopenharmony_ci.globl vpaes_ecb_encrypt 1214e1051a39Sopenharmony_ci.type vpaes_ecb_encrypt,%function 1215e1051a39Sopenharmony_ci.align 4 1216e1051a39Sopenharmony_civpaes_ecb_encrypt: 1217e1051a39Sopenharmony_ci .inst 0xd503233f // paciasp 1218e1051a39Sopenharmony_ci stp x29,x30,[sp,#-16]! 1219e1051a39Sopenharmony_ci add x29,sp,#0 1220e1051a39Sopenharmony_ci stp d8,d9,[sp,#-16]! // ABI spec says so 1221e1051a39Sopenharmony_ci stp d10,d11,[sp,#-16]! 1222e1051a39Sopenharmony_ci stp d12,d13,[sp,#-16]! 1223e1051a39Sopenharmony_ci stp d14,d15,[sp,#-16]! 1224e1051a39Sopenharmony_ci 1225e1051a39Sopenharmony_ci mov x17, $len 1226e1051a39Sopenharmony_ci mov x2, $key 1227e1051a39Sopenharmony_ci bl _vpaes_encrypt_preheat 1228e1051a39Sopenharmony_ci tst x17, #16 1229e1051a39Sopenharmony_ci b.eq .Lecb_enc_loop 1230e1051a39Sopenharmony_ci 1231e1051a39Sopenharmony_ci ld1 {v7.16b}, [$inp],#16 1232e1051a39Sopenharmony_ci bl _vpaes_encrypt_core 1233e1051a39Sopenharmony_ci st1 {v0.16b}, [$out],#16 1234e1051a39Sopenharmony_ci subs x17, x17, #16 1235e1051a39Sopenharmony_ci b.ls .Lecb_enc_done 1236e1051a39Sopenharmony_ci 1237e1051a39Sopenharmony_ci.align 4 1238e1051a39Sopenharmony_ci.Lecb_enc_loop: 1239e1051a39Sopenharmony_ci ld1 {v14.16b,v15.16b}, [$inp], #32 1240e1051a39Sopenharmony_ci bl _vpaes_encrypt_2x 1241e1051a39Sopenharmony_ci st1 {v0.16b,v1.16b}, [$out], #32 1242e1051a39Sopenharmony_ci subs x17, x17, #32 1243e1051a39Sopenharmony_ci b.hi .Lecb_enc_loop 1244e1051a39Sopenharmony_ci 1245e1051a39Sopenharmony_ci.Lecb_enc_done: 1246e1051a39Sopenharmony_ci ldp d14,d15,[sp],#16 1247e1051a39Sopenharmony_ci ldp d12,d13,[sp],#16 1248e1051a39Sopenharmony_ci ldp d10,d11,[sp],#16 1249e1051a39Sopenharmony_ci ldp d8,d9,[sp],#16 1250e1051a39Sopenharmony_ci ldp x29,x30,[sp],#16 1251e1051a39Sopenharmony_ci .inst 0xd50323bf // autiasp 1252e1051a39Sopenharmony_ci ret 1253e1051a39Sopenharmony_ci.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt 1254e1051a39Sopenharmony_ci 1255e1051a39Sopenharmony_ci.globl vpaes_ecb_decrypt 1256e1051a39Sopenharmony_ci.type vpaes_ecb_decrypt,%function 1257e1051a39Sopenharmony_ci.align 4 1258e1051a39Sopenharmony_civpaes_ecb_decrypt: 1259e1051a39Sopenharmony_ci .inst 0xd503233f // paciasp 1260e1051a39Sopenharmony_ci stp x29,x30,[sp,#-16]! 1261e1051a39Sopenharmony_ci add x29,sp,#0 1262e1051a39Sopenharmony_ci stp d8,d9,[sp,#-16]! // ABI spec says so 1263e1051a39Sopenharmony_ci stp d10,d11,[sp,#-16]! 1264e1051a39Sopenharmony_ci stp d12,d13,[sp,#-16]! 1265e1051a39Sopenharmony_ci stp d14,d15,[sp,#-16]! 1266e1051a39Sopenharmony_ci 1267e1051a39Sopenharmony_ci mov x17, $len 1268e1051a39Sopenharmony_ci mov x2, $key 1269e1051a39Sopenharmony_ci bl _vpaes_decrypt_preheat 1270e1051a39Sopenharmony_ci tst x17, #16 1271e1051a39Sopenharmony_ci b.eq .Lecb_dec_loop 1272e1051a39Sopenharmony_ci 1273e1051a39Sopenharmony_ci ld1 {v7.16b}, [$inp],#16 1274e1051a39Sopenharmony_ci bl _vpaes_encrypt_core 1275e1051a39Sopenharmony_ci st1 {v0.16b}, [$out],#16 1276e1051a39Sopenharmony_ci subs x17, x17, #16 1277e1051a39Sopenharmony_ci b.ls .Lecb_dec_done 1278e1051a39Sopenharmony_ci 1279e1051a39Sopenharmony_ci.align 4 1280e1051a39Sopenharmony_ci.Lecb_dec_loop: 1281e1051a39Sopenharmony_ci ld1 {v14.16b,v15.16b}, [$inp], #32 1282e1051a39Sopenharmony_ci bl _vpaes_decrypt_2x 1283e1051a39Sopenharmony_ci st1 {v0.16b,v1.16b}, [$out], #32 1284e1051a39Sopenharmony_ci subs x17, x17, #32 1285e1051a39Sopenharmony_ci b.hi .Lecb_dec_loop 1286e1051a39Sopenharmony_ci 1287e1051a39Sopenharmony_ci.Lecb_dec_done: 1288e1051a39Sopenharmony_ci ldp d14,d15,[sp],#16 1289e1051a39Sopenharmony_ci ldp d12,d13,[sp],#16 1290e1051a39Sopenharmony_ci ldp d10,d11,[sp],#16 1291e1051a39Sopenharmony_ci ldp d8,d9,[sp],#16 1292e1051a39Sopenharmony_ci ldp x29,x30,[sp],#16 1293e1051a39Sopenharmony_ci .inst 0xd50323bf // autiasp 1294e1051a39Sopenharmony_ci ret 1295e1051a39Sopenharmony_ci.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt 1296e1051a39Sopenharmony_ci___ 1297e1051a39Sopenharmony_ci} } 1298e1051a39Sopenharmony_ciprint $code; 1299e1051a39Sopenharmony_ci 1300e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 1301