1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci# 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# June 2014 20e1051a39Sopenharmony_ci# 21e1051a39Sopenharmony_ci# Initial version was developed in tight cooperation with Ard 22e1051a39Sopenharmony_ci# Biesheuvel of Linaro from bits-n-pieces from other assembly modules. 23e1051a39Sopenharmony_ci# Just like aesv8-armx.pl this module supports both AArch32 and 24e1051a39Sopenharmony_ci# AArch64 execution modes. 25e1051a39Sopenharmony_ci# 26e1051a39Sopenharmony_ci# July 2014 27e1051a39Sopenharmony_ci# 28e1051a39Sopenharmony_ci# Implement 2x aggregated reduction [see ghash-x86.pl for background 29e1051a39Sopenharmony_ci# information]. 30e1051a39Sopenharmony_ci# 31e1051a39Sopenharmony_ci# November 2017 32e1051a39Sopenharmony_ci# 33e1051a39Sopenharmony_ci# AArch64 register bank to "accommodate" 4x aggregated reduction and 34e1051a39Sopenharmony_ci# improve performance by 20-70% depending on processor. 35e1051a39Sopenharmony_ci# 36e1051a39Sopenharmony_ci# Current performance in cycles per processed byte: 37e1051a39Sopenharmony_ci# 38e1051a39Sopenharmony_ci# 64-bit PMULL 32-bit PMULL 32-bit NEON(*) 39e1051a39Sopenharmony_ci# Apple A7 0.58 0.92 5.62 40e1051a39Sopenharmony_ci# Cortex-A53 0.85 1.01 8.39 41e1051a39Sopenharmony_ci# Cortex-A57 0.73 1.17 7.61 42e1051a39Sopenharmony_ci# Denver 0.51 0.65 6.02 43e1051a39Sopenharmony_ci# Mongoose 0.65 1.10 8.06 44e1051a39Sopenharmony_ci# Kryo 0.76 1.16 8.00 45e1051a39Sopenharmony_ci# ThunderX2 1.05 46e1051a39Sopenharmony_ci# 47e1051a39Sopenharmony_ci# (*) presented for reference/comparison purposes; 48e1051a39Sopenharmony_ci 49e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 50e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 51e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 52e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 53e1051a39Sopenharmony_ci 54e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 55e1051a39Sopenharmony_ci( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 56e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 57e1051a39Sopenharmony_cidie "can't locate arm-xlate.pl"; 58e1051a39Sopenharmony_ci 59e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" $xlate $flavour \"$output\"" 60e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 61e1051a39Sopenharmony_ci*STDOUT=*OUT; 62e1051a39Sopenharmony_ci 63e1051a39Sopenharmony_ci$Xi="x0"; # argument block 64e1051a39Sopenharmony_ci$Htbl="x1"; 65e1051a39Sopenharmony_ci$inp="x2"; 66e1051a39Sopenharmony_ci$len="x3"; 67e1051a39Sopenharmony_ci 68e1051a39Sopenharmony_ci$inc="x12"; 69e1051a39Sopenharmony_ci 70e1051a39Sopenharmony_ci{ 71e1051a39Sopenharmony_cimy ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); 72e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); 73e1051a39Sopenharmony_cimy $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); 74e1051a39Sopenharmony_ci 75e1051a39Sopenharmony_ci$code=<<___; 76e1051a39Sopenharmony_ci#include "arm_arch.h" 77e1051a39Sopenharmony_ci 78e1051a39Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 79e1051a39Sopenharmony_ci___ 80e1051a39Sopenharmony_ci$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); 81e1051a39Sopenharmony_ci$code.=<<___ if ($flavour !~ /64/); 82e1051a39Sopenharmony_ci.fpu neon 83e1051a39Sopenharmony_ci#ifdef __thumb2__ 84e1051a39Sopenharmony_ci.syntax unified 85e1051a39Sopenharmony_ci.thumb 86e1051a39Sopenharmony_ci# define INST(a,b,c,d) $_byte c,0xef,a,b 87e1051a39Sopenharmony_ci#else 88e1051a39Sopenharmony_ci.code 32 89e1051a39Sopenharmony_ci# define INST(a,b,c,d) $_byte a,b,c,0xf2 90e1051a39Sopenharmony_ci#endif 91e1051a39Sopenharmony_ci 92e1051a39Sopenharmony_ci.text 93e1051a39Sopenharmony_ci___ 94e1051a39Sopenharmony_ci 95e1051a39Sopenharmony_ci################################################################################ 96e1051a39Sopenharmony_ci# void gcm_init_v8(u128 Htable[16],const u64 H[2]); 97e1051a39Sopenharmony_ci# 98e1051a39Sopenharmony_ci# input: 128-bit H - secret parameter E(K,0^128) 99e1051a39Sopenharmony_ci# output: precomputed table filled with degrees of twisted H; 100e1051a39Sopenharmony_ci# H is twisted to handle reverse bitness of GHASH; 101e1051a39Sopenharmony_ci# only few of 16 slots of Htable[16] are used; 102e1051a39Sopenharmony_ci# data is opaque to outside world (which allows to 103e1051a39Sopenharmony_ci# optimize the code independently); 104e1051a39Sopenharmony_ci# 105e1051a39Sopenharmony_ci$code.=<<___; 106e1051a39Sopenharmony_ci.global gcm_init_v8 107e1051a39Sopenharmony_ci.type gcm_init_v8,%function 108e1051a39Sopenharmony_ci.align 4 109e1051a39Sopenharmony_cigcm_init_v8: 110e1051a39Sopenharmony_ci vld1.64 {$t1},[x1] @ load input H 111e1051a39Sopenharmony_ci vmov.i8 $xC2,#0xe1 112e1051a39Sopenharmony_ci vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 113e1051a39Sopenharmony_ci vext.8 $IN,$t1,$t1,#8 114e1051a39Sopenharmony_ci vshr.u64 $t2,$xC2,#63 115e1051a39Sopenharmony_ci vdup.32 $t1,${t1}[1] 116e1051a39Sopenharmony_ci vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 117e1051a39Sopenharmony_ci vshr.u64 $t2,$IN,#63 118e1051a39Sopenharmony_ci vshr.s32 $t1,$t1,#31 @ broadcast carry bit 119e1051a39Sopenharmony_ci vand $t2,$t2,$t0 120e1051a39Sopenharmony_ci vshl.i64 $IN,$IN,#1 121e1051a39Sopenharmony_ci vext.8 $t2,$t2,$t2,#8 122e1051a39Sopenharmony_ci vand $t0,$t0,$t1 123e1051a39Sopenharmony_ci vorr $IN,$IN,$t2 @ H<<<=1 124e1051a39Sopenharmony_ci veor $H,$IN,$t0 @ twisted H 125e1051a39Sopenharmony_ci vst1.64 {$H},[x0],#16 @ store Htable[0] 126e1051a39Sopenharmony_ci 127e1051a39Sopenharmony_ci @ calculate H^2 128e1051a39Sopenharmony_ci vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing 129e1051a39Sopenharmony_ci vpmull.p64 $Xl,$H,$H 130e1051a39Sopenharmony_ci veor $t0,$t0,$H 131e1051a39Sopenharmony_ci vpmull2.p64 $Xh,$H,$H 132e1051a39Sopenharmony_ci vpmull.p64 $Xm,$t0,$t0 133e1051a39Sopenharmony_ci 134e1051a39Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 135e1051a39Sopenharmony_ci veor $t2,$Xl,$Xh 136e1051a39Sopenharmony_ci veor $Xm,$Xm,$t1 137e1051a39Sopenharmony_ci veor $Xm,$Xm,$t2 138e1051a39Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 139e1051a39Sopenharmony_ci 140e1051a39Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 141e1051a39Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 142e1051a39Sopenharmony_ci veor $Xl,$Xm,$t2 143e1051a39Sopenharmony_ci 144e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 145e1051a39Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 146e1051a39Sopenharmony_ci veor $t2,$t2,$Xh 147e1051a39Sopenharmony_ci veor $H2,$Xl,$t2 148e1051a39Sopenharmony_ci 149e1051a39Sopenharmony_ci vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing 150e1051a39Sopenharmony_ci veor $t1,$t1,$H2 151e1051a39Sopenharmony_ci vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed 152e1051a39Sopenharmony_ci vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2] 153e1051a39Sopenharmony_ci___ 154e1051a39Sopenharmony_ciif ($flavour =~ /64/) { 155e1051a39Sopenharmony_cimy ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); 156e1051a39Sopenharmony_ci 157e1051a39Sopenharmony_ci$code.=<<___; 158e1051a39Sopenharmony_ci @ calculate H^3 and H^4 159e1051a39Sopenharmony_ci vpmull.p64 $Xl,$H, $H2 160e1051a39Sopenharmony_ci vpmull.p64 $Yl,$H2,$H2 161e1051a39Sopenharmony_ci vpmull2.p64 $Xh,$H, $H2 162e1051a39Sopenharmony_ci vpmull2.p64 $Yh,$H2,$H2 163e1051a39Sopenharmony_ci vpmull.p64 $Xm,$t0,$t1 164e1051a39Sopenharmony_ci vpmull.p64 $Ym,$t1,$t1 165e1051a39Sopenharmony_ci 166e1051a39Sopenharmony_ci vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing 167e1051a39Sopenharmony_ci vext.8 $t1,$Yl,$Yh,#8 168e1051a39Sopenharmony_ci veor $t2,$Xl,$Xh 169e1051a39Sopenharmony_ci veor $Xm,$Xm,$t0 170e1051a39Sopenharmony_ci veor $t3,$Yl,$Yh 171e1051a39Sopenharmony_ci veor $Ym,$Ym,$t1 172e1051a39Sopenharmony_ci veor $Xm,$Xm,$t2 173e1051a39Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 174e1051a39Sopenharmony_ci veor $Ym,$Ym,$t3 175e1051a39Sopenharmony_ci vpmull.p64 $t3,$Yl,$xC2 176e1051a39Sopenharmony_ci 177e1051a39Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 178e1051a39Sopenharmony_ci vmov $Yh#lo,$Ym#hi 179e1051a39Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 180e1051a39Sopenharmony_ci vmov $Ym#hi,$Yl#lo 181e1051a39Sopenharmony_ci veor $Xl,$Xm,$t2 182e1051a39Sopenharmony_ci veor $Yl,$Ym,$t3 183e1051a39Sopenharmony_ci 184e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 185e1051a39Sopenharmony_ci vext.8 $t3,$Yl,$Yl,#8 186e1051a39Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 187e1051a39Sopenharmony_ci vpmull.p64 $Yl,$Yl,$xC2 188e1051a39Sopenharmony_ci veor $t2,$t2,$Xh 189e1051a39Sopenharmony_ci veor $t3,$t3,$Yh 190e1051a39Sopenharmony_ci veor $H, $Xl,$t2 @ H^3 191e1051a39Sopenharmony_ci veor $H2,$Yl,$t3 @ H^4 192e1051a39Sopenharmony_ci 193e1051a39Sopenharmony_ci vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing 194e1051a39Sopenharmony_ci vext.8 $t1,$H2,$H2,#8 195e1051a39Sopenharmony_ci veor $t0,$t0,$H 196e1051a39Sopenharmony_ci veor $t1,$t1,$H2 197e1051a39Sopenharmony_ci vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed 198e1051a39Sopenharmony_ci vst1.64 {$H-$H2},[x0] @ store Htable[3..5] 199e1051a39Sopenharmony_ci___ 200e1051a39Sopenharmony_ci} 201e1051a39Sopenharmony_ci$code.=<<___; 202e1051a39Sopenharmony_ci ret 203e1051a39Sopenharmony_ci.size gcm_init_v8,.-gcm_init_v8 204e1051a39Sopenharmony_ci___ 205e1051a39Sopenharmony_ci################################################################################ 206e1051a39Sopenharmony_ci# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); 207e1051a39Sopenharmony_ci# 208e1051a39Sopenharmony_ci# input: Xi - current hash value; 209e1051a39Sopenharmony_ci# Htable - table precomputed in gcm_init_v8; 210e1051a39Sopenharmony_ci# output: Xi - next hash value Xi; 211e1051a39Sopenharmony_ci# 212e1051a39Sopenharmony_ci$code.=<<___; 213e1051a39Sopenharmony_ci.global gcm_gmult_v8 214e1051a39Sopenharmony_ci.type gcm_gmult_v8,%function 215e1051a39Sopenharmony_ci.align 4 216e1051a39Sopenharmony_cigcm_gmult_v8: 217e1051a39Sopenharmony_ci vld1.64 {$t1},[$Xi] @ load Xi 218e1051a39Sopenharmony_ci vmov.i8 $xC2,#0xe1 219e1051a39Sopenharmony_ci vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... 220e1051a39Sopenharmony_ci vshl.u64 $xC2,$xC2,#57 221e1051a39Sopenharmony_ci#ifndef __ARMEB__ 222e1051a39Sopenharmony_ci vrev64.8 $t1,$t1 223e1051a39Sopenharmony_ci#endif 224e1051a39Sopenharmony_ci vext.8 $IN,$t1,$t1,#8 225e1051a39Sopenharmony_ci 226e1051a39Sopenharmony_ci vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo 227e1051a39Sopenharmony_ci veor $t1,$t1,$IN @ Karatsuba pre-processing 228e1051a39Sopenharmony_ci vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi 229e1051a39Sopenharmony_ci vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 230e1051a39Sopenharmony_ci 231e1051a39Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 232e1051a39Sopenharmony_ci veor $t2,$Xl,$Xh 233e1051a39Sopenharmony_ci veor $Xm,$Xm,$t1 234e1051a39Sopenharmony_ci veor $Xm,$Xm,$t2 235e1051a39Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 236e1051a39Sopenharmony_ci 237e1051a39Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 238e1051a39Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 239e1051a39Sopenharmony_ci veor $Xl,$Xm,$t2 240e1051a39Sopenharmony_ci 241e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 242e1051a39Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 243e1051a39Sopenharmony_ci veor $t2,$t2,$Xh 244e1051a39Sopenharmony_ci veor $Xl,$Xl,$t2 245e1051a39Sopenharmony_ci 246e1051a39Sopenharmony_ci#ifndef __ARMEB__ 247e1051a39Sopenharmony_ci vrev64.8 $Xl,$Xl 248e1051a39Sopenharmony_ci#endif 249e1051a39Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 250e1051a39Sopenharmony_ci vst1.64 {$Xl},[$Xi] @ write out Xi 251e1051a39Sopenharmony_ci 252e1051a39Sopenharmony_ci ret 253e1051a39Sopenharmony_ci.size gcm_gmult_v8,.-gcm_gmult_v8 254e1051a39Sopenharmony_ci___ 255e1051a39Sopenharmony_ci################################################################################ 256e1051a39Sopenharmony_ci# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 257e1051a39Sopenharmony_ci# 258e1051a39Sopenharmony_ci# input: table precomputed in gcm_init_v8; 259e1051a39Sopenharmony_ci# current hash value Xi; 260e1051a39Sopenharmony_ci# pointer to input data; 261e1051a39Sopenharmony_ci# length of input data in bytes, but divisible by block size; 262e1051a39Sopenharmony_ci# output: next hash value Xi; 263e1051a39Sopenharmony_ci# 264e1051a39Sopenharmony_ci$code.=<<___; 265e1051a39Sopenharmony_ci.global gcm_ghash_v8 266e1051a39Sopenharmony_ci.type gcm_ghash_v8,%function 267e1051a39Sopenharmony_ci.align 4 268e1051a39Sopenharmony_cigcm_ghash_v8: 269e1051a39Sopenharmony_ci___ 270e1051a39Sopenharmony_ci$code.=<<___ if ($flavour =~ /64/); 271e1051a39Sopenharmony_ci cmp $len,#64 272e1051a39Sopenharmony_ci b.hs .Lgcm_ghash_v8_4x 273e1051a39Sopenharmony_ci___ 274e1051a39Sopenharmony_ci$code.=<<___ if ($flavour !~ /64/); 275e1051a39Sopenharmony_ci vstmdb sp!,{d8-d15} @ 32-bit ABI says so 276e1051a39Sopenharmony_ci___ 277e1051a39Sopenharmony_ci$code.=<<___; 278e1051a39Sopenharmony_ci vld1.64 {$Xl},[$Xi] @ load [rotated] Xi 279e1051a39Sopenharmony_ci @ "[rotated]" means that 280e1051a39Sopenharmony_ci @ loaded value would have 281e1051a39Sopenharmony_ci @ to be rotated in order to 282e1051a39Sopenharmony_ci @ make it appear as in 283e1051a39Sopenharmony_ci @ algorithm specification 284e1051a39Sopenharmony_ci subs $len,$len,#32 @ see if $len is 32 or larger 285e1051a39Sopenharmony_ci mov $inc,#16 @ $inc is used as post- 286e1051a39Sopenharmony_ci @ increment for input pointer; 287e1051a39Sopenharmony_ci @ as loop is modulo-scheduled 288e1051a39Sopenharmony_ci @ $inc is zeroed just in time 289e1051a39Sopenharmony_ci @ to preclude overstepping 290e1051a39Sopenharmony_ci @ inp[len], which means that 291e1051a39Sopenharmony_ci @ last block[s] are actually 292e1051a39Sopenharmony_ci @ loaded twice, but last 293e1051a39Sopenharmony_ci @ copy is not processed 294e1051a39Sopenharmony_ci vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2 295e1051a39Sopenharmony_ci vmov.i8 $xC2,#0xe1 296e1051a39Sopenharmony_ci vld1.64 {$H2},[$Htbl] 297e1051a39Sopenharmony_ci cclr $inc,eq @ is it time to zero $inc? 298e1051a39Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi 299e1051a39Sopenharmony_ci vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0] 300e1051a39Sopenharmony_ci vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant 301e1051a39Sopenharmony_ci#ifndef __ARMEB__ 302e1051a39Sopenharmony_ci vrev64.8 $t0,$t0 303e1051a39Sopenharmony_ci vrev64.8 $Xl,$Xl 304e1051a39Sopenharmony_ci#endif 305e1051a39Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 @ rotate I[0] 306e1051a39Sopenharmony_ci b.lo .Lodd_tail_v8 @ $len was less than 32 307e1051a39Sopenharmony_ci___ 308e1051a39Sopenharmony_ci{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7)); 309e1051a39Sopenharmony_ci ####### 310e1051a39Sopenharmony_ci # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 311e1051a39Sopenharmony_ci # [(H*Ii+1) + (H*Xi+1)] mod P = 312e1051a39Sopenharmony_ci # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 313e1051a39Sopenharmony_ci # 314e1051a39Sopenharmony_ci$code.=<<___; 315e1051a39Sopenharmony_ci vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1] 316e1051a39Sopenharmony_ci#ifndef __ARMEB__ 317e1051a39Sopenharmony_ci vrev64.8 $t1,$t1 318e1051a39Sopenharmony_ci#endif 319e1051a39Sopenharmony_ci vext.8 $In,$t1,$t1,#8 320e1051a39Sopenharmony_ci veor $IN,$IN,$Xl @ I[i]^=Xi 321e1051a39Sopenharmony_ci vpmull.p64 $Xln,$H,$In @ H·Ii+1 322e1051a39Sopenharmony_ci veor $t1,$t1,$In @ Karatsuba pre-processing 323e1051a39Sopenharmony_ci vpmull2.p64 $Xhn,$H,$In 324e1051a39Sopenharmony_ci b .Loop_mod2x_v8 325e1051a39Sopenharmony_ci 326e1051a39Sopenharmony_ci.align 4 327e1051a39Sopenharmony_ci.Loop_mod2x_v8: 328e1051a39Sopenharmony_ci vext.8 $t2,$IN,$IN,#8 329e1051a39Sopenharmony_ci subs $len,$len,#32 @ is there more data? 330e1051a39Sopenharmony_ci vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo 331e1051a39Sopenharmony_ci cclr $inc,lo @ is it time to zero $inc? 332e1051a39Sopenharmony_ci 333e1051a39Sopenharmony_ci vpmull.p64 $Xmn,$Hhl,$t1 334e1051a39Sopenharmony_ci veor $t2,$t2,$IN @ Karatsuba pre-processing 335e1051a39Sopenharmony_ci vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi 336e1051a39Sopenharmony_ci veor $Xl,$Xl,$Xln @ accumulate 337e1051a39Sopenharmony_ci vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 338e1051a39Sopenharmony_ci vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2] 339e1051a39Sopenharmony_ci 340e1051a39Sopenharmony_ci veor $Xh,$Xh,$Xhn 341e1051a39Sopenharmony_ci cclr $inc,eq @ is it time to zero $inc? 342e1051a39Sopenharmony_ci veor $Xm,$Xm,$Xmn 343e1051a39Sopenharmony_ci 344e1051a39Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 345e1051a39Sopenharmony_ci veor $t2,$Xl,$Xh 346e1051a39Sopenharmony_ci veor $Xm,$Xm,$t1 347e1051a39Sopenharmony_ci vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3] 348e1051a39Sopenharmony_ci#ifndef __ARMEB__ 349e1051a39Sopenharmony_ci vrev64.8 $t0,$t0 350e1051a39Sopenharmony_ci#endif 351e1051a39Sopenharmony_ci veor $Xm,$Xm,$t2 352e1051a39Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 353e1051a39Sopenharmony_ci 354e1051a39Sopenharmony_ci#ifndef __ARMEB__ 355e1051a39Sopenharmony_ci vrev64.8 $t1,$t1 356e1051a39Sopenharmony_ci#endif 357e1051a39Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 358e1051a39Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 359e1051a39Sopenharmony_ci vext.8 $In,$t1,$t1,#8 360e1051a39Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 361e1051a39Sopenharmony_ci veor $Xl,$Xm,$t2 362e1051a39Sopenharmony_ci vpmull.p64 $Xln,$H,$In @ H·Ii+1 363e1051a39Sopenharmony_ci veor $IN,$IN,$Xh @ accumulate $IN early 364e1051a39Sopenharmony_ci 365e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 366e1051a39Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 367e1051a39Sopenharmony_ci veor $IN,$IN,$t2 368e1051a39Sopenharmony_ci veor $t1,$t1,$In @ Karatsuba pre-processing 369e1051a39Sopenharmony_ci veor $IN,$IN,$Xl 370e1051a39Sopenharmony_ci vpmull2.p64 $Xhn,$H,$In 371e1051a39Sopenharmony_ci b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes 372e1051a39Sopenharmony_ci 373e1051a39Sopenharmony_ci veor $Xh,$Xh,$t2 374e1051a39Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 @ re-construct $IN 375e1051a39Sopenharmony_ci adds $len,$len,#32 @ re-construct $len 376e1051a39Sopenharmony_ci veor $Xl,$Xl,$Xh @ re-construct $Xl 377e1051a39Sopenharmony_ci b.eq .Ldone_v8 @ is $len zero? 378e1051a39Sopenharmony_ci___ 379e1051a39Sopenharmony_ci} 380e1051a39Sopenharmony_ci$code.=<<___; 381e1051a39Sopenharmony_ci.Lodd_tail_v8: 382e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 383e1051a39Sopenharmony_ci veor $IN,$IN,$Xl @ inp^=Xi 384e1051a39Sopenharmony_ci veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi 385e1051a39Sopenharmony_ci 386e1051a39Sopenharmony_ci vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo 387e1051a39Sopenharmony_ci veor $t1,$t1,$IN @ Karatsuba pre-processing 388e1051a39Sopenharmony_ci vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi 389e1051a39Sopenharmony_ci vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 390e1051a39Sopenharmony_ci 391e1051a39Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 392e1051a39Sopenharmony_ci veor $t2,$Xl,$Xh 393e1051a39Sopenharmony_ci veor $Xm,$Xm,$t1 394e1051a39Sopenharmony_ci veor $Xm,$Xm,$t2 395e1051a39Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 396e1051a39Sopenharmony_ci 397e1051a39Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 398e1051a39Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 399e1051a39Sopenharmony_ci veor $Xl,$Xm,$t2 400e1051a39Sopenharmony_ci 401e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 402e1051a39Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 403e1051a39Sopenharmony_ci veor $t2,$t2,$Xh 404e1051a39Sopenharmony_ci veor $Xl,$Xl,$t2 405e1051a39Sopenharmony_ci 406e1051a39Sopenharmony_ci.Ldone_v8: 407e1051a39Sopenharmony_ci#ifndef __ARMEB__ 408e1051a39Sopenharmony_ci vrev64.8 $Xl,$Xl 409e1051a39Sopenharmony_ci#endif 410e1051a39Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 411e1051a39Sopenharmony_ci vst1.64 {$Xl},[$Xi] @ write out Xi 412e1051a39Sopenharmony_ci 413e1051a39Sopenharmony_ci___ 414e1051a39Sopenharmony_ci$code.=<<___ if ($flavour !~ /64/); 415e1051a39Sopenharmony_ci vldmia sp!,{d8-d15} @ 32-bit ABI says so 416e1051a39Sopenharmony_ci___ 417e1051a39Sopenharmony_ci$code.=<<___; 418e1051a39Sopenharmony_ci ret 419e1051a39Sopenharmony_ci.size gcm_ghash_v8,.-gcm_ghash_v8 420e1051a39Sopenharmony_ci___ 421e1051a39Sopenharmony_ci 422e1051a39Sopenharmony_ciif ($flavour =~ /64/) { # 4x subroutine 423e1051a39Sopenharmony_cimy ($I0,$j1,$j2,$j3, 424e1051a39Sopenharmony_ci $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23)); 425e1051a39Sopenharmony_ci 426e1051a39Sopenharmony_ci$code.=<<___; 427e1051a39Sopenharmony_ci.type gcm_ghash_v8_4x,%function 428e1051a39Sopenharmony_ci.align 4 429e1051a39Sopenharmony_cigcm_ghash_v8_4x: 430e1051a39Sopenharmony_ci.Lgcm_ghash_v8_4x: 431e1051a39Sopenharmony_ci vld1.64 {$Xl},[$Xi] @ load [rotated] Xi 432e1051a39Sopenharmony_ci vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2 433e1051a39Sopenharmony_ci vmov.i8 $xC2,#0xe1 434e1051a39Sopenharmony_ci vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4 435e1051a39Sopenharmony_ci vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant 436e1051a39Sopenharmony_ci 437e1051a39Sopenharmony_ci vld1.64 {$I0-$j3},[$inp],#64 438e1051a39Sopenharmony_ci#ifndef __ARMEB__ 439e1051a39Sopenharmony_ci vrev64.8 $Xl,$Xl 440e1051a39Sopenharmony_ci vrev64.8 $j1,$j1 441e1051a39Sopenharmony_ci vrev64.8 $j2,$j2 442e1051a39Sopenharmony_ci vrev64.8 $j3,$j3 443e1051a39Sopenharmony_ci vrev64.8 $I0,$I0 444e1051a39Sopenharmony_ci#endif 445e1051a39Sopenharmony_ci vext.8 $I3,$j3,$j3,#8 446e1051a39Sopenharmony_ci vext.8 $I2,$j2,$j2,#8 447e1051a39Sopenharmony_ci vext.8 $I1,$j1,$j1,#8 448e1051a39Sopenharmony_ci 449e1051a39Sopenharmony_ci vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 450e1051a39Sopenharmony_ci veor $j3,$j3,$I3 451e1051a39Sopenharmony_ci vpmull2.p64 $Yh,$H,$I3 452e1051a39Sopenharmony_ci vpmull.p64 $Ym,$Hhl,$j3 453e1051a39Sopenharmony_ci 454e1051a39Sopenharmony_ci vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 455e1051a39Sopenharmony_ci veor $j2,$j2,$I2 456e1051a39Sopenharmony_ci vpmull2.p64 $I2,$H2,$I2 457e1051a39Sopenharmony_ci vpmull2.p64 $j2,$Hhl,$j2 458e1051a39Sopenharmony_ci 459e1051a39Sopenharmony_ci veor $Yl,$Yl,$t0 460e1051a39Sopenharmony_ci veor $Yh,$Yh,$I2 461e1051a39Sopenharmony_ci veor $Ym,$Ym,$j2 462e1051a39Sopenharmony_ci 463e1051a39Sopenharmony_ci vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 464e1051a39Sopenharmony_ci veor $j1,$j1,$I1 465e1051a39Sopenharmony_ci vpmull2.p64 $I1,$H3,$I1 466e1051a39Sopenharmony_ci vpmull.p64 $j1,$H34,$j1 467e1051a39Sopenharmony_ci 468e1051a39Sopenharmony_ci veor $Yl,$Yl,$j3 469e1051a39Sopenharmony_ci veor $Yh,$Yh,$I1 470e1051a39Sopenharmony_ci veor $Ym,$Ym,$j1 471e1051a39Sopenharmony_ci 472e1051a39Sopenharmony_ci subs $len,$len,#128 473e1051a39Sopenharmony_ci b.lo .Ltail4x 474e1051a39Sopenharmony_ci 475e1051a39Sopenharmony_ci b .Loop4x 476e1051a39Sopenharmony_ci 477e1051a39Sopenharmony_ci.align 4 478e1051a39Sopenharmony_ci.Loop4x: 479e1051a39Sopenharmony_ci veor $t0,$I0,$Xl 480e1051a39Sopenharmony_ci vld1.64 {$I0-$j3},[$inp],#64 481e1051a39Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 482e1051a39Sopenharmony_ci#ifndef __ARMEB__ 483e1051a39Sopenharmony_ci vrev64.8 $j1,$j1 484e1051a39Sopenharmony_ci vrev64.8 $j2,$j2 485e1051a39Sopenharmony_ci vrev64.8 $j3,$j3 486e1051a39Sopenharmony_ci vrev64.8 $I0,$I0 487e1051a39Sopenharmony_ci#endif 488e1051a39Sopenharmony_ci 489e1051a39Sopenharmony_ci vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) 490e1051a39Sopenharmony_ci veor $t0,$t0,$IN 491e1051a39Sopenharmony_ci vpmull2.p64 $Xh,$H4,$IN 492e1051a39Sopenharmony_ci vext.8 $I3,$j3,$j3,#8 493e1051a39Sopenharmony_ci vpmull2.p64 $Xm,$H34,$t0 494e1051a39Sopenharmony_ci 495e1051a39Sopenharmony_ci veor $Xl,$Xl,$Yl 496e1051a39Sopenharmony_ci veor $Xh,$Xh,$Yh 497e1051a39Sopenharmony_ci vext.8 $I2,$j2,$j2,#8 498e1051a39Sopenharmony_ci veor $Xm,$Xm,$Ym 499e1051a39Sopenharmony_ci vext.8 $I1,$j1,$j1,#8 500e1051a39Sopenharmony_ci 501e1051a39Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 502e1051a39Sopenharmony_ci veor $t2,$Xl,$Xh 503e1051a39Sopenharmony_ci vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 504e1051a39Sopenharmony_ci veor $j3,$j3,$I3 505e1051a39Sopenharmony_ci veor $Xm,$Xm,$t1 506e1051a39Sopenharmony_ci vpmull2.p64 $Yh,$H,$I3 507e1051a39Sopenharmony_ci veor $Xm,$Xm,$t2 508e1051a39Sopenharmony_ci vpmull.p64 $Ym,$Hhl,$j3 509e1051a39Sopenharmony_ci 510e1051a39Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 511e1051a39Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 512e1051a39Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 513e1051a39Sopenharmony_ci vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 514e1051a39Sopenharmony_ci veor $j2,$j2,$I2 515e1051a39Sopenharmony_ci vpmull2.p64 $I2,$H2,$I2 516e1051a39Sopenharmony_ci veor $Xl,$Xm,$t2 517e1051a39Sopenharmony_ci vpmull2.p64 $j2,$Hhl,$j2 518e1051a39Sopenharmony_ci 519e1051a39Sopenharmony_ci veor $Yl,$Yl,$t0 520e1051a39Sopenharmony_ci veor $Yh,$Yh,$I2 521e1051a39Sopenharmony_ci veor $Ym,$Ym,$j2 522e1051a39Sopenharmony_ci 523e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 524e1051a39Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 525e1051a39Sopenharmony_ci vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 526e1051a39Sopenharmony_ci veor $j1,$j1,$I1 527e1051a39Sopenharmony_ci veor $t2,$t2,$Xh 528e1051a39Sopenharmony_ci vpmull2.p64 $I1,$H3,$I1 529e1051a39Sopenharmony_ci vpmull.p64 $j1,$H34,$j1 530e1051a39Sopenharmony_ci 531e1051a39Sopenharmony_ci veor $Xl,$Xl,$t2 532e1051a39Sopenharmony_ci veor $Yl,$Yl,$j3 533e1051a39Sopenharmony_ci veor $Yh,$Yh,$I1 534e1051a39Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 535e1051a39Sopenharmony_ci veor $Ym,$Ym,$j1 536e1051a39Sopenharmony_ci 537e1051a39Sopenharmony_ci subs $len,$len,#64 538e1051a39Sopenharmony_ci b.hs .Loop4x 539e1051a39Sopenharmony_ci 540e1051a39Sopenharmony_ci.Ltail4x: 541e1051a39Sopenharmony_ci veor $t0,$I0,$Xl 542e1051a39Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 543e1051a39Sopenharmony_ci 544e1051a39Sopenharmony_ci vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) 545e1051a39Sopenharmony_ci veor $t0,$t0,$IN 546e1051a39Sopenharmony_ci vpmull2.p64 $Xh,$H4,$IN 547e1051a39Sopenharmony_ci vpmull2.p64 $Xm,$H34,$t0 548e1051a39Sopenharmony_ci 549e1051a39Sopenharmony_ci veor $Xl,$Xl,$Yl 550e1051a39Sopenharmony_ci veor $Xh,$Xh,$Yh 551e1051a39Sopenharmony_ci veor $Xm,$Xm,$Ym 552e1051a39Sopenharmony_ci 553e1051a39Sopenharmony_ci adds $len,$len,#64 554e1051a39Sopenharmony_ci b.eq .Ldone4x 555e1051a39Sopenharmony_ci 556e1051a39Sopenharmony_ci cmp $len,#32 557e1051a39Sopenharmony_ci b.lo .Lone 558e1051a39Sopenharmony_ci b.eq .Ltwo 559e1051a39Sopenharmony_ci.Lthree: 560e1051a39Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 561e1051a39Sopenharmony_ci veor $t2,$Xl,$Xh 562e1051a39Sopenharmony_ci veor $Xm,$Xm,$t1 563e1051a39Sopenharmony_ci vld1.64 {$I0-$j2},[$inp] 564e1051a39Sopenharmony_ci veor $Xm,$Xm,$t2 565e1051a39Sopenharmony_ci#ifndef __ARMEB__ 566e1051a39Sopenharmony_ci vrev64.8 $j1,$j1 567e1051a39Sopenharmony_ci vrev64.8 $j2,$j2 568e1051a39Sopenharmony_ci vrev64.8 $I0,$I0 569e1051a39Sopenharmony_ci#endif 570e1051a39Sopenharmony_ci 571e1051a39Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 572e1051a39Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 573e1051a39Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 574e1051a39Sopenharmony_ci vext.8 $I2,$j2,$j2,#8 575e1051a39Sopenharmony_ci vext.8 $I1,$j1,$j1,#8 576e1051a39Sopenharmony_ci veor $Xl,$Xm,$t2 577e1051a39Sopenharmony_ci 578e1051a39Sopenharmony_ci vpmull.p64 $Yl,$H,$I2 @ H·Ii+2 579e1051a39Sopenharmony_ci veor $j2,$j2,$I2 580e1051a39Sopenharmony_ci 581e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 582e1051a39Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 583e1051a39Sopenharmony_ci veor $t2,$t2,$Xh 584e1051a39Sopenharmony_ci vpmull2.p64 $Yh,$H,$I2 585e1051a39Sopenharmony_ci vpmull.p64 $Ym,$Hhl,$j2 586e1051a39Sopenharmony_ci veor $Xl,$Xl,$t2 587e1051a39Sopenharmony_ci vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1 588e1051a39Sopenharmony_ci veor $j1,$j1,$I1 589e1051a39Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 590e1051a39Sopenharmony_ci 591e1051a39Sopenharmony_ci vpmull2.p64 $I1,$H2,$I1 592e1051a39Sopenharmony_ci veor $t0,$I0,$Xl 593e1051a39Sopenharmony_ci vpmull2.p64 $j1,$Hhl,$j1 594e1051a39Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 595e1051a39Sopenharmony_ci 596e1051a39Sopenharmony_ci veor $Yl,$Yl,$j3 597e1051a39Sopenharmony_ci veor $Yh,$Yh,$I1 598e1051a39Sopenharmony_ci veor $Ym,$Ym,$j1 599e1051a39Sopenharmony_ci 600e1051a39Sopenharmony_ci vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii) 601e1051a39Sopenharmony_ci veor $t0,$t0,$IN 602e1051a39Sopenharmony_ci vpmull2.p64 $Xh,$H3,$IN 603e1051a39Sopenharmony_ci vpmull.p64 $Xm,$H34,$t0 604e1051a39Sopenharmony_ci 605e1051a39Sopenharmony_ci veor $Xl,$Xl,$Yl 606e1051a39Sopenharmony_ci veor $Xh,$Xh,$Yh 607e1051a39Sopenharmony_ci veor $Xm,$Xm,$Ym 608e1051a39Sopenharmony_ci b .Ldone4x 609e1051a39Sopenharmony_ci 610e1051a39Sopenharmony_ci.align 4 611e1051a39Sopenharmony_ci.Ltwo: 612e1051a39Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 613e1051a39Sopenharmony_ci veor $t2,$Xl,$Xh 614e1051a39Sopenharmony_ci veor $Xm,$Xm,$t1 615e1051a39Sopenharmony_ci vld1.64 {$I0-$j1},[$inp] 616e1051a39Sopenharmony_ci veor $Xm,$Xm,$t2 617e1051a39Sopenharmony_ci#ifndef __ARMEB__ 618e1051a39Sopenharmony_ci vrev64.8 $j1,$j1 619e1051a39Sopenharmony_ci vrev64.8 $I0,$I0 620e1051a39Sopenharmony_ci#endif 621e1051a39Sopenharmony_ci 622e1051a39Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 623e1051a39Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 624e1051a39Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 625e1051a39Sopenharmony_ci vext.8 $I1,$j1,$j1,#8 626e1051a39Sopenharmony_ci veor $Xl,$Xm,$t2 627e1051a39Sopenharmony_ci 628e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 629e1051a39Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 630e1051a39Sopenharmony_ci veor $t2,$t2,$Xh 631e1051a39Sopenharmony_ci veor $Xl,$Xl,$t2 632e1051a39Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 633e1051a39Sopenharmony_ci 634e1051a39Sopenharmony_ci vpmull.p64 $Yl,$H,$I1 @ H·Ii+1 635e1051a39Sopenharmony_ci veor $j1,$j1,$I1 636e1051a39Sopenharmony_ci 637e1051a39Sopenharmony_ci veor $t0,$I0,$Xl 638e1051a39Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 639e1051a39Sopenharmony_ci 640e1051a39Sopenharmony_ci vpmull2.p64 $Yh,$H,$I1 641e1051a39Sopenharmony_ci vpmull.p64 $Ym,$Hhl,$j1 642e1051a39Sopenharmony_ci 643e1051a39Sopenharmony_ci vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii) 644e1051a39Sopenharmony_ci veor $t0,$t0,$IN 645e1051a39Sopenharmony_ci vpmull2.p64 $Xh,$H2,$IN 646e1051a39Sopenharmony_ci vpmull2.p64 $Xm,$Hhl,$t0 647e1051a39Sopenharmony_ci 648e1051a39Sopenharmony_ci veor $Xl,$Xl,$Yl 649e1051a39Sopenharmony_ci veor $Xh,$Xh,$Yh 650e1051a39Sopenharmony_ci veor $Xm,$Xm,$Ym 651e1051a39Sopenharmony_ci b .Ldone4x 652e1051a39Sopenharmony_ci 653e1051a39Sopenharmony_ci.align 4 654e1051a39Sopenharmony_ci.Lone: 655e1051a39Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 656e1051a39Sopenharmony_ci veor $t2,$Xl,$Xh 657e1051a39Sopenharmony_ci veor $Xm,$Xm,$t1 658e1051a39Sopenharmony_ci vld1.64 {$I0},[$inp] 659e1051a39Sopenharmony_ci veor $Xm,$Xm,$t2 660e1051a39Sopenharmony_ci#ifndef __ARMEB__ 661e1051a39Sopenharmony_ci vrev64.8 $I0,$I0 662e1051a39Sopenharmony_ci#endif 663e1051a39Sopenharmony_ci 664e1051a39Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 665e1051a39Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 666e1051a39Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 667e1051a39Sopenharmony_ci veor $Xl,$Xm,$t2 668e1051a39Sopenharmony_ci 669e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 670e1051a39Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 671e1051a39Sopenharmony_ci veor $t2,$t2,$Xh 672e1051a39Sopenharmony_ci veor $Xl,$Xl,$t2 673e1051a39Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 674e1051a39Sopenharmony_ci 675e1051a39Sopenharmony_ci veor $t0,$I0,$Xl 676e1051a39Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 677e1051a39Sopenharmony_ci 678e1051a39Sopenharmony_ci vpmull.p64 $Xl,$H,$IN 679e1051a39Sopenharmony_ci veor $t0,$t0,$IN 680e1051a39Sopenharmony_ci vpmull2.p64 $Xh,$H,$IN 681e1051a39Sopenharmony_ci vpmull.p64 $Xm,$Hhl,$t0 682e1051a39Sopenharmony_ci 683e1051a39Sopenharmony_ci.Ldone4x: 684e1051a39Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 685e1051a39Sopenharmony_ci veor $t2,$Xl,$Xh 686e1051a39Sopenharmony_ci veor $Xm,$Xm,$t1 687e1051a39Sopenharmony_ci veor $Xm,$Xm,$t2 688e1051a39Sopenharmony_ci 689e1051a39Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 690e1051a39Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 691e1051a39Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 692e1051a39Sopenharmony_ci veor $Xl,$Xm,$t2 693e1051a39Sopenharmony_ci 694e1051a39Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 695e1051a39Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 696e1051a39Sopenharmony_ci veor $t2,$t2,$Xh 697e1051a39Sopenharmony_ci veor $Xl,$Xl,$t2 698e1051a39Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 699e1051a39Sopenharmony_ci 700e1051a39Sopenharmony_ci#ifndef __ARMEB__ 701e1051a39Sopenharmony_ci vrev64.8 $Xl,$Xl 702e1051a39Sopenharmony_ci#endif 703e1051a39Sopenharmony_ci vst1.64 {$Xl},[$Xi] @ write out Xi 704e1051a39Sopenharmony_ci 705e1051a39Sopenharmony_ci ret 706e1051a39Sopenharmony_ci.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 707e1051a39Sopenharmony_ci___ 708e1051a39Sopenharmony_ci 709e1051a39Sopenharmony_ci} 710e1051a39Sopenharmony_ci} 711e1051a39Sopenharmony_ci 712e1051a39Sopenharmony_ci$code.=<<___; 713e1051a39Sopenharmony_ci.rodata 714e1051a39Sopenharmony_ci.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 715e1051a39Sopenharmony_ci.align 2 716e1051a39Sopenharmony_ci#endif 717e1051a39Sopenharmony_ci___ 718e1051a39Sopenharmony_ci 719e1051a39Sopenharmony_ciif ($flavour =~ /64/) { ######## 64-bit code 720e1051a39Sopenharmony_ci sub unvmov { 721e1051a39Sopenharmony_ci my $arg=shift; 722e1051a39Sopenharmony_ci 723e1051a39Sopenharmony_ci $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && 724e1051a39Sopenharmony_ci sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, 725e1051a39Sopenharmony_ci $3<8?$3:$3+8,($4 eq "lo")?0:1; 726e1051a39Sopenharmony_ci } 727e1051a39Sopenharmony_ci foreach(split("\n",$code)) { 728e1051a39Sopenharmony_ci s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 729e1051a39Sopenharmony_ci s/vmov\.i8/movi/o or # fix up legacy mnemonics 730e1051a39Sopenharmony_ci s/vmov\s+(.*)/unvmov($1)/geo or 731e1051a39Sopenharmony_ci s/vext\.8/ext/o or 732e1051a39Sopenharmony_ci s/vshr\.s/sshr\.s/o or 733e1051a39Sopenharmony_ci s/vshr/ushr/o or 734e1051a39Sopenharmony_ci s/^(\s+)v/$1/o or # strip off v prefix 735e1051a39Sopenharmony_ci s/\bbx\s+lr\b/ret/o; 736e1051a39Sopenharmony_ci 737e1051a39Sopenharmony_ci s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 738e1051a39Sopenharmony_ci s/@\s/\/\//o; # old->new style commentary 739e1051a39Sopenharmony_ci 740e1051a39Sopenharmony_ci # fix up remaining legacy suffixes 741e1051a39Sopenharmony_ci s/\.[ui]?8(\s)/$1/o; 742e1051a39Sopenharmony_ci s/\.[uis]?32//o and s/\.16b/\.4s/go; 743e1051a39Sopenharmony_ci m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument 744e1051a39Sopenharmony_ci m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments 745e1051a39Sopenharmony_ci s/\.[uisp]?64//o and s/\.16b/\.2d/go; 746e1051a39Sopenharmony_ci s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 747e1051a39Sopenharmony_ci 748e1051a39Sopenharmony_ci print $_,"\n"; 749e1051a39Sopenharmony_ci } 750e1051a39Sopenharmony_ci} else { ######## 32-bit code 751e1051a39Sopenharmony_ci sub unvdup32 { 752e1051a39Sopenharmony_ci my $arg=shift; 753e1051a39Sopenharmony_ci 754e1051a39Sopenharmony_ci $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 755e1051a39Sopenharmony_ci sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 756e1051a39Sopenharmony_ci } 757e1051a39Sopenharmony_ci sub unvpmullp64 { 758e1051a39Sopenharmony_ci my ($mnemonic,$arg)=@_; 759e1051a39Sopenharmony_ci 760e1051a39Sopenharmony_ci if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { 761e1051a39Sopenharmony_ci my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) 762e1051a39Sopenharmony_ci |(($2&7)<<17)|(($2&8)<<4) 763e1051a39Sopenharmony_ci |(($3&7)<<1) |(($3&8)<<2); 764e1051a39Sopenharmony_ci $word |= 0x00010001 if ($mnemonic =~ "2"); 765e1051a39Sopenharmony_ci # since ARMv7 instructions are always encoded little-endian. 766e1051a39Sopenharmony_ci # correct solution is to use .inst directive, but older 767e1051a39Sopenharmony_ci # assemblers don't implement it:-( 768e1051a39Sopenharmony_ci sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 769e1051a39Sopenharmony_ci $word&0xff,($word>>8)&0xff, 770e1051a39Sopenharmony_ci ($word>>16)&0xff,($word>>24)&0xff, 771e1051a39Sopenharmony_ci $mnemonic,$arg; 772e1051a39Sopenharmony_ci } 773e1051a39Sopenharmony_ci } 774e1051a39Sopenharmony_ci 775e1051a39Sopenharmony_ci foreach(split("\n",$code)) { 776e1051a39Sopenharmony_ci s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 777e1051a39Sopenharmony_ci s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 778e1051a39Sopenharmony_ci s/\/\/\s?/@ /o; # new->old style commentary 779e1051a39Sopenharmony_ci 780e1051a39Sopenharmony_ci # fix up remaining new-style suffixes 781e1051a39Sopenharmony_ci s/\],#[0-9]+/]!/o; 782e1051a39Sopenharmony_ci 783e1051a39Sopenharmony_ci s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or 784e1051a39Sopenharmony_ci s/vdup\.32\s+(.*)/unvdup32($1)/geo or 785e1051a39Sopenharmony_ci s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or 786e1051a39Sopenharmony_ci s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 787e1051a39Sopenharmony_ci s/^(\s+)b\./$1b/o or 788e1051a39Sopenharmony_ci s/^(\s+)ret/$1bx\tlr/o; 789e1051a39Sopenharmony_ci 790e1051a39Sopenharmony_ci if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { 791e1051a39Sopenharmony_ci print " it $2\n"; 792e1051a39Sopenharmony_ci } 793e1051a39Sopenharmony_ci 794e1051a39Sopenharmony_ci print $_,"\n"; 795e1051a39Sopenharmony_ci } 796e1051a39Sopenharmony_ci} 797e1051a39Sopenharmony_ci 798e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; # enforce flush 799