11cb0ef41Sopenharmony_ci#! /usr/bin/env perl 21cb0ef41Sopenharmony_ci# Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved. 31cb0ef41Sopenharmony_ci# 41cb0ef41Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 51cb0ef41Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 61cb0ef41Sopenharmony_ci# in the file LICENSE in the source distribution or at 71cb0ef41Sopenharmony_ci# https://www.openssl.org/source/license.html 81cb0ef41Sopenharmony_ci 91cb0ef41Sopenharmony_ci# 101cb0ef41Sopenharmony_ci# ==================================================================== 111cb0ef41Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 121cb0ef41Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 131cb0ef41Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 141cb0ef41Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 151cb0ef41Sopenharmony_ci# ==================================================================== 161cb0ef41Sopenharmony_ci# 171cb0ef41Sopenharmony_ci# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. 181cb0ef41Sopenharmony_ci# 191cb0ef41Sopenharmony_ci# June 2014 201cb0ef41Sopenharmony_ci# 211cb0ef41Sopenharmony_ci# Initial version was developed in tight cooperation with Ard 221cb0ef41Sopenharmony_ci# Biesheuvel of Linaro from bits-n-pieces from other assembly modules. 231cb0ef41Sopenharmony_ci# Just like aesv8-armx.pl this module supports both AArch32 and 241cb0ef41Sopenharmony_ci# AArch64 execution modes. 251cb0ef41Sopenharmony_ci# 261cb0ef41Sopenharmony_ci# July 2014 271cb0ef41Sopenharmony_ci# 281cb0ef41Sopenharmony_ci# Implement 2x aggregated reduction [see ghash-x86.pl for background 291cb0ef41Sopenharmony_ci# information]. 301cb0ef41Sopenharmony_ci# 311cb0ef41Sopenharmony_ci# November 2017 321cb0ef41Sopenharmony_ci# 331cb0ef41Sopenharmony_ci# AArch64 register bank to "accommodate" 4x aggregated reduction and 341cb0ef41Sopenharmony_ci# improve performance by 20-70% depending on processor. 351cb0ef41Sopenharmony_ci# 361cb0ef41Sopenharmony_ci# Current performance in cycles per processed byte: 371cb0ef41Sopenharmony_ci# 381cb0ef41Sopenharmony_ci# 64-bit PMULL 32-bit PMULL 32-bit NEON(*) 391cb0ef41Sopenharmony_ci# Apple A7 0.58 0.92 5.62 401cb0ef41Sopenharmony_ci# Cortex-A53 0.85 1.01 8.39 411cb0ef41Sopenharmony_ci# Cortex-A57 0.73 1.17 7.61 421cb0ef41Sopenharmony_ci# Denver 0.51 0.65 6.02 431cb0ef41Sopenharmony_ci# Mongoose 0.65 1.10 8.06 441cb0ef41Sopenharmony_ci# Kryo 0.76 1.16 8.00 451cb0ef41Sopenharmony_ci# ThunderX2 1.05 461cb0ef41Sopenharmony_ci# 471cb0ef41Sopenharmony_ci# (*) presented for reference/comparison purposes; 481cb0ef41Sopenharmony_ci 491cb0ef41Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 501cb0ef41Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 511cb0ef41Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 521cb0ef41Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 531cb0ef41Sopenharmony_ci 541cb0ef41Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 551cb0ef41Sopenharmony_ci( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 561cb0ef41Sopenharmony_ci( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 571cb0ef41Sopenharmony_cidie "can't locate arm-xlate.pl"; 581cb0ef41Sopenharmony_ci 591cb0ef41Sopenharmony_ciopen OUT,"| \"$^X\" $xlate $flavour \"$output\"" 601cb0ef41Sopenharmony_ci or die "can't call $xlate: $!"; 611cb0ef41Sopenharmony_ci*STDOUT=*OUT; 621cb0ef41Sopenharmony_ci 631cb0ef41Sopenharmony_ci$Xi="x0"; # argument block 641cb0ef41Sopenharmony_ci$Htbl="x1"; 651cb0ef41Sopenharmony_ci$inp="x2"; 661cb0ef41Sopenharmony_ci$len="x3"; 671cb0ef41Sopenharmony_ci 681cb0ef41Sopenharmony_ci$inc="x12"; 691cb0ef41Sopenharmony_ci 701cb0ef41Sopenharmony_ci{ 711cb0ef41Sopenharmony_cimy ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); 721cb0ef41Sopenharmony_cimy ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); 731cb0ef41Sopenharmony_cimy $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); 741cb0ef41Sopenharmony_ci 751cb0ef41Sopenharmony_ci$code=<<___; 761cb0ef41Sopenharmony_ci#include "arm_arch.h" 771cb0ef41Sopenharmony_ci 781cb0ef41Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 791cb0ef41Sopenharmony_ci___ 801cb0ef41Sopenharmony_ci$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); 811cb0ef41Sopenharmony_ci$code.=<<___ if ($flavour !~ /64/); 821cb0ef41Sopenharmony_ci.fpu neon 831cb0ef41Sopenharmony_ci#ifdef __thumb2__ 841cb0ef41Sopenharmony_ci.syntax unified 851cb0ef41Sopenharmony_ci.thumb 861cb0ef41Sopenharmony_ci# define INST(a,b,c,d) $_byte c,0xef,a,b 871cb0ef41Sopenharmony_ci#else 881cb0ef41Sopenharmony_ci.code 32 891cb0ef41Sopenharmony_ci# define INST(a,b,c,d) $_byte a,b,c,0xf2 901cb0ef41Sopenharmony_ci#endif 911cb0ef41Sopenharmony_ci 921cb0ef41Sopenharmony_ci.text 931cb0ef41Sopenharmony_ci___ 941cb0ef41Sopenharmony_ci 951cb0ef41Sopenharmony_ci################################################################################ 961cb0ef41Sopenharmony_ci# void gcm_init_v8(u128 Htable[16],const u64 H[2]); 971cb0ef41Sopenharmony_ci# 981cb0ef41Sopenharmony_ci# input: 128-bit H - secret parameter E(K,0^128) 991cb0ef41Sopenharmony_ci# output: precomputed table filled with degrees of twisted H; 1001cb0ef41Sopenharmony_ci# H is twisted to handle reverse bitness of GHASH; 1011cb0ef41Sopenharmony_ci# only few of 16 slots of Htable[16] are used; 1021cb0ef41Sopenharmony_ci# data is opaque to outside world (which allows to 1031cb0ef41Sopenharmony_ci# optimize the code independently); 1041cb0ef41Sopenharmony_ci# 1051cb0ef41Sopenharmony_ci$code.=<<___; 1061cb0ef41Sopenharmony_ci.global gcm_init_v8 1071cb0ef41Sopenharmony_ci.type gcm_init_v8,%function 1081cb0ef41Sopenharmony_ci.align 4 1091cb0ef41Sopenharmony_cigcm_init_v8: 1101cb0ef41Sopenharmony_ci vld1.64 {$t1},[x1] @ load input H 1111cb0ef41Sopenharmony_ci vmov.i8 $xC2,#0xe1 1121cb0ef41Sopenharmony_ci vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 1131cb0ef41Sopenharmony_ci vext.8 $IN,$t1,$t1,#8 1141cb0ef41Sopenharmony_ci vshr.u64 $t2,$xC2,#63 1151cb0ef41Sopenharmony_ci vdup.32 $t1,${t1}[1] 1161cb0ef41Sopenharmony_ci vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 1171cb0ef41Sopenharmony_ci vshr.u64 $t2,$IN,#63 1181cb0ef41Sopenharmony_ci vshr.s32 $t1,$t1,#31 @ broadcast carry bit 1191cb0ef41Sopenharmony_ci vand $t2,$t2,$t0 1201cb0ef41Sopenharmony_ci vshl.i64 $IN,$IN,#1 1211cb0ef41Sopenharmony_ci vext.8 $t2,$t2,$t2,#8 1221cb0ef41Sopenharmony_ci vand $t0,$t0,$t1 1231cb0ef41Sopenharmony_ci vorr $IN,$IN,$t2 @ H<<<=1 1241cb0ef41Sopenharmony_ci veor $H,$IN,$t0 @ twisted H 1251cb0ef41Sopenharmony_ci vst1.64 {$H},[x0],#16 @ store Htable[0] 1261cb0ef41Sopenharmony_ci 1271cb0ef41Sopenharmony_ci @ calculate H^2 1281cb0ef41Sopenharmony_ci vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing 1291cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$H,$H 1301cb0ef41Sopenharmony_ci veor $t0,$t0,$H 1311cb0ef41Sopenharmony_ci vpmull2.p64 $Xh,$H,$H 1321cb0ef41Sopenharmony_ci vpmull.p64 $Xm,$t0,$t0 1331cb0ef41Sopenharmony_ci 1341cb0ef41Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 1351cb0ef41Sopenharmony_ci veor $t2,$Xl,$Xh 1361cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t1 1371cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t2 1381cb0ef41Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 1391cb0ef41Sopenharmony_ci 1401cb0ef41Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 1411cb0ef41Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 1421cb0ef41Sopenharmony_ci veor $Xl,$Xm,$t2 1431cb0ef41Sopenharmony_ci 1441cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 1451cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 1461cb0ef41Sopenharmony_ci veor $t2,$t2,$Xh 1471cb0ef41Sopenharmony_ci veor $H2,$Xl,$t2 1481cb0ef41Sopenharmony_ci 1491cb0ef41Sopenharmony_ci vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing 1501cb0ef41Sopenharmony_ci veor $t1,$t1,$H2 1511cb0ef41Sopenharmony_ci vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed 1521cb0ef41Sopenharmony_ci vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2] 1531cb0ef41Sopenharmony_ci___ 1541cb0ef41Sopenharmony_ciif ($flavour =~ /64/) { 1551cb0ef41Sopenharmony_cimy ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); 1561cb0ef41Sopenharmony_ci 1571cb0ef41Sopenharmony_ci$code.=<<___; 1581cb0ef41Sopenharmony_ci @ calculate H^3 and H^4 1591cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$H, $H2 1601cb0ef41Sopenharmony_ci vpmull.p64 $Yl,$H2,$H2 1611cb0ef41Sopenharmony_ci vpmull2.p64 $Xh,$H, $H2 1621cb0ef41Sopenharmony_ci vpmull2.p64 $Yh,$H2,$H2 1631cb0ef41Sopenharmony_ci vpmull.p64 $Xm,$t0,$t1 1641cb0ef41Sopenharmony_ci vpmull.p64 $Ym,$t1,$t1 1651cb0ef41Sopenharmony_ci 1661cb0ef41Sopenharmony_ci vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing 1671cb0ef41Sopenharmony_ci vext.8 $t1,$Yl,$Yh,#8 1681cb0ef41Sopenharmony_ci veor $t2,$Xl,$Xh 1691cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t0 1701cb0ef41Sopenharmony_ci veor $t3,$Yl,$Yh 1711cb0ef41Sopenharmony_ci veor $Ym,$Ym,$t1 1721cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t2 1731cb0ef41Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 1741cb0ef41Sopenharmony_ci veor $Ym,$Ym,$t3 1751cb0ef41Sopenharmony_ci vpmull.p64 $t3,$Yl,$xC2 1761cb0ef41Sopenharmony_ci 1771cb0ef41Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 1781cb0ef41Sopenharmony_ci vmov $Yh#lo,$Ym#hi 1791cb0ef41Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 1801cb0ef41Sopenharmony_ci vmov $Ym#hi,$Yl#lo 1811cb0ef41Sopenharmony_ci veor $Xl,$Xm,$t2 1821cb0ef41Sopenharmony_ci veor $Yl,$Ym,$t3 1831cb0ef41Sopenharmony_ci 1841cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 1851cb0ef41Sopenharmony_ci vext.8 $t3,$Yl,$Yl,#8 1861cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 1871cb0ef41Sopenharmony_ci vpmull.p64 $Yl,$Yl,$xC2 1881cb0ef41Sopenharmony_ci veor $t2,$t2,$Xh 1891cb0ef41Sopenharmony_ci veor $t3,$t3,$Yh 1901cb0ef41Sopenharmony_ci veor $H, $Xl,$t2 @ H^3 1911cb0ef41Sopenharmony_ci veor $H2,$Yl,$t3 @ H^4 1921cb0ef41Sopenharmony_ci 1931cb0ef41Sopenharmony_ci vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing 1941cb0ef41Sopenharmony_ci vext.8 $t1,$H2,$H2,#8 1951cb0ef41Sopenharmony_ci veor $t0,$t0,$H 1961cb0ef41Sopenharmony_ci veor $t1,$t1,$H2 1971cb0ef41Sopenharmony_ci vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed 1981cb0ef41Sopenharmony_ci vst1.64 {$H-$H2},[x0] @ store Htable[3..5] 1991cb0ef41Sopenharmony_ci___ 2001cb0ef41Sopenharmony_ci} 2011cb0ef41Sopenharmony_ci$code.=<<___; 2021cb0ef41Sopenharmony_ci ret 2031cb0ef41Sopenharmony_ci.size gcm_init_v8,.-gcm_init_v8 2041cb0ef41Sopenharmony_ci___ 2051cb0ef41Sopenharmony_ci################################################################################ 2061cb0ef41Sopenharmony_ci# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); 2071cb0ef41Sopenharmony_ci# 2081cb0ef41Sopenharmony_ci# input: Xi - current hash value; 2091cb0ef41Sopenharmony_ci# Htable - table precomputed in gcm_init_v8; 2101cb0ef41Sopenharmony_ci# output: Xi - next hash value Xi; 2111cb0ef41Sopenharmony_ci# 2121cb0ef41Sopenharmony_ci$code.=<<___; 2131cb0ef41Sopenharmony_ci.global gcm_gmult_v8 2141cb0ef41Sopenharmony_ci.type gcm_gmult_v8,%function 2151cb0ef41Sopenharmony_ci.align 4 2161cb0ef41Sopenharmony_cigcm_gmult_v8: 2171cb0ef41Sopenharmony_ci vld1.64 {$t1},[$Xi] @ load Xi 2181cb0ef41Sopenharmony_ci vmov.i8 $xC2,#0xe1 2191cb0ef41Sopenharmony_ci vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... 2201cb0ef41Sopenharmony_ci vshl.u64 $xC2,$xC2,#57 2211cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 2221cb0ef41Sopenharmony_ci vrev64.8 $t1,$t1 2231cb0ef41Sopenharmony_ci#endif 2241cb0ef41Sopenharmony_ci vext.8 $IN,$t1,$t1,#8 2251cb0ef41Sopenharmony_ci 2261cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo 2271cb0ef41Sopenharmony_ci veor $t1,$t1,$IN @ Karatsuba pre-processing 2281cb0ef41Sopenharmony_ci vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi 2291cb0ef41Sopenharmony_ci vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 2301cb0ef41Sopenharmony_ci 2311cb0ef41Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 2321cb0ef41Sopenharmony_ci veor $t2,$Xl,$Xh 2331cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t1 2341cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t2 2351cb0ef41Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 2361cb0ef41Sopenharmony_ci 2371cb0ef41Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 2381cb0ef41Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 2391cb0ef41Sopenharmony_ci veor $Xl,$Xm,$t2 2401cb0ef41Sopenharmony_ci 2411cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 2421cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 2431cb0ef41Sopenharmony_ci veor $t2,$t2,$Xh 2441cb0ef41Sopenharmony_ci veor $Xl,$Xl,$t2 2451cb0ef41Sopenharmony_ci 2461cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 2471cb0ef41Sopenharmony_ci vrev64.8 $Xl,$Xl 2481cb0ef41Sopenharmony_ci#endif 2491cb0ef41Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 2501cb0ef41Sopenharmony_ci vst1.64 {$Xl},[$Xi] @ write out Xi 2511cb0ef41Sopenharmony_ci 2521cb0ef41Sopenharmony_ci ret 2531cb0ef41Sopenharmony_ci.size gcm_gmult_v8,.-gcm_gmult_v8 2541cb0ef41Sopenharmony_ci___ 2551cb0ef41Sopenharmony_ci################################################################################ 2561cb0ef41Sopenharmony_ci# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 2571cb0ef41Sopenharmony_ci# 2581cb0ef41Sopenharmony_ci# input: table precomputed in gcm_init_v8; 2591cb0ef41Sopenharmony_ci# current hash value Xi; 2601cb0ef41Sopenharmony_ci# pointer to input data; 2611cb0ef41Sopenharmony_ci# length of input data in bytes, but divisible by block size; 2621cb0ef41Sopenharmony_ci# output: next hash value Xi; 2631cb0ef41Sopenharmony_ci# 2641cb0ef41Sopenharmony_ci$code.=<<___; 2651cb0ef41Sopenharmony_ci.global gcm_ghash_v8 2661cb0ef41Sopenharmony_ci.type gcm_ghash_v8,%function 2671cb0ef41Sopenharmony_ci.align 4 2681cb0ef41Sopenharmony_cigcm_ghash_v8: 2691cb0ef41Sopenharmony_ci___ 2701cb0ef41Sopenharmony_ci$code.=<<___ if ($flavour =~ /64/); 2711cb0ef41Sopenharmony_ci cmp $len,#64 2721cb0ef41Sopenharmony_ci b.hs .Lgcm_ghash_v8_4x 2731cb0ef41Sopenharmony_ci___ 2741cb0ef41Sopenharmony_ci$code.=<<___ if ($flavour !~ /64/); 2751cb0ef41Sopenharmony_ci vstmdb sp!,{d8-d15} @ 32-bit ABI says so 2761cb0ef41Sopenharmony_ci___ 2771cb0ef41Sopenharmony_ci$code.=<<___; 2781cb0ef41Sopenharmony_ci vld1.64 {$Xl},[$Xi] @ load [rotated] Xi 2791cb0ef41Sopenharmony_ci @ "[rotated]" means that 2801cb0ef41Sopenharmony_ci @ loaded value would have 2811cb0ef41Sopenharmony_ci @ to be rotated in order to 2821cb0ef41Sopenharmony_ci @ make it appear as in 2831cb0ef41Sopenharmony_ci @ algorithm specification 2841cb0ef41Sopenharmony_ci subs $len,$len,#32 @ see if $len is 32 or larger 2851cb0ef41Sopenharmony_ci mov $inc,#16 @ $inc is used as post- 2861cb0ef41Sopenharmony_ci @ increment for input pointer; 2871cb0ef41Sopenharmony_ci @ as loop is modulo-scheduled 2881cb0ef41Sopenharmony_ci @ $inc is zeroed just in time 2891cb0ef41Sopenharmony_ci @ to preclude overstepping 2901cb0ef41Sopenharmony_ci @ inp[len], which means that 2911cb0ef41Sopenharmony_ci @ last block[s] are actually 2921cb0ef41Sopenharmony_ci @ loaded twice, but last 2931cb0ef41Sopenharmony_ci @ copy is not processed 2941cb0ef41Sopenharmony_ci vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2 2951cb0ef41Sopenharmony_ci vmov.i8 $xC2,#0xe1 2961cb0ef41Sopenharmony_ci vld1.64 {$H2},[$Htbl] 2971cb0ef41Sopenharmony_ci cclr $inc,eq @ is it time to zero $inc? 2981cb0ef41Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi 2991cb0ef41Sopenharmony_ci vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0] 3001cb0ef41Sopenharmony_ci vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant 3011cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 3021cb0ef41Sopenharmony_ci vrev64.8 $t0,$t0 3031cb0ef41Sopenharmony_ci vrev64.8 $Xl,$Xl 3041cb0ef41Sopenharmony_ci#endif 3051cb0ef41Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 @ rotate I[0] 3061cb0ef41Sopenharmony_ci b.lo .Lodd_tail_v8 @ $len was less than 32 3071cb0ef41Sopenharmony_ci___ 3081cb0ef41Sopenharmony_ci{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7)); 3091cb0ef41Sopenharmony_ci ####### 3101cb0ef41Sopenharmony_ci # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 3111cb0ef41Sopenharmony_ci # [(H*Ii+1) + (H*Xi+1)] mod P = 3121cb0ef41Sopenharmony_ci # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 3131cb0ef41Sopenharmony_ci # 3141cb0ef41Sopenharmony_ci$code.=<<___; 3151cb0ef41Sopenharmony_ci vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1] 3161cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 3171cb0ef41Sopenharmony_ci vrev64.8 $t1,$t1 3181cb0ef41Sopenharmony_ci#endif 3191cb0ef41Sopenharmony_ci vext.8 $In,$t1,$t1,#8 3201cb0ef41Sopenharmony_ci veor $IN,$IN,$Xl @ I[i]^=Xi 3211cb0ef41Sopenharmony_ci vpmull.p64 $Xln,$H,$In @ H·Ii+1 3221cb0ef41Sopenharmony_ci veor $t1,$t1,$In @ Karatsuba pre-processing 3231cb0ef41Sopenharmony_ci vpmull2.p64 $Xhn,$H,$In 3241cb0ef41Sopenharmony_ci b .Loop_mod2x_v8 3251cb0ef41Sopenharmony_ci 3261cb0ef41Sopenharmony_ci.align 4 3271cb0ef41Sopenharmony_ci.Loop_mod2x_v8: 3281cb0ef41Sopenharmony_ci vext.8 $t2,$IN,$IN,#8 3291cb0ef41Sopenharmony_ci subs $len,$len,#32 @ is there more data? 3301cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo 3311cb0ef41Sopenharmony_ci cclr $inc,lo @ is it time to zero $inc? 3321cb0ef41Sopenharmony_ci 3331cb0ef41Sopenharmony_ci vpmull.p64 $Xmn,$Hhl,$t1 3341cb0ef41Sopenharmony_ci veor $t2,$t2,$IN @ Karatsuba pre-processing 3351cb0ef41Sopenharmony_ci vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi 3361cb0ef41Sopenharmony_ci veor $Xl,$Xl,$Xln @ accumulate 3371cb0ef41Sopenharmony_ci vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 3381cb0ef41Sopenharmony_ci vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2] 3391cb0ef41Sopenharmony_ci 3401cb0ef41Sopenharmony_ci veor $Xh,$Xh,$Xhn 3411cb0ef41Sopenharmony_ci cclr $inc,eq @ is it time to zero $inc? 3421cb0ef41Sopenharmony_ci veor $Xm,$Xm,$Xmn 3431cb0ef41Sopenharmony_ci 3441cb0ef41Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 3451cb0ef41Sopenharmony_ci veor $t2,$Xl,$Xh 3461cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t1 3471cb0ef41Sopenharmony_ci vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3] 3481cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 3491cb0ef41Sopenharmony_ci vrev64.8 $t0,$t0 3501cb0ef41Sopenharmony_ci#endif 3511cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t2 3521cb0ef41Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 3531cb0ef41Sopenharmony_ci 3541cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 3551cb0ef41Sopenharmony_ci vrev64.8 $t1,$t1 3561cb0ef41Sopenharmony_ci#endif 3571cb0ef41Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 3581cb0ef41Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 3591cb0ef41Sopenharmony_ci vext.8 $In,$t1,$t1,#8 3601cb0ef41Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 3611cb0ef41Sopenharmony_ci veor $Xl,$Xm,$t2 3621cb0ef41Sopenharmony_ci vpmull.p64 $Xln,$H,$In @ H·Ii+1 3631cb0ef41Sopenharmony_ci veor $IN,$IN,$Xh @ accumulate $IN early 3641cb0ef41Sopenharmony_ci 3651cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 3661cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 3671cb0ef41Sopenharmony_ci veor $IN,$IN,$t2 3681cb0ef41Sopenharmony_ci veor $t1,$t1,$In @ Karatsuba pre-processing 3691cb0ef41Sopenharmony_ci veor $IN,$IN,$Xl 3701cb0ef41Sopenharmony_ci vpmull2.p64 $Xhn,$H,$In 3711cb0ef41Sopenharmony_ci b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes 3721cb0ef41Sopenharmony_ci 3731cb0ef41Sopenharmony_ci veor $Xh,$Xh,$t2 3741cb0ef41Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 @ re-construct $IN 3751cb0ef41Sopenharmony_ci adds $len,$len,#32 @ re-construct $len 3761cb0ef41Sopenharmony_ci veor $Xl,$Xl,$Xh @ re-construct $Xl 3771cb0ef41Sopenharmony_ci b.eq .Ldone_v8 @ is $len zero? 3781cb0ef41Sopenharmony_ci___ 3791cb0ef41Sopenharmony_ci} 3801cb0ef41Sopenharmony_ci$code.=<<___; 3811cb0ef41Sopenharmony_ci.Lodd_tail_v8: 3821cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 3831cb0ef41Sopenharmony_ci veor $IN,$IN,$Xl @ inp^=Xi 3841cb0ef41Sopenharmony_ci veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi 3851cb0ef41Sopenharmony_ci 3861cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo 3871cb0ef41Sopenharmony_ci veor $t1,$t1,$IN @ Karatsuba pre-processing 3881cb0ef41Sopenharmony_ci vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi 3891cb0ef41Sopenharmony_ci vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 3901cb0ef41Sopenharmony_ci 3911cb0ef41Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 3921cb0ef41Sopenharmony_ci veor $t2,$Xl,$Xh 3931cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t1 3941cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t2 3951cb0ef41Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 3961cb0ef41Sopenharmony_ci 3971cb0ef41Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 3981cb0ef41Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 3991cb0ef41Sopenharmony_ci veor $Xl,$Xm,$t2 4001cb0ef41Sopenharmony_ci 4011cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 4021cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 4031cb0ef41Sopenharmony_ci veor $t2,$t2,$Xh 4041cb0ef41Sopenharmony_ci veor $Xl,$Xl,$t2 4051cb0ef41Sopenharmony_ci 4061cb0ef41Sopenharmony_ci.Ldone_v8: 4071cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 4081cb0ef41Sopenharmony_ci vrev64.8 $Xl,$Xl 4091cb0ef41Sopenharmony_ci#endif 4101cb0ef41Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 4111cb0ef41Sopenharmony_ci vst1.64 {$Xl},[$Xi] @ write out Xi 4121cb0ef41Sopenharmony_ci 4131cb0ef41Sopenharmony_ci___ 4141cb0ef41Sopenharmony_ci$code.=<<___ if ($flavour !~ /64/); 4151cb0ef41Sopenharmony_ci vldmia sp!,{d8-d15} @ 32-bit ABI says so 4161cb0ef41Sopenharmony_ci___ 4171cb0ef41Sopenharmony_ci$code.=<<___; 4181cb0ef41Sopenharmony_ci ret 4191cb0ef41Sopenharmony_ci.size gcm_ghash_v8,.-gcm_ghash_v8 4201cb0ef41Sopenharmony_ci___ 4211cb0ef41Sopenharmony_ci 4221cb0ef41Sopenharmony_ciif ($flavour =~ /64/) { # 4x subroutine 4231cb0ef41Sopenharmony_cimy ($I0,$j1,$j2,$j3, 4241cb0ef41Sopenharmony_ci $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23)); 4251cb0ef41Sopenharmony_ci 4261cb0ef41Sopenharmony_ci$code.=<<___; 4271cb0ef41Sopenharmony_ci.type gcm_ghash_v8_4x,%function 4281cb0ef41Sopenharmony_ci.align 4 4291cb0ef41Sopenharmony_cigcm_ghash_v8_4x: 4301cb0ef41Sopenharmony_ci.Lgcm_ghash_v8_4x: 4311cb0ef41Sopenharmony_ci vld1.64 {$Xl},[$Xi] @ load [rotated] Xi 4321cb0ef41Sopenharmony_ci vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2 4331cb0ef41Sopenharmony_ci vmov.i8 $xC2,#0xe1 4341cb0ef41Sopenharmony_ci vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4 4351cb0ef41Sopenharmony_ci vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant 4361cb0ef41Sopenharmony_ci 4371cb0ef41Sopenharmony_ci vld1.64 {$I0-$j3},[$inp],#64 4381cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 4391cb0ef41Sopenharmony_ci vrev64.8 $Xl,$Xl 4401cb0ef41Sopenharmony_ci vrev64.8 $j1,$j1 4411cb0ef41Sopenharmony_ci vrev64.8 $j2,$j2 4421cb0ef41Sopenharmony_ci vrev64.8 $j3,$j3 4431cb0ef41Sopenharmony_ci vrev64.8 $I0,$I0 4441cb0ef41Sopenharmony_ci#endif 4451cb0ef41Sopenharmony_ci vext.8 $I3,$j3,$j3,#8 4461cb0ef41Sopenharmony_ci vext.8 $I2,$j2,$j2,#8 4471cb0ef41Sopenharmony_ci vext.8 $I1,$j1,$j1,#8 4481cb0ef41Sopenharmony_ci 4491cb0ef41Sopenharmony_ci vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 4501cb0ef41Sopenharmony_ci veor $j3,$j3,$I3 4511cb0ef41Sopenharmony_ci vpmull2.p64 $Yh,$H,$I3 4521cb0ef41Sopenharmony_ci vpmull.p64 $Ym,$Hhl,$j3 4531cb0ef41Sopenharmony_ci 4541cb0ef41Sopenharmony_ci vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 4551cb0ef41Sopenharmony_ci veor $j2,$j2,$I2 4561cb0ef41Sopenharmony_ci vpmull2.p64 $I2,$H2,$I2 4571cb0ef41Sopenharmony_ci vpmull2.p64 $j2,$Hhl,$j2 4581cb0ef41Sopenharmony_ci 4591cb0ef41Sopenharmony_ci veor $Yl,$Yl,$t0 4601cb0ef41Sopenharmony_ci veor $Yh,$Yh,$I2 4611cb0ef41Sopenharmony_ci veor $Ym,$Ym,$j2 4621cb0ef41Sopenharmony_ci 4631cb0ef41Sopenharmony_ci vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 4641cb0ef41Sopenharmony_ci veor $j1,$j1,$I1 4651cb0ef41Sopenharmony_ci vpmull2.p64 $I1,$H3,$I1 4661cb0ef41Sopenharmony_ci vpmull.p64 $j1,$H34,$j1 4671cb0ef41Sopenharmony_ci 4681cb0ef41Sopenharmony_ci veor $Yl,$Yl,$j3 4691cb0ef41Sopenharmony_ci veor $Yh,$Yh,$I1 4701cb0ef41Sopenharmony_ci veor $Ym,$Ym,$j1 4711cb0ef41Sopenharmony_ci 4721cb0ef41Sopenharmony_ci subs $len,$len,#128 4731cb0ef41Sopenharmony_ci b.lo .Ltail4x 4741cb0ef41Sopenharmony_ci 4751cb0ef41Sopenharmony_ci b .Loop4x 4761cb0ef41Sopenharmony_ci 4771cb0ef41Sopenharmony_ci.align 4 4781cb0ef41Sopenharmony_ci.Loop4x: 4791cb0ef41Sopenharmony_ci veor $t0,$I0,$Xl 4801cb0ef41Sopenharmony_ci vld1.64 {$I0-$j3},[$inp],#64 4811cb0ef41Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 4821cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 4831cb0ef41Sopenharmony_ci vrev64.8 $j1,$j1 4841cb0ef41Sopenharmony_ci vrev64.8 $j2,$j2 4851cb0ef41Sopenharmony_ci vrev64.8 $j3,$j3 4861cb0ef41Sopenharmony_ci vrev64.8 $I0,$I0 4871cb0ef41Sopenharmony_ci#endif 4881cb0ef41Sopenharmony_ci 4891cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) 4901cb0ef41Sopenharmony_ci veor $t0,$t0,$IN 4911cb0ef41Sopenharmony_ci vpmull2.p64 $Xh,$H4,$IN 4921cb0ef41Sopenharmony_ci vext.8 $I3,$j3,$j3,#8 4931cb0ef41Sopenharmony_ci vpmull2.p64 $Xm,$H34,$t0 4941cb0ef41Sopenharmony_ci 4951cb0ef41Sopenharmony_ci veor $Xl,$Xl,$Yl 4961cb0ef41Sopenharmony_ci veor $Xh,$Xh,$Yh 4971cb0ef41Sopenharmony_ci vext.8 $I2,$j2,$j2,#8 4981cb0ef41Sopenharmony_ci veor $Xm,$Xm,$Ym 4991cb0ef41Sopenharmony_ci vext.8 $I1,$j1,$j1,#8 5001cb0ef41Sopenharmony_ci 5011cb0ef41Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 5021cb0ef41Sopenharmony_ci veor $t2,$Xl,$Xh 5031cb0ef41Sopenharmony_ci vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 5041cb0ef41Sopenharmony_ci veor $j3,$j3,$I3 5051cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t1 5061cb0ef41Sopenharmony_ci vpmull2.p64 $Yh,$H,$I3 5071cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t2 5081cb0ef41Sopenharmony_ci vpmull.p64 $Ym,$Hhl,$j3 5091cb0ef41Sopenharmony_ci 5101cb0ef41Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 5111cb0ef41Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 5121cb0ef41Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 5131cb0ef41Sopenharmony_ci vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 5141cb0ef41Sopenharmony_ci veor $j2,$j2,$I2 5151cb0ef41Sopenharmony_ci vpmull2.p64 $I2,$H2,$I2 5161cb0ef41Sopenharmony_ci veor $Xl,$Xm,$t2 5171cb0ef41Sopenharmony_ci vpmull2.p64 $j2,$Hhl,$j2 5181cb0ef41Sopenharmony_ci 5191cb0ef41Sopenharmony_ci veor $Yl,$Yl,$t0 5201cb0ef41Sopenharmony_ci veor $Yh,$Yh,$I2 5211cb0ef41Sopenharmony_ci veor $Ym,$Ym,$j2 5221cb0ef41Sopenharmony_ci 5231cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 5241cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 5251cb0ef41Sopenharmony_ci vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 5261cb0ef41Sopenharmony_ci veor $j1,$j1,$I1 5271cb0ef41Sopenharmony_ci veor $t2,$t2,$Xh 5281cb0ef41Sopenharmony_ci vpmull2.p64 $I1,$H3,$I1 5291cb0ef41Sopenharmony_ci vpmull.p64 $j1,$H34,$j1 5301cb0ef41Sopenharmony_ci 5311cb0ef41Sopenharmony_ci veor $Xl,$Xl,$t2 5321cb0ef41Sopenharmony_ci veor $Yl,$Yl,$j3 5331cb0ef41Sopenharmony_ci veor $Yh,$Yh,$I1 5341cb0ef41Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 5351cb0ef41Sopenharmony_ci veor $Ym,$Ym,$j1 5361cb0ef41Sopenharmony_ci 5371cb0ef41Sopenharmony_ci subs $len,$len,#64 5381cb0ef41Sopenharmony_ci b.hs .Loop4x 5391cb0ef41Sopenharmony_ci 5401cb0ef41Sopenharmony_ci.Ltail4x: 5411cb0ef41Sopenharmony_ci veor $t0,$I0,$Xl 5421cb0ef41Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 5431cb0ef41Sopenharmony_ci 5441cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) 5451cb0ef41Sopenharmony_ci veor $t0,$t0,$IN 5461cb0ef41Sopenharmony_ci vpmull2.p64 $Xh,$H4,$IN 5471cb0ef41Sopenharmony_ci vpmull2.p64 $Xm,$H34,$t0 5481cb0ef41Sopenharmony_ci 5491cb0ef41Sopenharmony_ci veor $Xl,$Xl,$Yl 5501cb0ef41Sopenharmony_ci veor $Xh,$Xh,$Yh 5511cb0ef41Sopenharmony_ci veor $Xm,$Xm,$Ym 5521cb0ef41Sopenharmony_ci 5531cb0ef41Sopenharmony_ci adds $len,$len,#64 5541cb0ef41Sopenharmony_ci b.eq .Ldone4x 5551cb0ef41Sopenharmony_ci 5561cb0ef41Sopenharmony_ci cmp $len,#32 5571cb0ef41Sopenharmony_ci b.lo .Lone 5581cb0ef41Sopenharmony_ci b.eq .Ltwo 5591cb0ef41Sopenharmony_ci.Lthree: 5601cb0ef41Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 5611cb0ef41Sopenharmony_ci veor $t2,$Xl,$Xh 5621cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t1 5631cb0ef41Sopenharmony_ci vld1.64 {$I0-$j2},[$inp] 5641cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t2 5651cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 5661cb0ef41Sopenharmony_ci vrev64.8 $j1,$j1 5671cb0ef41Sopenharmony_ci vrev64.8 $j2,$j2 5681cb0ef41Sopenharmony_ci vrev64.8 $I0,$I0 5691cb0ef41Sopenharmony_ci#endif 5701cb0ef41Sopenharmony_ci 5711cb0ef41Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 5721cb0ef41Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 5731cb0ef41Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 5741cb0ef41Sopenharmony_ci vext.8 $I2,$j2,$j2,#8 5751cb0ef41Sopenharmony_ci vext.8 $I1,$j1,$j1,#8 5761cb0ef41Sopenharmony_ci veor $Xl,$Xm,$t2 5771cb0ef41Sopenharmony_ci 5781cb0ef41Sopenharmony_ci vpmull.p64 $Yl,$H,$I2 @ H·Ii+2 5791cb0ef41Sopenharmony_ci veor $j2,$j2,$I2 5801cb0ef41Sopenharmony_ci 5811cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 5821cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 5831cb0ef41Sopenharmony_ci veor $t2,$t2,$Xh 5841cb0ef41Sopenharmony_ci vpmull2.p64 $Yh,$H,$I2 5851cb0ef41Sopenharmony_ci vpmull.p64 $Ym,$Hhl,$j2 5861cb0ef41Sopenharmony_ci veor $Xl,$Xl,$t2 5871cb0ef41Sopenharmony_ci vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1 5881cb0ef41Sopenharmony_ci veor $j1,$j1,$I1 5891cb0ef41Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 5901cb0ef41Sopenharmony_ci 5911cb0ef41Sopenharmony_ci vpmull2.p64 $I1,$H2,$I1 5921cb0ef41Sopenharmony_ci veor $t0,$I0,$Xl 5931cb0ef41Sopenharmony_ci vpmull2.p64 $j1,$Hhl,$j1 5941cb0ef41Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 5951cb0ef41Sopenharmony_ci 5961cb0ef41Sopenharmony_ci veor $Yl,$Yl,$j3 5971cb0ef41Sopenharmony_ci veor $Yh,$Yh,$I1 5981cb0ef41Sopenharmony_ci veor $Ym,$Ym,$j1 5991cb0ef41Sopenharmony_ci 6001cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii) 6011cb0ef41Sopenharmony_ci veor $t0,$t0,$IN 6021cb0ef41Sopenharmony_ci vpmull2.p64 $Xh,$H3,$IN 6031cb0ef41Sopenharmony_ci vpmull.p64 $Xm,$H34,$t0 6041cb0ef41Sopenharmony_ci 6051cb0ef41Sopenharmony_ci veor $Xl,$Xl,$Yl 6061cb0ef41Sopenharmony_ci veor $Xh,$Xh,$Yh 6071cb0ef41Sopenharmony_ci veor $Xm,$Xm,$Ym 6081cb0ef41Sopenharmony_ci b .Ldone4x 6091cb0ef41Sopenharmony_ci 6101cb0ef41Sopenharmony_ci.align 4 6111cb0ef41Sopenharmony_ci.Ltwo: 6121cb0ef41Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 6131cb0ef41Sopenharmony_ci veor $t2,$Xl,$Xh 6141cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t1 6151cb0ef41Sopenharmony_ci vld1.64 {$I0-$j1},[$inp] 6161cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t2 6171cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 6181cb0ef41Sopenharmony_ci vrev64.8 $j1,$j1 6191cb0ef41Sopenharmony_ci vrev64.8 $I0,$I0 6201cb0ef41Sopenharmony_ci#endif 6211cb0ef41Sopenharmony_ci 6221cb0ef41Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 6231cb0ef41Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 6241cb0ef41Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 6251cb0ef41Sopenharmony_ci vext.8 $I1,$j1,$j1,#8 6261cb0ef41Sopenharmony_ci veor $Xl,$Xm,$t2 6271cb0ef41Sopenharmony_ci 6281cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 6291cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 6301cb0ef41Sopenharmony_ci veor $t2,$t2,$Xh 6311cb0ef41Sopenharmony_ci veor $Xl,$Xl,$t2 6321cb0ef41Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 6331cb0ef41Sopenharmony_ci 6341cb0ef41Sopenharmony_ci vpmull.p64 $Yl,$H,$I1 @ H·Ii+1 6351cb0ef41Sopenharmony_ci veor $j1,$j1,$I1 6361cb0ef41Sopenharmony_ci 6371cb0ef41Sopenharmony_ci veor $t0,$I0,$Xl 6381cb0ef41Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 6391cb0ef41Sopenharmony_ci 6401cb0ef41Sopenharmony_ci vpmull2.p64 $Yh,$H,$I1 6411cb0ef41Sopenharmony_ci vpmull.p64 $Ym,$Hhl,$j1 6421cb0ef41Sopenharmony_ci 6431cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii) 6441cb0ef41Sopenharmony_ci veor $t0,$t0,$IN 6451cb0ef41Sopenharmony_ci vpmull2.p64 $Xh,$H2,$IN 6461cb0ef41Sopenharmony_ci vpmull2.p64 $Xm,$Hhl,$t0 6471cb0ef41Sopenharmony_ci 6481cb0ef41Sopenharmony_ci veor $Xl,$Xl,$Yl 6491cb0ef41Sopenharmony_ci veor $Xh,$Xh,$Yh 6501cb0ef41Sopenharmony_ci veor $Xm,$Xm,$Ym 6511cb0ef41Sopenharmony_ci b .Ldone4x 6521cb0ef41Sopenharmony_ci 6531cb0ef41Sopenharmony_ci.align 4 6541cb0ef41Sopenharmony_ci.Lone: 6551cb0ef41Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 6561cb0ef41Sopenharmony_ci veor $t2,$Xl,$Xh 6571cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t1 6581cb0ef41Sopenharmony_ci vld1.64 {$I0},[$inp] 6591cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t2 6601cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 6611cb0ef41Sopenharmony_ci vrev64.8 $I0,$I0 6621cb0ef41Sopenharmony_ci#endif 6631cb0ef41Sopenharmony_ci 6641cb0ef41Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 6651cb0ef41Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 6661cb0ef41Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 6671cb0ef41Sopenharmony_ci veor $Xl,$Xm,$t2 6681cb0ef41Sopenharmony_ci 6691cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 6701cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 6711cb0ef41Sopenharmony_ci veor $t2,$t2,$Xh 6721cb0ef41Sopenharmony_ci veor $Xl,$Xl,$t2 6731cb0ef41Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 6741cb0ef41Sopenharmony_ci 6751cb0ef41Sopenharmony_ci veor $t0,$I0,$Xl 6761cb0ef41Sopenharmony_ci vext.8 $IN,$t0,$t0,#8 6771cb0ef41Sopenharmony_ci 6781cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$H,$IN 6791cb0ef41Sopenharmony_ci veor $t0,$t0,$IN 6801cb0ef41Sopenharmony_ci vpmull2.p64 $Xh,$H,$IN 6811cb0ef41Sopenharmony_ci vpmull.p64 $Xm,$Hhl,$t0 6821cb0ef41Sopenharmony_ci 6831cb0ef41Sopenharmony_ci.Ldone4x: 6841cb0ef41Sopenharmony_ci vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 6851cb0ef41Sopenharmony_ci veor $t2,$Xl,$Xh 6861cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t1 6871cb0ef41Sopenharmony_ci veor $Xm,$Xm,$t2 6881cb0ef41Sopenharmony_ci 6891cb0ef41Sopenharmony_ci vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 6901cb0ef41Sopenharmony_ci vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 6911cb0ef41Sopenharmony_ci vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 6921cb0ef41Sopenharmony_ci veor $Xl,$Xm,$t2 6931cb0ef41Sopenharmony_ci 6941cb0ef41Sopenharmony_ci vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 6951cb0ef41Sopenharmony_ci vpmull.p64 $Xl,$Xl,$xC2 6961cb0ef41Sopenharmony_ci veor $t2,$t2,$Xh 6971cb0ef41Sopenharmony_ci veor $Xl,$Xl,$t2 6981cb0ef41Sopenharmony_ci vext.8 $Xl,$Xl,$Xl,#8 6991cb0ef41Sopenharmony_ci 7001cb0ef41Sopenharmony_ci#ifndef __ARMEB__ 7011cb0ef41Sopenharmony_ci vrev64.8 $Xl,$Xl 7021cb0ef41Sopenharmony_ci#endif 7031cb0ef41Sopenharmony_ci vst1.64 {$Xl},[$Xi] @ write out Xi 7041cb0ef41Sopenharmony_ci 7051cb0ef41Sopenharmony_ci ret 7061cb0ef41Sopenharmony_ci.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 7071cb0ef41Sopenharmony_ci___ 7081cb0ef41Sopenharmony_ci 7091cb0ef41Sopenharmony_ci} 7101cb0ef41Sopenharmony_ci} 7111cb0ef41Sopenharmony_ci 7121cb0ef41Sopenharmony_ci$code.=<<___; 7131cb0ef41Sopenharmony_ci.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 7141cb0ef41Sopenharmony_ci.align 2 7151cb0ef41Sopenharmony_ci#endif 7161cb0ef41Sopenharmony_ci___ 7171cb0ef41Sopenharmony_ci 7181cb0ef41Sopenharmony_ciif ($flavour =~ /64/) { ######## 64-bit code 7191cb0ef41Sopenharmony_ci sub unvmov { 7201cb0ef41Sopenharmony_ci my $arg=shift; 7211cb0ef41Sopenharmony_ci 7221cb0ef41Sopenharmony_ci $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && 7231cb0ef41Sopenharmony_ci sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, 7241cb0ef41Sopenharmony_ci $3<8?$3:$3+8,($4 eq "lo")?0:1; 7251cb0ef41Sopenharmony_ci } 7261cb0ef41Sopenharmony_ci foreach(split("\n",$code)) { 7271cb0ef41Sopenharmony_ci s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 7281cb0ef41Sopenharmony_ci s/vmov\.i8/movi/o or # fix up legacy mnemonics 7291cb0ef41Sopenharmony_ci s/vmov\s+(.*)/unvmov($1)/geo or 7301cb0ef41Sopenharmony_ci s/vext\.8/ext/o or 7311cb0ef41Sopenharmony_ci s/vshr\.s/sshr\.s/o or 7321cb0ef41Sopenharmony_ci s/vshr/ushr/o or 7331cb0ef41Sopenharmony_ci s/^(\s+)v/$1/o or # strip off v prefix 7341cb0ef41Sopenharmony_ci s/\bbx\s+lr\b/ret/o; 7351cb0ef41Sopenharmony_ci 7361cb0ef41Sopenharmony_ci s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 7371cb0ef41Sopenharmony_ci s/@\s/\/\//o; # old->new style commentary 7381cb0ef41Sopenharmony_ci 7391cb0ef41Sopenharmony_ci # fix up remaining legacy suffixes 7401cb0ef41Sopenharmony_ci s/\.[ui]?8(\s)/$1/o; 7411cb0ef41Sopenharmony_ci s/\.[uis]?32//o and s/\.16b/\.4s/go; 7421cb0ef41Sopenharmony_ci m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument 7431cb0ef41Sopenharmony_ci m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments 7441cb0ef41Sopenharmony_ci s/\.[uisp]?64//o and s/\.16b/\.2d/go; 7451cb0ef41Sopenharmony_ci s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 7461cb0ef41Sopenharmony_ci 7471cb0ef41Sopenharmony_ci # Switch preprocessor checks to aarch64 versions. 7481cb0ef41Sopenharmony_ci s/__ARME([BL])__/__AARCH64E$1__/go; 7491cb0ef41Sopenharmony_ci 7501cb0ef41Sopenharmony_ci print $_,"\n"; 7511cb0ef41Sopenharmony_ci } 7521cb0ef41Sopenharmony_ci} else { ######## 32-bit code 7531cb0ef41Sopenharmony_ci sub unvdup32 { 7541cb0ef41Sopenharmony_ci my $arg=shift; 7551cb0ef41Sopenharmony_ci 7561cb0ef41Sopenharmony_ci $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 7571cb0ef41Sopenharmony_ci sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 7581cb0ef41Sopenharmony_ci } 7591cb0ef41Sopenharmony_ci sub unvpmullp64 { 7601cb0ef41Sopenharmony_ci my ($mnemonic,$arg)=@_; 7611cb0ef41Sopenharmony_ci 7621cb0ef41Sopenharmony_ci if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { 7631cb0ef41Sopenharmony_ci my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) 7641cb0ef41Sopenharmony_ci |(($2&7)<<17)|(($2&8)<<4) 7651cb0ef41Sopenharmony_ci |(($3&7)<<1) |(($3&8)<<2); 7661cb0ef41Sopenharmony_ci $word |= 0x00010001 if ($mnemonic =~ "2"); 7671cb0ef41Sopenharmony_ci # since ARMv7 instructions are always encoded little-endian. 7681cb0ef41Sopenharmony_ci # correct solution is to use .inst directive, but older 7691cb0ef41Sopenharmony_ci # assemblers don't implement it:-( 7701cb0ef41Sopenharmony_ci sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 7711cb0ef41Sopenharmony_ci $word&0xff,($word>>8)&0xff, 7721cb0ef41Sopenharmony_ci ($word>>16)&0xff,($word>>24)&0xff, 7731cb0ef41Sopenharmony_ci $mnemonic,$arg; 7741cb0ef41Sopenharmony_ci } 7751cb0ef41Sopenharmony_ci } 7761cb0ef41Sopenharmony_ci 7771cb0ef41Sopenharmony_ci foreach(split("\n",$code)) { 7781cb0ef41Sopenharmony_ci s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 7791cb0ef41Sopenharmony_ci s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 7801cb0ef41Sopenharmony_ci s/\/\/\s?/@ /o; # new->old style commentary 7811cb0ef41Sopenharmony_ci 7821cb0ef41Sopenharmony_ci # fix up remaining new-style suffixes 7831cb0ef41Sopenharmony_ci s/\],#[0-9]+/]!/o; 7841cb0ef41Sopenharmony_ci 7851cb0ef41Sopenharmony_ci s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or 7861cb0ef41Sopenharmony_ci s/vdup\.32\s+(.*)/unvdup32($1)/geo or 7871cb0ef41Sopenharmony_ci s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or 7881cb0ef41Sopenharmony_ci s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 7891cb0ef41Sopenharmony_ci s/^(\s+)b\./$1b/o or 7901cb0ef41Sopenharmony_ci s/^(\s+)ret/$1bx\tlr/o; 7911cb0ef41Sopenharmony_ci 7921cb0ef41Sopenharmony_ci if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { 7931cb0ef41Sopenharmony_ci print " it $2\n"; 7941cb0ef41Sopenharmony_ci } 7951cb0ef41Sopenharmony_ci 7961cb0ef41Sopenharmony_ci print $_,"\n"; 7971cb0ef41Sopenharmony_ci } 7981cb0ef41Sopenharmony_ci} 7991cb0ef41Sopenharmony_ci 8001cb0ef41Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; # enforce flush 801