1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci#
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# June 2014
20e1051a39Sopenharmony_ci#
21e1051a39Sopenharmony_ci# Initial version was developed in tight cooperation with Ard
22e1051a39Sopenharmony_ci# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
23e1051a39Sopenharmony_ci# Just like aesv8-armx.pl this module supports both AArch32 and
24e1051a39Sopenharmony_ci# AArch64 execution modes.
25e1051a39Sopenharmony_ci#
26e1051a39Sopenharmony_ci# July 2014
27e1051a39Sopenharmony_ci#
28e1051a39Sopenharmony_ci# Implement 2x aggregated reduction [see ghash-x86.pl for background
29e1051a39Sopenharmony_ci# information].
30e1051a39Sopenharmony_ci#
31e1051a39Sopenharmony_ci# November 2017
32e1051a39Sopenharmony_ci#
33e1051a39Sopenharmony_ci# AArch64 register bank to "accommodate" 4x aggregated reduction and
34e1051a39Sopenharmony_ci# improve performance by 20-70% depending on processor.
35e1051a39Sopenharmony_ci#
36e1051a39Sopenharmony_ci# Current performance in cycles per processed byte:
37e1051a39Sopenharmony_ci#
38e1051a39Sopenharmony_ci#		64-bit PMULL	32-bit PMULL	32-bit NEON(*)
39e1051a39Sopenharmony_ci# Apple A7	0.58		0.92		5.62
40e1051a39Sopenharmony_ci# Cortex-A53	0.85		1.01		8.39
41e1051a39Sopenharmony_ci# Cortex-A57	0.73		1.17		7.61
42e1051a39Sopenharmony_ci# Denver	0.51		0.65		6.02
43e1051a39Sopenharmony_ci# Mongoose	0.65		1.10		8.06
44e1051a39Sopenharmony_ci# Kryo		0.76		1.16		8.00
45e1051a39Sopenharmony_ci# ThunderX2	1.05
46e1051a39Sopenharmony_ci#
47e1051a39Sopenharmony_ci# (*)	presented for reference/comparison purposes;
48e1051a39Sopenharmony_ci
49e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
50e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
51e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
52e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
53e1051a39Sopenharmony_ci
54e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55e1051a39Sopenharmony_ci( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
56e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
57e1051a39Sopenharmony_cidie "can't locate arm-xlate.pl";
58e1051a39Sopenharmony_ci
59e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" $xlate $flavour \"$output\""
60e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
61e1051a39Sopenharmony_ci*STDOUT=*OUT;
62e1051a39Sopenharmony_ci
63e1051a39Sopenharmony_ci$Xi="x0";	# argument block
64e1051a39Sopenharmony_ci$Htbl="x1";
65e1051a39Sopenharmony_ci$inp="x2";
66e1051a39Sopenharmony_ci$len="x3";
67e1051a39Sopenharmony_ci
68e1051a39Sopenharmony_ci$inc="x12";
69e1051a39Sopenharmony_ci
70e1051a39Sopenharmony_ci{
71e1051a39Sopenharmony_cimy ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
72e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
73e1051a39Sopenharmony_cimy $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
74e1051a39Sopenharmony_ci
75e1051a39Sopenharmony_ci$code=<<___;
76e1051a39Sopenharmony_ci#include "arm_arch.h"
77e1051a39Sopenharmony_ci
78e1051a39Sopenharmony_ci#if __ARM_MAX_ARCH__>=7
79e1051a39Sopenharmony_ci___
80e1051a39Sopenharmony_ci$code.=".arch	armv8-a+crypto\n.text\n"	if ($flavour =~ /64/);
81e1051a39Sopenharmony_ci$code.=<<___					if ($flavour !~ /64/);
82e1051a39Sopenharmony_ci.fpu	neon
83e1051a39Sopenharmony_ci#ifdef __thumb2__
84e1051a39Sopenharmony_ci.syntax        unified
85e1051a39Sopenharmony_ci.thumb
86e1051a39Sopenharmony_ci# define INST(a,b,c,d) $_byte  c,0xef,a,b
87e1051a39Sopenharmony_ci#else
88e1051a39Sopenharmony_ci.code  32
89e1051a39Sopenharmony_ci# define INST(a,b,c,d) $_byte  a,b,c,0xf2
90e1051a39Sopenharmony_ci#endif
91e1051a39Sopenharmony_ci
92e1051a39Sopenharmony_ci.text
93e1051a39Sopenharmony_ci___
94e1051a39Sopenharmony_ci
95e1051a39Sopenharmony_ci################################################################################
96e1051a39Sopenharmony_ci# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
97e1051a39Sopenharmony_ci#
98e1051a39Sopenharmony_ci# input:	128-bit H - secret parameter E(K,0^128)
99e1051a39Sopenharmony_ci# output:	precomputed table filled with degrees of twisted H;
100e1051a39Sopenharmony_ci#		H is twisted to handle reverse bitness of GHASH;
101e1051a39Sopenharmony_ci#		only few of 16 slots of Htable[16] are used;
102e1051a39Sopenharmony_ci#		data is opaque to outside world (which allows to
103e1051a39Sopenharmony_ci#		optimize the code independently);
104e1051a39Sopenharmony_ci#
105e1051a39Sopenharmony_ci$code.=<<___;
106e1051a39Sopenharmony_ci.global	gcm_init_v8
107e1051a39Sopenharmony_ci.type	gcm_init_v8,%function
108e1051a39Sopenharmony_ci.align	4
109e1051a39Sopenharmony_cigcm_init_v8:
110e1051a39Sopenharmony_ci	vld1.64		{$t1},[x1]		@ load input H
111e1051a39Sopenharmony_ci	vmov.i8		$xC2,#0xe1
112e1051a39Sopenharmony_ci	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
113e1051a39Sopenharmony_ci	vext.8		$IN,$t1,$t1,#8
114e1051a39Sopenharmony_ci	vshr.u64	$t2,$xC2,#63
115e1051a39Sopenharmony_ci	vdup.32		$t1,${t1}[1]
116e1051a39Sopenharmony_ci	vext.8		$t0,$t2,$xC2,#8		@ t0=0xc2....01
117e1051a39Sopenharmony_ci	vshr.u64	$t2,$IN,#63
118e1051a39Sopenharmony_ci	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
119e1051a39Sopenharmony_ci	vand		$t2,$t2,$t0
120e1051a39Sopenharmony_ci	vshl.i64	$IN,$IN,#1
121e1051a39Sopenharmony_ci	vext.8		$t2,$t2,$t2,#8
122e1051a39Sopenharmony_ci	vand		$t0,$t0,$t1
123e1051a39Sopenharmony_ci	vorr		$IN,$IN,$t2		@ H<<<=1
124e1051a39Sopenharmony_ci	veor		$H,$IN,$t0		@ twisted H
125e1051a39Sopenharmony_ci	vst1.64		{$H},[x0],#16		@ store Htable[0]
126e1051a39Sopenharmony_ci
127e1051a39Sopenharmony_ci	@ calculate H^2
128e1051a39Sopenharmony_ci	vext.8		$t0,$H,$H,#8		@ Karatsuba pre-processing
129e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$H,$H
130e1051a39Sopenharmony_ci	veor		$t0,$t0,$H
131e1051a39Sopenharmony_ci	vpmull2.p64	$Xh,$H,$H
132e1051a39Sopenharmony_ci	vpmull.p64	$Xm,$t0,$t0
133e1051a39Sopenharmony_ci
134e1051a39Sopenharmony_ci	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
135e1051a39Sopenharmony_ci	veor		$t2,$Xl,$Xh
136e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t1
137e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t2
138e1051a39Sopenharmony_ci	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
139e1051a39Sopenharmony_ci
140e1051a39Sopenharmony_ci	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
141e1051a39Sopenharmony_ci	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
142e1051a39Sopenharmony_ci	veor		$Xl,$Xm,$t2
143e1051a39Sopenharmony_ci
144e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
145e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$Xl,$xC2
146e1051a39Sopenharmony_ci	veor		$t2,$t2,$Xh
147e1051a39Sopenharmony_ci	veor		$H2,$Xl,$t2
148e1051a39Sopenharmony_ci
149e1051a39Sopenharmony_ci	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
150e1051a39Sopenharmony_ci	veor		$t1,$t1,$H2
151e1051a39Sopenharmony_ci	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
152e1051a39Sopenharmony_ci	vst1.64		{$Hhl-$H2},[x0],#32	@ store Htable[1..2]
153e1051a39Sopenharmony_ci___
154e1051a39Sopenharmony_ciif ($flavour =~ /64/) {
155e1051a39Sopenharmony_cimy ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
156e1051a39Sopenharmony_ci
157e1051a39Sopenharmony_ci$code.=<<___;
158e1051a39Sopenharmony_ci	@ calculate H^3 and H^4
159e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$H, $H2
160e1051a39Sopenharmony_ci	 vpmull.p64	$Yl,$H2,$H2
161e1051a39Sopenharmony_ci	vpmull2.p64	$Xh,$H, $H2
162e1051a39Sopenharmony_ci	 vpmull2.p64	$Yh,$H2,$H2
163e1051a39Sopenharmony_ci	vpmull.p64	$Xm,$t0,$t1
164e1051a39Sopenharmony_ci	 vpmull.p64	$Ym,$t1,$t1
165e1051a39Sopenharmony_ci
166e1051a39Sopenharmony_ci	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
167e1051a39Sopenharmony_ci	 vext.8		$t1,$Yl,$Yh,#8
168e1051a39Sopenharmony_ci	veor		$t2,$Xl,$Xh
169e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t0
170e1051a39Sopenharmony_ci	 veor		$t3,$Yl,$Yh
171e1051a39Sopenharmony_ci	 veor		$Ym,$Ym,$t1
172e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t2
173e1051a39Sopenharmony_ci	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
174e1051a39Sopenharmony_ci	 veor		$Ym,$Ym,$t3
175e1051a39Sopenharmony_ci	 vpmull.p64	$t3,$Yl,$xC2
176e1051a39Sopenharmony_ci
177e1051a39Sopenharmony_ci	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
178e1051a39Sopenharmony_ci	 vmov		$Yh#lo,$Ym#hi
179e1051a39Sopenharmony_ci	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
180e1051a39Sopenharmony_ci	 vmov		$Ym#hi,$Yl#lo
181e1051a39Sopenharmony_ci	veor		$Xl,$Xm,$t2
182e1051a39Sopenharmony_ci	 veor		$Yl,$Ym,$t3
183e1051a39Sopenharmony_ci
184e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
185e1051a39Sopenharmony_ci	 vext.8		$t3,$Yl,$Yl,#8
186e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$Xl,$xC2
187e1051a39Sopenharmony_ci	 vpmull.p64	$Yl,$Yl,$xC2
188e1051a39Sopenharmony_ci	veor		$t2,$t2,$Xh
189e1051a39Sopenharmony_ci	 veor		$t3,$t3,$Yh
190e1051a39Sopenharmony_ci	veor		$H, $Xl,$t2		@ H^3
191e1051a39Sopenharmony_ci	 veor		$H2,$Yl,$t3		@ H^4
192e1051a39Sopenharmony_ci
193e1051a39Sopenharmony_ci	vext.8		$t0,$H, $H,#8		@ Karatsuba pre-processing
194e1051a39Sopenharmony_ci	 vext.8		$t1,$H2,$H2,#8
195e1051a39Sopenharmony_ci	veor		$t0,$t0,$H
196e1051a39Sopenharmony_ci	 veor		$t1,$t1,$H2
197e1051a39Sopenharmony_ci	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
198e1051a39Sopenharmony_ci	vst1.64		{$H-$H2},[x0]		@ store Htable[3..5]
199e1051a39Sopenharmony_ci___
200e1051a39Sopenharmony_ci}
201e1051a39Sopenharmony_ci$code.=<<___;
202e1051a39Sopenharmony_ci	ret
203e1051a39Sopenharmony_ci.size	gcm_init_v8,.-gcm_init_v8
204e1051a39Sopenharmony_ci___
205e1051a39Sopenharmony_ci################################################################################
206e1051a39Sopenharmony_ci# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
207e1051a39Sopenharmony_ci#
208e1051a39Sopenharmony_ci# input:	Xi - current hash value;
209e1051a39Sopenharmony_ci#		Htable - table precomputed in gcm_init_v8;
210e1051a39Sopenharmony_ci# output:	Xi - next hash value Xi;
211e1051a39Sopenharmony_ci#
212e1051a39Sopenharmony_ci$code.=<<___;
213e1051a39Sopenharmony_ci.global	gcm_gmult_v8
214e1051a39Sopenharmony_ci.type	gcm_gmult_v8,%function
215e1051a39Sopenharmony_ci.align	4
216e1051a39Sopenharmony_cigcm_gmult_v8:
217e1051a39Sopenharmony_ci	vld1.64		{$t1},[$Xi]		@ load Xi
218e1051a39Sopenharmony_ci	vmov.i8		$xC2,#0xe1
219e1051a39Sopenharmony_ci	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
220e1051a39Sopenharmony_ci	vshl.u64	$xC2,$xC2,#57
221e1051a39Sopenharmony_ci#ifndef __ARMEB__
222e1051a39Sopenharmony_ci	vrev64.8	$t1,$t1
223e1051a39Sopenharmony_ci#endif
224e1051a39Sopenharmony_ci	vext.8		$IN,$t1,$t1,#8
225e1051a39Sopenharmony_ci
226e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
227e1051a39Sopenharmony_ci	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
228e1051a39Sopenharmony_ci	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
229e1051a39Sopenharmony_ci	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
230e1051a39Sopenharmony_ci
231e1051a39Sopenharmony_ci	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
232e1051a39Sopenharmony_ci	veor		$t2,$Xl,$Xh
233e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t1
234e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t2
235e1051a39Sopenharmony_ci	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
236e1051a39Sopenharmony_ci
237e1051a39Sopenharmony_ci	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
238e1051a39Sopenharmony_ci	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
239e1051a39Sopenharmony_ci	veor		$Xl,$Xm,$t2
240e1051a39Sopenharmony_ci
241e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
242e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$Xl,$xC2
243e1051a39Sopenharmony_ci	veor		$t2,$t2,$Xh
244e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$t2
245e1051a39Sopenharmony_ci
246e1051a39Sopenharmony_ci#ifndef __ARMEB__
247e1051a39Sopenharmony_ci	vrev64.8	$Xl,$Xl
248e1051a39Sopenharmony_ci#endif
249e1051a39Sopenharmony_ci	vext.8		$Xl,$Xl,$Xl,#8
250e1051a39Sopenharmony_ci	vst1.64		{$Xl},[$Xi]		@ write out Xi
251e1051a39Sopenharmony_ci
252e1051a39Sopenharmony_ci	ret
253e1051a39Sopenharmony_ci.size	gcm_gmult_v8,.-gcm_gmult_v8
254e1051a39Sopenharmony_ci___
255e1051a39Sopenharmony_ci################################################################################
256e1051a39Sopenharmony_ci# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
257e1051a39Sopenharmony_ci#
258e1051a39Sopenharmony_ci# input:	table precomputed in gcm_init_v8;
259e1051a39Sopenharmony_ci#		current hash value Xi;
260e1051a39Sopenharmony_ci#		pointer to input data;
261e1051a39Sopenharmony_ci#		length of input data in bytes, but divisible by block size;
262e1051a39Sopenharmony_ci# output:	next hash value Xi;
263e1051a39Sopenharmony_ci#
264e1051a39Sopenharmony_ci$code.=<<___;
265e1051a39Sopenharmony_ci.global	gcm_ghash_v8
266e1051a39Sopenharmony_ci.type	gcm_ghash_v8,%function
267e1051a39Sopenharmony_ci.align	4
268e1051a39Sopenharmony_cigcm_ghash_v8:
269e1051a39Sopenharmony_ci___
270e1051a39Sopenharmony_ci$code.=<<___	if ($flavour =~ /64/);
271e1051a39Sopenharmony_ci	cmp		$len,#64
272e1051a39Sopenharmony_ci	b.hs		.Lgcm_ghash_v8_4x
273e1051a39Sopenharmony_ci___
274e1051a39Sopenharmony_ci$code.=<<___		if ($flavour !~ /64/);
275e1051a39Sopenharmony_ci	vstmdb		sp!,{d8-d15}		@ 32-bit ABI says so
276e1051a39Sopenharmony_ci___
277e1051a39Sopenharmony_ci$code.=<<___;
278e1051a39Sopenharmony_ci	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
279e1051a39Sopenharmony_ci						@ "[rotated]" means that
280e1051a39Sopenharmony_ci						@ loaded value would have
281e1051a39Sopenharmony_ci						@ to be rotated in order to
282e1051a39Sopenharmony_ci						@ make it appear as in
283e1051a39Sopenharmony_ci						@ algorithm specification
284e1051a39Sopenharmony_ci	subs		$len,$len,#32		@ see if $len is 32 or larger
285e1051a39Sopenharmony_ci	mov		$inc,#16		@ $inc is used as post-
286e1051a39Sopenharmony_ci						@ increment for input pointer;
287e1051a39Sopenharmony_ci						@ as loop is modulo-scheduled
288e1051a39Sopenharmony_ci						@ $inc is zeroed just in time
289e1051a39Sopenharmony_ci						@ to preclude overstepping
290e1051a39Sopenharmony_ci						@ inp[len], which means that
291e1051a39Sopenharmony_ci						@ last block[s] are actually
292e1051a39Sopenharmony_ci						@ loaded twice, but last
293e1051a39Sopenharmony_ci						@ copy is not processed
294e1051a39Sopenharmony_ci	vld1.64		{$H-$Hhl},[$Htbl],#32	@ load twisted H, ..., H^2
295e1051a39Sopenharmony_ci	vmov.i8		$xC2,#0xe1
296e1051a39Sopenharmony_ci	vld1.64		{$H2},[$Htbl]
297e1051a39Sopenharmony_ci	cclr		$inc,eq			@ is it time to zero $inc?
298e1051a39Sopenharmony_ci	vext.8		$Xl,$Xl,$Xl,#8		@ rotate Xi
299e1051a39Sopenharmony_ci	vld1.64		{$t0},[$inp],#16	@ load [rotated] I[0]
300e1051a39Sopenharmony_ci	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
301e1051a39Sopenharmony_ci#ifndef __ARMEB__
302e1051a39Sopenharmony_ci	vrev64.8	$t0,$t0
303e1051a39Sopenharmony_ci	vrev64.8	$Xl,$Xl
304e1051a39Sopenharmony_ci#endif
305e1051a39Sopenharmony_ci	vext.8		$IN,$t0,$t0,#8		@ rotate I[0]
306e1051a39Sopenharmony_ci	b.lo		.Lodd_tail_v8		@ $len was less than 32
307e1051a39Sopenharmony_ci___
308e1051a39Sopenharmony_ci{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
309e1051a39Sopenharmony_ci	#######
310e1051a39Sopenharmony_ci	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
311e1051a39Sopenharmony_ci	#	[(H*Ii+1) + (H*Xi+1)] mod P =
312e1051a39Sopenharmony_ci	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
313e1051a39Sopenharmony_ci	#
314e1051a39Sopenharmony_ci$code.=<<___;
315e1051a39Sopenharmony_ci	vld1.64		{$t1},[$inp],$inc	@ load [rotated] I[1]
316e1051a39Sopenharmony_ci#ifndef __ARMEB__
317e1051a39Sopenharmony_ci	vrev64.8	$t1,$t1
318e1051a39Sopenharmony_ci#endif
319e1051a39Sopenharmony_ci	vext.8		$In,$t1,$t1,#8
320e1051a39Sopenharmony_ci	veor		$IN,$IN,$Xl		@ I[i]^=Xi
321e1051a39Sopenharmony_ci	vpmull.p64	$Xln,$H,$In		@ H·Ii+1
322e1051a39Sopenharmony_ci	veor		$t1,$t1,$In		@ Karatsuba pre-processing
323e1051a39Sopenharmony_ci	vpmull2.p64	$Xhn,$H,$In
324e1051a39Sopenharmony_ci	b		.Loop_mod2x_v8
325e1051a39Sopenharmony_ci
326e1051a39Sopenharmony_ci.align	4
327e1051a39Sopenharmony_ci.Loop_mod2x_v8:
328e1051a39Sopenharmony_ci	vext.8		$t2,$IN,$IN,#8
329e1051a39Sopenharmony_ci	subs		$len,$len,#32		@ is there more data?
330e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$H2,$IN		@ H^2.lo·Xi.lo
331e1051a39Sopenharmony_ci	cclr		$inc,lo			@ is it time to zero $inc?
332e1051a39Sopenharmony_ci
333e1051a39Sopenharmony_ci	 vpmull.p64	$Xmn,$Hhl,$t1
334e1051a39Sopenharmony_ci	veor		$t2,$t2,$IN		@ Karatsuba pre-processing
335e1051a39Sopenharmony_ci	vpmull2.p64	$Xh,$H2,$IN		@ H^2.hi·Xi.hi
336e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$Xln		@ accumulate
337e1051a39Sopenharmony_ci	vpmull2.p64	$Xm,$Hhl,$t2		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
338e1051a39Sopenharmony_ci	 vld1.64	{$t0},[$inp],$inc	@ load [rotated] I[i+2]
339e1051a39Sopenharmony_ci
340e1051a39Sopenharmony_ci	veor		$Xh,$Xh,$Xhn
341e1051a39Sopenharmony_ci	 cclr		$inc,eq			@ is it time to zero $inc?
342e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$Xmn
343e1051a39Sopenharmony_ci
344e1051a39Sopenharmony_ci	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
345e1051a39Sopenharmony_ci	veor		$t2,$Xl,$Xh
346e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t1
347e1051a39Sopenharmony_ci	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] I[i+3]
348e1051a39Sopenharmony_ci#ifndef __ARMEB__
349e1051a39Sopenharmony_ci	 vrev64.8	$t0,$t0
350e1051a39Sopenharmony_ci#endif
351e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t2
352e1051a39Sopenharmony_ci	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
353e1051a39Sopenharmony_ci
354e1051a39Sopenharmony_ci#ifndef __ARMEB__
355e1051a39Sopenharmony_ci	 vrev64.8	$t1,$t1
356e1051a39Sopenharmony_ci#endif
357e1051a39Sopenharmony_ci	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
358e1051a39Sopenharmony_ci	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
359e1051a39Sopenharmony_ci	 vext.8		$In,$t1,$t1,#8
360e1051a39Sopenharmony_ci	 vext.8		$IN,$t0,$t0,#8
361e1051a39Sopenharmony_ci	veor		$Xl,$Xm,$t2
362e1051a39Sopenharmony_ci	 vpmull.p64	$Xln,$H,$In		@ H·Ii+1
363e1051a39Sopenharmony_ci	veor		$IN,$IN,$Xh		@ accumulate $IN early
364e1051a39Sopenharmony_ci
365e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
366e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$Xl,$xC2
367e1051a39Sopenharmony_ci	veor		$IN,$IN,$t2
368e1051a39Sopenharmony_ci	 veor		$t1,$t1,$In		@ Karatsuba pre-processing
369e1051a39Sopenharmony_ci	veor		$IN,$IN,$Xl
370e1051a39Sopenharmony_ci	 vpmull2.p64	$Xhn,$H,$In
371e1051a39Sopenharmony_ci	b.hs		.Loop_mod2x_v8		@ there was at least 32 more bytes
372e1051a39Sopenharmony_ci
373e1051a39Sopenharmony_ci	veor		$Xh,$Xh,$t2
374e1051a39Sopenharmony_ci	vext.8		$IN,$t0,$t0,#8		@ re-construct $IN
375e1051a39Sopenharmony_ci	adds		$len,$len,#32		@ re-construct $len
376e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$Xh		@ re-construct $Xl
377e1051a39Sopenharmony_ci	b.eq		.Ldone_v8		@ is $len zero?
378e1051a39Sopenharmony_ci___
379e1051a39Sopenharmony_ci}
380e1051a39Sopenharmony_ci$code.=<<___;
381e1051a39Sopenharmony_ci.Lodd_tail_v8:
382e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8
383e1051a39Sopenharmony_ci	veor		$IN,$IN,$Xl		@ inp^=Xi
384e1051a39Sopenharmony_ci	veor		$t1,$t0,$t2		@ $t1 is rotated inp^Xi
385e1051a39Sopenharmony_ci
386e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
387e1051a39Sopenharmony_ci	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
388e1051a39Sopenharmony_ci	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
389e1051a39Sopenharmony_ci	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
390e1051a39Sopenharmony_ci
391e1051a39Sopenharmony_ci	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
392e1051a39Sopenharmony_ci	veor		$t2,$Xl,$Xh
393e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t1
394e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t2
395e1051a39Sopenharmony_ci	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
396e1051a39Sopenharmony_ci
397e1051a39Sopenharmony_ci	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
398e1051a39Sopenharmony_ci	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
399e1051a39Sopenharmony_ci	veor		$Xl,$Xm,$t2
400e1051a39Sopenharmony_ci
401e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
402e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$Xl,$xC2
403e1051a39Sopenharmony_ci	veor		$t2,$t2,$Xh
404e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$t2
405e1051a39Sopenharmony_ci
406e1051a39Sopenharmony_ci.Ldone_v8:
407e1051a39Sopenharmony_ci#ifndef __ARMEB__
408e1051a39Sopenharmony_ci	vrev64.8	$Xl,$Xl
409e1051a39Sopenharmony_ci#endif
410e1051a39Sopenharmony_ci	vext.8		$Xl,$Xl,$Xl,#8
411e1051a39Sopenharmony_ci	vst1.64		{$Xl},[$Xi]		@ write out Xi
412e1051a39Sopenharmony_ci
413e1051a39Sopenharmony_ci___
414e1051a39Sopenharmony_ci$code.=<<___		if ($flavour !~ /64/);
415e1051a39Sopenharmony_ci	vldmia		sp!,{d8-d15}		@ 32-bit ABI says so
416e1051a39Sopenharmony_ci___
417e1051a39Sopenharmony_ci$code.=<<___;
418e1051a39Sopenharmony_ci	ret
419e1051a39Sopenharmony_ci.size	gcm_ghash_v8,.-gcm_ghash_v8
420e1051a39Sopenharmony_ci___
421e1051a39Sopenharmony_ci
422e1051a39Sopenharmony_ciif ($flavour =~ /64/) {				# 4x subroutine
423e1051a39Sopenharmony_cimy ($I0,$j1,$j2,$j3,
424e1051a39Sopenharmony_ci    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
425e1051a39Sopenharmony_ci
426e1051a39Sopenharmony_ci$code.=<<___;
427e1051a39Sopenharmony_ci.type	gcm_ghash_v8_4x,%function
428e1051a39Sopenharmony_ci.align	4
429e1051a39Sopenharmony_cigcm_ghash_v8_4x:
430e1051a39Sopenharmony_ci.Lgcm_ghash_v8_4x:
431e1051a39Sopenharmony_ci	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
432e1051a39Sopenharmony_ci	vld1.64		{$H-$H2},[$Htbl],#48	@ load twisted H, ..., H^2
433e1051a39Sopenharmony_ci	vmov.i8		$xC2,#0xe1
434e1051a39Sopenharmony_ci	vld1.64		{$H3-$H4},[$Htbl]	@ load twisted H^3, ..., H^4
435e1051a39Sopenharmony_ci	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
436e1051a39Sopenharmony_ci
437e1051a39Sopenharmony_ci	vld1.64		{$I0-$j3},[$inp],#64
438e1051a39Sopenharmony_ci#ifndef __ARMEB__
439e1051a39Sopenharmony_ci	vrev64.8	$Xl,$Xl
440e1051a39Sopenharmony_ci	vrev64.8	$j1,$j1
441e1051a39Sopenharmony_ci	vrev64.8	$j2,$j2
442e1051a39Sopenharmony_ci	vrev64.8	$j3,$j3
443e1051a39Sopenharmony_ci	vrev64.8	$I0,$I0
444e1051a39Sopenharmony_ci#endif
445e1051a39Sopenharmony_ci	vext.8		$I3,$j3,$j3,#8
446e1051a39Sopenharmony_ci	vext.8		$I2,$j2,$j2,#8
447e1051a39Sopenharmony_ci	vext.8		$I1,$j1,$j1,#8
448e1051a39Sopenharmony_ci
449e1051a39Sopenharmony_ci	vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
450e1051a39Sopenharmony_ci	veor		$j3,$j3,$I3
451e1051a39Sopenharmony_ci	vpmull2.p64	$Yh,$H,$I3
452e1051a39Sopenharmony_ci	vpmull.p64	$Ym,$Hhl,$j3
453e1051a39Sopenharmony_ci
454e1051a39Sopenharmony_ci	vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
455e1051a39Sopenharmony_ci	veor		$j2,$j2,$I2
456e1051a39Sopenharmony_ci	vpmull2.p64	$I2,$H2,$I2
457e1051a39Sopenharmony_ci	vpmull2.p64	$j2,$Hhl,$j2
458e1051a39Sopenharmony_ci
459e1051a39Sopenharmony_ci	veor		$Yl,$Yl,$t0
460e1051a39Sopenharmony_ci	veor		$Yh,$Yh,$I2
461e1051a39Sopenharmony_ci	veor		$Ym,$Ym,$j2
462e1051a39Sopenharmony_ci
463e1051a39Sopenharmony_ci	vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
464e1051a39Sopenharmony_ci	veor		$j1,$j1,$I1
465e1051a39Sopenharmony_ci	vpmull2.p64	$I1,$H3,$I1
466e1051a39Sopenharmony_ci	vpmull.p64	$j1,$H34,$j1
467e1051a39Sopenharmony_ci
468e1051a39Sopenharmony_ci	veor		$Yl,$Yl,$j3
469e1051a39Sopenharmony_ci	veor		$Yh,$Yh,$I1
470e1051a39Sopenharmony_ci	veor		$Ym,$Ym,$j1
471e1051a39Sopenharmony_ci
472e1051a39Sopenharmony_ci	subs		$len,$len,#128
473e1051a39Sopenharmony_ci	b.lo		.Ltail4x
474e1051a39Sopenharmony_ci
475e1051a39Sopenharmony_ci	b		.Loop4x
476e1051a39Sopenharmony_ci
477e1051a39Sopenharmony_ci.align	4
478e1051a39Sopenharmony_ci.Loop4x:
479e1051a39Sopenharmony_ci	veor		$t0,$I0,$Xl
480e1051a39Sopenharmony_ci	 vld1.64	{$I0-$j3},[$inp],#64
481e1051a39Sopenharmony_ci	vext.8		$IN,$t0,$t0,#8
482e1051a39Sopenharmony_ci#ifndef __ARMEB__
483e1051a39Sopenharmony_ci	 vrev64.8	$j1,$j1
484e1051a39Sopenharmony_ci	 vrev64.8	$j2,$j2
485e1051a39Sopenharmony_ci	 vrev64.8	$j3,$j3
486e1051a39Sopenharmony_ci	 vrev64.8	$I0,$I0
487e1051a39Sopenharmony_ci#endif
488e1051a39Sopenharmony_ci
489e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
490e1051a39Sopenharmony_ci	veor		$t0,$t0,$IN
491e1051a39Sopenharmony_ci	vpmull2.p64	$Xh,$H4,$IN
492e1051a39Sopenharmony_ci	 vext.8		$I3,$j3,$j3,#8
493e1051a39Sopenharmony_ci	vpmull2.p64	$Xm,$H34,$t0
494e1051a39Sopenharmony_ci
495e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$Yl
496e1051a39Sopenharmony_ci	veor		$Xh,$Xh,$Yh
497e1051a39Sopenharmony_ci	 vext.8		$I2,$j2,$j2,#8
498e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$Ym
499e1051a39Sopenharmony_ci	 vext.8		$I1,$j1,$j1,#8
500e1051a39Sopenharmony_ci
501e1051a39Sopenharmony_ci	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
502e1051a39Sopenharmony_ci	veor		$t2,$Xl,$Xh
503e1051a39Sopenharmony_ci	 vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
504e1051a39Sopenharmony_ci	 veor		$j3,$j3,$I3
505e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t1
506e1051a39Sopenharmony_ci	 vpmull2.p64	$Yh,$H,$I3
507e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t2
508e1051a39Sopenharmony_ci	 vpmull.p64	$Ym,$Hhl,$j3
509e1051a39Sopenharmony_ci
510e1051a39Sopenharmony_ci	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
511e1051a39Sopenharmony_ci	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
512e1051a39Sopenharmony_ci	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
513e1051a39Sopenharmony_ci	 vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
514e1051a39Sopenharmony_ci	 veor		$j2,$j2,$I2
515e1051a39Sopenharmony_ci	 vpmull2.p64	$I2,$H2,$I2
516e1051a39Sopenharmony_ci	veor		$Xl,$Xm,$t2
517e1051a39Sopenharmony_ci	 vpmull2.p64	$j2,$Hhl,$j2
518e1051a39Sopenharmony_ci
519e1051a39Sopenharmony_ci	 veor		$Yl,$Yl,$t0
520e1051a39Sopenharmony_ci	 veor		$Yh,$Yh,$I2
521e1051a39Sopenharmony_ci	 veor		$Ym,$Ym,$j2
522e1051a39Sopenharmony_ci
523e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
524e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$Xl,$xC2
525e1051a39Sopenharmony_ci	 vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
526e1051a39Sopenharmony_ci	 veor		$j1,$j1,$I1
527e1051a39Sopenharmony_ci	veor		$t2,$t2,$Xh
528e1051a39Sopenharmony_ci	 vpmull2.p64	$I1,$H3,$I1
529e1051a39Sopenharmony_ci	 vpmull.p64	$j1,$H34,$j1
530e1051a39Sopenharmony_ci
531e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$t2
532e1051a39Sopenharmony_ci	 veor		$Yl,$Yl,$j3
533e1051a39Sopenharmony_ci	 veor		$Yh,$Yh,$I1
534e1051a39Sopenharmony_ci	vext.8		$Xl,$Xl,$Xl,#8
535e1051a39Sopenharmony_ci	 veor		$Ym,$Ym,$j1
536e1051a39Sopenharmony_ci
537e1051a39Sopenharmony_ci	subs		$len,$len,#64
538e1051a39Sopenharmony_ci	b.hs		.Loop4x
539e1051a39Sopenharmony_ci
540e1051a39Sopenharmony_ci.Ltail4x:
541e1051a39Sopenharmony_ci	veor		$t0,$I0,$Xl
542e1051a39Sopenharmony_ci	vext.8		$IN,$t0,$t0,#8
543e1051a39Sopenharmony_ci
544e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
545e1051a39Sopenharmony_ci	veor		$t0,$t0,$IN
546e1051a39Sopenharmony_ci	vpmull2.p64	$Xh,$H4,$IN
547e1051a39Sopenharmony_ci	vpmull2.p64	$Xm,$H34,$t0
548e1051a39Sopenharmony_ci
549e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$Yl
550e1051a39Sopenharmony_ci	veor		$Xh,$Xh,$Yh
551e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$Ym
552e1051a39Sopenharmony_ci
553e1051a39Sopenharmony_ci	adds		$len,$len,#64
554e1051a39Sopenharmony_ci	b.eq		.Ldone4x
555e1051a39Sopenharmony_ci
556e1051a39Sopenharmony_ci	cmp		$len,#32
557e1051a39Sopenharmony_ci	b.lo		.Lone
558e1051a39Sopenharmony_ci	b.eq		.Ltwo
559e1051a39Sopenharmony_ci.Lthree:
560e1051a39Sopenharmony_ci	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
561e1051a39Sopenharmony_ci	veor		$t2,$Xl,$Xh
562e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t1
563e1051a39Sopenharmony_ci	 vld1.64	{$I0-$j2},[$inp]
564e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t2
565e1051a39Sopenharmony_ci#ifndef	__ARMEB__
566e1051a39Sopenharmony_ci	 vrev64.8	$j1,$j1
567e1051a39Sopenharmony_ci	 vrev64.8	$j2,$j2
568e1051a39Sopenharmony_ci	 vrev64.8	$I0,$I0
569e1051a39Sopenharmony_ci#endif
570e1051a39Sopenharmony_ci
571e1051a39Sopenharmony_ci	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
572e1051a39Sopenharmony_ci	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
573e1051a39Sopenharmony_ci	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
574e1051a39Sopenharmony_ci	 vext.8		$I2,$j2,$j2,#8
575e1051a39Sopenharmony_ci	 vext.8		$I1,$j1,$j1,#8
576e1051a39Sopenharmony_ci	veor		$Xl,$Xm,$t2
577e1051a39Sopenharmony_ci
578e1051a39Sopenharmony_ci	 vpmull.p64	$Yl,$H,$I2		@ H·Ii+2
579e1051a39Sopenharmony_ci	 veor		$j2,$j2,$I2
580e1051a39Sopenharmony_ci
581e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
582e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$Xl,$xC2
583e1051a39Sopenharmony_ci	veor		$t2,$t2,$Xh
584e1051a39Sopenharmony_ci	 vpmull2.p64	$Yh,$H,$I2
585e1051a39Sopenharmony_ci	 vpmull.p64	$Ym,$Hhl,$j2
586e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$t2
587e1051a39Sopenharmony_ci	 vpmull.p64	$j3,$H2,$I1		@ H^2·Ii+1
588e1051a39Sopenharmony_ci	 veor		$j1,$j1,$I1
589e1051a39Sopenharmony_ci	vext.8		$Xl,$Xl,$Xl,#8
590e1051a39Sopenharmony_ci
591e1051a39Sopenharmony_ci	 vpmull2.p64	$I1,$H2,$I1
592e1051a39Sopenharmony_ci	veor		$t0,$I0,$Xl
593e1051a39Sopenharmony_ci	 vpmull2.p64	$j1,$Hhl,$j1
594e1051a39Sopenharmony_ci	vext.8		$IN,$t0,$t0,#8
595e1051a39Sopenharmony_ci
596e1051a39Sopenharmony_ci	 veor		$Yl,$Yl,$j3
597e1051a39Sopenharmony_ci	 veor		$Yh,$Yh,$I1
598e1051a39Sopenharmony_ci	 veor		$Ym,$Ym,$j1
599e1051a39Sopenharmony_ci
600e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$H3,$IN		@ H^3·(Xi+Ii)
601e1051a39Sopenharmony_ci	veor		$t0,$t0,$IN
602e1051a39Sopenharmony_ci	vpmull2.p64	$Xh,$H3,$IN
603e1051a39Sopenharmony_ci	vpmull.p64	$Xm,$H34,$t0
604e1051a39Sopenharmony_ci
605e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$Yl
606e1051a39Sopenharmony_ci	veor		$Xh,$Xh,$Yh
607e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$Ym
608e1051a39Sopenharmony_ci	b		.Ldone4x
609e1051a39Sopenharmony_ci
610e1051a39Sopenharmony_ci.align	4
611e1051a39Sopenharmony_ci.Ltwo:
612e1051a39Sopenharmony_ci	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
613e1051a39Sopenharmony_ci	veor		$t2,$Xl,$Xh
614e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t1
615e1051a39Sopenharmony_ci	 vld1.64	{$I0-$j1},[$inp]
616e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t2
617e1051a39Sopenharmony_ci#ifndef	__ARMEB__
618e1051a39Sopenharmony_ci	 vrev64.8	$j1,$j1
619e1051a39Sopenharmony_ci	 vrev64.8	$I0,$I0
620e1051a39Sopenharmony_ci#endif
621e1051a39Sopenharmony_ci
622e1051a39Sopenharmony_ci	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
623e1051a39Sopenharmony_ci	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
624e1051a39Sopenharmony_ci	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
625e1051a39Sopenharmony_ci	 vext.8		$I1,$j1,$j1,#8
626e1051a39Sopenharmony_ci	veor		$Xl,$Xm,$t2
627e1051a39Sopenharmony_ci
628e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
629e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$Xl,$xC2
630e1051a39Sopenharmony_ci	veor		$t2,$t2,$Xh
631e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$t2
632e1051a39Sopenharmony_ci	vext.8		$Xl,$Xl,$Xl,#8
633e1051a39Sopenharmony_ci
634e1051a39Sopenharmony_ci	 vpmull.p64	$Yl,$H,$I1		@ H·Ii+1
635e1051a39Sopenharmony_ci	 veor		$j1,$j1,$I1
636e1051a39Sopenharmony_ci
637e1051a39Sopenharmony_ci	veor		$t0,$I0,$Xl
638e1051a39Sopenharmony_ci	vext.8		$IN,$t0,$t0,#8
639e1051a39Sopenharmony_ci
640e1051a39Sopenharmony_ci	 vpmull2.p64	$Yh,$H,$I1
641e1051a39Sopenharmony_ci	 vpmull.p64	$Ym,$Hhl,$j1
642e1051a39Sopenharmony_ci
643e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$H2,$IN		@ H^2·(Xi+Ii)
644e1051a39Sopenharmony_ci	veor		$t0,$t0,$IN
645e1051a39Sopenharmony_ci	vpmull2.p64	$Xh,$H2,$IN
646e1051a39Sopenharmony_ci	vpmull2.p64	$Xm,$Hhl,$t0
647e1051a39Sopenharmony_ci
648e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$Yl
649e1051a39Sopenharmony_ci	veor		$Xh,$Xh,$Yh
650e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$Ym
651e1051a39Sopenharmony_ci	b		.Ldone4x
652e1051a39Sopenharmony_ci
653e1051a39Sopenharmony_ci.align	4
654e1051a39Sopenharmony_ci.Lone:
655e1051a39Sopenharmony_ci	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
656e1051a39Sopenharmony_ci	veor		$t2,$Xl,$Xh
657e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t1
658e1051a39Sopenharmony_ci	 vld1.64	{$I0},[$inp]
659e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t2
660e1051a39Sopenharmony_ci#ifndef	__ARMEB__
661e1051a39Sopenharmony_ci	 vrev64.8	$I0,$I0
662e1051a39Sopenharmony_ci#endif
663e1051a39Sopenharmony_ci
664e1051a39Sopenharmony_ci	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
665e1051a39Sopenharmony_ci	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
666e1051a39Sopenharmony_ci	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
667e1051a39Sopenharmony_ci	veor		$Xl,$Xm,$t2
668e1051a39Sopenharmony_ci
669e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
670e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$Xl,$xC2
671e1051a39Sopenharmony_ci	veor		$t2,$t2,$Xh
672e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$t2
673e1051a39Sopenharmony_ci	vext.8		$Xl,$Xl,$Xl,#8
674e1051a39Sopenharmony_ci
675e1051a39Sopenharmony_ci	veor		$t0,$I0,$Xl
676e1051a39Sopenharmony_ci	vext.8		$IN,$t0,$t0,#8
677e1051a39Sopenharmony_ci
678e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$H,$IN
679e1051a39Sopenharmony_ci	veor		$t0,$t0,$IN
680e1051a39Sopenharmony_ci	vpmull2.p64	$Xh,$H,$IN
681e1051a39Sopenharmony_ci	vpmull.p64	$Xm,$Hhl,$t0
682e1051a39Sopenharmony_ci
683e1051a39Sopenharmony_ci.Ldone4x:
684e1051a39Sopenharmony_ci	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
685e1051a39Sopenharmony_ci	veor		$t2,$Xl,$Xh
686e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t1
687e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$t2
688e1051a39Sopenharmony_ci
689e1051a39Sopenharmony_ci	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
690e1051a39Sopenharmony_ci	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
691e1051a39Sopenharmony_ci	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
692e1051a39Sopenharmony_ci	veor		$Xl,$Xm,$t2
693e1051a39Sopenharmony_ci
694e1051a39Sopenharmony_ci	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
695e1051a39Sopenharmony_ci	vpmull.p64	$Xl,$Xl,$xC2
696e1051a39Sopenharmony_ci	veor		$t2,$t2,$Xh
697e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$t2
698e1051a39Sopenharmony_ci	vext.8		$Xl,$Xl,$Xl,#8
699e1051a39Sopenharmony_ci
700e1051a39Sopenharmony_ci#ifndef __ARMEB__
701e1051a39Sopenharmony_ci	vrev64.8	$Xl,$Xl
702e1051a39Sopenharmony_ci#endif
703e1051a39Sopenharmony_ci	vst1.64		{$Xl},[$Xi]		@ write out Xi
704e1051a39Sopenharmony_ci
705e1051a39Sopenharmony_ci	ret
706e1051a39Sopenharmony_ci.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
707e1051a39Sopenharmony_ci___
708e1051a39Sopenharmony_ci
709e1051a39Sopenharmony_ci}
710e1051a39Sopenharmony_ci}
711e1051a39Sopenharmony_ci
712e1051a39Sopenharmony_ci$code.=<<___;
713e1051a39Sopenharmony_ci.rodata
714e1051a39Sopenharmony_ci.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
715e1051a39Sopenharmony_ci.align  2
716e1051a39Sopenharmony_ci#endif
717e1051a39Sopenharmony_ci___
718e1051a39Sopenharmony_ci
719e1051a39Sopenharmony_ciif ($flavour =~ /64/) {			######## 64-bit code
720e1051a39Sopenharmony_ci    sub unvmov {
721e1051a39Sopenharmony_ci	my $arg=shift;
722e1051a39Sopenharmony_ci
723e1051a39Sopenharmony_ci	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
724e1051a39Sopenharmony_ci	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
725e1051a39Sopenharmony_ci					     $3<8?$3:$3+8,($4 eq "lo")?0:1;
726e1051a39Sopenharmony_ci    }
727e1051a39Sopenharmony_ci    foreach(split("\n",$code)) {
728e1051a39Sopenharmony_ci	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
729e1051a39Sopenharmony_ci	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
730e1051a39Sopenharmony_ci	s/vmov\s+(.*)/unvmov($1)/geo	or
731e1051a39Sopenharmony_ci	s/vext\.8/ext/o			or
732e1051a39Sopenharmony_ci	s/vshr\.s/sshr\.s/o		or
733e1051a39Sopenharmony_ci	s/vshr/ushr/o			or
734e1051a39Sopenharmony_ci	s/^(\s+)v/$1/o			or	# strip off v prefix
735e1051a39Sopenharmony_ci	s/\bbx\s+lr\b/ret/o;
736e1051a39Sopenharmony_ci
737e1051a39Sopenharmony_ci	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
738e1051a39Sopenharmony_ci	s/@\s/\/\//o;				# old->new style commentary
739e1051a39Sopenharmony_ci
740e1051a39Sopenharmony_ci	# fix up remaining legacy suffixes
741e1051a39Sopenharmony_ci	s/\.[ui]?8(\s)/$1/o;
742e1051a39Sopenharmony_ci	s/\.[uis]?32//o and s/\.16b/\.4s/go;
743e1051a39Sopenharmony_ci	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
744e1051a39Sopenharmony_ci	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
745e1051a39Sopenharmony_ci	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
746e1051a39Sopenharmony_ci	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
747e1051a39Sopenharmony_ci
748e1051a39Sopenharmony_ci	print $_,"\n";
749e1051a39Sopenharmony_ci    }
750e1051a39Sopenharmony_ci} else {				######## 32-bit code
751e1051a39Sopenharmony_ci    sub unvdup32 {
752e1051a39Sopenharmony_ci	my $arg=shift;
753e1051a39Sopenharmony_ci
754e1051a39Sopenharmony_ci	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
755e1051a39Sopenharmony_ci	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
756e1051a39Sopenharmony_ci    }
757e1051a39Sopenharmony_ci    sub unvpmullp64 {
758e1051a39Sopenharmony_ci	my ($mnemonic,$arg)=@_;
759e1051a39Sopenharmony_ci
760e1051a39Sopenharmony_ci	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
761e1051a39Sopenharmony_ci	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
762e1051a39Sopenharmony_ci				 |(($2&7)<<17)|(($2&8)<<4)
763e1051a39Sopenharmony_ci				 |(($3&7)<<1) |(($3&8)<<2);
764e1051a39Sopenharmony_ci	    $word |= 0x00010001	 if ($mnemonic =~ "2");
765e1051a39Sopenharmony_ci	    # since ARMv7 instructions are always encoded little-endian.
766e1051a39Sopenharmony_ci	    # correct solution is to use .inst directive, but older
767e1051a39Sopenharmony_ci	    # assemblers don't implement it:-(
768e1051a39Sopenharmony_ci	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
769e1051a39Sopenharmony_ci			$word&0xff,($word>>8)&0xff,
770e1051a39Sopenharmony_ci			($word>>16)&0xff,($word>>24)&0xff,
771e1051a39Sopenharmony_ci			$mnemonic,$arg;
772e1051a39Sopenharmony_ci	}
773e1051a39Sopenharmony_ci    }
774e1051a39Sopenharmony_ci
775e1051a39Sopenharmony_ci    foreach(split("\n",$code)) {
776e1051a39Sopenharmony_ci	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
777e1051a39Sopenharmony_ci	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
778e1051a39Sopenharmony_ci	s/\/\/\s?/@ /o;				# new->old style commentary
779e1051a39Sopenharmony_ci
780e1051a39Sopenharmony_ci	# fix up remaining new-style suffixes
781e1051a39Sopenharmony_ci	s/\],#[0-9]+/]!/o;
782e1051a39Sopenharmony_ci
783e1051a39Sopenharmony_ci	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2	$1,#0/o			or
784e1051a39Sopenharmony_ci	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
785e1051a39Sopenharmony_ci	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
786e1051a39Sopenharmony_ci	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
787e1051a39Sopenharmony_ci	s/^(\s+)b\./$1b/o						or
788e1051a39Sopenharmony_ci	s/^(\s+)ret/$1bx\tlr/o;
789e1051a39Sopenharmony_ci
790e1051a39Sopenharmony_ci	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
791e1051a39Sopenharmony_ci	    print "     it      $2\n";
792e1051a39Sopenharmony_ci	}
793e1051a39Sopenharmony_ci
794e1051a39Sopenharmony_ci	print $_,"\n";
795e1051a39Sopenharmony_ci    }
796e1051a39Sopenharmony_ci}
797e1051a39Sopenharmony_ci
798e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; # enforce flush
799