1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci#
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# April 2010
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# The module implements "4-bit" GCM GHASH function and underlying
20e1051a39Sopenharmony_ci# single multiplication operation in GF(2^128). "4-bit" means that it
21e1051a39Sopenharmony_ci# uses 256 bytes per-key table [+32 bytes shared table]. There is no
22e1051a39Sopenharmony_ci# experimental performance data available yet. The only approximation
23e1051a39Sopenharmony_ci# that can be made at this point is based on code size. Inner loop is
24e1051a39Sopenharmony_ci# 32 instructions long and on single-issue core should execute in <40
25e1051a39Sopenharmony_ci# cycles. Having verified that gcc 3.4 didn't unroll corresponding
26e1051a39Sopenharmony_ci# loop, this assembler loop body was found to be ~3x smaller than
27e1051a39Sopenharmony_ci# compiler-generated one...
28e1051a39Sopenharmony_ci#
29e1051a39Sopenharmony_ci# July 2010
30e1051a39Sopenharmony_ci#
31e1051a39Sopenharmony_ci# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
32e1051a39Sopenharmony_ci# Cortex A8 core and ~25 cycles per processed byte (which was observed
33e1051a39Sopenharmony_ci# to be ~3 times faster than gcc-generated code:-)
34e1051a39Sopenharmony_ci#
35e1051a39Sopenharmony_ci# February 2011
36e1051a39Sopenharmony_ci#
37e1051a39Sopenharmony_ci# Profiler-assisted and platform-specific optimization resulted in 7%
38e1051a39Sopenharmony_ci# improvement on Cortex A8 core and ~23.5 cycles per byte.
39e1051a39Sopenharmony_ci#
40e1051a39Sopenharmony_ci# March 2011
41e1051a39Sopenharmony_ci#
42e1051a39Sopenharmony_ci# Add NEON implementation featuring polynomial multiplication, i.e. no
43e1051a39Sopenharmony_ci# lookup tables involved. On Cortex A8 it was measured to process one
44e1051a39Sopenharmony_ci# byte in 15 cycles or 55% faster than integer-only code.
45e1051a39Sopenharmony_ci#
46e1051a39Sopenharmony_ci# April 2014
47e1051a39Sopenharmony_ci#
48e1051a39Sopenharmony_ci# Switch to multiplication algorithm suggested in paper referred
49e1051a39Sopenharmony_ci# below and combine it with reduction algorithm from x86 module.
50e1051a39Sopenharmony_ci# Performance improvement over previous version varies from 65% on
51e1051a39Sopenharmony_ci# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
52e1051a39Sopenharmony_ci# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
53e1051a39Sopenharmony_ci# Snapdragon S4 - in 9.33.
54e1051a39Sopenharmony_ci#
55e1051a39Sopenharmony_ci# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
56e1051a39Sopenharmony_ci# Polynomial Multiplication on ARM Processors using the NEON Engine.
57e1051a39Sopenharmony_ci#
58e1051a39Sopenharmony_ci# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
59e1051a39Sopenharmony_ci
60e1051a39Sopenharmony_ci# ====================================================================
61e1051a39Sopenharmony_ci# Note about "528B" variant. In ARM case it makes lesser sense to
62e1051a39Sopenharmony_ci# implement it for following reasons:
63e1051a39Sopenharmony_ci#
64e1051a39Sopenharmony_ci# - performance improvement won't be anywhere near 50%, because 128-
65e1051a39Sopenharmony_ci#   bit shift operation is neatly fused with 128-bit xor here, and
66e1051a39Sopenharmony_ci#   "538B" variant would eliminate only 4-5 instructions out of 32
67e1051a39Sopenharmony_ci#   in the inner loop (meaning that estimated improvement is ~15%);
68e1051a39Sopenharmony_ci# - ARM-based systems are often embedded ones and extra memory
69e1051a39Sopenharmony_ci#   consumption might be unappreciated (for so little improvement);
70e1051a39Sopenharmony_ci#
71e1051a39Sopenharmony_ci# Byte order [in]dependence. =========================================
72e1051a39Sopenharmony_ci#
73e1051a39Sopenharmony_ci# Caller is expected to maintain specific *dword* order in Htable,
74e1051a39Sopenharmony_ci# namely with *least* significant dword of 128-bit value at *lower*
75e1051a39Sopenharmony_ci# address. This differs completely from C code and has everything to
76e1051a39Sopenharmony_ci# do with ldm instruction and order in which dwords are "consumed" by
77e1051a39Sopenharmony_ci# algorithm. *Byte* order within these dwords in turn is whatever
78e1051a39Sopenharmony_ci# *native* byte order on current platform. See gcm128.c for working
79e1051a39Sopenharmony_ci# example...
80e1051a39Sopenharmony_ci
81e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
82e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
83e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
84e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
85e1051a39Sopenharmony_ci
86e1051a39Sopenharmony_ciif ($flavour && $flavour ne "void") {
87e1051a39Sopenharmony_ci    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
88e1051a39Sopenharmony_ci    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
89e1051a39Sopenharmony_ci    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
90e1051a39Sopenharmony_ci    die "can't locate arm-xlate.pl";
91e1051a39Sopenharmony_ci
92e1051a39Sopenharmony_ci    open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
93e1051a39Sopenharmony_ci        or die "can't call $xlate: $!";
94e1051a39Sopenharmony_ci} else {
95e1051a39Sopenharmony_ci    $output and open STDOUT,">$output";
96e1051a39Sopenharmony_ci}
97e1051a39Sopenharmony_ci
98e1051a39Sopenharmony_ci$Xi="r0";	# argument block
99e1051a39Sopenharmony_ci$Htbl="r1";
100e1051a39Sopenharmony_ci$inp="r2";
101e1051a39Sopenharmony_ci$len="r3";
102e1051a39Sopenharmony_ci
103e1051a39Sopenharmony_ci$Zll="r4";	# variables
104e1051a39Sopenharmony_ci$Zlh="r5";
105e1051a39Sopenharmony_ci$Zhl="r6";
106e1051a39Sopenharmony_ci$Zhh="r7";
107e1051a39Sopenharmony_ci$Tll="r8";
108e1051a39Sopenharmony_ci$Tlh="r9";
109e1051a39Sopenharmony_ci$Thl="r10";
110e1051a39Sopenharmony_ci$Thh="r11";
111e1051a39Sopenharmony_ci$nlo="r12";
112e1051a39Sopenharmony_ci################# r13 is stack pointer
113e1051a39Sopenharmony_ci$nhi="r14";
114e1051a39Sopenharmony_ci################# r15 is program counter
115e1051a39Sopenharmony_ci
116e1051a39Sopenharmony_ci$rem_4bit=$inp;	# used in gcm_gmult_4bit
117e1051a39Sopenharmony_ci$cnt=$len;
118e1051a39Sopenharmony_ci
119e1051a39Sopenharmony_cisub Zsmash() {
120e1051a39Sopenharmony_ci  my $i=12;
121e1051a39Sopenharmony_ci  my @args=@_;
122e1051a39Sopenharmony_ci  for ($Zll,$Zlh,$Zhl,$Zhh) {
123e1051a39Sopenharmony_ci    $code.=<<___;
124e1051a39Sopenharmony_ci#if __ARM_ARCH__>=7 && defined(__ARMEL__)
125e1051a39Sopenharmony_ci	rev	$_,$_
126e1051a39Sopenharmony_ci	str	$_,[$Xi,#$i]
127e1051a39Sopenharmony_ci#elif defined(__ARMEB__)
128e1051a39Sopenharmony_ci	str	$_,[$Xi,#$i]
129e1051a39Sopenharmony_ci#else
130e1051a39Sopenharmony_ci	mov	$Tlh,$_,lsr#8
131e1051a39Sopenharmony_ci	strb	$_,[$Xi,#$i+3]
132e1051a39Sopenharmony_ci	mov	$Thl,$_,lsr#16
133e1051a39Sopenharmony_ci	strb	$Tlh,[$Xi,#$i+2]
134e1051a39Sopenharmony_ci	mov	$Thh,$_,lsr#24
135e1051a39Sopenharmony_ci	strb	$Thl,[$Xi,#$i+1]
136e1051a39Sopenharmony_ci	strb	$Thh,[$Xi,#$i]
137e1051a39Sopenharmony_ci#endif
138e1051a39Sopenharmony_ci___
139e1051a39Sopenharmony_ci    $code.="\t".shift(@args)."\n";
140e1051a39Sopenharmony_ci    $i-=4;
141e1051a39Sopenharmony_ci  }
142e1051a39Sopenharmony_ci}
143e1051a39Sopenharmony_ci
144e1051a39Sopenharmony_ci$code=<<___;
145e1051a39Sopenharmony_ci#include "arm_arch.h"
146e1051a39Sopenharmony_ci
147e1051a39Sopenharmony_ci#if defined(__thumb2__) || defined(__clang__)
148e1051a39Sopenharmony_ci.syntax	unified
149e1051a39Sopenharmony_ci#define ldrplb  ldrbpl
150e1051a39Sopenharmony_ci#define ldrneb  ldrbne
151e1051a39Sopenharmony_ci#endif
152e1051a39Sopenharmony_ci#if defined(__thumb2__)
153e1051a39Sopenharmony_ci.thumb
154e1051a39Sopenharmony_ci#else
155e1051a39Sopenharmony_ci.code	32
156e1051a39Sopenharmony_ci#endif
157e1051a39Sopenharmony_ci
158e1051a39Sopenharmony_ci.text
159e1051a39Sopenharmony_ci
160e1051a39Sopenharmony_ci.type	rem_4bit,%object
161e1051a39Sopenharmony_ci.align	5
162e1051a39Sopenharmony_cirem_4bit:
163e1051a39Sopenharmony_ci.short	0x0000,0x1C20,0x3840,0x2460
164e1051a39Sopenharmony_ci.short	0x7080,0x6CA0,0x48C0,0x54E0
165e1051a39Sopenharmony_ci.short	0xE100,0xFD20,0xD940,0xC560
166e1051a39Sopenharmony_ci.short	0x9180,0x8DA0,0xA9C0,0xB5E0
167e1051a39Sopenharmony_ci.size	rem_4bit,.-rem_4bit
168e1051a39Sopenharmony_ci
169e1051a39Sopenharmony_ci.type	rem_4bit_get,%function
170e1051a39Sopenharmony_cirem_4bit_get:
171e1051a39Sopenharmony_ci#if defined(__thumb2__)
172e1051a39Sopenharmony_ci	adr	$rem_4bit,rem_4bit
173e1051a39Sopenharmony_ci#else
174e1051a39Sopenharmony_ci	sub	$rem_4bit,pc,#8+32	@ &rem_4bit
175e1051a39Sopenharmony_ci#endif
176e1051a39Sopenharmony_ci	b	.Lrem_4bit_got
177e1051a39Sopenharmony_ci	nop
178e1051a39Sopenharmony_ci	nop
179e1051a39Sopenharmony_ci.size	rem_4bit_get,.-rem_4bit_get
180e1051a39Sopenharmony_ci
181e1051a39Sopenharmony_ci.global	gcm_ghash_4bit
182e1051a39Sopenharmony_ci.type	gcm_ghash_4bit,%function
183e1051a39Sopenharmony_ci.align	4
184e1051a39Sopenharmony_cigcm_ghash_4bit:
185e1051a39Sopenharmony_ci#if defined(__thumb2__)
186e1051a39Sopenharmony_ci	adr	r12,rem_4bit
187e1051a39Sopenharmony_ci#else
188e1051a39Sopenharmony_ci	sub	r12,pc,#8+48		@ &rem_4bit
189e1051a39Sopenharmony_ci#endif
190e1051a39Sopenharmony_ci	add	$len,$inp,$len		@ $len to point at the end
191e1051a39Sopenharmony_ci	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
192e1051a39Sopenharmony_ci
193e1051a39Sopenharmony_ci	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
194e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r11}		@ ... to stack
195e1051a39Sopenharmony_ci
196e1051a39Sopenharmony_ci	ldrb	$nlo,[$inp,#15]
197e1051a39Sopenharmony_ci	ldrb	$nhi,[$Xi,#15]
198e1051a39Sopenharmony_ci.Louter:
199e1051a39Sopenharmony_ci	eor	$nlo,$nlo,$nhi
200e1051a39Sopenharmony_ci	and	$nhi,$nlo,#0xf0
201e1051a39Sopenharmony_ci	and	$nlo,$nlo,#0x0f
202e1051a39Sopenharmony_ci	mov	$cnt,#14
203e1051a39Sopenharmony_ci
204e1051a39Sopenharmony_ci	add	$Zhh,$Htbl,$nlo,lsl#4
205e1051a39Sopenharmony_ci	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
206e1051a39Sopenharmony_ci	add	$Thh,$Htbl,$nhi
207e1051a39Sopenharmony_ci	ldrb	$nlo,[$inp,#14]
208e1051a39Sopenharmony_ci
209e1051a39Sopenharmony_ci	and	$nhi,$Zll,#0xf		@ rem
210e1051a39Sopenharmony_ci	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
211e1051a39Sopenharmony_ci	add	$nhi,$nhi,$nhi
212e1051a39Sopenharmony_ci	eor	$Zll,$Tll,$Zll,lsr#4
213e1051a39Sopenharmony_ci	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
214e1051a39Sopenharmony_ci	eor	$Zll,$Zll,$Zlh,lsl#28
215e1051a39Sopenharmony_ci	ldrb	$nhi,[$Xi,#14]
216e1051a39Sopenharmony_ci	eor	$Zlh,$Tlh,$Zlh,lsr#4
217e1051a39Sopenharmony_ci	eor	$Zlh,$Zlh,$Zhl,lsl#28
218e1051a39Sopenharmony_ci	eor	$Zhl,$Thl,$Zhl,lsr#4
219e1051a39Sopenharmony_ci	eor	$Zhl,$Zhl,$Zhh,lsl#28
220e1051a39Sopenharmony_ci	eor	$Zhh,$Thh,$Zhh,lsr#4
221e1051a39Sopenharmony_ci	eor	$nlo,$nlo,$nhi
222e1051a39Sopenharmony_ci	and	$nhi,$nlo,#0xf0
223e1051a39Sopenharmony_ci	and	$nlo,$nlo,#0x0f
224e1051a39Sopenharmony_ci	eor	$Zhh,$Zhh,$Tll,lsl#16
225e1051a39Sopenharmony_ci
226e1051a39Sopenharmony_ci.Linner:
227e1051a39Sopenharmony_ci	add	$Thh,$Htbl,$nlo,lsl#4
228e1051a39Sopenharmony_ci	and	$nlo,$Zll,#0xf		@ rem
229e1051a39Sopenharmony_ci	subs	$cnt,$cnt,#1
230e1051a39Sopenharmony_ci	add	$nlo,$nlo,$nlo
231e1051a39Sopenharmony_ci	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
232e1051a39Sopenharmony_ci	eor	$Zll,$Tll,$Zll,lsr#4
233e1051a39Sopenharmony_ci	eor	$Zll,$Zll,$Zlh,lsl#28
234e1051a39Sopenharmony_ci	eor	$Zlh,$Tlh,$Zlh,lsr#4
235e1051a39Sopenharmony_ci	eor	$Zlh,$Zlh,$Zhl,lsl#28
236e1051a39Sopenharmony_ci	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
237e1051a39Sopenharmony_ci	eor	$Zhl,$Thl,$Zhl,lsr#4
238e1051a39Sopenharmony_ci#ifdef	__thumb2__
239e1051a39Sopenharmony_ci	it	pl
240e1051a39Sopenharmony_ci#endif
241e1051a39Sopenharmony_ci	ldrplb	$nlo,[$inp,$cnt]
242e1051a39Sopenharmony_ci	eor	$Zhl,$Zhl,$Zhh,lsl#28
243e1051a39Sopenharmony_ci	eor	$Zhh,$Thh,$Zhh,lsr#4
244e1051a39Sopenharmony_ci
245e1051a39Sopenharmony_ci	add	$Thh,$Htbl,$nhi
246e1051a39Sopenharmony_ci	and	$nhi,$Zll,#0xf		@ rem
247e1051a39Sopenharmony_ci	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
248e1051a39Sopenharmony_ci	add	$nhi,$nhi,$nhi
249e1051a39Sopenharmony_ci	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
250e1051a39Sopenharmony_ci	eor	$Zll,$Tll,$Zll,lsr#4
251e1051a39Sopenharmony_ci#ifdef	__thumb2__
252e1051a39Sopenharmony_ci	it	pl
253e1051a39Sopenharmony_ci#endif
254e1051a39Sopenharmony_ci	ldrplb	$Tll,[$Xi,$cnt]
255e1051a39Sopenharmony_ci	eor	$Zll,$Zll,$Zlh,lsl#28
256e1051a39Sopenharmony_ci	eor	$Zlh,$Tlh,$Zlh,lsr#4
257e1051a39Sopenharmony_ci	ldrh	$Tlh,[sp,$nhi]
258e1051a39Sopenharmony_ci	eor	$Zlh,$Zlh,$Zhl,lsl#28
259e1051a39Sopenharmony_ci	eor	$Zhl,$Thl,$Zhl,lsr#4
260e1051a39Sopenharmony_ci	eor	$Zhl,$Zhl,$Zhh,lsl#28
261e1051a39Sopenharmony_ci#ifdef	__thumb2__
262e1051a39Sopenharmony_ci	it	pl
263e1051a39Sopenharmony_ci#endif
264e1051a39Sopenharmony_ci	eorpl	$nlo,$nlo,$Tll
265e1051a39Sopenharmony_ci	eor	$Zhh,$Thh,$Zhh,lsr#4
266e1051a39Sopenharmony_ci#ifdef	__thumb2__
267e1051a39Sopenharmony_ci	itt	pl
268e1051a39Sopenharmony_ci#endif
269e1051a39Sopenharmony_ci	andpl	$nhi,$nlo,#0xf0
270e1051a39Sopenharmony_ci	andpl	$nlo,$nlo,#0x0f
271e1051a39Sopenharmony_ci	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
272e1051a39Sopenharmony_ci	bpl	.Linner
273e1051a39Sopenharmony_ci
274e1051a39Sopenharmony_ci	ldr	$len,[sp,#32]		@ re-load $len/end
275e1051a39Sopenharmony_ci	add	$inp,$inp,#16
276e1051a39Sopenharmony_ci	mov	$nhi,$Zll
277e1051a39Sopenharmony_ci___
278e1051a39Sopenharmony_ci	&Zsmash("cmp\t$inp,$len","\n".
279e1051a39Sopenharmony_ci				 "#ifdef __thumb2__\n".
280e1051a39Sopenharmony_ci				 "	it	ne\n".
281e1051a39Sopenharmony_ci				 "#endif\n".
282e1051a39Sopenharmony_ci				 "	ldrneb	$nlo,[$inp,#15]");
283e1051a39Sopenharmony_ci$code.=<<___;
284e1051a39Sopenharmony_ci	bne	.Louter
285e1051a39Sopenharmony_ci
286e1051a39Sopenharmony_ci	add	sp,sp,#36
287e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5
288e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r11,pc}
289e1051a39Sopenharmony_ci#else
290e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r11,lr}
291e1051a39Sopenharmony_ci	tst	lr,#1
292e1051a39Sopenharmony_ci	moveq	pc,lr			@ be binary compatible with V4, yet
293e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
294e1051a39Sopenharmony_ci#endif
295e1051a39Sopenharmony_ci.size	gcm_ghash_4bit,.-gcm_ghash_4bit
296e1051a39Sopenharmony_ci
297e1051a39Sopenharmony_ci.global	gcm_gmult_4bit
298e1051a39Sopenharmony_ci.type	gcm_gmult_4bit,%function
299e1051a39Sopenharmony_cigcm_gmult_4bit:
300e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r11,lr}
301e1051a39Sopenharmony_ci	ldrb	$nlo,[$Xi,#15]
302e1051a39Sopenharmony_ci	b	rem_4bit_get
303e1051a39Sopenharmony_ci.Lrem_4bit_got:
304e1051a39Sopenharmony_ci	and	$nhi,$nlo,#0xf0
305e1051a39Sopenharmony_ci	and	$nlo,$nlo,#0x0f
306e1051a39Sopenharmony_ci	mov	$cnt,#14
307e1051a39Sopenharmony_ci
308e1051a39Sopenharmony_ci	add	$Zhh,$Htbl,$nlo,lsl#4
309e1051a39Sopenharmony_ci	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
310e1051a39Sopenharmony_ci	ldrb	$nlo,[$Xi,#14]
311e1051a39Sopenharmony_ci
312e1051a39Sopenharmony_ci	add	$Thh,$Htbl,$nhi
313e1051a39Sopenharmony_ci	and	$nhi,$Zll,#0xf		@ rem
314e1051a39Sopenharmony_ci	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
315e1051a39Sopenharmony_ci	add	$nhi,$nhi,$nhi
316e1051a39Sopenharmony_ci	eor	$Zll,$Tll,$Zll,lsr#4
317e1051a39Sopenharmony_ci	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
318e1051a39Sopenharmony_ci	eor	$Zll,$Zll,$Zlh,lsl#28
319e1051a39Sopenharmony_ci	eor	$Zlh,$Tlh,$Zlh,lsr#4
320e1051a39Sopenharmony_ci	eor	$Zlh,$Zlh,$Zhl,lsl#28
321e1051a39Sopenharmony_ci	eor	$Zhl,$Thl,$Zhl,lsr#4
322e1051a39Sopenharmony_ci	eor	$Zhl,$Zhl,$Zhh,lsl#28
323e1051a39Sopenharmony_ci	eor	$Zhh,$Thh,$Zhh,lsr#4
324e1051a39Sopenharmony_ci	and	$nhi,$nlo,#0xf0
325e1051a39Sopenharmony_ci	eor	$Zhh,$Zhh,$Tll,lsl#16
326e1051a39Sopenharmony_ci	and	$nlo,$nlo,#0x0f
327e1051a39Sopenharmony_ci
328e1051a39Sopenharmony_ci.Loop:
329e1051a39Sopenharmony_ci	add	$Thh,$Htbl,$nlo,lsl#4
330e1051a39Sopenharmony_ci	and	$nlo,$Zll,#0xf		@ rem
331e1051a39Sopenharmony_ci	subs	$cnt,$cnt,#1
332e1051a39Sopenharmony_ci	add	$nlo,$nlo,$nlo
333e1051a39Sopenharmony_ci	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
334e1051a39Sopenharmony_ci	eor	$Zll,$Tll,$Zll,lsr#4
335e1051a39Sopenharmony_ci	eor	$Zll,$Zll,$Zlh,lsl#28
336e1051a39Sopenharmony_ci	eor	$Zlh,$Tlh,$Zlh,lsr#4
337e1051a39Sopenharmony_ci	eor	$Zlh,$Zlh,$Zhl,lsl#28
338e1051a39Sopenharmony_ci	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
339e1051a39Sopenharmony_ci	eor	$Zhl,$Thl,$Zhl,lsr#4
340e1051a39Sopenharmony_ci#ifdef	__thumb2__
341e1051a39Sopenharmony_ci	it	pl
342e1051a39Sopenharmony_ci#endif
343e1051a39Sopenharmony_ci	ldrplb	$nlo,[$Xi,$cnt]
344e1051a39Sopenharmony_ci	eor	$Zhl,$Zhl,$Zhh,lsl#28
345e1051a39Sopenharmony_ci	eor	$Zhh,$Thh,$Zhh,lsr#4
346e1051a39Sopenharmony_ci
347e1051a39Sopenharmony_ci	add	$Thh,$Htbl,$nhi
348e1051a39Sopenharmony_ci	and	$nhi,$Zll,#0xf		@ rem
349e1051a39Sopenharmony_ci	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
350e1051a39Sopenharmony_ci	add	$nhi,$nhi,$nhi
351e1051a39Sopenharmony_ci	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
352e1051a39Sopenharmony_ci	eor	$Zll,$Tll,$Zll,lsr#4
353e1051a39Sopenharmony_ci	eor	$Zll,$Zll,$Zlh,lsl#28
354e1051a39Sopenharmony_ci	eor	$Zlh,$Tlh,$Zlh,lsr#4
355e1051a39Sopenharmony_ci	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
356e1051a39Sopenharmony_ci	eor	$Zlh,$Zlh,$Zhl,lsl#28
357e1051a39Sopenharmony_ci	eor	$Zhl,$Thl,$Zhl,lsr#4
358e1051a39Sopenharmony_ci	eor	$Zhl,$Zhl,$Zhh,lsl#28
359e1051a39Sopenharmony_ci	eor	$Zhh,$Thh,$Zhh,lsr#4
360e1051a39Sopenharmony_ci#ifdef	__thumb2__
361e1051a39Sopenharmony_ci	itt	pl
362e1051a39Sopenharmony_ci#endif
363e1051a39Sopenharmony_ci	andpl	$nhi,$nlo,#0xf0
364e1051a39Sopenharmony_ci	andpl	$nlo,$nlo,#0x0f
365e1051a39Sopenharmony_ci	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
366e1051a39Sopenharmony_ci	bpl	.Loop
367e1051a39Sopenharmony_ci___
368e1051a39Sopenharmony_ci	&Zsmash();
369e1051a39Sopenharmony_ci$code.=<<___;
370e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5
371e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r11,pc}
372e1051a39Sopenharmony_ci#else
373e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r11,lr}
374e1051a39Sopenharmony_ci	tst	lr,#1
375e1051a39Sopenharmony_ci	moveq	pc,lr			@ be binary compatible with V4, yet
376e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
377e1051a39Sopenharmony_ci#endif
378e1051a39Sopenharmony_ci.size	gcm_gmult_4bit,.-gcm_gmult_4bit
379e1051a39Sopenharmony_ci___
380e1051a39Sopenharmony_ci{
381e1051a39Sopenharmony_cimy ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
382e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
383e1051a39Sopenharmony_cimy ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
384e1051a39Sopenharmony_ci
385e1051a39Sopenharmony_cisub clmul64x64 {
386e1051a39Sopenharmony_cimy ($r,$a,$b)=@_;
387e1051a39Sopenharmony_ci$code.=<<___;
388e1051a39Sopenharmony_ci	vext.8		$t0#lo, $a, $a, #1	@ A1
389e1051a39Sopenharmony_ci	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
390e1051a39Sopenharmony_ci	vext.8		$r#lo, $b, $b, #1	@ B1
391e1051a39Sopenharmony_ci	vmull.p8	$r, $a, $r#lo		@ E = A*B1
392e1051a39Sopenharmony_ci	vext.8		$t1#lo, $a, $a, #2	@ A2
393e1051a39Sopenharmony_ci	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
394e1051a39Sopenharmony_ci	vext.8		$t3#lo, $b, $b, #2	@ B2
395e1051a39Sopenharmony_ci	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
396e1051a39Sopenharmony_ci	vext.8		$t2#lo, $a, $a, #3	@ A3
397e1051a39Sopenharmony_ci	veor		$t0, $t0, $r		@ L = E + F
398e1051a39Sopenharmony_ci	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
399e1051a39Sopenharmony_ci	vext.8		$r#lo, $b, $b, #3	@ B3
400e1051a39Sopenharmony_ci	veor		$t1, $t1, $t3		@ M = G + H
401e1051a39Sopenharmony_ci	vmull.p8	$r, $a, $r#lo		@ I = A*B3
402e1051a39Sopenharmony_ci	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
403e1051a39Sopenharmony_ci	vand		$t0#hi, $t0#hi, $k48
404e1051a39Sopenharmony_ci	vext.8		$t3#lo, $b, $b, #4	@ B4
405e1051a39Sopenharmony_ci	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
406e1051a39Sopenharmony_ci	vand		$t1#hi, $t1#hi, $k32
407e1051a39Sopenharmony_ci	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
408e1051a39Sopenharmony_ci	veor		$t2, $t2, $r		@ N = I + J
409e1051a39Sopenharmony_ci	veor		$t0#lo, $t0#lo, $t0#hi
410e1051a39Sopenharmony_ci	veor		$t1#lo, $t1#lo, $t1#hi
411e1051a39Sopenharmony_ci	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
412e1051a39Sopenharmony_ci	vand		$t2#hi, $t2#hi, $k16
413e1051a39Sopenharmony_ci	vext.8		$t0, $t0, $t0, #15
414e1051a39Sopenharmony_ci	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
415e1051a39Sopenharmony_ci	vmov.i64	$t3#hi, #0
416e1051a39Sopenharmony_ci	vext.8		$t1, $t1, $t1, #14
417e1051a39Sopenharmony_ci	veor		$t2#lo, $t2#lo, $t2#hi
418e1051a39Sopenharmony_ci	vmull.p8	$r, $a, $b		@ D = A*B
419e1051a39Sopenharmony_ci	vext.8		$t3, $t3, $t3, #12
420e1051a39Sopenharmony_ci	vext.8		$t2, $t2, $t2, #13
421e1051a39Sopenharmony_ci	veor		$t0, $t0, $t1
422e1051a39Sopenharmony_ci	veor		$t2, $t2, $t3
423e1051a39Sopenharmony_ci	veor		$r, $r, $t0
424e1051a39Sopenharmony_ci	veor		$r, $r, $t2
425e1051a39Sopenharmony_ci___
426e1051a39Sopenharmony_ci}
427e1051a39Sopenharmony_ci
428e1051a39Sopenharmony_ci$code.=<<___;
429e1051a39Sopenharmony_ci#if __ARM_MAX_ARCH__>=7
430e1051a39Sopenharmony_ci.arch	armv7-a
431e1051a39Sopenharmony_ci.fpu	neon
432e1051a39Sopenharmony_ci
433e1051a39Sopenharmony_ci.global	gcm_init_neon
434e1051a39Sopenharmony_ci.type	gcm_init_neon,%function
435e1051a39Sopenharmony_ci.align	4
436e1051a39Sopenharmony_cigcm_init_neon:
437e1051a39Sopenharmony_ci	vld1.64		$IN#hi,[r1]!		@ load H
438e1051a39Sopenharmony_ci	vmov.i8		$t0,#0xe1
439e1051a39Sopenharmony_ci	vld1.64		$IN#lo,[r1]
440e1051a39Sopenharmony_ci	vshl.i64	$t0#hi,#57
441e1051a39Sopenharmony_ci	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
442e1051a39Sopenharmony_ci	vdup.8		$t1,$IN#hi[7]
443e1051a39Sopenharmony_ci	vshr.u64	$Hlo,$IN#lo,#63
444e1051a39Sopenharmony_ci	vshr.s8		$t1,#7			@ broadcast carry bit
445e1051a39Sopenharmony_ci	vshl.i64	$IN,$IN,#1
446e1051a39Sopenharmony_ci	vand		$t0,$t0,$t1
447e1051a39Sopenharmony_ci	vorr		$IN#hi,$Hlo		@ H<<<=1
448e1051a39Sopenharmony_ci	veor		$IN,$IN,$t0		@ twisted H
449e1051a39Sopenharmony_ci	vstmia		r0,{$IN}
450e1051a39Sopenharmony_ci
451e1051a39Sopenharmony_ci	ret					@ bx lr
452e1051a39Sopenharmony_ci.size	gcm_init_neon,.-gcm_init_neon
453e1051a39Sopenharmony_ci
454e1051a39Sopenharmony_ci.global	gcm_gmult_neon
455e1051a39Sopenharmony_ci.type	gcm_gmult_neon,%function
456e1051a39Sopenharmony_ci.align	4
457e1051a39Sopenharmony_cigcm_gmult_neon:
458e1051a39Sopenharmony_ci	vld1.64		$IN#hi,[$Xi]!		@ load Xi
459e1051a39Sopenharmony_ci	vld1.64		$IN#lo,[$Xi]!
460e1051a39Sopenharmony_ci	vmov.i64	$k48,#0x0000ffffffffffff
461e1051a39Sopenharmony_ci	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
462e1051a39Sopenharmony_ci	vmov.i64	$k32,#0x00000000ffffffff
463e1051a39Sopenharmony_ci#ifdef __ARMEL__
464e1051a39Sopenharmony_ci	vrev64.8	$IN,$IN
465e1051a39Sopenharmony_ci#endif
466e1051a39Sopenharmony_ci	vmov.i64	$k16,#0x000000000000ffff
467e1051a39Sopenharmony_ci	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
468e1051a39Sopenharmony_ci	mov		$len,#16
469e1051a39Sopenharmony_ci	b		.Lgmult_neon
470e1051a39Sopenharmony_ci.size	gcm_gmult_neon,.-gcm_gmult_neon
471e1051a39Sopenharmony_ci
472e1051a39Sopenharmony_ci.global	gcm_ghash_neon
473e1051a39Sopenharmony_ci.type	gcm_ghash_neon,%function
474e1051a39Sopenharmony_ci.align	4
475e1051a39Sopenharmony_cigcm_ghash_neon:
476e1051a39Sopenharmony_ci	vld1.64		$Xl#hi,[$Xi]!		@ load Xi
477e1051a39Sopenharmony_ci	vld1.64		$Xl#lo,[$Xi]!
478e1051a39Sopenharmony_ci	vmov.i64	$k48,#0x0000ffffffffffff
479e1051a39Sopenharmony_ci	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
480e1051a39Sopenharmony_ci	vmov.i64	$k32,#0x00000000ffffffff
481e1051a39Sopenharmony_ci#ifdef __ARMEL__
482e1051a39Sopenharmony_ci	vrev64.8	$Xl,$Xl
483e1051a39Sopenharmony_ci#endif
484e1051a39Sopenharmony_ci	vmov.i64	$k16,#0x000000000000ffff
485e1051a39Sopenharmony_ci	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
486e1051a39Sopenharmony_ci
487e1051a39Sopenharmony_ci.Loop_neon:
488e1051a39Sopenharmony_ci	vld1.64		$IN#hi,[$inp]!		@ load inp
489e1051a39Sopenharmony_ci	vld1.64		$IN#lo,[$inp]!
490e1051a39Sopenharmony_ci#ifdef __ARMEL__
491e1051a39Sopenharmony_ci	vrev64.8	$IN,$IN
492e1051a39Sopenharmony_ci#endif
493e1051a39Sopenharmony_ci	veor		$IN,$Xl			@ inp^=Xi
494e1051a39Sopenharmony_ci.Lgmult_neon:
495e1051a39Sopenharmony_ci___
496e1051a39Sopenharmony_ci	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo·Xi.lo
497e1051a39Sopenharmony_ci$code.=<<___;
498e1051a39Sopenharmony_ci	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing
499e1051a39Sopenharmony_ci___
500e1051a39Sopenharmony_ci	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)·(Xi.lo+Xi.hi)
501e1051a39Sopenharmony_ci	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi·Xi.hi
502e1051a39Sopenharmony_ci$code.=<<___;
503e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing
504e1051a39Sopenharmony_ci	veor		$Xm,$Xm,$Xh
505e1051a39Sopenharmony_ci	veor		$Xl#hi,$Xl#hi,$Xm#lo
506e1051a39Sopenharmony_ci	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result
507e1051a39Sopenharmony_ci
508e1051a39Sopenharmony_ci	@ equivalent of reduction_avx from ghash-x86_64.pl
509e1051a39Sopenharmony_ci	vshl.i64	$t1,$Xl,#57		@ 1st phase
510e1051a39Sopenharmony_ci	vshl.i64	$t2,$Xl,#62
511e1051a39Sopenharmony_ci	veor		$t2,$t2,$t1		@
512e1051a39Sopenharmony_ci	vshl.i64	$t1,$Xl,#63
513e1051a39Sopenharmony_ci	veor		$t2, $t2, $t1		@
514e1051a39Sopenharmony_ci 	veor		$Xl#hi,$Xl#hi,$t2#lo	@
515e1051a39Sopenharmony_ci	veor		$Xh#lo,$Xh#lo,$t2#hi
516e1051a39Sopenharmony_ci
517e1051a39Sopenharmony_ci	vshr.u64	$t2,$Xl,#1		@ 2nd phase
518e1051a39Sopenharmony_ci	veor		$Xh,$Xh,$Xl
519e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$t2		@
520e1051a39Sopenharmony_ci	vshr.u64	$t2,$t2,#6
521e1051a39Sopenharmony_ci	vshr.u64	$Xl,$Xl,#1		@
522e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$Xh		@
523e1051a39Sopenharmony_ci	veor		$Xl,$Xl,$t2		@
524e1051a39Sopenharmony_ci
525e1051a39Sopenharmony_ci	subs		$len,#16
526e1051a39Sopenharmony_ci	bne		.Loop_neon
527e1051a39Sopenharmony_ci
528e1051a39Sopenharmony_ci#ifdef __ARMEL__
529e1051a39Sopenharmony_ci	vrev64.8	$Xl,$Xl
530e1051a39Sopenharmony_ci#endif
531e1051a39Sopenharmony_ci	sub		$Xi,#16
532e1051a39Sopenharmony_ci	vst1.64		$Xl#hi,[$Xi]!		@ write out Xi
533e1051a39Sopenharmony_ci	vst1.64		$Xl#lo,[$Xi]
534e1051a39Sopenharmony_ci
535e1051a39Sopenharmony_ci	ret					@ bx lr
536e1051a39Sopenharmony_ci.size	gcm_ghash_neon,.-gcm_ghash_neon
537e1051a39Sopenharmony_ci#endif
538e1051a39Sopenharmony_ci___
539e1051a39Sopenharmony_ci}
540e1051a39Sopenharmony_ci$code.=<<___;
541e1051a39Sopenharmony_ci.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
542e1051a39Sopenharmony_ci.align  2
543e1051a39Sopenharmony_ci___
544e1051a39Sopenharmony_ci
545e1051a39Sopenharmony_ciforeach (split("\n",$code)) {
546e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/geo;
547e1051a39Sopenharmony_ci
548e1051a39Sopenharmony_ci	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
549e1051a39Sopenharmony_ci	s/\bret\b/bx	lr/go		or
550e1051a39Sopenharmony_ci	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
551e1051a39Sopenharmony_ci
552e1051a39Sopenharmony_ci	print $_,"\n";
553e1051a39Sopenharmony_ci}
554e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; # enforce flush
555