1e1051a39Sopenharmony_ci#!/usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci#
9e1051a39Sopenharmony_ci# ====================================================================
10e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
12e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
13e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
14e1051a39Sopenharmony_ci# ====================================================================
15e1051a39Sopenharmony_ci#
16e1051a39Sopenharmony_ci# Keccak-1600 for ARMv4.
17e1051a39Sopenharmony_ci#
18e1051a39Sopenharmony_ci# June 2017.
19e1051a39Sopenharmony_ci#
20e1051a39Sopenharmony_ci# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21e1051a39Sopenharmony_ci# interleaving. How does it compare to Keccak Code Package? It's as
22e1051a39Sopenharmony_ci# fast, but several times smaller, and is endian- and ISA-neutral. ISA
23e1051a39Sopenharmony_ci# neutrality means that minimum ISA requirement is ARMv4, yet it can
24e1051a39Sopenharmony_ci# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25e1051a39Sopenharmony_ci# register layout taken from Keccak Code Package. It's also as fast,
26e1051a39Sopenharmony_ci# in fact faster by 10-15% on some processors, and endian-neutral.
27e1051a39Sopenharmony_ci#
28e1051a39Sopenharmony_ci# August 2017.
29e1051a39Sopenharmony_ci#
30e1051a39Sopenharmony_ci# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31e1051a39Sopenharmony_ci# of rotate instructions with logical ones. This resulted in ~10%
32e1051a39Sopenharmony_ci# improvement on most processors. Switch to KECCAK_2X effectively
33e1051a39Sopenharmony_ci# minimizes re-loads from temporary storage, and merged rotates just
34e1051a39Sopenharmony_ci# eliminate corresponding instructions. As for latter. When examining
35e1051a39Sopenharmony_ci# code you'll notice commented ror instructions. These are eliminated
36e1051a39Sopenharmony_ci# ones, and you should trace destination register below to see what's
37e1051a39Sopenharmony_ci# going on. Just in case, why not all rotates are eliminated. Trouble
38e1051a39Sopenharmony_ci# is that you have operations that require both inputs to be rotated,
39e1051a39Sopenharmony_ci# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40e1051a39Sopenharmony_ci# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41e1051a39Sopenharmony_ci# that takes 'a' as input. And thing is that this next operation can
42e1051a39Sopenharmony_ci# be in next round. It's totally possible to "carry" rotate "factors"
43e1051a39Sopenharmony_ci# to the next round, but it makes code more complex. And the last word
44e1051a39Sopenharmony_ci# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
45e1051a39Sopenharmony_ci# time being]...
46e1051a39Sopenharmony_ci#
47e1051a39Sopenharmony_ci# Reduce per-round instruction count in Thumb-2 case by 16%. This is
48e1051a39Sopenharmony_ci# achieved by folding ldr/str pairs to their double-word counterparts.
49e1051a39Sopenharmony_ci# Theoretically this should have improved performance on single-issue
50e1051a39Sopenharmony_ci# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
51e1051a39Sopenharmony_ci# usual...
52e1051a39Sopenharmony_ci#
53e1051a39Sopenharmony_ci########################################################################
54e1051a39Sopenharmony_ci# Numbers are cycles per processed byte. Non-NEON results account even
55e1051a39Sopenharmony_ci# for input bit interleaving.
56e1051a39Sopenharmony_ci#
57e1051a39Sopenharmony_ci#		r=1088(*)   Thumb-2(**) NEON
58e1051a39Sopenharmony_ci#
59e1051a39Sopenharmony_ci# ARM11xx	82/+150%
60e1051a39Sopenharmony_ci# Cortex-A5	88/+160%,   86,         36
61e1051a39Sopenharmony_ci# Cortex-A7	78/+160%,   68,         34
62e1051a39Sopenharmony_ci# Cortex-A8	51/+230%,   57,         30
63e1051a39Sopenharmony_ci# Cortex-A9	53/+210%,   51,         26
64e1051a39Sopenharmony_ci# Cortex-A15	42/+160%,   38,         18
65e1051a39Sopenharmony_ci# Snapdragon S4	43/+210%,   38,         24
66e1051a39Sopenharmony_ci#
67e1051a39Sopenharmony_ci# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
68e1051a39Sopenharmony_ci#	over compiler-generated KECCAK_2X reference code.
69e1051a39Sopenharmony_ci# (**)	Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70e1051a39Sopenharmony_ci#	Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71e1051a39Sopenharmony_ci#	processors are presented mostly for reference purposes.
72e1051a39Sopenharmony_ci
73e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
74e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
75e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
76e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
77e1051a39Sopenharmony_ci
78e1051a39Sopenharmony_ciif ($flavour && $flavour ne "void") {
79e1051a39Sopenharmony_ci    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
80e1051a39Sopenharmony_ci    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
81e1051a39Sopenharmony_ci    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
82e1051a39Sopenharmony_ci    die "can't locate arm-xlate.pl";
83e1051a39Sopenharmony_ci
84e1051a39Sopenharmony_ci    open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
85e1051a39Sopenharmony_ci        or die "can't call $xlate: $!";
86e1051a39Sopenharmony_ci} else {
87e1051a39Sopenharmony_ci    $output and open STDOUT,">$output";
88e1051a39Sopenharmony_ci}
89e1051a39Sopenharmony_ci
90e1051a39Sopenharmony_cimy @C = map("r$_",(0..9));
91e1051a39Sopenharmony_cimy @E = map("r$_",(10..12,14));
92e1051a39Sopenharmony_ci
93e1051a39Sopenharmony_ci########################################################################
94e1051a39Sopenharmony_ci# Stack layout
95e1051a39Sopenharmony_ci# ----->+-----------------------+
96e1051a39Sopenharmony_ci#       | uint64_t A[5][5]      |
97e1051a39Sopenharmony_ci#       | ...                   |
98e1051a39Sopenharmony_ci# +200->+-----------------------+
99e1051a39Sopenharmony_ci#       | uint64_t D[5]         |
100e1051a39Sopenharmony_ci#       | ...                   |
101e1051a39Sopenharmony_ci# +240->+-----------------------+
102e1051a39Sopenharmony_ci#       | uint64_t T[5][5]      |
103e1051a39Sopenharmony_ci#       | ...                   |
104e1051a39Sopenharmony_ci# +440->+-----------------------+
105e1051a39Sopenharmony_ci#       | saved lr              |
106e1051a39Sopenharmony_ci# +444->+-----------------------+
107e1051a39Sopenharmony_ci#       | loop counter          |
108e1051a39Sopenharmony_ci# +448->+-----------------------+
109e1051a39Sopenharmony_ci#       | ...
110e1051a39Sopenharmony_ci
111e1051a39Sopenharmony_cimy @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
112e1051a39Sopenharmony_cimy @D = map(8*$_, (25..29));
113e1051a39Sopenharmony_cimy @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
114e1051a39Sopenharmony_ci
115e1051a39Sopenharmony_ci$code.=<<___;
116e1051a39Sopenharmony_ci#include "arm_arch.h"
117e1051a39Sopenharmony_ci
118e1051a39Sopenharmony_ci#if defined(__thumb2__)
119e1051a39Sopenharmony_ci.syntax	unified
120e1051a39Sopenharmony_ci.thumb
121e1051a39Sopenharmony_ci#else
122e1051a39Sopenharmony_ci.code	32
123e1051a39Sopenharmony_ci#endif
124e1051a39Sopenharmony_ci
125e1051a39Sopenharmony_ci.text
126e1051a39Sopenharmony_ci
127e1051a39Sopenharmony_ci.type	iotas32, %object
128e1051a39Sopenharmony_ci.align	5
129e1051a39Sopenharmony_ciiotas32:
130e1051a39Sopenharmony_ci	.long	0x00000001, 0x00000000
131e1051a39Sopenharmony_ci	.long	0x00000000, 0x00000089
132e1051a39Sopenharmony_ci	.long	0x00000000, 0x8000008b
133e1051a39Sopenharmony_ci	.long	0x00000000, 0x80008080
134e1051a39Sopenharmony_ci	.long	0x00000001, 0x0000008b
135e1051a39Sopenharmony_ci	.long	0x00000001, 0x00008000
136e1051a39Sopenharmony_ci	.long	0x00000001, 0x80008088
137e1051a39Sopenharmony_ci	.long	0x00000001, 0x80000082
138e1051a39Sopenharmony_ci	.long	0x00000000, 0x0000000b
139e1051a39Sopenharmony_ci	.long	0x00000000, 0x0000000a
140e1051a39Sopenharmony_ci	.long	0x00000001, 0x00008082
141e1051a39Sopenharmony_ci	.long	0x00000000, 0x00008003
142e1051a39Sopenharmony_ci	.long	0x00000001, 0x0000808b
143e1051a39Sopenharmony_ci	.long	0x00000001, 0x8000000b
144e1051a39Sopenharmony_ci	.long	0x00000001, 0x8000008a
145e1051a39Sopenharmony_ci	.long	0x00000001, 0x80000081
146e1051a39Sopenharmony_ci	.long	0x00000000, 0x80000081
147e1051a39Sopenharmony_ci	.long	0x00000000, 0x80000008
148e1051a39Sopenharmony_ci	.long	0x00000000, 0x00000083
149e1051a39Sopenharmony_ci	.long	0x00000000, 0x80008003
150e1051a39Sopenharmony_ci	.long	0x00000001, 0x80008088
151e1051a39Sopenharmony_ci	.long	0x00000000, 0x80000088
152e1051a39Sopenharmony_ci	.long	0x00000001, 0x00008000
153e1051a39Sopenharmony_ci	.long	0x00000000, 0x80008082
154e1051a39Sopenharmony_ci.size	iotas32,.-iotas32
155e1051a39Sopenharmony_ci
156e1051a39Sopenharmony_ci.type	KeccakF1600_int, %function
157e1051a39Sopenharmony_ci.align	5
158e1051a39Sopenharmony_ciKeccakF1600_int:
159e1051a39Sopenharmony_ci	add	@C[9],sp,#$A[4][2]
160e1051a39Sopenharmony_ci	add	@E[2],sp,#$A[0][0]
161e1051a39Sopenharmony_ci	add	@E[0],sp,#$A[1][0]
162e1051a39Sopenharmony_ci	ldmia	@C[9],{@C[4]-@C[9]}		@ A[4][2..4]
163e1051a39Sopenharmony_ciKeccakF1600_enter:
164e1051a39Sopenharmony_ci	str	lr,[sp,#440]
165e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@E[1]
166e1051a39Sopenharmony_ci	str	@E[1],[sp,#444]
167e1051a39Sopenharmony_ci	b	.Lround2x
168e1051a39Sopenharmony_ci
169e1051a39Sopenharmony_ci.align	4
170e1051a39Sopenharmony_ci.Lround2x:
171e1051a39Sopenharmony_ci___
172e1051a39Sopenharmony_cisub Round {
173e1051a39Sopenharmony_cimy (@A,@R); (@A[0..4],@R) = @_;
174e1051a39Sopenharmony_ci
175e1051a39Sopenharmony_ci$code.=<<___;
176e1051a39Sopenharmony_ci	ldmia	@E[2],{@C[0]-@C[3]}		@ A[0][0..1]
177e1051a39Sopenharmony_ci	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][0..1]
178e1051a39Sopenharmony_ci#ifdef	__thumb2__
179e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[0]
180e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[1]
181e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[2]
182e1051a39Sopenharmony_ci	ldrd	@E[0],@E[1],[sp,#$A[1][2]]
183e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[3]
184e1051a39Sopenharmony_ci	ldrd	@E[2],@E[3],[sp,#$A[1][3]]
185e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@E[0]
186e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@E[1]
187e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@E[2]
188e1051a39Sopenharmony_ci	ldrd	@E[0],@E[1],[sp,#$A[1][4]]
189e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@E[3]
190e1051a39Sopenharmony_ci	ldrd	@E[2],@E[3],[sp,#$A[2][0]]
191e1051a39Sopenharmony_ci	eor	@C[8],@C[8],@E[0]
192e1051a39Sopenharmony_ci	eor	@C[9],@C[9],@E[1]
193e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[2]
194e1051a39Sopenharmony_ci	ldrd	@E[0],@E[1],[sp,#$A[2][1]]
195e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[3]
196e1051a39Sopenharmony_ci	ldrd	@E[2],@E[3],[sp,#$A[2][2]]
197e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[0]
198e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[1]
199e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@E[2]
200e1051a39Sopenharmony_ci	ldrd	@E[0],@E[1],[sp,#$A[2][3]]
201e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@E[3]
202e1051a39Sopenharmony_ci	ldrd	@E[2],@E[3],[sp,#$A[2][4]]
203e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@E[0]
204e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@E[1]
205e1051a39Sopenharmony_ci	eor	@C[8],@C[8],@E[2]
206e1051a39Sopenharmony_ci	ldrd	@E[0],@E[1],[sp,#$A[3][0]]
207e1051a39Sopenharmony_ci	eor	@C[9],@C[9],@E[3]
208e1051a39Sopenharmony_ci	ldrd	@E[2],@E[3],[sp,#$A[3][1]]
209e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[0]
210e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[1]
211e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[2]
212e1051a39Sopenharmony_ci	ldrd	@E[0],@E[1],[sp,#$A[3][2]]
213e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[3]
214e1051a39Sopenharmony_ci	ldrd	@E[2],@E[3],[sp,#$A[3][3]]
215e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@E[0]
216e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@E[1]
217e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@E[2]
218e1051a39Sopenharmony_ci	ldrd	@E[0],@E[1],[sp,#$A[3][4]]
219e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@E[3]
220e1051a39Sopenharmony_ci	ldrd	@E[2],@E[3],[sp,#$A[4][0]]
221e1051a39Sopenharmony_ci	eor	@C[8],@C[8],@E[0]
222e1051a39Sopenharmony_ci	eor	@C[9],@C[9],@E[1]
223e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[2]
224e1051a39Sopenharmony_ci	ldrd	@E[0],@E[1],[sp,#$A[4][1]]
225e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[3]
226e1051a39Sopenharmony_ci	ldrd	@E[2],@E[3],[sp,#$A[0][2]]
227e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[0]
228e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[1]
229e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@E[2]
230e1051a39Sopenharmony_ci	ldrd	@E[0],@E[1],[sp,#$A[0][3]]
231e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@E[3]
232e1051a39Sopenharmony_ci	ldrd	@E[2],@E[3],[sp,#$A[0][4]]
233e1051a39Sopenharmony_ci#else
234e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[0]
235e1051a39Sopenharmony_ci	 add	@E[0],sp,#$A[1][2]
236e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[1]
237e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[2]
238e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[3]
239e1051a39Sopenharmony_ci	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][2..3]
240e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@E[0]
241e1051a39Sopenharmony_ci	 add	@E[0],sp,#$A[1][4]
242e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@E[1]
243e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@E[2]
244e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@E[3]
245e1051a39Sopenharmony_ci	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][4]..A[2][0]
246e1051a39Sopenharmony_ci	eor	@C[8],@C[8],@E[0]
247e1051a39Sopenharmony_ci	 add	@E[0],sp,#$A[2][1]
248e1051a39Sopenharmony_ci	eor	@C[9],@C[9],@E[1]
249e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[2]
250e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[3]
251e1051a39Sopenharmony_ci	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[2][1..2]
252e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[0]
253e1051a39Sopenharmony_ci	 add	@E[0],sp,#$A[2][3]
254e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[1]
255e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@E[2]
256e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@E[3]
257e1051a39Sopenharmony_ci	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[2][3..4]
258e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@E[0]
259e1051a39Sopenharmony_ci	 add	@E[0],sp,#$A[3][0]
260e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@E[1]
261e1051a39Sopenharmony_ci	eor	@C[8],@C[8],@E[2]
262e1051a39Sopenharmony_ci	eor	@C[9],@C[9],@E[3]
263e1051a39Sopenharmony_ci	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][0..1]
264e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[0]
265e1051a39Sopenharmony_ci	 add	@E[0],sp,#$A[3][2]
266e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[1]
267e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[2]
268e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[3]
269e1051a39Sopenharmony_ci	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][2..3]
270e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@E[0]
271e1051a39Sopenharmony_ci	 add	@E[0],sp,#$A[3][4]
272e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@E[1]
273e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@E[2]
274e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@E[3]
275e1051a39Sopenharmony_ci	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][4]..A[4][0]
276e1051a39Sopenharmony_ci	eor	@C[8],@C[8],@E[0]
277e1051a39Sopenharmony_ci	ldr	@E[0],[sp,#$A[4][1]]		@ A[4][1]
278e1051a39Sopenharmony_ci	eor	@C[9],@C[9],@E[1]
279e1051a39Sopenharmony_ci	ldr	@E[1],[sp,#$A[4][1]+4]
280e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[2]
281e1051a39Sopenharmony_ci	ldr	@E[2],[sp,#$A[0][2]]		@ A[0][2]
282e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[3]
283e1051a39Sopenharmony_ci	ldr	@E[3],[sp,#$A[0][2]+4]
284e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[0]
285e1051a39Sopenharmony_ci	 add	@E[0],sp,#$A[0][3]
286e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[1]
287e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@E[2]
288e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@E[3]
289e1051a39Sopenharmony_ci	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[0][3..4]
290e1051a39Sopenharmony_ci#endif
291e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@E[0]
292e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@E[1]
293e1051a39Sopenharmony_ci	eor	@C[8],@C[8],@E[2]
294e1051a39Sopenharmony_ci	eor	@C[9],@C[9],@E[3]
295e1051a39Sopenharmony_ci
296e1051a39Sopenharmony_ci	eor	@E[0],@C[0],@C[5],ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
297e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$D[1]]		@ D[1] = E[0]
298e1051a39Sopenharmony_ci	eor	@E[1],@C[1],@C[4]
299e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$D[1]+4]
300e1051a39Sopenharmony_ci	eor	@E[2],@C[6],@C[1],ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
301e1051a39Sopenharmony_ci	eor	@E[3],@C[7],@C[0]
302e1051a39Sopenharmony_ci	str.l	@E[2],[sp,#$D[4]]		@ D[4] = E[1]
303e1051a39Sopenharmony_ci	eor	@C[0],@C[8],@C[3],ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
304e1051a39Sopenharmony_ci	str.h	@E[3],[sp,#$D[4]+4]
305e1051a39Sopenharmony_ci	eor	@C[1],@C[9],@C[2]
306e1051a39Sopenharmony_ci	str.l	@C[0],[sp,#$D[0]]		@ D[0] = C[0]
307e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@C[7],ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
308e1051a39Sopenharmony_ci	 ldr.l	@C[7],[sp,#$A[3][3]]
309e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@C[6]
310e1051a39Sopenharmony_ci	str.h	@C[1],[sp,#$D[0]+4]
311e1051a39Sopenharmony_ci	 ldr.h	@C[6],[sp,#$A[3][3]+4]
312e1051a39Sopenharmony_ci	str.l	@C[2],[sp,#$D[2]]		@ D[2] = C[1]
313e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@C[9],ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
314e1051a39Sopenharmony_ci	str.h	@C[3],[sp,#$D[2]+4]
315e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@C[8]
316e1051a39Sopenharmony_ci
317e1051a39Sopenharmony_ci	ldr.l	@C[8],[sp,#$A[4][4]]
318e1051a39Sopenharmony_ci	ldr.h	@C[9],[sp,#$A[4][4]+4]
319e1051a39Sopenharmony_ci	 str.l	@C[4],[sp,#$D[3]]		@ D[3] = C[2]
320e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@C[4]
321e1051a39Sopenharmony_ci	 str.h	@C[5],[sp,#$D[3]+4]
322e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@C[5]
323e1051a39Sopenharmony_ci	ldr.l	@C[4],[sp,#$A[0][0]]
324e1051a39Sopenharmony_ci	@ ror	@C[7],@C[7],#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
325e1051a39Sopenharmony_ci	@ ror	@C[6],@C[6],#32-11
326e1051a39Sopenharmony_ci	ldr.h	@C[5],[sp,#$A[0][0]+4]
327e1051a39Sopenharmony_ci	eor	@C[8],@C[8],@E[2]
328e1051a39Sopenharmony_ci	eor	@C[9],@C[9],@E[3]
329e1051a39Sopenharmony_ci	ldr.l	@E[2],[sp,#$A[2][2]]
330e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@C[4]
331e1051a39Sopenharmony_ci	ldr.h	@E[3],[sp,#$A[2][2]+4]
332e1051a39Sopenharmony_ci	@ ror	@C[8],@C[8],#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
333e1051a39Sopenharmony_ci	@ ror	@C[9],@C[9],#32-7
334e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@C[5]		@ C[0] =       A[0][0] ^ C[0]; /* rotate by 0 */  /* D[0] */
335e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[2]
336e1051a39Sopenharmony_ci	ldr.l	@C[2],[sp,#$A[1][1]]
337e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[3]
338e1051a39Sopenharmony_ci	ldr.h	@C[3],[sp,#$A[1][1]+4]
339e1051a39Sopenharmony_ci	ror	@C[5],@E[2],#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
340e1051a39Sopenharmony_ci	 ldr	@E[2],[sp,#444]			@ load counter
341e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[0]
342e1051a39Sopenharmony_ci	 adr	@E[0],iotas32
343e1051a39Sopenharmony_ci	ror	@C[4],@E[3],#32-22
344e1051a39Sopenharmony_ci	 add	@E[3],@E[0],@E[2]
345e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[1]
346e1051a39Sopenharmony_ci___
347e1051a39Sopenharmony_ci$code.=<<___	if ($A[0][0] != $T[0][0]);
348e1051a39Sopenharmony_ci	ldmia	@E[3],{@E[0],@E[1]}		@ iotas[i]
349e1051a39Sopenharmony_ci___
350e1051a39Sopenharmony_ci$code.=<<___	if ($A[0][0] == $T[0][0]);
351e1051a39Sopenharmony_ci	ldr.l	@E[0],[@E[3],#8]		@ iotas[i].lo
352e1051a39Sopenharmony_ci	add	@E[2],@E[2],#16
353e1051a39Sopenharmony_ci	ldr.h	@E[1],[@E[3],#12]		@ iotas[i].hi
354e1051a39Sopenharmony_ci	cmp	@E[2],#192
355e1051a39Sopenharmony_ci	str	@E[2],[sp,#444]			@ store counter
356e1051a39Sopenharmony_ci___
357e1051a39Sopenharmony_ci$code.=<<___;
358e1051a39Sopenharmony_ci	bic	@E[2],@C[4],@C[2],ror#32-22
359e1051a39Sopenharmony_ci	bic	@E[3],@C[5],@C[3],ror#32-22
360e1051a39Sopenharmony_ci	 ror	@C[2],@C[2],#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
361e1051a39Sopenharmony_ci	 ror	@C[3],@C[3],#32-22
362e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[0]
363e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[1]
364e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@E[2]
365e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@E[3]
366e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[0][0]]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
367e1051a39Sopenharmony_ci	bic	@E[2],@C[6],@C[4],ror#11
368e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[0][0]+4]
369e1051a39Sopenharmony_ci	bic	@E[3],@C[7],@C[5],ror#10
370e1051a39Sopenharmony_ci	bic	@E[0],@C[8],@C[6],ror#32-(11-7)
371e1051a39Sopenharmony_ci	bic	@E[1],@C[9],@C[7],ror#32-(10-7)
372e1051a39Sopenharmony_ci	eor	@E[2],@C[2],@E[2],ror#32-11
373e1051a39Sopenharmony_ci	str.l	@E[2],[sp,#$R[0][1]]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
374e1051a39Sopenharmony_ci	eor	@E[3],@C[3],@E[3],ror#32-10
375e1051a39Sopenharmony_ci	str.h	@E[3],[sp,#$R[0][1]+4]
376e1051a39Sopenharmony_ci	eor	@E[0],@C[4],@E[0],ror#32-7
377e1051a39Sopenharmony_ci	eor	@E[1],@C[5],@E[1],ror#32-7
378e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[0][2]]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
379e1051a39Sopenharmony_ci	bic	@E[2],@C[0],@C[8],ror#32-7
380e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[0][2]+4]
381e1051a39Sopenharmony_ci	bic	@E[3],@C[1],@C[9],ror#32-7
382e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[6],ror#32-11
383e1051a39Sopenharmony_ci	str.l	@E[2],[sp,#$R[0][3]]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
384e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[7],ror#32-10
385e1051a39Sopenharmony_ci	str.h	@E[3],[sp,#$R[0][3]+4]
386e1051a39Sopenharmony_ci	bic	@E[0],@C[2],@C[0]
387e1051a39Sopenharmony_ci	 add	@E[3],sp,#$D[3]
388e1051a39Sopenharmony_ci	 ldr.l	@C[0],[sp,#$A[0][3]]		@ A[0][3]
389e1051a39Sopenharmony_ci	bic	@E[1],@C[3],@C[1]
390e1051a39Sopenharmony_ci	 ldr.h	@C[1],[sp,#$A[0][3]+4]
391e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[8],ror#32-7
392e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[9],ror#32-7
393e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[0][4]]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
394e1051a39Sopenharmony_ci	 add	@C[9],sp,#$D[0]
395e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[0][4]+4]
396e1051a39Sopenharmony_ci
397e1051a39Sopenharmony_ci	ldmia	@E[3],{@E[0]-@E[2],@E[3]}	@ D[3..4]
398e1051a39Sopenharmony_ci	ldmia	@C[9],{@C[6]-@C[9]}		@ D[0..1]
399e1051a39Sopenharmony_ci
400e1051a39Sopenharmony_ci	ldr.l	@C[2],[sp,#$A[1][4]]		@ A[1][4]
401e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[0]
402e1051a39Sopenharmony_ci	ldr.h	@C[3],[sp,#$A[1][4]+4]
403e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[1]
404e1051a39Sopenharmony_ci	@ ror	@C[0],@C[0],#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
405e1051a39Sopenharmony_ci	ldr.l	@E[0],[sp,#$A[3][1]]		@ A[3][1]
406e1051a39Sopenharmony_ci	@ ror	@C[1],@C[1],#32-14
407e1051a39Sopenharmony_ci	ldr.h	@E[1],[sp,#$A[3][1]+4]
408e1051a39Sopenharmony_ci
409e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[2]
410e1051a39Sopenharmony_ci	ldr.l	@C[4],[sp,#$A[2][0]]		@ A[2][0]
411e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[3]
412e1051a39Sopenharmony_ci	ldr.h	@C[5],[sp,#$A[2][0]+4]
413e1051a39Sopenharmony_ci	@ ror	@C[2],@C[2],#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
414e1051a39Sopenharmony_ci	@ ror	@C[3],@C[3],#32-10
415e1051a39Sopenharmony_ci
416e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@C[4]
417e1051a39Sopenharmony_ci	ldr.l	@E[2],[sp,#$D[2]]		@ D[2]
418e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@C[5]
419e1051a39Sopenharmony_ci	ldr.h	@E[3],[sp,#$D[2]+4]
420e1051a39Sopenharmony_ci	ror	@C[5],@C[6],#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
421e1051a39Sopenharmony_ci	ror	@C[4],@C[7],#32-2
422e1051a39Sopenharmony_ci
423e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[8]
424e1051a39Sopenharmony_ci	ldr.l	@C[8],[sp,#$A[4][2]]		@ A[4][2]
425e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[9]
426e1051a39Sopenharmony_ci	ldr.h	@C[9],[sp,#$A[4][2]+4]
427e1051a39Sopenharmony_ci	ror	@C[7],@E[0],#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
428e1051a39Sopenharmony_ci	ror	@C[6],@E[1],#32-23
429e1051a39Sopenharmony_ci
430e1051a39Sopenharmony_ci	bic	@E[0],@C[4],@C[2],ror#32-10
431e1051a39Sopenharmony_ci	bic	@E[1],@C[5],@C[3],ror#32-10
432e1051a39Sopenharmony_ci	 eor	@E[2],@E[2],@C[8]
433e1051a39Sopenharmony_ci	 eor	@E[3],@E[3],@C[9]
434e1051a39Sopenharmony_ci	 ror	@C[9],@E[2],#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
435e1051a39Sopenharmony_ci	 ror	@C[8],@E[3],#32-31
436e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[0],ror#32-14
437e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[1],ror#32-14
438e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[1][0]]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
439e1051a39Sopenharmony_ci	bic	@E[2],@C[6],@C[4]
440e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[1][0]+4]
441e1051a39Sopenharmony_ci	bic	@E[3],@C[7],@C[5]
442e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[2],ror#32-10
443e1051a39Sopenharmony_ci	str.l	@E[2],[sp,#$R[1][1]]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
444e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[3],ror#32-10
445e1051a39Sopenharmony_ci	str.h	@E[3],[sp,#$R[1][1]+4]
446e1051a39Sopenharmony_ci	bic	@E[0],@C[8],@C[6]
447e1051a39Sopenharmony_ci	bic	@E[1],@C[9],@C[7]
448e1051a39Sopenharmony_ci	bic	@E[2],@C[0],@C[8],ror#14
449e1051a39Sopenharmony_ci	bic	@E[3],@C[1],@C[9],ror#14
450e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[4]
451e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[5]
452e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[1][2]]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
453e1051a39Sopenharmony_ci	bic	@C[2],@C[2],@C[0],ror#32-(14-10)
454e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[1][2]+4]
455e1051a39Sopenharmony_ci	eor	@E[2],@C[6],@E[2],ror#32-14
456e1051a39Sopenharmony_ci	bic	@E[1],@C[3],@C[1],ror#32-(14-10)
457e1051a39Sopenharmony_ci	str.l	@E[2],[sp,#$R[1][3]]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
458e1051a39Sopenharmony_ci	eor	@E[3],@C[7],@E[3],ror#32-14
459e1051a39Sopenharmony_ci	str.h	@E[3],[sp,#$R[1][3]+4]
460e1051a39Sopenharmony_ci	 add	@E[2],sp,#$D[1]
461e1051a39Sopenharmony_ci	 ldr.l	@C[1],[sp,#$A[0][1]]		@ A[0][1]
462e1051a39Sopenharmony_ci	eor	@E[0],@C[8],@C[2],ror#32-10
463e1051a39Sopenharmony_ci	 ldr.h	@C[0],[sp,#$A[0][1]+4]
464e1051a39Sopenharmony_ci	eor	@E[1],@C[9],@E[1],ror#32-10
465e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[1][4]]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
466e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[1][4]+4]
467e1051a39Sopenharmony_ci
468e1051a39Sopenharmony_ci	add	@C[9],sp,#$D[3]
469e1051a39Sopenharmony_ci	ldmia	@E[2],{@E[0]-@E[2],@E[3]}	@ D[1..2]
470e1051a39Sopenharmony_ci	ldr.l	@C[2],[sp,#$A[1][2]]		@ A[1][2]
471e1051a39Sopenharmony_ci	ldr.h	@C[3],[sp,#$A[1][2]+4]
472e1051a39Sopenharmony_ci	ldmia	@C[9],{@C[6]-@C[9]}		@ D[3..4]
473e1051a39Sopenharmony_ci
474e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[0]
475e1051a39Sopenharmony_ci	ldr.l	@C[4],[sp,#$A[2][3]]		@ A[2][3]
476e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[1]
477e1051a39Sopenharmony_ci	ldr.h	@C[5],[sp,#$A[2][3]+4]
478e1051a39Sopenharmony_ci	ror	@C[0],@C[0],#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
479e1051a39Sopenharmony_ci
480e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[2]
481e1051a39Sopenharmony_ci	ldr.l	@E[0],[sp,#$A[3][4]]		@ A[3][4]
482e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[3]
483e1051a39Sopenharmony_ci	ldr.h	@E[1],[sp,#$A[3][4]+4]
484e1051a39Sopenharmony_ci	@ ror	@C[2],@C[2],#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
485e1051a39Sopenharmony_ci	ldr.l	@E[2],[sp,#$D[0]]		@ D[0]
486e1051a39Sopenharmony_ci	@ ror	@C[3],@C[3],#32-3
487e1051a39Sopenharmony_ci	ldr.h	@E[3],[sp,#$D[0]+4]
488e1051a39Sopenharmony_ci
489e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@C[6]
490e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@C[7]
491e1051a39Sopenharmony_ci	@ ror	@C[5],@C[6],#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
492e1051a39Sopenharmony_ci	@ ror	@C[4],@C[7],#32-13		@ [track reverse order below]
493e1051a39Sopenharmony_ci
494e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[8]
495e1051a39Sopenharmony_ci	ldr.l	@C[8],[sp,#$A[4][0]]		@ A[4][0]
496e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[9]
497e1051a39Sopenharmony_ci	ldr.h	@C[9],[sp,#$A[4][0]+4]
498e1051a39Sopenharmony_ci	ror	@C[6],@E[0],#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
499e1051a39Sopenharmony_ci	ror	@C[7],@E[1],#32-4
500e1051a39Sopenharmony_ci
501e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[8]
502e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[9]
503e1051a39Sopenharmony_ci	ror	@C[8],@E[2],#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
504e1051a39Sopenharmony_ci	ror	@C[9],@E[3],#32-9
505e1051a39Sopenharmony_ci
506e1051a39Sopenharmony_ci	bic	@E[0],@C[5],@C[2],ror#13-3
507e1051a39Sopenharmony_ci	bic	@E[1],@C[4],@C[3],ror#12-3
508e1051a39Sopenharmony_ci	bic	@E[2],@C[6],@C[5],ror#32-13
509e1051a39Sopenharmony_ci	bic	@E[3],@C[7],@C[4],ror#32-12
510e1051a39Sopenharmony_ci	eor	@E[0],@C[0],@E[0],ror#32-13
511e1051a39Sopenharmony_ci	eor	@E[1],@C[1],@E[1],ror#32-12
512e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[2][0]]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
513e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[2],ror#32-3
514e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[2][0]+4]
515e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[3],ror#32-3
516e1051a39Sopenharmony_ci	str.l	@E[2],[sp,#$R[2][1]]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
517e1051a39Sopenharmony_ci	bic	@E[0],@C[8],@C[6]
518e1051a39Sopenharmony_ci	bic	@E[1],@C[9],@C[7]
519e1051a39Sopenharmony_ci	str.h	@E[3],[sp,#$R[2][1]+4]
520e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[5],ror#32-13
521e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[4],ror#32-12
522e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[2][2]]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
523e1051a39Sopenharmony_ci	bic	@E[2],@C[0],@C[8]
524e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[2][2]+4]
525e1051a39Sopenharmony_ci	bic	@E[3],@C[1],@C[9]
526e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[6]
527e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[7]
528e1051a39Sopenharmony_ci	str.l	@E[2],[sp,#$R[2][3]]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
529e1051a39Sopenharmony_ci	bic	@E[0],@C[2],@C[0],ror#3
530e1051a39Sopenharmony_ci	str.h	@E[3],[sp,#$R[2][3]+4]
531e1051a39Sopenharmony_ci	bic	@E[1],@C[3],@C[1],ror#3
532e1051a39Sopenharmony_ci	 ldr.l	@C[1],[sp,#$A[0][4]]		@ A[0][4] [in reverse order]
533e1051a39Sopenharmony_ci	eor	@E[0],@C[8],@E[0],ror#32-3
534e1051a39Sopenharmony_ci	 ldr.h	@C[0],[sp,#$A[0][4]+4]
535e1051a39Sopenharmony_ci	eor	@E[1],@C[9],@E[1],ror#32-3
536e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[2][4]]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
537e1051a39Sopenharmony_ci	 add	@C[9],sp,#$D[1]
538e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[2][4]+4]
539e1051a39Sopenharmony_ci
540e1051a39Sopenharmony_ci	ldr.l	@E[0],[sp,#$D[4]]		@ D[4]
541e1051a39Sopenharmony_ci	ldr.h	@E[1],[sp,#$D[4]+4]
542e1051a39Sopenharmony_ci	ldr.l	@E[2],[sp,#$D[0]]		@ D[0]
543e1051a39Sopenharmony_ci	ldr.h	@E[3],[sp,#$D[0]+4]
544e1051a39Sopenharmony_ci
545e1051a39Sopenharmony_ci	ldmia	@C[9],{@C[6]-@C[9]}		@ D[1..2]
546e1051a39Sopenharmony_ci
547e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[0]
548e1051a39Sopenharmony_ci	ldr.l	@C[2],[sp,#$A[1][0]]		@ A[1][0]
549e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[1]
550e1051a39Sopenharmony_ci	ldr.h	@C[3],[sp,#$A[1][0]+4]
551e1051a39Sopenharmony_ci	@ ror	@C[1],@E[0],#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
552e1051a39Sopenharmony_ci	ldr.l	@C[4],[sp,#$A[2][1]]		@ A[2][1]
553e1051a39Sopenharmony_ci	@ ror	@C[0],@E[1],#32-14		@ [was loaded in reverse order]
554e1051a39Sopenharmony_ci	ldr.h	@C[5],[sp,#$A[2][1]+4]
555e1051a39Sopenharmony_ci
556e1051a39Sopenharmony_ci	eor	@C[2],@C[2],@E[2]
557e1051a39Sopenharmony_ci	ldr.l	@E[0],[sp,#$A[3][2]]		@ A[3][2]
558e1051a39Sopenharmony_ci	eor	@C[3],@C[3],@E[3]
559e1051a39Sopenharmony_ci	ldr.h	@E[1],[sp,#$A[3][2]+4]
560e1051a39Sopenharmony_ci	@ ror	@C[2],@C[2],#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
561e1051a39Sopenharmony_ci	ldr.l	@E[2],[sp,#$D[3]]		@ D[3]
562e1051a39Sopenharmony_ci	@ ror	@C[3],@C[3],#32-18
563e1051a39Sopenharmony_ci	ldr.h	@E[3],[sp,#$D[3]+4]
564e1051a39Sopenharmony_ci
565e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@C[4]
566e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@C[5]
567e1051a39Sopenharmony_ci	ror	@C[4],@C[6],#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
568e1051a39Sopenharmony_ci	ror	@C[5],@C[7],#32-5
569e1051a39Sopenharmony_ci
570e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[8]
571e1051a39Sopenharmony_ci	ldr.l	@C[8],[sp,#$A[4][3]]		@ A[4][3]
572e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[9]
573e1051a39Sopenharmony_ci	ldr.h	@C[9],[sp,#$A[4][3]+4]
574e1051a39Sopenharmony_ci	ror	@C[7],@E[0],#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
575e1051a39Sopenharmony_ci	ror	@C[6],@E[1],#32-8
576e1051a39Sopenharmony_ci
577e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[8]
578e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[9]
579e1051a39Sopenharmony_ci	ror	@C[8],@E[2],#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
580e1051a39Sopenharmony_ci	ror	@C[9],@E[3],#32-28
581e1051a39Sopenharmony_ci
582e1051a39Sopenharmony_ci	bic	@E[0],@C[4],@C[2],ror#32-18
583e1051a39Sopenharmony_ci	bic	@E[1],@C[5],@C[3],ror#32-18
584e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[0],ror#32-14
585e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[1],ror#32-13
586e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[3][0]]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
587e1051a39Sopenharmony_ci	bic	@E[2],@C[6],@C[4]
588e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[3][0]+4]
589e1051a39Sopenharmony_ci	bic	@E[3],@C[7],@C[5]
590e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[2],ror#32-18
591e1051a39Sopenharmony_ci	str.l	@E[2],[sp,#$R[3][1]]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
592e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[3],ror#32-18
593e1051a39Sopenharmony_ci	str.h	@E[3],[sp,#$R[3][1]+4]
594e1051a39Sopenharmony_ci	bic	@E[0],@C[8],@C[6]
595e1051a39Sopenharmony_ci	bic	@E[1],@C[9],@C[7]
596e1051a39Sopenharmony_ci	bic	@E[2],@C[0],@C[8],ror#14
597e1051a39Sopenharmony_ci	bic	@E[3],@C[1],@C[9],ror#13
598e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[4]
599e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[5]
600e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[3][2]]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
601e1051a39Sopenharmony_ci	bic	@C[2],@C[2],@C[0],ror#18-14
602e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[3][2]+4]
603e1051a39Sopenharmony_ci	eor	@E[2],@C[6],@E[2],ror#32-14
604e1051a39Sopenharmony_ci	bic	@E[1],@C[3],@C[1],ror#18-13
605e1051a39Sopenharmony_ci	eor	@E[3],@C[7],@E[3],ror#32-13
606e1051a39Sopenharmony_ci	str.l	@E[2],[sp,#$R[3][3]]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
607e1051a39Sopenharmony_ci	str.h	@E[3],[sp,#$R[3][3]+4]
608e1051a39Sopenharmony_ci	 add	@E[3],sp,#$D[2]
609e1051a39Sopenharmony_ci	 ldr.l	@C[0],[sp,#$A[0][2]]		@ A[0][2]
610e1051a39Sopenharmony_ci	eor	@E[0],@C[8],@C[2],ror#32-18
611e1051a39Sopenharmony_ci	 ldr.h	@C[1],[sp,#$A[0][2]+4]
612e1051a39Sopenharmony_ci	eor	@E[1],@C[9],@E[1],ror#32-18
613e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[3][4]]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
614e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[3][4]+4]
615e1051a39Sopenharmony_ci
616e1051a39Sopenharmony_ci	ldmia	@E[3],{@E[0]-@E[2],@E[3]}	@ D[2..3]
617e1051a39Sopenharmony_ci	ldr.l	@C[2],[sp,#$A[1][3]]		@ A[1][3]
618e1051a39Sopenharmony_ci	ldr.h	@C[3],[sp,#$A[1][3]+4]
619e1051a39Sopenharmony_ci	ldr.l	@C[6],[sp,#$D[4]]		@ D[4]
620e1051a39Sopenharmony_ci	ldr.h	@C[7],[sp,#$D[4]+4]
621e1051a39Sopenharmony_ci
622e1051a39Sopenharmony_ci	eor	@C[0],@C[0],@E[0]
623e1051a39Sopenharmony_ci	ldr.l	@C[4],[sp,#$A[2][4]]		@ A[2][4]
624e1051a39Sopenharmony_ci	eor	@C[1],@C[1],@E[1]
625e1051a39Sopenharmony_ci	ldr.h	@C[5],[sp,#$A[2][4]+4]
626e1051a39Sopenharmony_ci	@ ror	@C[0],@C[0],#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
627e1051a39Sopenharmony_ci	ldr.l	@C[8],[sp,#$D[0]]		@ D[0]
628e1051a39Sopenharmony_ci	@ ror	@C[1],@C[1],#32-31
629e1051a39Sopenharmony_ci	ldr.h	@C[9],[sp,#$D[0]+4]
630e1051a39Sopenharmony_ci
631e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[2]
632e1051a39Sopenharmony_ci	ldr.l	@E[0],[sp,#$A[3][0]]		@ A[3][0]
633e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[3]
634e1051a39Sopenharmony_ci	ldr.h	@E[1],[sp,#$A[3][0]+4]
635e1051a39Sopenharmony_ci	ror	@C[3],@E[2],#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
636e1051a39Sopenharmony_ci	ldr.l	@E[2],[sp,#$D[1]]		@ D[1]
637e1051a39Sopenharmony_ci	ror	@C[2],@E[3],#32-28
638e1051a39Sopenharmony_ci	ldr.h	@E[3],[sp,#$D[1]+4]
639e1051a39Sopenharmony_ci
640e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@C[4]
641e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@C[5]
642e1051a39Sopenharmony_ci	ror	@C[5],@C[6],#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
643e1051a39Sopenharmony_ci	ror	@C[4],@C[7],#32-20
644e1051a39Sopenharmony_ci
645e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[8]
646e1051a39Sopenharmony_ci	ldr.l	@C[8],[sp,#$A[4][1]]		@ A[4][1]
647e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[9]
648e1051a39Sopenharmony_ci	ldr.h	@C[9],[sp,#$A[4][1]+4]
649e1051a39Sopenharmony_ci	ror	@C[7],@E[0],#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
650e1051a39Sopenharmony_ci	ror	@C[6],@E[1],#32-21
651e1051a39Sopenharmony_ci
652e1051a39Sopenharmony_ci	eor	@C[8],@C[8],@E[2]
653e1051a39Sopenharmony_ci	eor	@C[9],@C[9],@E[3]
654e1051a39Sopenharmony_ci	@ ror	@C[8],@C[2],#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
655e1051a39Sopenharmony_ci	@ ror	@C[9],@C[3],#32-1
656e1051a39Sopenharmony_ci
657e1051a39Sopenharmony_ci	bic	@E[0],@C[4],@C[2]
658e1051a39Sopenharmony_ci	bic	@E[1],@C[5],@C[3]
659e1051a39Sopenharmony_ci	eor	@E[0],@E[0],@C[0],ror#32-31
660e1051a39Sopenharmony_ci	str.l	@E[0],[sp,#$R[4][0]]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
661e1051a39Sopenharmony_ci	eor	@E[1],@E[1],@C[1],ror#32-31
662e1051a39Sopenharmony_ci	str.h	@E[1],[sp,#$R[4][0]+4]
663e1051a39Sopenharmony_ci	bic	@E[2],@C[6],@C[4]
664e1051a39Sopenharmony_ci	bic	@E[3],@C[7],@C[5]
665e1051a39Sopenharmony_ci	eor	@E[2],@E[2],@C[2]
666e1051a39Sopenharmony_ci	eor	@E[3],@E[3],@C[3]
667e1051a39Sopenharmony_ci	str.l	@E[2],[sp,#$R[4][1]]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
668e1051a39Sopenharmony_ci	bic	@E[0],@C[8],@C[6],ror#1
669e1051a39Sopenharmony_ci	str.h	@E[3],[sp,#$R[4][1]+4]
670e1051a39Sopenharmony_ci	bic	@E[1],@C[9],@C[7],ror#1
671e1051a39Sopenharmony_ci	bic	@E[2],@C[0],@C[8],ror#31-1
672e1051a39Sopenharmony_ci	bic	@E[3],@C[1],@C[9],ror#31-1
673e1051a39Sopenharmony_ci	eor	@C[4],@C[4],@E[0],ror#32-1
674e1051a39Sopenharmony_ci	str.l	@C[4],[sp,#$R[4][2]]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
675e1051a39Sopenharmony_ci	eor	@C[5],@C[5],@E[1],ror#32-1
676e1051a39Sopenharmony_ci	str.h	@C[5],[sp,#$R[4][2]+4]
677e1051a39Sopenharmony_ci	eor	@C[6],@C[6],@E[2],ror#32-31
678e1051a39Sopenharmony_ci	eor	@C[7],@C[7],@E[3],ror#32-31
679e1051a39Sopenharmony_ci	str.l	@C[6],[sp,#$R[4][3]]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
680e1051a39Sopenharmony_ci	bic	@E[0],@C[2],@C[0],ror#32-31
681e1051a39Sopenharmony_ci	str.h	@C[7],[sp,#$R[4][3]+4]
682e1051a39Sopenharmony_ci	bic	@E[1],@C[3],@C[1],ror#32-31
683e1051a39Sopenharmony_ci	 add	@E[2],sp,#$R[0][0]
684e1051a39Sopenharmony_ci	eor	@C[8],@E[0],@C[8],ror#32-1
685e1051a39Sopenharmony_ci	 add	@E[0],sp,#$R[1][0]
686e1051a39Sopenharmony_ci	eor	@C[9],@E[1],@C[9],ror#32-1
687e1051a39Sopenharmony_ci	str.l	@C[8],[sp,#$R[4][4]]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
688e1051a39Sopenharmony_ci	str.h	@C[9],[sp,#$R[4][4]+4]
689e1051a39Sopenharmony_ci___
690e1051a39Sopenharmony_ci}
691e1051a39Sopenharmony_ci	Round(@A,@T);
692e1051a39Sopenharmony_ci	Round(@T,@A);
693e1051a39Sopenharmony_ci$code.=<<___;
694e1051a39Sopenharmony_ci	blo	.Lround2x
695e1051a39Sopenharmony_ci
696e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5
697e1051a39Sopenharmony_ci	ldr	pc,[sp,#440]
698e1051a39Sopenharmony_ci#else
699e1051a39Sopenharmony_ci	ldr	lr,[sp,#440]
700e1051a39Sopenharmony_ci	tst	lr,#1
701e1051a39Sopenharmony_ci	moveq	pc,lr		@ be binary compatible with V4, yet
702e1051a39Sopenharmony_ci	bx	lr		@ interoperable with Thumb ISA:-)
703e1051a39Sopenharmony_ci#endif
704e1051a39Sopenharmony_ci.size	KeccakF1600_int,.-KeccakF1600_int
705e1051a39Sopenharmony_ci
706e1051a39Sopenharmony_ci.type	KeccakF1600, %function
707e1051a39Sopenharmony_ci.align	5
708e1051a39Sopenharmony_ciKeccakF1600:
709e1051a39Sopenharmony_ci	stmdb	sp!,{r0,r4-r11,lr}
710e1051a39Sopenharmony_ci	sub	sp,sp,#440+16			@ space for A[5][5],D[5],T[5][5],...
711e1051a39Sopenharmony_ci
712e1051a39Sopenharmony_ci	add	@E[0],r0,#$A[1][0]
713e1051a39Sopenharmony_ci	add	@E[1],sp,#$A[1][0]
714e1051a39Sopenharmony_ci	ldmia	r0,    {@C[0]-@C[9]}		@ copy A[5][5] to stack
715e1051a39Sopenharmony_ci	stmia	sp,    {@C[0]-@C[9]}
716e1051a39Sopenharmony_ci	ldmia	@E[0]!,{@C[0]-@C[9]}
717e1051a39Sopenharmony_ci	stmia	@E[1]!,{@C[0]-@C[9]}
718e1051a39Sopenharmony_ci	ldmia	@E[0]!,{@C[0]-@C[9]}
719e1051a39Sopenharmony_ci	stmia	@E[1]!,{@C[0]-@C[9]}
720e1051a39Sopenharmony_ci	ldmia	@E[0]!,{@C[0]-@C[9]}
721e1051a39Sopenharmony_ci	stmia	@E[1]!,{@C[0]-@C[9]}
722e1051a39Sopenharmony_ci	ldmia	@E[0], {@C[0]-@C[9]}
723e1051a39Sopenharmony_ci	add	@E[2],sp,#$A[0][0]
724e1051a39Sopenharmony_ci	add	@E[0],sp,#$A[1][0]
725e1051a39Sopenharmony_ci	stmia	@E[1], {@C[0]-@C[9]}
726e1051a39Sopenharmony_ci
727e1051a39Sopenharmony_ci	bl	KeccakF1600_enter
728e1051a39Sopenharmony_ci
729e1051a39Sopenharmony_ci	ldr	@E[1], [sp,#440+16]		@ restore pointer to A
730e1051a39Sopenharmony_ci	ldmia	sp,    {@C[0]-@C[9]}
731e1051a39Sopenharmony_ci	stmia	@E[1]!,{@C[0]-@C[9]}		@ return A[5][5]
732e1051a39Sopenharmony_ci	ldmia	@E[0]!,{@C[0]-@C[9]}
733e1051a39Sopenharmony_ci	stmia	@E[1]!,{@C[0]-@C[9]}
734e1051a39Sopenharmony_ci	ldmia	@E[0]!,{@C[0]-@C[9]}
735e1051a39Sopenharmony_ci	stmia	@E[1]!,{@C[0]-@C[9]}
736e1051a39Sopenharmony_ci	ldmia	@E[0]!,{@C[0]-@C[9]}
737e1051a39Sopenharmony_ci	stmia	@E[1]!,{@C[0]-@C[9]}
738e1051a39Sopenharmony_ci	ldmia	@E[0], {@C[0]-@C[9]}
739e1051a39Sopenharmony_ci	stmia	@E[1], {@C[0]-@C[9]}
740e1051a39Sopenharmony_ci
741e1051a39Sopenharmony_ci	add	sp,sp,#440+20
742e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5
743e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r11,pc}
744e1051a39Sopenharmony_ci#else
745e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r11,lr}
746e1051a39Sopenharmony_ci	tst	lr,#1
747e1051a39Sopenharmony_ci	moveq	pc,lr		@ be binary compatible with V4, yet
748e1051a39Sopenharmony_ci	bx	lr		@ interoperable with Thumb ISA:-)
749e1051a39Sopenharmony_ci#endif
750e1051a39Sopenharmony_ci.size	KeccakF1600,.-KeccakF1600
751e1051a39Sopenharmony_ci___
752e1051a39Sopenharmony_ci{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
753e1051a39Sopenharmony_ci
754e1051a39Sopenharmony_ci########################################################################
755e1051a39Sopenharmony_ci# Stack layout
756e1051a39Sopenharmony_ci# ----->+-----------------------+
757e1051a39Sopenharmony_ci#       | uint64_t A[5][5]      |
758e1051a39Sopenharmony_ci#       | ...                   |
759e1051a39Sopenharmony_ci#       | ...                   |
760e1051a39Sopenharmony_ci# +456->+-----------------------+
761e1051a39Sopenharmony_ci#       | 0x55555555            |
762e1051a39Sopenharmony_ci# +460->+-----------------------+
763e1051a39Sopenharmony_ci#       | 0x33333333            |
764e1051a39Sopenharmony_ci# +464->+-----------------------+
765e1051a39Sopenharmony_ci#       | 0x0f0f0f0f            |
766e1051a39Sopenharmony_ci# +468->+-----------------------+
767e1051a39Sopenharmony_ci#       | 0x00ff00ff            |
768e1051a39Sopenharmony_ci# +472->+-----------------------+
769e1051a39Sopenharmony_ci#       | uint64_t *A           |
770e1051a39Sopenharmony_ci# +476->+-----------------------+
771e1051a39Sopenharmony_ci#       | const void *inp       |
772e1051a39Sopenharmony_ci# +480->+-----------------------+
773e1051a39Sopenharmony_ci#       | size_t len            |
774e1051a39Sopenharmony_ci# +484->+-----------------------+
775e1051a39Sopenharmony_ci#       | size_t bs             |
776e1051a39Sopenharmony_ci# +488->+-----------------------+
777e1051a39Sopenharmony_ci#       | ....
778e1051a39Sopenharmony_ci
779e1051a39Sopenharmony_ci$code.=<<___;
780e1051a39Sopenharmony_ci.global	SHA3_absorb
781e1051a39Sopenharmony_ci.type	SHA3_absorb,%function
782e1051a39Sopenharmony_ci.align	5
783e1051a39Sopenharmony_ciSHA3_absorb:
784e1051a39Sopenharmony_ci	stmdb	sp!,{r0-r12,lr}
785e1051a39Sopenharmony_ci	sub	sp,sp,#456+16
786e1051a39Sopenharmony_ci
787e1051a39Sopenharmony_ci	add	$A_flat,r0,#$A[1][0]
788e1051a39Sopenharmony_ci	@ mov	$inp,r1
789e1051a39Sopenharmony_ci	mov	$len,r2
790e1051a39Sopenharmony_ci	mov	$bsz,r3
791e1051a39Sopenharmony_ci	cmp	r2,r3
792e1051a39Sopenharmony_ci	blo	.Labsorb_abort
793e1051a39Sopenharmony_ci
794e1051a39Sopenharmony_ci	add	$inp,sp,#0
795e1051a39Sopenharmony_ci	ldmia	r0,      {@C[0]-@C[9]}	@ copy A[5][5] to stack
796e1051a39Sopenharmony_ci	stmia	$inp!,   {@C[0]-@C[9]}
797e1051a39Sopenharmony_ci	ldmia	$A_flat!,{@C[0]-@C[9]}
798e1051a39Sopenharmony_ci	stmia	$inp!,   {@C[0]-@C[9]}
799e1051a39Sopenharmony_ci	ldmia	$A_flat!,{@C[0]-@C[9]}
800e1051a39Sopenharmony_ci	stmia	$inp!,   {@C[0]-@C[9]}
801e1051a39Sopenharmony_ci	ldmia	$A_flat!,{@C[0]-@C[9]}
802e1051a39Sopenharmony_ci	stmia	$inp!,   {@C[0]-@C[9]}
803e1051a39Sopenharmony_ci	ldmia	$A_flat!,{@C[0]-@C[9]}
804e1051a39Sopenharmony_ci	stmia	$inp,    {@C[0]-@C[9]}
805e1051a39Sopenharmony_ci
806e1051a39Sopenharmony_ci	ldr	$inp,[sp,#476]		@ restore $inp
807e1051a39Sopenharmony_ci#ifdef	__thumb2__
808e1051a39Sopenharmony_ci	mov	r9,#0x00ff00ff
809e1051a39Sopenharmony_ci	mov	r8,#0x0f0f0f0f
810e1051a39Sopenharmony_ci	mov	r7,#0x33333333
811e1051a39Sopenharmony_ci	mov	r6,#0x55555555
812e1051a39Sopenharmony_ci#else
813e1051a39Sopenharmony_ci	mov	r6,#0x11		@ compose constants
814e1051a39Sopenharmony_ci	mov	r8,#0x0f
815e1051a39Sopenharmony_ci	mov	r9,#0xff
816e1051a39Sopenharmony_ci	orr	r6,r6,r6,lsl#8
817e1051a39Sopenharmony_ci	orr	r8,r8,r8,lsl#8
818e1051a39Sopenharmony_ci	orr	r6,r6,r6,lsl#16		@ 0x11111111
819e1051a39Sopenharmony_ci	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
820e1051a39Sopenharmony_ci	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
821e1051a39Sopenharmony_ci	orr	r7,r6,r6,lsl#1		@ 0x33333333
822e1051a39Sopenharmony_ci	orr	r6,r6,r6,lsl#2		@ 0x55555555
823e1051a39Sopenharmony_ci#endif
824e1051a39Sopenharmony_ci	str	r9,[sp,#468]
825e1051a39Sopenharmony_ci	str	r8,[sp,#464]
826e1051a39Sopenharmony_ci	str	r7,[sp,#460]
827e1051a39Sopenharmony_ci	str	r6,[sp,#456]
828e1051a39Sopenharmony_ci	b	.Loop_absorb
829e1051a39Sopenharmony_ci
830e1051a39Sopenharmony_ci.align	4
831e1051a39Sopenharmony_ci.Loop_absorb:
832e1051a39Sopenharmony_ci	subs	r0,$len,$bsz
833e1051a39Sopenharmony_ci	blo	.Labsorbed
834e1051a39Sopenharmony_ci	add	$A_flat,sp,#0
835e1051a39Sopenharmony_ci	str	r0,[sp,#480]		@ save len - bsz
836e1051a39Sopenharmony_ci
837e1051a39Sopenharmony_ci.align	4
838e1051a39Sopenharmony_ci.Loop_block:
839e1051a39Sopenharmony_ci	ldrb	r0,[$inp],#1
840e1051a39Sopenharmony_ci	ldrb	r1,[$inp],#1
841e1051a39Sopenharmony_ci	ldrb	r2,[$inp],#1
842e1051a39Sopenharmony_ci	ldrb	r3,[$inp],#1
843e1051a39Sopenharmony_ci	ldrb	r4,[$inp],#1
844e1051a39Sopenharmony_ci	orr	r0,r0,r1,lsl#8
845e1051a39Sopenharmony_ci	ldrb	r1,[$inp],#1
846e1051a39Sopenharmony_ci	orr	r0,r0,r2,lsl#16
847e1051a39Sopenharmony_ci	ldrb	r2,[$inp],#1
848e1051a39Sopenharmony_ci	orr	r0,r0,r3,lsl#24		@ lo
849e1051a39Sopenharmony_ci	ldrb	r3,[$inp],#1
850e1051a39Sopenharmony_ci	orr	r1,r4,r1,lsl#8
851e1051a39Sopenharmony_ci	orr	r1,r1,r2,lsl#16
852e1051a39Sopenharmony_ci	orr	r1,r1,r3,lsl#24		@ hi
853e1051a39Sopenharmony_ci
854e1051a39Sopenharmony_ci	and	r2,r0,r6		@ &=0x55555555
855e1051a39Sopenharmony_ci	and	r0,r0,r6,lsl#1		@ &=0xaaaaaaaa
856e1051a39Sopenharmony_ci	and	r3,r1,r6		@ &=0x55555555
857e1051a39Sopenharmony_ci	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
858e1051a39Sopenharmony_ci	orr	r2,r2,r2,lsr#1
859e1051a39Sopenharmony_ci	orr	r0,r0,r0,lsl#1
860e1051a39Sopenharmony_ci	orr	r3,r3,r3,lsr#1
861e1051a39Sopenharmony_ci	orr	r1,r1,r1,lsl#1
862e1051a39Sopenharmony_ci	and	r2,r2,r7		@ &=0x33333333
863e1051a39Sopenharmony_ci	and	r0,r0,r7,lsl#2		@ &=0xcccccccc
864e1051a39Sopenharmony_ci	and	r3,r3,r7		@ &=0x33333333
865e1051a39Sopenharmony_ci	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
866e1051a39Sopenharmony_ci	orr	r2,r2,r2,lsr#2
867e1051a39Sopenharmony_ci	orr	r0,r0,r0,lsl#2
868e1051a39Sopenharmony_ci	orr	r3,r3,r3,lsr#2
869e1051a39Sopenharmony_ci	orr	r1,r1,r1,lsl#2
870e1051a39Sopenharmony_ci	and	r2,r2,r8		@ &=0x0f0f0f0f
871e1051a39Sopenharmony_ci	and	r0,r0,r8,lsl#4		@ &=0xf0f0f0f0
872e1051a39Sopenharmony_ci	and	r3,r3,r8		@ &=0x0f0f0f0f
873e1051a39Sopenharmony_ci	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
874e1051a39Sopenharmony_ci	ldmia	$A_flat,{r4-r5}		@ A_flat[i]
875e1051a39Sopenharmony_ci	orr	r2,r2,r2,lsr#4
876e1051a39Sopenharmony_ci	orr	r0,r0,r0,lsl#4
877e1051a39Sopenharmony_ci	orr	r3,r3,r3,lsr#4
878e1051a39Sopenharmony_ci	orr	r1,r1,r1,lsl#4
879e1051a39Sopenharmony_ci	and	r2,r2,r9		@ &=0x00ff00ff
880e1051a39Sopenharmony_ci	and	r0,r0,r9,lsl#8		@ &=0xff00ff00
881e1051a39Sopenharmony_ci	and	r3,r3,r9		@ &=0x00ff00ff
882e1051a39Sopenharmony_ci	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
883e1051a39Sopenharmony_ci	orr	r2,r2,r2,lsr#8
884e1051a39Sopenharmony_ci	orr	r0,r0,r0,lsl#8
885e1051a39Sopenharmony_ci	orr	r3,r3,r3,lsr#8
886e1051a39Sopenharmony_ci	orr	r1,r1,r1,lsl#8
887e1051a39Sopenharmony_ci
888e1051a39Sopenharmony_ci	lsl	r2,r2,#16
889e1051a39Sopenharmony_ci	lsr	r1,r1,#16
890e1051a39Sopenharmony_ci	eor	r4,r4,r3,lsl#16
891e1051a39Sopenharmony_ci	eor	r5,r5,r0,lsr#16
892e1051a39Sopenharmony_ci	eor	r4,r4,r2,lsr#16
893e1051a39Sopenharmony_ci	eor	r5,r5,r1,lsl#16
894e1051a39Sopenharmony_ci	stmia	$A_flat!,{r4-r5}	@ A_flat[i++] ^= BitInterleave(inp[0..7])
895e1051a39Sopenharmony_ci
896e1051a39Sopenharmony_ci	subs	$bsz,$bsz,#8
897e1051a39Sopenharmony_ci	bhi	.Loop_block
898e1051a39Sopenharmony_ci
899e1051a39Sopenharmony_ci	str	$inp,[sp,#476]
900e1051a39Sopenharmony_ci
901e1051a39Sopenharmony_ci	bl	KeccakF1600_int
902e1051a39Sopenharmony_ci
903e1051a39Sopenharmony_ci	add	r14,sp,#456
904e1051a39Sopenharmony_ci	ldmia	r14,{r6-r12,r14}	@ restore constants and variables
905e1051a39Sopenharmony_ci	b	.Loop_absorb
906e1051a39Sopenharmony_ci
907e1051a39Sopenharmony_ci.align	4
908e1051a39Sopenharmony_ci.Labsorbed:
909e1051a39Sopenharmony_ci	add	$inp,sp,#$A[1][0]
910e1051a39Sopenharmony_ci	ldmia	sp,      {@C[0]-@C[9]}
911e1051a39Sopenharmony_ci	stmia	$A_flat!,{@C[0]-@C[9]}	@ return A[5][5]
912e1051a39Sopenharmony_ci	ldmia	$inp!,   {@C[0]-@C[9]}
913e1051a39Sopenharmony_ci	stmia	$A_flat!,{@C[0]-@C[9]}
914e1051a39Sopenharmony_ci	ldmia	$inp!,   {@C[0]-@C[9]}
915e1051a39Sopenharmony_ci	stmia	$A_flat!,{@C[0]-@C[9]}
916e1051a39Sopenharmony_ci	ldmia	$inp!,   {@C[0]-@C[9]}
917e1051a39Sopenharmony_ci	stmia	$A_flat!,{@C[0]-@C[9]}
918e1051a39Sopenharmony_ci	ldmia	$inp,    {@C[0]-@C[9]}
919e1051a39Sopenharmony_ci	stmia	$A_flat, {@C[0]-@C[9]}
920e1051a39Sopenharmony_ci
921e1051a39Sopenharmony_ci.Labsorb_abort:
922e1051a39Sopenharmony_ci	add	sp,sp,#456+32
923e1051a39Sopenharmony_ci	mov	r0,$len			@ return value
924e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5
925e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
926e1051a39Sopenharmony_ci#else
927e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
928e1051a39Sopenharmony_ci	tst	lr,#1
929e1051a39Sopenharmony_ci	moveq	pc,lr		@ be binary compatible with V4, yet
930e1051a39Sopenharmony_ci	bx	lr		@ interoperable with Thumb ISA:-)
931e1051a39Sopenharmony_ci#endif
932e1051a39Sopenharmony_ci.size	SHA3_absorb,.-SHA3_absorb
933e1051a39Sopenharmony_ci___
934e1051a39Sopenharmony_ci}
935e1051a39Sopenharmony_ci{ my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
936e1051a39Sopenharmony_ci
937e1051a39Sopenharmony_ci$code.=<<___;
938e1051a39Sopenharmony_ci.global	SHA3_squeeze
939e1051a39Sopenharmony_ci.type	SHA3_squeeze,%function
940e1051a39Sopenharmony_ci.align	5
941e1051a39Sopenharmony_ciSHA3_squeeze:
942e1051a39Sopenharmony_ci	stmdb	sp!,{r0,r3-r10,lr}
943e1051a39Sopenharmony_ci
944e1051a39Sopenharmony_ci	mov	$A_flat,r0
945e1051a39Sopenharmony_ci	mov	$out,r1
946e1051a39Sopenharmony_ci	mov	$len,r2
947e1051a39Sopenharmony_ci	mov	$bsz,r3
948e1051a39Sopenharmony_ci
949e1051a39Sopenharmony_ci#ifdef	__thumb2__
950e1051a39Sopenharmony_ci	mov	r9,#0x00ff00ff
951e1051a39Sopenharmony_ci	mov	r8,#0x0f0f0f0f
952e1051a39Sopenharmony_ci	mov	r7,#0x33333333
953e1051a39Sopenharmony_ci	mov	r6,#0x55555555
954e1051a39Sopenharmony_ci#else
955e1051a39Sopenharmony_ci	mov	r6,#0x11		@ compose constants
956e1051a39Sopenharmony_ci	mov	r8,#0x0f
957e1051a39Sopenharmony_ci	mov	r9,#0xff
958e1051a39Sopenharmony_ci	orr	r6,r6,r6,lsl#8
959e1051a39Sopenharmony_ci	orr	r8,r8,r8,lsl#8
960e1051a39Sopenharmony_ci	orr	r6,r6,r6,lsl#16		@ 0x11111111
961e1051a39Sopenharmony_ci	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
962e1051a39Sopenharmony_ci	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
963e1051a39Sopenharmony_ci	orr	r7,r6,r6,lsl#1		@ 0x33333333
964e1051a39Sopenharmony_ci	orr	r6,r6,r6,lsl#2		@ 0x55555555
965e1051a39Sopenharmony_ci#endif
966e1051a39Sopenharmony_ci	stmdb	sp!,{r6-r9}
967e1051a39Sopenharmony_ci
968e1051a39Sopenharmony_ci	mov	r14,$A_flat
969e1051a39Sopenharmony_ci	b	.Loop_squeeze
970e1051a39Sopenharmony_ci
971e1051a39Sopenharmony_ci.align	4
972e1051a39Sopenharmony_ci.Loop_squeeze:
973e1051a39Sopenharmony_ci	ldmia	$A_flat!,{r0,r1}	@ A_flat[i++]
974e1051a39Sopenharmony_ci
975e1051a39Sopenharmony_ci	lsl	r2,r0,#16
976e1051a39Sopenharmony_ci	lsl	r3,r1,#16		@ r3 = r1 << 16
977e1051a39Sopenharmony_ci	lsr	r2,r2,#16		@ r2 = r0 & 0x0000ffff
978e1051a39Sopenharmony_ci	lsr	r1,r1,#16
979e1051a39Sopenharmony_ci	lsr	r0,r0,#16		@ r0 = r0 >> 16
980e1051a39Sopenharmony_ci	lsl	r1,r1,#16		@ r1 = r1 & 0xffff0000
981e1051a39Sopenharmony_ci
982e1051a39Sopenharmony_ci	orr	r2,r2,r2,lsl#8
983e1051a39Sopenharmony_ci	orr	r3,r3,r3,lsr#8
984e1051a39Sopenharmony_ci	orr	r0,r0,r0,lsl#8
985e1051a39Sopenharmony_ci	orr	r1,r1,r1,lsr#8
986e1051a39Sopenharmony_ci	and	r2,r2,r9		@ &=0x00ff00ff
987e1051a39Sopenharmony_ci	and	r3,r3,r9,lsl#8		@ &=0xff00ff00
988e1051a39Sopenharmony_ci	and	r0,r0,r9		@ &=0x00ff00ff
989e1051a39Sopenharmony_ci	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
990e1051a39Sopenharmony_ci	orr	r2,r2,r2,lsl#4
991e1051a39Sopenharmony_ci	orr	r3,r3,r3,lsr#4
992e1051a39Sopenharmony_ci	orr	r0,r0,r0,lsl#4
993e1051a39Sopenharmony_ci	orr	r1,r1,r1,lsr#4
994e1051a39Sopenharmony_ci	and	r2,r2,r8		@ &=0x0f0f0f0f
995e1051a39Sopenharmony_ci	and	r3,r3,r8,lsl#4		@ &=0xf0f0f0f0
996e1051a39Sopenharmony_ci	and	r0,r0,r8		@ &=0x0f0f0f0f
997e1051a39Sopenharmony_ci	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
998e1051a39Sopenharmony_ci	orr	r2,r2,r2,lsl#2
999e1051a39Sopenharmony_ci	orr	r3,r3,r3,lsr#2
1000e1051a39Sopenharmony_ci	orr	r0,r0,r0,lsl#2
1001e1051a39Sopenharmony_ci	orr	r1,r1,r1,lsr#2
1002e1051a39Sopenharmony_ci	and	r2,r2,r7		@ &=0x33333333
1003e1051a39Sopenharmony_ci	and	r3,r3,r7,lsl#2		@ &=0xcccccccc
1004e1051a39Sopenharmony_ci	and	r0,r0,r7		@ &=0x33333333
1005e1051a39Sopenharmony_ci	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
1006e1051a39Sopenharmony_ci	orr	r2,r2,r2,lsl#1
1007e1051a39Sopenharmony_ci	orr	r3,r3,r3,lsr#1
1008e1051a39Sopenharmony_ci	orr	r0,r0,r0,lsl#1
1009e1051a39Sopenharmony_ci	orr	r1,r1,r1,lsr#1
1010e1051a39Sopenharmony_ci	and	r2,r2,r6		@ &=0x55555555
1011e1051a39Sopenharmony_ci	and	r3,r3,r6,lsl#1		@ &=0xaaaaaaaa
1012e1051a39Sopenharmony_ci	and	r0,r0,r6		@ &=0x55555555
1013e1051a39Sopenharmony_ci	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
1014e1051a39Sopenharmony_ci
1015e1051a39Sopenharmony_ci	orr	r2,r2,r3
1016e1051a39Sopenharmony_ci	orr	r0,r0,r1
1017e1051a39Sopenharmony_ci
1018e1051a39Sopenharmony_ci	cmp	$len,#8
1019e1051a39Sopenharmony_ci	blo	.Lsqueeze_tail
1020e1051a39Sopenharmony_ci	lsr	r1,r2,#8
1021e1051a39Sopenharmony_ci	strb	r2,[$out],#1
1022e1051a39Sopenharmony_ci	lsr	r3,r2,#16
1023e1051a39Sopenharmony_ci	strb	r1,[$out],#1
1024e1051a39Sopenharmony_ci	lsr	r2,r2,#24
1025e1051a39Sopenharmony_ci	strb	r3,[$out],#1
1026e1051a39Sopenharmony_ci	strb	r2,[$out],#1
1027e1051a39Sopenharmony_ci
1028e1051a39Sopenharmony_ci	lsr	r1,r0,#8
1029e1051a39Sopenharmony_ci	strb	r0,[$out],#1
1030e1051a39Sopenharmony_ci	lsr	r3,r0,#16
1031e1051a39Sopenharmony_ci	strb	r1,[$out],#1
1032e1051a39Sopenharmony_ci	lsr	r0,r0,#24
1033e1051a39Sopenharmony_ci	strb	r3,[$out],#1
1034e1051a39Sopenharmony_ci	strb	r0,[$out],#1
1035e1051a39Sopenharmony_ci	subs	$len,$len,#8
1036e1051a39Sopenharmony_ci	beq	.Lsqueeze_done
1037e1051a39Sopenharmony_ci
1038e1051a39Sopenharmony_ci	subs	$bsz,$bsz,#8		@ bsz -= 8
1039e1051a39Sopenharmony_ci	bhi	.Loop_squeeze
1040e1051a39Sopenharmony_ci
1041e1051a39Sopenharmony_ci	mov	r0,r14			@ original $A_flat
1042e1051a39Sopenharmony_ci
1043e1051a39Sopenharmony_ci	bl	KeccakF1600
1044e1051a39Sopenharmony_ci
1045e1051a39Sopenharmony_ci	ldmia	sp,{r6-r10,r12}		@ restore constants and variables
1046e1051a39Sopenharmony_ci	mov	r14,$A_flat
1047e1051a39Sopenharmony_ci	b	.Loop_squeeze
1048e1051a39Sopenharmony_ci
1049e1051a39Sopenharmony_ci.align	4
1050e1051a39Sopenharmony_ci.Lsqueeze_tail:
1051e1051a39Sopenharmony_ci	strb	r2,[$out],#1
1052e1051a39Sopenharmony_ci	lsr	r2,r2,#8
1053e1051a39Sopenharmony_ci	subs	$len,$len,#1
1054e1051a39Sopenharmony_ci	beq	.Lsqueeze_done
1055e1051a39Sopenharmony_ci	strb	r2,[$out],#1
1056e1051a39Sopenharmony_ci	lsr	r2,r2,#8
1057e1051a39Sopenharmony_ci	subs	$len,$len,#1
1058e1051a39Sopenharmony_ci	beq	.Lsqueeze_done
1059e1051a39Sopenharmony_ci	strb	r2,[$out],#1
1060e1051a39Sopenharmony_ci	lsr	r2,r2,#8
1061e1051a39Sopenharmony_ci	subs	$len,$len,#1
1062e1051a39Sopenharmony_ci	beq	.Lsqueeze_done
1063e1051a39Sopenharmony_ci	strb	r2,[$out],#1
1064e1051a39Sopenharmony_ci	subs	$len,$len,#1
1065e1051a39Sopenharmony_ci	beq	.Lsqueeze_done
1066e1051a39Sopenharmony_ci
1067e1051a39Sopenharmony_ci	strb	r0,[$out],#1
1068e1051a39Sopenharmony_ci	lsr	r0,r0,#8
1069e1051a39Sopenharmony_ci	subs	$len,$len,#1
1070e1051a39Sopenharmony_ci	beq	.Lsqueeze_done
1071e1051a39Sopenharmony_ci	strb	r0,[$out],#1
1072e1051a39Sopenharmony_ci	lsr	r0,r0,#8
1073e1051a39Sopenharmony_ci	subs	$len,$len,#1
1074e1051a39Sopenharmony_ci	beq	.Lsqueeze_done
1075e1051a39Sopenharmony_ci	strb	r0,[$out]
1076e1051a39Sopenharmony_ci	b	.Lsqueeze_done
1077e1051a39Sopenharmony_ci
1078e1051a39Sopenharmony_ci.align	4
1079e1051a39Sopenharmony_ci.Lsqueeze_done:
1080e1051a39Sopenharmony_ci	add	sp,sp,#24
1081e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5
1082e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r10,pc}
1083e1051a39Sopenharmony_ci#else
1084e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r10,lr}
1085e1051a39Sopenharmony_ci	tst	lr,#1
1086e1051a39Sopenharmony_ci	moveq	pc,lr		@ be binary compatible with V4, yet
1087e1051a39Sopenharmony_ci	bx	lr		@ interoperable with Thumb ISA:-)
1088e1051a39Sopenharmony_ci#endif
1089e1051a39Sopenharmony_ci.size	SHA3_squeeze,.-SHA3_squeeze
1090e1051a39Sopenharmony_ci___
1091e1051a39Sopenharmony_ci}
1092e1051a39Sopenharmony_ci
1093e1051a39Sopenharmony_ci$code.=<<___;
1094e1051a39Sopenharmony_ci#if __ARM_MAX_ARCH__>=7
1095e1051a39Sopenharmony_ci.fpu	neon
1096e1051a39Sopenharmony_ci
1097e1051a39Sopenharmony_ci.type	iotas64, %object
1098e1051a39Sopenharmony_ci.align 5
1099e1051a39Sopenharmony_ciiotas64:
1100e1051a39Sopenharmony_ci	.quad	0x0000000000000001
1101e1051a39Sopenharmony_ci	.quad	0x0000000000008082
1102e1051a39Sopenharmony_ci	.quad	0x800000000000808a
1103e1051a39Sopenharmony_ci	.quad	0x8000000080008000
1104e1051a39Sopenharmony_ci	.quad	0x000000000000808b
1105e1051a39Sopenharmony_ci	.quad	0x0000000080000001
1106e1051a39Sopenharmony_ci	.quad	0x8000000080008081
1107e1051a39Sopenharmony_ci	.quad	0x8000000000008009
1108e1051a39Sopenharmony_ci	.quad	0x000000000000008a
1109e1051a39Sopenharmony_ci	.quad	0x0000000000000088
1110e1051a39Sopenharmony_ci	.quad	0x0000000080008009
1111e1051a39Sopenharmony_ci	.quad	0x000000008000000a
1112e1051a39Sopenharmony_ci	.quad	0x000000008000808b
1113e1051a39Sopenharmony_ci	.quad	0x800000000000008b
1114e1051a39Sopenharmony_ci	.quad	0x8000000000008089
1115e1051a39Sopenharmony_ci	.quad	0x8000000000008003
1116e1051a39Sopenharmony_ci	.quad	0x8000000000008002
1117e1051a39Sopenharmony_ci	.quad	0x8000000000000080
1118e1051a39Sopenharmony_ci	.quad	0x000000000000800a
1119e1051a39Sopenharmony_ci	.quad	0x800000008000000a
1120e1051a39Sopenharmony_ci	.quad	0x8000000080008081
1121e1051a39Sopenharmony_ci	.quad	0x8000000000008080
1122e1051a39Sopenharmony_ci	.quad	0x0000000080000001
1123e1051a39Sopenharmony_ci	.quad	0x8000000080008008
1124e1051a39Sopenharmony_ci.size	iotas64,.-iotas64
1125e1051a39Sopenharmony_ci
1126e1051a39Sopenharmony_ci.type	KeccakF1600_neon, %function
1127e1051a39Sopenharmony_ci.align	5
1128e1051a39Sopenharmony_ciKeccakF1600_neon:
1129e1051a39Sopenharmony_ci	add	r1, r0, #16
1130e1051a39Sopenharmony_ci	adr	r2, iotas64
1131e1051a39Sopenharmony_ci	mov	r3, #24			@ loop counter
1132e1051a39Sopenharmony_ci	b	.Loop_neon
1133e1051a39Sopenharmony_ci
1134e1051a39Sopenharmony_ci.align	4
1135e1051a39Sopenharmony_ci.Loop_neon:
1136e1051a39Sopenharmony_ci	@ Theta
1137e1051a39Sopenharmony_ci	vst1.64		{q4},  [r0,:64]		@ offload A[0..1][4]
1138e1051a39Sopenharmony_ci	veor		q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
1139e1051a39Sopenharmony_ci	vst1.64		{d18}, [r1,:64]		@ offload A[2][4]
1140e1051a39Sopenharmony_ci	veor		q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
1141e1051a39Sopenharmony_ci	veor		q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
1142e1051a39Sopenharmony_ci	veor		d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1143e1051a39Sopenharmony_ci	veor		d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1144e1051a39Sopenharmony_ci	veor		q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
1145e1051a39Sopenharmony_ci	veor		q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
1146e1051a39Sopenharmony_ci	veor		d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1147e1051a39Sopenharmony_ci	veor		d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1148e1051a39Sopenharmony_ci	veor		d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1149e1051a39Sopenharmony_ci	veor		q13, q13, q10		@ C[0..1]^=A[4][0..1]
1150e1051a39Sopenharmony_ci	veor		q14, q15, q11		@ C[2..3]^=A[4][2..3]
1151e1051a39Sopenharmony_ci	veor		d25, d25, d24		@ C[4]^=A[4][4]
1152e1051a39Sopenharmony_ci
1153e1051a39Sopenharmony_ci	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
1154e1051a39Sopenharmony_ci	vadd.u64	q15, q14, q14		@ C[2..3]<<1
1155e1051a39Sopenharmony_ci	vadd.u64	d18, d25, d25		@ C[4]<<1
1156e1051a39Sopenharmony_ci	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
1157e1051a39Sopenharmony_ci	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
1158e1051a39Sopenharmony_ci	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
1159e1051a39Sopenharmony_ci	veor		d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
1160e1051a39Sopenharmony_ci	veor		q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1161e1051a39Sopenharmony_ci	veor		d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
1162e1051a39Sopenharmony_ci	veor		d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)
1163e1051a39Sopenharmony_ci
1164e1051a39Sopenharmony_ci	veor		d0,  d0,  d25		@ A[0][0] ^= C[4]
1165e1051a39Sopenharmony_ci	veor		d1,  d1,  d25		@ A[1][0] ^= C[4]
1166e1051a39Sopenharmony_ci	veor		d10, d10, d25		@ A[2][0] ^= C[4]
1167e1051a39Sopenharmony_ci	veor		d11, d11, d25		@ A[3][0] ^= C[4]
1168e1051a39Sopenharmony_ci	veor		d20, d20, d25		@ A[4][0] ^= C[4]
1169e1051a39Sopenharmony_ci
1170e1051a39Sopenharmony_ci	veor		d2,  d2,  d26		@ A[0][1] ^= D[1]
1171e1051a39Sopenharmony_ci	veor		d3,  d3,  d26		@ A[1][1] ^= D[1]
1172e1051a39Sopenharmony_ci	veor		d12, d12, d26		@ A[2][1] ^= D[1]
1173e1051a39Sopenharmony_ci	veor		d13, d13, d26		@ A[3][1] ^= D[1]
1174e1051a39Sopenharmony_ci	veor		d21, d21, d26		@ A[4][1] ^= D[1]
1175e1051a39Sopenharmony_ci	vmov		d26, d27
1176e1051a39Sopenharmony_ci
1177e1051a39Sopenharmony_ci	veor		d6,  d6,  d28		@ A[0][3] ^= C[2]
1178e1051a39Sopenharmony_ci	veor		d7,  d7,  d28		@ A[1][3] ^= C[2]
1179e1051a39Sopenharmony_ci	veor		d16, d16, d28		@ A[2][3] ^= C[2]
1180e1051a39Sopenharmony_ci	veor		d17, d17, d28		@ A[3][3] ^= C[2]
1181e1051a39Sopenharmony_ci	veor		d23, d23, d28		@ A[4][3] ^= C[2]
1182e1051a39Sopenharmony_ci	vld1.64		{q4},  [r0,:64]		@ restore A[0..1][4]
1183e1051a39Sopenharmony_ci	vmov		d28, d29
1184e1051a39Sopenharmony_ci
1185e1051a39Sopenharmony_ci	vld1.64		{d18}, [r1,:64]		@ restore A[2][4]
1186e1051a39Sopenharmony_ci	veor		q2,  q2,  q13		@ A[0..1][2] ^= D[2]
1187e1051a39Sopenharmony_ci	veor		q7,  q7,  q13		@ A[2..3][2] ^= D[2]
1188e1051a39Sopenharmony_ci	veor		d22, d22, d27		@ A[4][2]    ^= D[2]
1189e1051a39Sopenharmony_ci
1190e1051a39Sopenharmony_ci	veor		q4,  q4,  q14		@ A[0..1][4] ^= C[3]
1191e1051a39Sopenharmony_ci	veor		q9,  q9,  q14		@ A[2..3][4] ^= C[3]
1192e1051a39Sopenharmony_ci	veor		d24, d24, d29		@ A[4][4]    ^= C[3]
1193e1051a39Sopenharmony_ci
1194e1051a39Sopenharmony_ci	@ Rho + Pi
1195e1051a39Sopenharmony_ci	vmov		d26, d2			@ C[1] = A[0][1]
1196e1051a39Sopenharmony_ci	vshl.u64	d2,  d3,  #44
1197e1051a39Sopenharmony_ci	vmov		d27, d4			@ C[2] = A[0][2]
1198e1051a39Sopenharmony_ci	vshl.u64	d4,  d14, #43
1199e1051a39Sopenharmony_ci	vmov		d28, d6			@ C[3] = A[0][3]
1200e1051a39Sopenharmony_ci	vshl.u64	d6,  d17, #21
1201e1051a39Sopenharmony_ci	vmov		d29, d8			@ C[4] = A[0][4]
1202e1051a39Sopenharmony_ci	vshl.u64	d8,  d24, #14
1203e1051a39Sopenharmony_ci	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1204e1051a39Sopenharmony_ci	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1205e1051a39Sopenharmony_ci	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1206e1051a39Sopenharmony_ci	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1207e1051a39Sopenharmony_ci
1208e1051a39Sopenharmony_ci	vshl.u64	d3,  d9,  #20
1209e1051a39Sopenharmony_ci	vshl.u64	d14, d16, #25
1210e1051a39Sopenharmony_ci	vshl.u64	d17, d15, #15
1211e1051a39Sopenharmony_ci	vshl.u64	d24, d21, #2
1212e1051a39Sopenharmony_ci	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1213e1051a39Sopenharmony_ci	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1214e1051a39Sopenharmony_ci	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1215e1051a39Sopenharmony_ci	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1216e1051a39Sopenharmony_ci
1217e1051a39Sopenharmony_ci	vshl.u64	d9,  d22, #61
1218e1051a39Sopenharmony_ci	@ vshl.u64	d16, d19, #8
1219e1051a39Sopenharmony_ci	vshl.u64	d15, d12, #10
1220e1051a39Sopenharmony_ci	vshl.u64	d21, d7,  #55
1221e1051a39Sopenharmony_ci	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1222e1051a39Sopenharmony_ci	vext.8		d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1223e1051a39Sopenharmony_ci	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1224e1051a39Sopenharmony_ci	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1225e1051a39Sopenharmony_ci
1226e1051a39Sopenharmony_ci	vshl.u64	d22, d18, #39
1227e1051a39Sopenharmony_ci	@ vshl.u64	d19, d23, #56
1228e1051a39Sopenharmony_ci	vshl.u64	d12, d5,  #6
1229e1051a39Sopenharmony_ci	vshl.u64	d7,  d13, #45
1230e1051a39Sopenharmony_ci	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1231e1051a39Sopenharmony_ci	vext.8		d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1232e1051a39Sopenharmony_ci	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1233e1051a39Sopenharmony_ci	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1234e1051a39Sopenharmony_ci
1235e1051a39Sopenharmony_ci	vshl.u64	d18, d20, #18
1236e1051a39Sopenharmony_ci	vshl.u64	d23, d11, #41
1237e1051a39Sopenharmony_ci	vshl.u64	d5,  d10, #3
1238e1051a39Sopenharmony_ci	vshl.u64	d13, d1,  #36
1239e1051a39Sopenharmony_ci	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1240e1051a39Sopenharmony_ci	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1241e1051a39Sopenharmony_ci	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1242e1051a39Sopenharmony_ci	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1243e1051a39Sopenharmony_ci
1244e1051a39Sopenharmony_ci	vshl.u64	d1,  d28, #28
1245e1051a39Sopenharmony_ci	vshl.u64	d10, d26, #1
1246e1051a39Sopenharmony_ci	vshl.u64	d11, d29, #27
1247e1051a39Sopenharmony_ci	vshl.u64	d20, d27, #62
1248e1051a39Sopenharmony_ci	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
1249e1051a39Sopenharmony_ci	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
1250e1051a39Sopenharmony_ci	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
1251e1051a39Sopenharmony_ci	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])
1252e1051a39Sopenharmony_ci
1253e1051a39Sopenharmony_ci	@ Chi + Iota
1254e1051a39Sopenharmony_ci	vbic		q13, q2,  q1
1255e1051a39Sopenharmony_ci	vbic		q14, q3,  q2
1256e1051a39Sopenharmony_ci	vbic		q15, q4,  q3
1257e1051a39Sopenharmony_ci	veor		q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1258e1051a39Sopenharmony_ci	veor		q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1259e1051a39Sopenharmony_ci	veor		q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1260e1051a39Sopenharmony_ci	vst1.64		{q13}, [r0,:64]		@ offload A[0..1][0]
1261e1051a39Sopenharmony_ci	vbic		q13, q0,  q4
1262e1051a39Sopenharmony_ci	vbic		q15, q1,  q0
1263e1051a39Sopenharmony_ci	vmov		q1,  q14		@ A[0..1][1]
1264e1051a39Sopenharmony_ci	veor		q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1265e1051a39Sopenharmony_ci	veor		q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1266e1051a39Sopenharmony_ci
1267e1051a39Sopenharmony_ci	vbic		q13, q7,  q6
1268e1051a39Sopenharmony_ci	vmov		q0,  q5			@ A[2..3][0]
1269e1051a39Sopenharmony_ci	vbic		q14, q8,  q7
1270e1051a39Sopenharmony_ci	vmov		q15, q6			@ A[2..3][1]
1271e1051a39Sopenharmony_ci	veor		q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1272e1051a39Sopenharmony_ci	vbic		q13, q9,  q8
1273e1051a39Sopenharmony_ci	veor		q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1274e1051a39Sopenharmony_ci	vbic		q14, q0,  q9
1275e1051a39Sopenharmony_ci	veor		q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1276e1051a39Sopenharmony_ci	vbic		q13, q15, q0
1277e1051a39Sopenharmony_ci	veor		q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1278e1051a39Sopenharmony_ci	vmov		q14, q10		@ A[4][0..1]
1279e1051a39Sopenharmony_ci	veor		q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1280e1051a39Sopenharmony_ci
1281e1051a39Sopenharmony_ci	vld1.64		d25, [r2,:64]!		@ Iota[i++]
1282e1051a39Sopenharmony_ci	vbic		d26, d22, d21
1283e1051a39Sopenharmony_ci	vbic		d27, d23, d22
1284e1051a39Sopenharmony_ci	vld1.64		{q0}, [r0,:64]		@ restore A[0..1][0]
1285e1051a39Sopenharmony_ci	veor		d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
1286e1051a39Sopenharmony_ci	vbic		d26, d24, d23
1287e1051a39Sopenharmony_ci	veor		d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
1288e1051a39Sopenharmony_ci	vbic		d27, d28, d24
1289e1051a39Sopenharmony_ci	veor		d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
1290e1051a39Sopenharmony_ci	vbic		d26, d29, d28
1291e1051a39Sopenharmony_ci	veor		d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
1292e1051a39Sopenharmony_ci	veor		d0,  d0,  d25		@ A[0][0] ^= Iota[i]
1293e1051a39Sopenharmony_ci	veor		d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])
1294e1051a39Sopenharmony_ci
1295e1051a39Sopenharmony_ci	subs	r3, r3, #1
1296e1051a39Sopenharmony_ci	bne	.Loop_neon
1297e1051a39Sopenharmony_ci
1298e1051a39Sopenharmony_ci	ret
1299e1051a39Sopenharmony_ci.size	KeccakF1600_neon,.-KeccakF1600_neon
1300e1051a39Sopenharmony_ci
1301e1051a39Sopenharmony_ci.global	SHA3_absorb_neon
1302e1051a39Sopenharmony_ci.type	SHA3_absorb_neon, %function
1303e1051a39Sopenharmony_ci.align	5
1304e1051a39Sopenharmony_ciSHA3_absorb_neon:
1305e1051a39Sopenharmony_ci	stmdb	sp!, {r4-r6,lr}
1306e1051a39Sopenharmony_ci	vstmdb	sp!, {d8-d15}
1307e1051a39Sopenharmony_ci
1308e1051a39Sopenharmony_ci	mov	r4, r1			@ inp
1309e1051a39Sopenharmony_ci	mov	r5, r2			@ len
1310e1051a39Sopenharmony_ci	mov	r6, r3			@ bsz
1311e1051a39Sopenharmony_ci
1312e1051a39Sopenharmony_ci	vld1.32	{d0}, [r0,:64]!		@ A[0][0]
1313e1051a39Sopenharmony_ci	vld1.32	{d2}, [r0,:64]!		@ A[0][1]
1314e1051a39Sopenharmony_ci	vld1.32	{d4}, [r0,:64]!		@ A[0][2]
1315e1051a39Sopenharmony_ci	vld1.32	{d6}, [r0,:64]!		@ A[0][3]
1316e1051a39Sopenharmony_ci	vld1.32	{d8}, [r0,:64]!		@ A[0][4]
1317e1051a39Sopenharmony_ci
1318e1051a39Sopenharmony_ci	vld1.32	{d1}, [r0,:64]!		@ A[1][0]
1319e1051a39Sopenharmony_ci	vld1.32	{d3}, [r0,:64]!		@ A[1][1]
1320e1051a39Sopenharmony_ci	vld1.32	{d5}, [r0,:64]!		@ A[1][2]
1321e1051a39Sopenharmony_ci	vld1.32	{d7}, [r0,:64]!		@ A[1][3]
1322e1051a39Sopenharmony_ci	vld1.32	{d9}, [r0,:64]!		@ A[1][4]
1323e1051a39Sopenharmony_ci
1324e1051a39Sopenharmony_ci	vld1.32	{d10}, [r0,:64]!		@ A[2][0]
1325e1051a39Sopenharmony_ci	vld1.32	{d12}, [r0,:64]!		@ A[2][1]
1326e1051a39Sopenharmony_ci	vld1.32	{d14}, [r0,:64]!		@ A[2][2]
1327e1051a39Sopenharmony_ci	vld1.32	{d16}, [r0,:64]!		@ A[2][3]
1328e1051a39Sopenharmony_ci	vld1.32	{d18}, [r0,:64]!		@ A[2][4]
1329e1051a39Sopenharmony_ci
1330e1051a39Sopenharmony_ci	vld1.32	{d11}, [r0,:64]!		@ A[3][0]
1331e1051a39Sopenharmony_ci	vld1.32	{d13}, [r0,:64]!		@ A[3][1]
1332e1051a39Sopenharmony_ci	vld1.32	{d15}, [r0,:64]!		@ A[3][2]
1333e1051a39Sopenharmony_ci	vld1.32	{d17}, [r0,:64]!		@ A[3][3]
1334e1051a39Sopenharmony_ci	vld1.32	{d19}, [r0,:64]!		@ A[3][4]
1335e1051a39Sopenharmony_ci
1336e1051a39Sopenharmony_ci	vld1.32	{d20-d23}, [r0,:64]!	@ A[4][0..3]
1337e1051a39Sopenharmony_ci	vld1.32	{d24}, [r0,:64]		@ A[4][4]
1338e1051a39Sopenharmony_ci	sub	r0, r0, #24*8		@ rewind
1339e1051a39Sopenharmony_ci	b	.Loop_absorb_neon
1340e1051a39Sopenharmony_ci
1341e1051a39Sopenharmony_ci.align	4
1342e1051a39Sopenharmony_ci.Loop_absorb_neon:
1343e1051a39Sopenharmony_ci	subs	r12, r5, r6		@ len - bsz
1344e1051a39Sopenharmony_ci	blo	.Labsorbed_neon
1345e1051a39Sopenharmony_ci	mov	r5, r12
1346e1051a39Sopenharmony_ci
1347e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
1348e1051a39Sopenharmony_ci	cmp	r6, #8*2
1349e1051a39Sopenharmony_ci	veor	d0, d0, d31		@ A[0][0] ^= *inp++
1350e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1351e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1352e1051a39Sopenharmony_ci	veor	d2, d2, d31		@ A[0][1] ^= *inp++
1353e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1354e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1355e1051a39Sopenharmony_ci	cmp	r6, #8*4
1356e1051a39Sopenharmony_ci	veor	d4, d4, d31		@ A[0][2] ^= *inp++
1357e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1358e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1359e1051a39Sopenharmony_ci	veor	d6, d6, d31		@ A[0][3] ^= *inp++
1360e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1361e1051a39Sopenharmony_ci	vld1.8	{d31},[r4]!
1362e1051a39Sopenharmony_ci	cmp	r6, #8*6
1363e1051a39Sopenharmony_ci	veor	d8, d8, d31		@ A[0][4] ^= *inp++
1364e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1365e1051a39Sopenharmony_ci
1366e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1367e1051a39Sopenharmony_ci	veor	d1, d1, d31		@ A[1][0] ^= *inp++
1368e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1369e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1370e1051a39Sopenharmony_ci	cmp	r6, #8*8
1371e1051a39Sopenharmony_ci	veor	d3, d3, d31		@ A[1][1] ^= *inp++
1372e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1373e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1374e1051a39Sopenharmony_ci	veor	d5, d5, d31		@ A[1][2] ^= *inp++
1375e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1376e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1377e1051a39Sopenharmony_ci	cmp	r6, #8*10
1378e1051a39Sopenharmony_ci	veor	d7, d7, d31		@ A[1][3] ^= *inp++
1379e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1380e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1381e1051a39Sopenharmony_ci	veor	d9, d9, d31		@ A[1][4] ^= *inp++
1382e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1383e1051a39Sopenharmony_ci
1384e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1385e1051a39Sopenharmony_ci	cmp	r6, #8*12
1386e1051a39Sopenharmony_ci	veor	d10, d10, d31		@ A[2][0] ^= *inp++
1387e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1388e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1389e1051a39Sopenharmony_ci	veor	d12, d12, d31		@ A[2][1] ^= *inp++
1390e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1391e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1392e1051a39Sopenharmony_ci	cmp	r6, #8*14
1393e1051a39Sopenharmony_ci	veor	d14, d14, d31		@ A[2][2] ^= *inp++
1394e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1395e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1396e1051a39Sopenharmony_ci	veor	d16, d16, d31		@ A[2][3] ^= *inp++
1397e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1398e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1399e1051a39Sopenharmony_ci	cmp	r6, #8*16
1400e1051a39Sopenharmony_ci	veor	d18, d18, d31		@ A[2][4] ^= *inp++
1401e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1402e1051a39Sopenharmony_ci
1403e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1404e1051a39Sopenharmony_ci	veor	d11, d11, d31		@ A[3][0] ^= *inp++
1405e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1406e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1407e1051a39Sopenharmony_ci	cmp	r6, #8*18
1408e1051a39Sopenharmony_ci	veor	d13, d13, d31		@ A[3][1] ^= *inp++
1409e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1410e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1411e1051a39Sopenharmony_ci	veor	d15, d15, d31		@ A[3][2] ^= *inp++
1412e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1413e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1414e1051a39Sopenharmony_ci	cmp	r6, #8*20
1415e1051a39Sopenharmony_ci	veor	d17, d17, d31		@ A[3][3] ^= *inp++
1416e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1417e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1418e1051a39Sopenharmony_ci	veor	d19, d19, d31		@ A[3][4] ^= *inp++
1419e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1420e1051a39Sopenharmony_ci
1421e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1422e1051a39Sopenharmony_ci	cmp	r6, #8*22
1423e1051a39Sopenharmony_ci	veor	d20, d20, d31		@ A[4][0] ^= *inp++
1424e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1425e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1426e1051a39Sopenharmony_ci	veor	d21, d21, d31		@ A[4][1] ^= *inp++
1427e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1428e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1429e1051a39Sopenharmony_ci	cmp	r6, #8*24
1430e1051a39Sopenharmony_ci	veor	d22, d22, d31		@ A[4][2] ^= *inp++
1431e1051a39Sopenharmony_ci	blo	.Lprocess_neon
1432e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1433e1051a39Sopenharmony_ci	veor	d23, d23, d31		@ A[4][3] ^= *inp++
1434e1051a39Sopenharmony_ci	beq	.Lprocess_neon
1435e1051a39Sopenharmony_ci	vld1.8	{d31}, [r4]!
1436e1051a39Sopenharmony_ci	veor	d24, d24, d31		@ A[4][4] ^= *inp++
1437e1051a39Sopenharmony_ci
1438e1051a39Sopenharmony_ci.Lprocess_neon:
1439e1051a39Sopenharmony_ci	bl	KeccakF1600_neon
1440e1051a39Sopenharmony_ci	b 	.Loop_absorb_neon
1441e1051a39Sopenharmony_ci
1442e1051a39Sopenharmony_ci.align	4
1443e1051a39Sopenharmony_ci.Labsorbed_neon:
1444e1051a39Sopenharmony_ci	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1445e1051a39Sopenharmony_ci	vst1.32	{d2}, [r0,:64]!
1446e1051a39Sopenharmony_ci	vst1.32	{d4}, [r0,:64]!
1447e1051a39Sopenharmony_ci	vst1.32	{d6}, [r0,:64]!
1448e1051a39Sopenharmony_ci	vst1.32	{d8}, [r0,:64]!
1449e1051a39Sopenharmony_ci
1450e1051a39Sopenharmony_ci	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1451e1051a39Sopenharmony_ci	vst1.32	{d3}, [r0,:64]!
1452e1051a39Sopenharmony_ci	vst1.32	{d5}, [r0,:64]!
1453e1051a39Sopenharmony_ci	vst1.32	{d7}, [r0,:64]!
1454e1051a39Sopenharmony_ci	vst1.32	{d9}, [r0,:64]!
1455e1051a39Sopenharmony_ci
1456e1051a39Sopenharmony_ci	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1457e1051a39Sopenharmony_ci	vst1.32	{d12}, [r0,:64]!
1458e1051a39Sopenharmony_ci	vst1.32	{d14}, [r0,:64]!
1459e1051a39Sopenharmony_ci	vst1.32	{d16}, [r0,:64]!
1460e1051a39Sopenharmony_ci	vst1.32	{d18}, [r0,:64]!
1461e1051a39Sopenharmony_ci
1462e1051a39Sopenharmony_ci	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1463e1051a39Sopenharmony_ci	vst1.32	{d13}, [r0,:64]!
1464e1051a39Sopenharmony_ci	vst1.32	{d15}, [r0,:64]!
1465e1051a39Sopenharmony_ci	vst1.32	{d17}, [r0,:64]!
1466e1051a39Sopenharmony_ci	vst1.32	{d19}, [r0,:64]!
1467e1051a39Sopenharmony_ci
1468e1051a39Sopenharmony_ci	vst1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1469e1051a39Sopenharmony_ci	vst1.32	{d24}, [r0,:64]
1470e1051a39Sopenharmony_ci
1471e1051a39Sopenharmony_ci	mov	r0, r5			@ return value
1472e1051a39Sopenharmony_ci	vldmia	sp!, {d8-d15}
1473e1051a39Sopenharmony_ci	ldmia	sp!, {r4-r6,pc}
1474e1051a39Sopenharmony_ci.size	SHA3_absorb_neon,.-SHA3_absorb_neon
1475e1051a39Sopenharmony_ci
1476e1051a39Sopenharmony_ci.global	SHA3_squeeze_neon
1477e1051a39Sopenharmony_ci.type	SHA3_squeeze_neon, %function
1478e1051a39Sopenharmony_ci.align	5
1479e1051a39Sopenharmony_ciSHA3_squeeze_neon:
1480e1051a39Sopenharmony_ci	stmdb	sp!, {r4-r6,lr}
1481e1051a39Sopenharmony_ci
1482e1051a39Sopenharmony_ci	mov	r4, r1			@ out
1483e1051a39Sopenharmony_ci	mov	r5, r2			@ len
1484e1051a39Sopenharmony_ci	mov	r6, r3			@ bsz
1485e1051a39Sopenharmony_ci	mov	r12, r0			@ A_flat
1486e1051a39Sopenharmony_ci	mov	r14, r3			@ bsz
1487e1051a39Sopenharmony_ci	b	.Loop_squeeze_neon
1488e1051a39Sopenharmony_ci
1489e1051a39Sopenharmony_ci.align	4
1490e1051a39Sopenharmony_ci.Loop_squeeze_neon:
1491e1051a39Sopenharmony_ci	cmp	r5, #8
1492e1051a39Sopenharmony_ci	blo	.Lsqueeze_neon_tail
1493e1051a39Sopenharmony_ci	vld1.32	{d0}, [r12]!
1494e1051a39Sopenharmony_ci	vst1.8	{d0}, [r4]!		@ endian-neutral store
1495e1051a39Sopenharmony_ci
1496e1051a39Sopenharmony_ci	subs	r5, r5, #8		@ len -= 8
1497e1051a39Sopenharmony_ci	beq	.Lsqueeze_neon_done
1498e1051a39Sopenharmony_ci
1499e1051a39Sopenharmony_ci	subs	r14, r14, #8		@ bsz -= 8
1500e1051a39Sopenharmony_ci	bhi	.Loop_squeeze_neon
1501e1051a39Sopenharmony_ci
1502e1051a39Sopenharmony_ci	vstmdb	sp!,  {d8-d15}
1503e1051a39Sopenharmony_ci
1504e1051a39Sopenharmony_ci	vld1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1505e1051a39Sopenharmony_ci	vld1.32	{d2}, [r0,:64]!
1506e1051a39Sopenharmony_ci	vld1.32	{d4}, [r0,:64]!
1507e1051a39Sopenharmony_ci	vld1.32	{d6}, [r0,:64]!
1508e1051a39Sopenharmony_ci	vld1.32	{d8}, [r0,:64]!
1509e1051a39Sopenharmony_ci
1510e1051a39Sopenharmony_ci	vld1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1511e1051a39Sopenharmony_ci	vld1.32	{d3}, [r0,:64]!
1512e1051a39Sopenharmony_ci	vld1.32	{d5}, [r0,:64]!
1513e1051a39Sopenharmony_ci	vld1.32	{d7}, [r0,:64]!
1514e1051a39Sopenharmony_ci	vld1.32	{d9}, [r0,:64]!
1515e1051a39Sopenharmony_ci
1516e1051a39Sopenharmony_ci	vld1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1517e1051a39Sopenharmony_ci	vld1.32	{d12}, [r0,:64]!
1518e1051a39Sopenharmony_ci	vld1.32	{d14}, [r0,:64]!
1519e1051a39Sopenharmony_ci	vld1.32	{d16}, [r0,:64]!
1520e1051a39Sopenharmony_ci	vld1.32	{d18}, [r0,:64]!
1521e1051a39Sopenharmony_ci
1522e1051a39Sopenharmony_ci	vld1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1523e1051a39Sopenharmony_ci	vld1.32	{d13}, [r0,:64]!
1524e1051a39Sopenharmony_ci	vld1.32	{d15}, [r0,:64]!
1525e1051a39Sopenharmony_ci	vld1.32	{d17}, [r0,:64]!
1526e1051a39Sopenharmony_ci	vld1.32	{d19}, [r0,:64]!
1527e1051a39Sopenharmony_ci
1528e1051a39Sopenharmony_ci	vld1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1529e1051a39Sopenharmony_ci	vld1.32	{d24}, [r0,:64]
1530e1051a39Sopenharmony_ci	sub	r0, r0, #24*8		@ rewind
1531e1051a39Sopenharmony_ci
1532e1051a39Sopenharmony_ci	bl	KeccakF1600_neon
1533e1051a39Sopenharmony_ci
1534e1051a39Sopenharmony_ci	mov	r12, r0			@ A_flat
1535e1051a39Sopenharmony_ci	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1536e1051a39Sopenharmony_ci	vst1.32	{d2}, [r0,:64]!
1537e1051a39Sopenharmony_ci	vst1.32	{d4}, [r0,:64]!
1538e1051a39Sopenharmony_ci	vst1.32	{d6}, [r0,:64]!
1539e1051a39Sopenharmony_ci	vst1.32	{d8}, [r0,:64]!
1540e1051a39Sopenharmony_ci
1541e1051a39Sopenharmony_ci	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1542e1051a39Sopenharmony_ci	vst1.32	{d3}, [r0,:64]!
1543e1051a39Sopenharmony_ci	vst1.32	{d5}, [r0,:64]!
1544e1051a39Sopenharmony_ci	vst1.32	{d7}, [r0,:64]!
1545e1051a39Sopenharmony_ci	vst1.32	{d9}, [r0,:64]!
1546e1051a39Sopenharmony_ci
1547e1051a39Sopenharmony_ci	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1548e1051a39Sopenharmony_ci	vst1.32	{d12}, [r0,:64]!
1549e1051a39Sopenharmony_ci	vst1.32	{d14}, [r0,:64]!
1550e1051a39Sopenharmony_ci	vst1.32	{d16}, [r0,:64]!
1551e1051a39Sopenharmony_ci	vst1.32	{d18}, [r0,:64]!
1552e1051a39Sopenharmony_ci
1553e1051a39Sopenharmony_ci	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1554e1051a39Sopenharmony_ci	vst1.32	{d13}, [r0,:64]!
1555e1051a39Sopenharmony_ci	vst1.32	{d15}, [r0,:64]!
1556e1051a39Sopenharmony_ci	vst1.32	{d17}, [r0,:64]!
1557e1051a39Sopenharmony_ci	vst1.32	{d19}, [r0,:64]!
1558e1051a39Sopenharmony_ci
1559e1051a39Sopenharmony_ci	vst1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1560e1051a39Sopenharmony_ci	mov	r14, r6			@ bsz
1561e1051a39Sopenharmony_ci	vst1.32	{d24}, [r0,:64]
1562e1051a39Sopenharmony_ci	mov	r0,  r12		@ rewind
1563e1051a39Sopenharmony_ci
1564e1051a39Sopenharmony_ci	vldmia	sp!, {d8-d15}
1565e1051a39Sopenharmony_ci	b	.Loop_squeeze_neon
1566e1051a39Sopenharmony_ci
1567e1051a39Sopenharmony_ci.align	4
1568e1051a39Sopenharmony_ci.Lsqueeze_neon_tail:
1569e1051a39Sopenharmony_ci	ldmia	r12, {r2,r3}
1570e1051a39Sopenharmony_ci	cmp	r5, #2
1571e1051a39Sopenharmony_ci	strb	r2, [r4],#1		@ endian-neutral store
1572e1051a39Sopenharmony_ci	lsr	r2, r2, #8
1573e1051a39Sopenharmony_ci	blo	.Lsqueeze_neon_done
1574e1051a39Sopenharmony_ci	strb	r2, [r4], #1
1575e1051a39Sopenharmony_ci	lsr	r2, r2, #8
1576e1051a39Sopenharmony_ci	beq	.Lsqueeze_neon_done
1577e1051a39Sopenharmony_ci	strb	r2, [r4], #1
1578e1051a39Sopenharmony_ci	lsr	r2, r2, #8
1579e1051a39Sopenharmony_ci	cmp	r5, #4
1580e1051a39Sopenharmony_ci	blo	.Lsqueeze_neon_done
1581e1051a39Sopenharmony_ci	strb	r2, [r4], #1
1582e1051a39Sopenharmony_ci	beq	.Lsqueeze_neon_done
1583e1051a39Sopenharmony_ci
1584e1051a39Sopenharmony_ci	strb	r3, [r4], #1
1585e1051a39Sopenharmony_ci	lsr	r3, r3, #8
1586e1051a39Sopenharmony_ci	cmp	r5, #6
1587e1051a39Sopenharmony_ci	blo	.Lsqueeze_neon_done
1588e1051a39Sopenharmony_ci	strb	r3, [r4], #1
1589e1051a39Sopenharmony_ci	lsr	r3, r3, #8
1590e1051a39Sopenharmony_ci	beq	.Lsqueeze_neon_done
1591e1051a39Sopenharmony_ci	strb	r3, [r4], #1
1592e1051a39Sopenharmony_ci
1593e1051a39Sopenharmony_ci.Lsqueeze_neon_done:
1594e1051a39Sopenharmony_ci	ldmia	sp!, {r4-r6,pc}
1595e1051a39Sopenharmony_ci.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
1596e1051a39Sopenharmony_ci#endif
1597e1051a39Sopenharmony_ci.asciz	"Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1598e1051a39Sopenharmony_ci.align	2
1599e1051a39Sopenharmony_ci___
1600e1051a39Sopenharmony_ci
1601e1051a39Sopenharmony_ci{
1602e1051a39Sopenharmony_ci    my %ldr, %str;
1603e1051a39Sopenharmony_ci
1604e1051a39Sopenharmony_ci    sub ldrd {
1605e1051a39Sopenharmony_ci	my ($mnemonic,$half,$reg,$ea) = @_;
1606e1051a39Sopenharmony_ci	my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1607e1051a39Sopenharmony_ci
1608e1051a39Sopenharmony_ci	if ($half eq "l") {
1609e1051a39Sopenharmony_ci	    $$op{reg} = $reg;
1610e1051a39Sopenharmony_ci	    $$op{ea}  = $ea;
1611e1051a39Sopenharmony_ci	    sprintf "#ifndef	__thumb2__\n"	.
1612e1051a39Sopenharmony_ci		    "	%s\t%s,%s\n"		.
1613e1051a39Sopenharmony_ci		    "#endif", $mnemonic,$reg,$ea;
1614e1051a39Sopenharmony_ci	} else {
1615e1051a39Sopenharmony_ci	    sprintf "#ifndef	__thumb2__\n"	.
1616e1051a39Sopenharmony_ci		    "	%s\t%s,%s\n"		.
1617e1051a39Sopenharmony_ci		    "#else\n"			.
1618e1051a39Sopenharmony_ci		    "	%sd\t%s,%s,%s\n"	.
1619e1051a39Sopenharmony_ci		    "#endif",	$mnemonic,$reg,$ea,
1620e1051a39Sopenharmony_ci				$mnemonic,$$op{reg},$reg,$$op{ea};
1621e1051a39Sopenharmony_ci	}
1622e1051a39Sopenharmony_ci    }
1623e1051a39Sopenharmony_ci}
1624e1051a39Sopenharmony_ci
1625e1051a39Sopenharmony_ciforeach (split($/,$code)) {
1626e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/ge;
1627e1051a39Sopenharmony_ci
1628e1051a39Sopenharmony_ci	s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1629e1051a39Sopenharmony_ci	s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov	$2$1#/g or
1630e1051a39Sopenharmony_ci	s/\bret\b/bx	lr/g		or
1631e1051a39Sopenharmony_ci	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
1632e1051a39Sopenharmony_ci
1633e1051a39Sopenharmony_ci	print $_,"\n";
1634e1051a39Sopenharmony_ci}
1635e1051a39Sopenharmony_ci
1636e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; # enforce flush
1637