1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci#
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# December 2015
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# ChaCha20 for s390x.
20e1051a39Sopenharmony_ci#
21e1051a39Sopenharmony_ci# 3 times faster than compiler-generated code.
22e1051a39Sopenharmony_ci
23e1051a39Sopenharmony_ci#
24e1051a39Sopenharmony_ci# August 2018
25e1051a39Sopenharmony_ci#
26e1051a39Sopenharmony_ci# Add vx code path: 4x"vertical".
27e1051a39Sopenharmony_ci#
28e1051a39Sopenharmony_ci# Copyright IBM Corp. 2018
29e1051a39Sopenharmony_ci# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
30e1051a39Sopenharmony_ci
31e1051a39Sopenharmony_ci#
32e1051a39Sopenharmony_ci# February 2019
33e1051a39Sopenharmony_ci#
34e1051a39Sopenharmony_ci# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
35e1051a39Sopenharmony_ci# 4x"vertical" submission [on z13] and >3 faster than scalar code.
36e1051a39Sopenharmony_ci# But to harness overheads revert to transliteration of VSX code path
37e1051a39Sopenharmony_ci# from chacha-ppc module, which is also 4x"vertical", to handle inputs
38e1051a39Sopenharmony_ci# not longer than 256 bytes.
39e1051a39Sopenharmony_ci
40e1051a39Sopenharmony_ciuse strict;
41e1051a39Sopenharmony_ciuse FindBin qw($Bin);
42e1051a39Sopenharmony_ciuse lib "$Bin/../..";
43e1051a39Sopenharmony_ciuse perlasm::s390x qw(:DEFAULT :VX :EI AUTOLOAD LABEL INCLUDE);
44e1051a39Sopenharmony_ci
45e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
46e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
47e1051a39Sopenharmony_cimy $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
48e1051a39Sopenharmony_cimy $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
49e1051a39Sopenharmony_ci
50e1051a39Sopenharmony_cimy ($z,$SIZE_T);
51e1051a39Sopenharmony_ciif ($flavour =~ /3[12]/) {
52e1051a39Sopenharmony_ci	$z=0;	# S/390 ABI
53e1051a39Sopenharmony_ci	$SIZE_T=4;
54e1051a39Sopenharmony_ci} else {
55e1051a39Sopenharmony_ci	$z=1;	# zSeries ABI
56e1051a39Sopenharmony_ci	$SIZE_T=8;
57e1051a39Sopenharmony_ci}
58e1051a39Sopenharmony_ci
59e1051a39Sopenharmony_cimy $sp="%r15";
60e1051a39Sopenharmony_cimy $stdframe=16*$SIZE_T+4*8;
61e1051a39Sopenharmony_ci
62e1051a39Sopenharmony_cisub ROUND {
63e1051a39Sopenharmony_cimy @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
64e1051a39Sopenharmony_cimy @t=map("%r$_",(8,9));
65e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0)=@_;
66e1051a39Sopenharmony_cimy ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
67e1051a39Sopenharmony_cimy ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
68e1051a39Sopenharmony_cimy ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
69e1051a39Sopenharmony_cimy ($xc,$xc_)=map("$_",@t);
70e1051a39Sopenharmony_ci
71e1051a39Sopenharmony_ci	# Consider order in which variables are addressed by their
72e1051a39Sopenharmony_ci	# index:
73e1051a39Sopenharmony_ci	#
74e1051a39Sopenharmony_ci	#	a   b   c   d
75e1051a39Sopenharmony_ci	#
76e1051a39Sopenharmony_ci	#	0   4   8  12 < even round
77e1051a39Sopenharmony_ci	#	1   5   9  13
78e1051a39Sopenharmony_ci	#	2   6  10  14
79e1051a39Sopenharmony_ci	#	3   7  11  15
80e1051a39Sopenharmony_ci	#	0   5  10  15 < odd round
81e1051a39Sopenharmony_ci	#	1   6  11  12
82e1051a39Sopenharmony_ci	#	2   7   8  13
83e1051a39Sopenharmony_ci	#	3   4   9  14
84e1051a39Sopenharmony_ci	#
85e1051a39Sopenharmony_ci	# 'a', 'b' and 'd's are permanently allocated in registers,
86e1051a39Sopenharmony_ci	# @x[0..7,12..15], while 'c's are maintained in memory. If
87e1051a39Sopenharmony_ci	# you observe 'c' column, you'll notice that pair of 'c's is
88e1051a39Sopenharmony_ci	# invariant between rounds. This means that we have to reload
89e1051a39Sopenharmony_ci	# them once per round, in the middle. This is why you'll see
90e1051a39Sopenharmony_ci	# 'c' stores and loads in the middle, but none in the beginning
91e1051a39Sopenharmony_ci	# or end.
92e1051a39Sopenharmony_ci
93e1051a39Sopenharmony_ci	alr	(@x[$a0],@x[$b0]);	# Q1
94e1051a39Sopenharmony_ci	 alr	(@x[$a1],@x[$b1]);	# Q2
95e1051a39Sopenharmony_ci	xr	(@x[$d0],@x[$a0]);
96e1051a39Sopenharmony_ci	 xr	(@x[$d1],@x[$a1]);
97e1051a39Sopenharmony_ci	rll	(@x[$d0],@x[$d0],16);
98e1051a39Sopenharmony_ci	 rll	(@x[$d1],@x[$d1],16);
99e1051a39Sopenharmony_ci
100e1051a39Sopenharmony_ci	alr	($xc,@x[$d0]);
101e1051a39Sopenharmony_ci	 alr	($xc_,@x[$d1]);
102e1051a39Sopenharmony_ci	xr	(@x[$b0],$xc);
103e1051a39Sopenharmony_ci	 xr	(@x[$b1],$xc_);
104e1051a39Sopenharmony_ci	rll	(@x[$b0],@x[$b0],12);
105e1051a39Sopenharmony_ci	 rll	(@x[$b1],@x[$b1],12);
106e1051a39Sopenharmony_ci
107e1051a39Sopenharmony_ci	alr	(@x[$a0],@x[$b0]);
108e1051a39Sopenharmony_ci	 alr	(@x[$a1],@x[$b1]);
109e1051a39Sopenharmony_ci	xr	(@x[$d0],@x[$a0]);
110e1051a39Sopenharmony_ci	 xr	(@x[$d1],@x[$a1]);
111e1051a39Sopenharmony_ci	rll	(@x[$d0],@x[$d0],8);
112e1051a39Sopenharmony_ci	 rll	(@x[$d1],@x[$d1],8);
113e1051a39Sopenharmony_ci
114e1051a39Sopenharmony_ci	alr	($xc,@x[$d0]);
115e1051a39Sopenharmony_ci	 alr	($xc_,@x[$d1]);
116e1051a39Sopenharmony_ci	xr	(@x[$b0],$xc);
117e1051a39Sopenharmony_ci	 xr	(@x[$b1],$xc_);
118e1051a39Sopenharmony_ci	rll	(@x[$b0],@x[$b0],7);
119e1051a39Sopenharmony_ci	 rll	(@x[$b1],@x[$b1],7);
120e1051a39Sopenharmony_ci
121e1051a39Sopenharmony_ci	stm	($xc,$xc_,"$stdframe+4*8+4*$c0($sp)");	# reload pair of 'c's
122e1051a39Sopenharmony_ci	lm	($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
123e1051a39Sopenharmony_ci
124e1051a39Sopenharmony_ci	alr	(@x[$a2],@x[$b2]);	# Q3
125e1051a39Sopenharmony_ci	 alr	(@x[$a3],@x[$b3]);	# Q4
126e1051a39Sopenharmony_ci	xr	(@x[$d2],@x[$a2]);
127e1051a39Sopenharmony_ci	 xr	(@x[$d3],@x[$a3]);
128e1051a39Sopenharmony_ci	rll	(@x[$d2],@x[$d2],16);
129e1051a39Sopenharmony_ci	 rll	(@x[$d3],@x[$d3],16);
130e1051a39Sopenharmony_ci
131e1051a39Sopenharmony_ci	alr	($xc,@x[$d2]);
132e1051a39Sopenharmony_ci	 alr	($xc_,@x[$d3]);
133e1051a39Sopenharmony_ci	xr	(@x[$b2],$xc);
134e1051a39Sopenharmony_ci	 xr	(@x[$b3],$xc_);
135e1051a39Sopenharmony_ci	rll	(@x[$b2],@x[$b2],12);
136e1051a39Sopenharmony_ci	 rll	(@x[$b3],@x[$b3],12);
137e1051a39Sopenharmony_ci
138e1051a39Sopenharmony_ci	alr	(@x[$a2],@x[$b2]);
139e1051a39Sopenharmony_ci	 alr	(@x[$a3],@x[$b3]);
140e1051a39Sopenharmony_ci	xr	(@x[$d2],@x[$a2]);
141e1051a39Sopenharmony_ci	 xr	(@x[$d3],@x[$a3]);
142e1051a39Sopenharmony_ci	rll	(@x[$d2],@x[$d2],8);
143e1051a39Sopenharmony_ci	 rll	(@x[$d3],@x[$d3],8);
144e1051a39Sopenharmony_ci
145e1051a39Sopenharmony_ci	alr	($xc,@x[$d2]);
146e1051a39Sopenharmony_ci	 alr	($xc_,@x[$d3]);
147e1051a39Sopenharmony_ci	xr	(@x[$b2],$xc);
148e1051a39Sopenharmony_ci	 xr	(@x[$b3],$xc_);
149e1051a39Sopenharmony_ci	rll	(@x[$b2],@x[$b2],7);
150e1051a39Sopenharmony_ci	 rll	(@x[$b3],@x[$b3],7);
151e1051a39Sopenharmony_ci}
152e1051a39Sopenharmony_ci
153e1051a39Sopenharmony_cisub VX_lane_ROUND {
154e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0)=@_;
155e1051a39Sopenharmony_cimy ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
156e1051a39Sopenharmony_cimy ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
157e1051a39Sopenharmony_cimy ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
158e1051a39Sopenharmony_cimy @x=map("%v$_",(0..15));
159e1051a39Sopenharmony_ci
160e1051a39Sopenharmony_ci	vaf	(@x[$a0],@x[$a0],@x[$b0]);	# Q1
161e1051a39Sopenharmony_ci	vx	(@x[$d0],@x[$d0],@x[$a0]);
162e1051a39Sopenharmony_ci	verllf	(@x[$d0],@x[$d0],16);
163e1051a39Sopenharmony_ci	vaf	(@x[$a1],@x[$a1],@x[$b1]);	# Q2
164e1051a39Sopenharmony_ci	vx	(@x[$d1],@x[$d1],@x[$a1]);
165e1051a39Sopenharmony_ci	verllf	(@x[$d1],@x[$d1],16);
166e1051a39Sopenharmony_ci	vaf	(@x[$a2],@x[$a2],@x[$b2]);	# Q3
167e1051a39Sopenharmony_ci	vx	(@x[$d2],@x[$d2],@x[$a2]);
168e1051a39Sopenharmony_ci	verllf	(@x[$d2],@x[$d2],16);
169e1051a39Sopenharmony_ci	vaf	(@x[$a3],@x[$a3],@x[$b3]);	# Q4
170e1051a39Sopenharmony_ci	vx	(@x[$d3],@x[$d3],@x[$a3]);
171e1051a39Sopenharmony_ci	verllf	(@x[$d3],@x[$d3],16);
172e1051a39Sopenharmony_ci
173e1051a39Sopenharmony_ci	vaf	(@x[$c0],@x[$c0],@x[$d0]);
174e1051a39Sopenharmony_ci	vx	(@x[$b0],@x[$b0],@x[$c0]);
175e1051a39Sopenharmony_ci	verllf	(@x[$b0],@x[$b0],12);
176e1051a39Sopenharmony_ci	vaf	(@x[$c1],@x[$c1],@x[$d1]);
177e1051a39Sopenharmony_ci	vx	(@x[$b1],@x[$b1],@x[$c1]);
178e1051a39Sopenharmony_ci	verllf	(@x[$b1],@x[$b1],12);
179e1051a39Sopenharmony_ci	vaf	(@x[$c2],@x[$c2],@x[$d2]);
180e1051a39Sopenharmony_ci	vx	(@x[$b2],@x[$b2],@x[$c2]);
181e1051a39Sopenharmony_ci	verllf	(@x[$b2],@x[$b2],12);
182e1051a39Sopenharmony_ci	vaf	(@x[$c3],@x[$c3],@x[$d3]);
183e1051a39Sopenharmony_ci	vx	(@x[$b3],@x[$b3],@x[$c3]);
184e1051a39Sopenharmony_ci	verllf	(@x[$b3],@x[$b3],12);
185e1051a39Sopenharmony_ci
186e1051a39Sopenharmony_ci	vaf	(@x[$a0],@x[$a0],@x[$b0]);
187e1051a39Sopenharmony_ci	vx	(@x[$d0],@x[$d0],@x[$a0]);
188e1051a39Sopenharmony_ci	verllf	(@x[$d0],@x[$d0],8);
189e1051a39Sopenharmony_ci	vaf	(@x[$a1],@x[$a1],@x[$b1]);
190e1051a39Sopenharmony_ci	vx	(@x[$d1],@x[$d1],@x[$a1]);
191e1051a39Sopenharmony_ci	verllf	(@x[$d1],@x[$d1],8);
192e1051a39Sopenharmony_ci	vaf	(@x[$a2],@x[$a2],@x[$b2]);
193e1051a39Sopenharmony_ci	vx	(@x[$d2],@x[$d2],@x[$a2]);
194e1051a39Sopenharmony_ci	verllf	(@x[$d2],@x[$d2],8);
195e1051a39Sopenharmony_ci	vaf	(@x[$a3],@x[$a3],@x[$b3]);
196e1051a39Sopenharmony_ci	vx	(@x[$d3],@x[$d3],@x[$a3]);
197e1051a39Sopenharmony_ci	verllf	(@x[$d3],@x[$d3],8);
198e1051a39Sopenharmony_ci
199e1051a39Sopenharmony_ci	vaf	(@x[$c0],@x[$c0],@x[$d0]);
200e1051a39Sopenharmony_ci	vx	(@x[$b0],@x[$b0],@x[$c0]);
201e1051a39Sopenharmony_ci	verllf	(@x[$b0],@x[$b0],7);
202e1051a39Sopenharmony_ci	vaf	(@x[$c1],@x[$c1],@x[$d1]);
203e1051a39Sopenharmony_ci	vx	(@x[$b1],@x[$b1],@x[$c1]);
204e1051a39Sopenharmony_ci	verllf	(@x[$b1],@x[$b1],7);
205e1051a39Sopenharmony_ci	vaf	(@x[$c2],@x[$c2],@x[$d2]);
206e1051a39Sopenharmony_ci	vx	(@x[$b2],@x[$b2],@x[$c2]);
207e1051a39Sopenharmony_ci	verllf	(@x[$b2],@x[$b2],7);
208e1051a39Sopenharmony_ci	vaf	(@x[$c3],@x[$c3],@x[$d3]);
209e1051a39Sopenharmony_ci	vx	(@x[$b3],@x[$b3],@x[$c3]);
210e1051a39Sopenharmony_ci	verllf	(@x[$b3],@x[$b3],7);
211e1051a39Sopenharmony_ci}
212e1051a39Sopenharmony_ci
213e1051a39Sopenharmony_cisub VX_ROUND {
214e1051a39Sopenharmony_cimy @a=@_[0..5];
215e1051a39Sopenharmony_cimy @b=@_[6..11];
216e1051a39Sopenharmony_cimy @c=@_[12..17];
217e1051a39Sopenharmony_cimy @d=@_[18..23];
218e1051a39Sopenharmony_cimy $odd=@_[24];
219e1051a39Sopenharmony_ci
220e1051a39Sopenharmony_ci	vaf		(@a[$_],@a[$_],@b[$_]) for (0..5);
221e1051a39Sopenharmony_ci	vx		(@d[$_],@d[$_],@a[$_]) for (0..5);
222e1051a39Sopenharmony_ci	verllf		(@d[$_],@d[$_],16) for (0..5);
223e1051a39Sopenharmony_ci
224e1051a39Sopenharmony_ci	vaf		(@c[$_],@c[$_],@d[$_]) for (0..5);
225e1051a39Sopenharmony_ci	vx		(@b[$_],@b[$_],@c[$_]) for (0..5);
226e1051a39Sopenharmony_ci	verllf		(@b[$_],@b[$_],12) for (0..5);
227e1051a39Sopenharmony_ci
228e1051a39Sopenharmony_ci	vaf		(@a[$_],@a[$_],@b[$_]) for (0..5);
229e1051a39Sopenharmony_ci	vx		(@d[$_],@d[$_],@a[$_]) for (0..5);
230e1051a39Sopenharmony_ci	verllf		(@d[$_],@d[$_],8) for (0..5);
231e1051a39Sopenharmony_ci
232e1051a39Sopenharmony_ci	vaf		(@c[$_],@c[$_],@d[$_]) for (0..5);
233e1051a39Sopenharmony_ci	vx		(@b[$_],@b[$_],@c[$_]) for (0..5);
234e1051a39Sopenharmony_ci	verllf		(@b[$_],@b[$_],7) for (0..5);
235e1051a39Sopenharmony_ci
236e1051a39Sopenharmony_ci	vsldb		(@c[$_],@c[$_],@c[$_],8) for (0..5);
237e1051a39Sopenharmony_ci	vsldb		(@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
238e1051a39Sopenharmony_ci	vsldb		(@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
239e1051a39Sopenharmony_ci}
240e1051a39Sopenharmony_ci
241e1051a39Sopenharmony_ciPERLASM_BEGIN($output);
242e1051a39Sopenharmony_ci
243e1051a39Sopenharmony_ciINCLUDE	("s390x_arch.h");
244e1051a39Sopenharmony_ciTEXT	();
245e1051a39Sopenharmony_ci
246e1051a39Sopenharmony_ci################
247e1051a39Sopenharmony_ci# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
248e1051a39Sopenharmony_ci#                     const unsigned int key[8], const unsigned int counter[4])
249e1051a39Sopenharmony_cimy ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
250e1051a39Sopenharmony_ci{
251e1051a39Sopenharmony_cimy $frame=$stdframe+4*20;
252e1051a39Sopenharmony_cimy @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
253e1051a39Sopenharmony_cimy @t=map("%r$_",(8,9));
254e1051a39Sopenharmony_ci
255e1051a39Sopenharmony_ciGLOBL	("ChaCha20_ctr32");
256e1051a39Sopenharmony_ciTYPE	("ChaCha20_ctr32","\@function");
257e1051a39Sopenharmony_ciALIGN	(32);
258e1051a39Sopenharmony_ciLABEL	("ChaCha20_ctr32");
259e1051a39Sopenharmony_ci	larl	("%r1","OPENSSL_s390xcap_P");
260e1051a39Sopenharmony_ci
261e1051a39Sopenharmony_ci	lghi	("%r0",64);
262e1051a39Sopenharmony_ci&{$z?	\&ltgr:\&ltr}	($len,$len);		# len==0?
263e1051a39Sopenharmony_ci	bzr	("%r14");
264e1051a39Sopenharmony_ci	lg	("%r1","S390X_STFLE+16(%r1)");
265e1051a39Sopenharmony_ci&{$z?	\&clgr:\&clr}	($len,"%r0");
266e1051a39Sopenharmony_ci	jle	(".Lshort");
267e1051a39Sopenharmony_ci
268e1051a39Sopenharmony_ci	tmhh	("%r1",0x4000);			# check for vx bit
269e1051a39Sopenharmony_ci	jnz	(".LChaCha20_ctr32_vx");
270e1051a39Sopenharmony_ci
271e1051a39Sopenharmony_ciLABEL	(".Lshort");
272e1051a39Sopenharmony_ci&{$z?	\&aghi:\&ahi}	($len,-64);
273e1051a39Sopenharmony_ci&{$z?	\&lghi:\&lhi}	("%r1",-$frame);
274e1051a39Sopenharmony_ci&{$z?	\&stmg:\&stm}	("%r6","%r15","6*$SIZE_T($sp)");
275e1051a39Sopenharmony_ci&{$z?	\&slgr:\&slr}	($out,$inp);	# difference
276e1051a39Sopenharmony_ci	la	($len,"0($inp,$len)");	# end of input minus 64
277e1051a39Sopenharmony_ci	larl	("%r7",".Lsigma");
278e1051a39Sopenharmony_ci	lgr	("%r0",$sp);
279e1051a39Sopenharmony_ci	la	($sp,"0(%r1,$sp)");
280e1051a39Sopenharmony_ci&{$z?	\&stg:\&st}	("%r0","0($sp)");
281e1051a39Sopenharmony_ci
282e1051a39Sopenharmony_ci	lmg	("%r8","%r11","0($key)");	# load key
283e1051a39Sopenharmony_ci	lmg	("%r12","%r13","0($counter)");	# load counter
284e1051a39Sopenharmony_ci	lmg	("%r6","%r7","0(%r7)");	# load sigma constant
285e1051a39Sopenharmony_ci
286e1051a39Sopenharmony_ci	la	("%r14","0($inp)");
287e1051a39Sopenharmony_ci&{$z?	\&stg:\&st}	($out,"$frame+3*$SIZE_T($sp)");
288e1051a39Sopenharmony_ci&{$z?	\&stg:\&st}	($len,"$frame+4*$SIZE_T($sp)");
289e1051a39Sopenharmony_ci	stmg	("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
290e1051a39Sopenharmony_ci	srlg	(@x[12],"%r12",32);	# 32-bit counter value
291e1051a39Sopenharmony_ci	j	(".Loop_outer");
292e1051a39Sopenharmony_ci
293e1051a39Sopenharmony_ciALIGN	(16);
294e1051a39Sopenharmony_ciLABEL	(".Loop_outer");
295e1051a39Sopenharmony_ci	lm	(@x[0],@x[7],"$stdframe+4*0($sp)");	# load x[0]-x[7]
296e1051a39Sopenharmony_ci	lm	(@t[0],@t[1],"$stdframe+4*10($sp)");	# load x[10]-x[11]
297e1051a39Sopenharmony_ci	lm	(@x[13],@x[15],"$stdframe+4*13($sp)");	# load x[13]-x[15]
298e1051a39Sopenharmony_ci	stm	(@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
299e1051a39Sopenharmony_ci	lm	(@t[0],@t[1],"$stdframe+4*8($sp)");	# load x[8]-x[9]
300e1051a39Sopenharmony_ci	st	(@x[12],"$stdframe+4*12($sp)");	# save counter
301e1051a39Sopenharmony_ci&{$z?	\&stg:\&st}	("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
302e1051a39Sopenharmony_ci	lhi	("%r14",10);
303e1051a39Sopenharmony_ci	j	(".Loop");
304e1051a39Sopenharmony_ci
305e1051a39Sopenharmony_ciALIGN	(4);
306e1051a39Sopenharmony_ciLABEL	(".Loop");
307e1051a39Sopenharmony_ci	ROUND	(0, 4, 8,12);
308e1051a39Sopenharmony_ci	ROUND	(0, 5,10,15);
309e1051a39Sopenharmony_ci	brct	("%r14",".Loop");
310e1051a39Sopenharmony_ci
311e1051a39Sopenharmony_ci&{$z?	\&lg:\&l}	("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
312e1051a39Sopenharmony_ci	stm	(@t[0],@t[1],"$stdframe+4*8+4*8($sp)");	# offload x[8]-x[9]
313e1051a39Sopenharmony_ci&{$z?	\&lmg:\&lm}	(@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
314e1051a39Sopenharmony_ci
315e1051a39Sopenharmony_ci	al	(@x[0],"$stdframe+4*0($sp)");	# accumulate key schedule
316e1051a39Sopenharmony_ci	al	(@x[1],"$stdframe+4*1($sp)");
317e1051a39Sopenharmony_ci	al	(@x[2],"$stdframe+4*2($sp)");
318e1051a39Sopenharmony_ci	al	(@x[3],"$stdframe+4*3($sp)");
319e1051a39Sopenharmony_ci	al	(@x[4],"$stdframe+4*4($sp)");
320e1051a39Sopenharmony_ci	al	(@x[5],"$stdframe+4*5($sp)");
321e1051a39Sopenharmony_ci	al	(@x[6],"$stdframe+4*6($sp)");
322e1051a39Sopenharmony_ci	al	(@x[7],"$stdframe+4*7($sp)");
323e1051a39Sopenharmony_ci	lrvr	(@x[0],@x[0]);
324e1051a39Sopenharmony_ci	lrvr	(@x[1],@x[1]);
325e1051a39Sopenharmony_ci	lrvr	(@x[2],@x[2]);
326e1051a39Sopenharmony_ci	lrvr	(@x[3],@x[3]);
327e1051a39Sopenharmony_ci	lrvr	(@x[4],@x[4]);
328e1051a39Sopenharmony_ci	lrvr	(@x[5],@x[5]);
329e1051a39Sopenharmony_ci	lrvr	(@x[6],@x[6]);
330e1051a39Sopenharmony_ci	lrvr	(@x[7],@x[7]);
331e1051a39Sopenharmony_ci	al	(@x[12],"$stdframe+4*12($sp)");
332e1051a39Sopenharmony_ci	al	(@x[13],"$stdframe+4*13($sp)");
333e1051a39Sopenharmony_ci	al	(@x[14],"$stdframe+4*14($sp)");
334e1051a39Sopenharmony_ci	al	(@x[15],"$stdframe+4*15($sp)");
335e1051a39Sopenharmony_ci	lrvr	(@x[12],@x[12]);
336e1051a39Sopenharmony_ci	lrvr	(@x[13],@x[13]);
337e1051a39Sopenharmony_ci	lrvr	(@x[14],@x[14]);
338e1051a39Sopenharmony_ci	lrvr	(@x[15],@x[15]);
339e1051a39Sopenharmony_ci
340e1051a39Sopenharmony_ci	la	(@t[0],"0(@t[0],%r14)");	# reconstruct output pointer
341e1051a39Sopenharmony_ci&{$z?	\&clgr:\&clr}	("%r14",@t[1]);
342e1051a39Sopenharmony_ci	jh	(".Ltail");
343e1051a39Sopenharmony_ci
344e1051a39Sopenharmony_ci	x	(@x[0],"4*0(%r14)");	# xor with input
345e1051a39Sopenharmony_ci	x	(@x[1],"4*1(%r14)");
346e1051a39Sopenharmony_ci	st	(@x[0],"4*0(@t[0])");	# store output
347e1051a39Sopenharmony_ci	x	(@x[2],"4*2(%r14)");
348e1051a39Sopenharmony_ci	st	(@x[1],"4*1(@t[0])");
349e1051a39Sopenharmony_ci	x	(@x[3],"4*3(%r14)");
350e1051a39Sopenharmony_ci	st	(@x[2],"4*2(@t[0])");
351e1051a39Sopenharmony_ci	x	(@x[4],"4*4(%r14)");
352e1051a39Sopenharmony_ci	st	(@x[3],"4*3(@t[0])");
353e1051a39Sopenharmony_ci	 lm	(@x[0],@x[3],"$stdframe+4*8+4*8($sp)");	# load x[8]-x[11]
354e1051a39Sopenharmony_ci	x	(@x[5],"4*5(%r14)");
355e1051a39Sopenharmony_ci	st	(@x[4],"4*4(@t[0])");
356e1051a39Sopenharmony_ci	x	(@x[6],"4*6(%r14)");
357e1051a39Sopenharmony_ci	 al	(@x[0],"$stdframe+4*8($sp)");
358e1051a39Sopenharmony_ci	st	(@x[5],"4*5(@t[0])");
359e1051a39Sopenharmony_ci	x	(@x[7],"4*7(%r14)");
360e1051a39Sopenharmony_ci	 al	(@x[1],"$stdframe+4*9($sp)");
361e1051a39Sopenharmony_ci	st	(@x[6],"4*6(@t[0])");
362e1051a39Sopenharmony_ci	x	(@x[12],"4*12(%r14)");
363e1051a39Sopenharmony_ci	 al	(@x[2],"$stdframe+4*10($sp)");
364e1051a39Sopenharmony_ci	st	(@x[7],"4*7(@t[0])");
365e1051a39Sopenharmony_ci	x	(@x[13],"4*13(%r14)");
366e1051a39Sopenharmony_ci	 al	(@x[3],"$stdframe+4*11($sp)");
367e1051a39Sopenharmony_ci	st	(@x[12],"4*12(@t[0])");
368e1051a39Sopenharmony_ci	x	(@x[14],"4*14(%r14)");
369e1051a39Sopenharmony_ci	st	(@x[13],"4*13(@t[0])");
370e1051a39Sopenharmony_ci	x	(@x[15],"4*15(%r14)");
371e1051a39Sopenharmony_ci	st	(@x[14],"4*14(@t[0])");
372e1051a39Sopenharmony_ci	 lrvr	(@x[0],@x[0]);
373e1051a39Sopenharmony_ci	st	(@x[15],"4*15(@t[0])");
374e1051a39Sopenharmony_ci	 lrvr	(@x[1],@x[1]);
375e1051a39Sopenharmony_ci	 lrvr	(@x[2],@x[2]);
376e1051a39Sopenharmony_ci	 lrvr	(@x[3],@x[3]);
377e1051a39Sopenharmony_ci	lhi	(@x[12],1);
378e1051a39Sopenharmony_ci	 x	(@x[0],"4*8(%r14)");
379e1051a39Sopenharmony_ci	al	(@x[12],"$stdframe+4*12($sp)");	# increment counter
380e1051a39Sopenharmony_ci	 x	(@x[1],"4*9(%r14)");
381e1051a39Sopenharmony_ci	 st	(@x[0],"4*8(@t[0])");
382e1051a39Sopenharmony_ci	 x	(@x[2],"4*10(%r14)");
383e1051a39Sopenharmony_ci	 st	(@x[1],"4*9(@t[0])");
384e1051a39Sopenharmony_ci	 x	(@x[3],"4*11(%r14)");
385e1051a39Sopenharmony_ci	 st	(@x[2],"4*10(@t[0])");
386e1051a39Sopenharmony_ci	 st	(@x[3],"4*11(@t[0])");
387e1051a39Sopenharmony_ci
388e1051a39Sopenharmony_ci&{$z?	\&clgr:\&clr}	("%r14",@t[1]);	# done yet?
389e1051a39Sopenharmony_ci	la	("%r14","64(%r14)");
390e1051a39Sopenharmony_ci	jl	(".Loop_outer");
391e1051a39Sopenharmony_ci
392e1051a39Sopenharmony_ciLABEL	(".Ldone");
393e1051a39Sopenharmony_ci	xgr	("%r0","%r0");
394e1051a39Sopenharmony_ci	xgr	("%r1","%r1");
395e1051a39Sopenharmony_ci	xgr	("%r2","%r2");
396e1051a39Sopenharmony_ci	xgr	("%r3","%r3");
397e1051a39Sopenharmony_ci	stmg	("%r0","%r3","$stdframe+4*4($sp)");	# wipe key copy
398e1051a39Sopenharmony_ci	stmg	("%r0","%r3","$stdframe+4*12($sp)");
399e1051a39Sopenharmony_ci
400e1051a39Sopenharmony_ci&{$z?	\&lmg:\&lm}	("%r6","%r15","$frame+6*$SIZE_T($sp)");
401e1051a39Sopenharmony_ci	br	("%r14");
402e1051a39Sopenharmony_ci
403e1051a39Sopenharmony_ciALIGN	(16);
404e1051a39Sopenharmony_ciLABEL	(".Ltail");
405e1051a39Sopenharmony_ci	la	(@t[1],"64($t[1])");
406e1051a39Sopenharmony_ci	stm	(@x[0],@x[7],"$stdframe+4*0($sp)");
407e1051a39Sopenharmony_ci&{$z?	\&slgr:\&slr}	(@t[1],"%r14");
408e1051a39Sopenharmony_ci	lm	(@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
409e1051a39Sopenharmony_ci&{$z?	\&lghi:\&lhi}	(@x[6],0);
410e1051a39Sopenharmony_ci	stm	(@x[12],@x[15],"$stdframe+4*12($sp)");
411e1051a39Sopenharmony_ci	al	(@x[0],"$stdframe+4*8($sp)");
412e1051a39Sopenharmony_ci	al	(@x[1],"$stdframe+4*9($sp)");
413e1051a39Sopenharmony_ci	al	(@x[2],"$stdframe+4*10($sp)");
414e1051a39Sopenharmony_ci	al	(@x[3],"$stdframe+4*11($sp)");
415e1051a39Sopenharmony_ci	lrvr	(@x[0],@x[0]);
416e1051a39Sopenharmony_ci	lrvr	(@x[1],@x[1]);
417e1051a39Sopenharmony_ci	lrvr	(@x[2],@x[2]);
418e1051a39Sopenharmony_ci	lrvr	(@x[3],@x[3]);
419e1051a39Sopenharmony_ci	stm	(@x[0],@x[3],"$stdframe+4*8($sp)");
420e1051a39Sopenharmony_ci
421e1051a39Sopenharmony_ciLABEL	(".Loop_tail");
422e1051a39Sopenharmony_ci	llgc	(@x[4],"0(@x[6],%r14)");
423e1051a39Sopenharmony_ci	llgc	(@x[5],"$stdframe(@x[6],$sp)");
424e1051a39Sopenharmony_ci	xr	(@x[5],@x[4]);
425e1051a39Sopenharmony_ci	stc	(@x[5],"0(@x[6],@t[0])");
426e1051a39Sopenharmony_ci	la	(@x[6],"1(@x[6])");
427e1051a39Sopenharmony_ci	brct	(@t[1],".Loop_tail");
428e1051a39Sopenharmony_ci
429e1051a39Sopenharmony_ci	j	(".Ldone");
430e1051a39Sopenharmony_ciSIZE	("ChaCha20_ctr32",".-ChaCha20_ctr32");
431e1051a39Sopenharmony_ci}
432e1051a39Sopenharmony_ci
433e1051a39Sopenharmony_ci########################################################################
434e1051a39Sopenharmony_ci# 4x"vertical" layout minimizes amount of instructions, but pipeline
435e1051a39Sopenharmony_ci# runs underutilized [because of vector instructions' high latency].
436e1051a39Sopenharmony_ci# On the other hand minimum amount of data it takes to fully utilize
437e1051a39Sopenharmony_ci# the pipeline is higher, so that effectively, short inputs would be
438e1051a39Sopenharmony_ci# processed slower. Hence this code path targeting <=256 bytes lengths.
439e1051a39Sopenharmony_ci#
440e1051a39Sopenharmony_ci{
441e1051a39Sopenharmony_cimy ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
442e1051a39Sopenharmony_ci    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
443e1051a39Sopenharmony_cimy @K=map("%v$_",(16..19));
444e1051a39Sopenharmony_cimy $CTR="%v26";
445e1051a39Sopenharmony_cimy ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
446e1051a39Sopenharmony_cimy $beperm="%v31";
447e1051a39Sopenharmony_cimy ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
448e1051a39Sopenharmony_cimy $FRAME=$stdframe+4*16;
449e1051a39Sopenharmony_ci
450e1051a39Sopenharmony_ciALIGN	(32);
451e1051a39Sopenharmony_ciLABEL	("ChaCha20_ctr32_4x");
452e1051a39Sopenharmony_ciLABEL	(".LChaCha20_ctr32_4x");
453e1051a39Sopenharmony_ci&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
454e1051a39Sopenharmony_ciif (!$z) {
455e1051a39Sopenharmony_ci	std	("%f4","16*$SIZE_T+2*8($sp)");
456e1051a39Sopenharmony_ci	std	("%f6","16*$SIZE_T+3*8($sp)");
457e1051a39Sopenharmony_ci}
458e1051a39Sopenharmony_ci&{$z?	\&lghi:\&lhi}	("%r1",-$FRAME);
459e1051a39Sopenharmony_ci	lgr	("%r0",$sp);
460e1051a39Sopenharmony_ci	la	($sp,"0(%r1,$sp)");
461e1051a39Sopenharmony_ci&{$z?	\&stg:\&st}	("%r0","0($sp)");	# back-chain
462e1051a39Sopenharmony_ciif ($z) {
463e1051a39Sopenharmony_ci	std	("%f8","$stdframe+8*0($sp)");
464e1051a39Sopenharmony_ci	std	("%f9","$stdframe+8*1($sp)");
465e1051a39Sopenharmony_ci	std	("%f10","$stdframe+8*2($sp)");
466e1051a39Sopenharmony_ci	std	("%f11","$stdframe+8*3($sp)");
467e1051a39Sopenharmony_ci	std	("%f12","$stdframe+8*4($sp)");
468e1051a39Sopenharmony_ci	std	("%f13","$stdframe+8*5($sp)");
469e1051a39Sopenharmony_ci	std	("%f14","$stdframe+8*6($sp)");
470e1051a39Sopenharmony_ci	std	("%f15","$stdframe+8*7($sp)");
471e1051a39Sopenharmony_ci}
472e1051a39Sopenharmony_ci	larl	("%r7",".Lsigma");
473e1051a39Sopenharmony_ci	lhi	("%r0",10);
474e1051a39Sopenharmony_ci	lhi	("%r1",0);
475e1051a39Sopenharmony_ci
476e1051a39Sopenharmony_ci	vl	(@K[0],"0(%r7)");		# load sigma
477e1051a39Sopenharmony_ci	vl	(@K[1],"0($key)");		# load key
478e1051a39Sopenharmony_ci	vl	(@K[2],"16($key)");
479e1051a39Sopenharmony_ci	vl	(@K[3],"0($counter)");		# load counter
480e1051a39Sopenharmony_ci
481e1051a39Sopenharmony_ci	vl	($beperm,"0x40(%r7)");
482e1051a39Sopenharmony_ci	vl	($xt1,"0x50(%r7)");
483e1051a39Sopenharmony_ci	vrepf	($CTR,@K[3],0);
484e1051a39Sopenharmony_ci	vlvgf	(@K[3],"%r1",0);		# clear @K[3].word[0]
485e1051a39Sopenharmony_ci	vaf	($CTR,$CTR,$xt1);
486e1051a39Sopenharmony_ci
487e1051a39Sopenharmony_ci#LABEL	(".Loop_outer_4x");
488e1051a39Sopenharmony_ci	vlm	($xa0,$xa3,"0x60(%r7)");	# load [smashed] sigma
489e1051a39Sopenharmony_ci
490e1051a39Sopenharmony_ci	vrepf	($xb0,@K[1],0);			# smash the key
491e1051a39Sopenharmony_ci	vrepf	($xb1,@K[1],1);
492e1051a39Sopenharmony_ci	vrepf	($xb2,@K[1],2);
493e1051a39Sopenharmony_ci	vrepf	($xb3,@K[1],3);
494e1051a39Sopenharmony_ci
495e1051a39Sopenharmony_ci	vrepf	($xc0,@K[2],0);
496e1051a39Sopenharmony_ci	vrepf	($xc1,@K[2],1);
497e1051a39Sopenharmony_ci	vrepf	($xc2,@K[2],2);
498e1051a39Sopenharmony_ci	vrepf	($xc3,@K[2],3);
499e1051a39Sopenharmony_ci
500e1051a39Sopenharmony_ci	vlr	($xd0,$CTR);
501e1051a39Sopenharmony_ci	vrepf	($xd1,@K[3],1);
502e1051a39Sopenharmony_ci	vrepf	($xd2,@K[3],2);
503e1051a39Sopenharmony_ci	vrepf	($xd3,@K[3],3);
504e1051a39Sopenharmony_ci
505e1051a39Sopenharmony_ciLABEL	(".Loop_4x");
506e1051a39Sopenharmony_ci	VX_lane_ROUND(0, 4, 8,12);
507e1051a39Sopenharmony_ci	VX_lane_ROUND(0, 5,10,15);
508e1051a39Sopenharmony_ci	brct	("%r0",".Loop_4x");
509e1051a39Sopenharmony_ci
510e1051a39Sopenharmony_ci	vaf	($xd0,$xd0,$CTR);
511e1051a39Sopenharmony_ci
512e1051a39Sopenharmony_ci	vmrhf	($xt0,$xa0,$xa1);		# transpose data
513e1051a39Sopenharmony_ci	vmrhf	($xt1,$xa2,$xa3);
514e1051a39Sopenharmony_ci	vmrlf	($xt2,$xa0,$xa1);
515e1051a39Sopenharmony_ci	vmrlf	($xt3,$xa2,$xa3);
516e1051a39Sopenharmony_ci	vpdi	($xa0,$xt0,$xt1,0b0000);
517e1051a39Sopenharmony_ci	vpdi	($xa1,$xt0,$xt1,0b0101);
518e1051a39Sopenharmony_ci	vpdi	($xa2,$xt2,$xt3,0b0000);
519e1051a39Sopenharmony_ci	vpdi	($xa3,$xt2,$xt3,0b0101);
520e1051a39Sopenharmony_ci
521e1051a39Sopenharmony_ci	vmrhf	($xt0,$xb0,$xb1);
522e1051a39Sopenharmony_ci	vmrhf	($xt1,$xb2,$xb3);
523e1051a39Sopenharmony_ci	vmrlf	($xt2,$xb0,$xb1);
524e1051a39Sopenharmony_ci	vmrlf	($xt3,$xb2,$xb3);
525e1051a39Sopenharmony_ci	vpdi	($xb0,$xt0,$xt1,0b0000);
526e1051a39Sopenharmony_ci	vpdi	($xb1,$xt0,$xt1,0b0101);
527e1051a39Sopenharmony_ci	vpdi	($xb2,$xt2,$xt3,0b0000);
528e1051a39Sopenharmony_ci	vpdi	($xb3,$xt2,$xt3,0b0101);
529e1051a39Sopenharmony_ci
530e1051a39Sopenharmony_ci	vmrhf	($xt0,$xc0,$xc1);
531e1051a39Sopenharmony_ci	vmrhf	($xt1,$xc2,$xc3);
532e1051a39Sopenharmony_ci	vmrlf	($xt2,$xc0,$xc1);
533e1051a39Sopenharmony_ci	vmrlf	($xt3,$xc2,$xc3);
534e1051a39Sopenharmony_ci	vpdi	($xc0,$xt0,$xt1,0b0000);
535e1051a39Sopenharmony_ci	vpdi	($xc1,$xt0,$xt1,0b0101);
536e1051a39Sopenharmony_ci	vpdi	($xc2,$xt2,$xt3,0b0000);
537e1051a39Sopenharmony_ci	vpdi	($xc3,$xt2,$xt3,0b0101);
538e1051a39Sopenharmony_ci
539e1051a39Sopenharmony_ci	vmrhf	($xt0,$xd0,$xd1);
540e1051a39Sopenharmony_ci	vmrhf	($xt1,$xd2,$xd3);
541e1051a39Sopenharmony_ci	vmrlf	($xt2,$xd0,$xd1);
542e1051a39Sopenharmony_ci	vmrlf	($xt3,$xd2,$xd3);
543e1051a39Sopenharmony_ci	vpdi	($xd0,$xt0,$xt1,0b0000);
544e1051a39Sopenharmony_ci	vpdi	($xd1,$xt0,$xt1,0b0101);
545e1051a39Sopenharmony_ci	vpdi	($xd2,$xt2,$xt3,0b0000);
546e1051a39Sopenharmony_ci	vpdi	($xd3,$xt2,$xt3,0b0101);
547e1051a39Sopenharmony_ci
548e1051a39Sopenharmony_ci	#vrepif	($xt0,4);
549e1051a39Sopenharmony_ci	#vaf	($CTR,$CTR,$xt0);		# next counter value
550e1051a39Sopenharmony_ci
551e1051a39Sopenharmony_ci	vaf	($xa0,$xa0,@K[0]);
552e1051a39Sopenharmony_ci	vaf	($xb0,$xb0,@K[1]);
553e1051a39Sopenharmony_ci	vaf	($xc0,$xc0,@K[2]);
554e1051a39Sopenharmony_ci	vaf	($xd0,$xd0,@K[3]);
555e1051a39Sopenharmony_ci
556e1051a39Sopenharmony_ci	vperm	($xa0,$xa0,$xa0,$beperm);
557e1051a39Sopenharmony_ci	vperm	($xb0,$xb0,$xb0,$beperm);
558e1051a39Sopenharmony_ci	vperm	($xc0,$xc0,$xc0,$beperm);
559e1051a39Sopenharmony_ci	vperm	($xd0,$xd0,$xd0,$beperm);
560e1051a39Sopenharmony_ci
561e1051a39Sopenharmony_ci	#&{$z?	\&clgfi:\&clfi} ($len,0x40);
562e1051a39Sopenharmony_ci	#jl	(".Ltail_4x");
563e1051a39Sopenharmony_ci
564e1051a39Sopenharmony_ci	vlm	($xt0,$xt3,"0($inp)");
565e1051a39Sopenharmony_ci
566e1051a39Sopenharmony_ci	vx	($xt0,$xt0,$xa0);
567e1051a39Sopenharmony_ci	vx	($xt1,$xt1,$xb0);
568e1051a39Sopenharmony_ci	vx	($xt2,$xt2,$xc0);
569e1051a39Sopenharmony_ci	vx	($xt3,$xt3,$xd0);
570e1051a39Sopenharmony_ci
571e1051a39Sopenharmony_ci	vstm	($xt0,$xt3,"0($out)");
572e1051a39Sopenharmony_ci
573e1051a39Sopenharmony_ci	la	($inp,"0x40($inp)");
574e1051a39Sopenharmony_ci	la	($out,"0x40($out)");
575e1051a39Sopenharmony_ci&{$z?	\&aghi:\&ahi}	($len,-0x40);
576e1051a39Sopenharmony_ci	#je	(".Ldone_4x");
577e1051a39Sopenharmony_ci
578e1051a39Sopenharmony_ci	vaf	($xa0,$xa1,@K[0]);
579e1051a39Sopenharmony_ci	vaf	($xb0,$xb1,@K[1]);
580e1051a39Sopenharmony_ci	vaf	($xc0,$xc1,@K[2]);
581e1051a39Sopenharmony_ci	vaf	($xd0,$xd1,@K[3]);
582e1051a39Sopenharmony_ci
583e1051a39Sopenharmony_ci	vperm	($xa0,$xa0,$xa0,$beperm);
584e1051a39Sopenharmony_ci	vperm	($xb0,$xb0,$xb0,$beperm);
585e1051a39Sopenharmony_ci	vperm	($xc0,$xc0,$xc0,$beperm);
586e1051a39Sopenharmony_ci	vperm	($xd0,$xd0,$xd0,$beperm);
587e1051a39Sopenharmony_ci
588e1051a39Sopenharmony_ci&{$z?	\&clgfi:\&clfi} ($len,0x40);
589e1051a39Sopenharmony_ci	jl	(".Ltail_4x");
590e1051a39Sopenharmony_ci
591e1051a39Sopenharmony_ci	vlm	($xt0,$xt3,"0($inp)");
592e1051a39Sopenharmony_ci
593e1051a39Sopenharmony_ci	vx	($xt0,$xt0,$xa0);
594e1051a39Sopenharmony_ci	vx	($xt1,$xt1,$xb0);
595e1051a39Sopenharmony_ci	vx	($xt2,$xt2,$xc0);
596e1051a39Sopenharmony_ci	vx	($xt3,$xt3,$xd0);
597e1051a39Sopenharmony_ci
598e1051a39Sopenharmony_ci	vstm	($xt0,$xt3,"0($out)");
599e1051a39Sopenharmony_ci
600e1051a39Sopenharmony_ci	la	($inp,"0x40($inp)");
601e1051a39Sopenharmony_ci	la	($out,"0x40($out)");
602e1051a39Sopenharmony_ci&{$z?	\&aghi:\&ahi}	($len,-0x40);
603e1051a39Sopenharmony_ci	je	(".Ldone_4x");
604e1051a39Sopenharmony_ci
605e1051a39Sopenharmony_ci	vaf	($xa0,$xa2,@K[0]);
606e1051a39Sopenharmony_ci	vaf	($xb0,$xb2,@K[1]);
607e1051a39Sopenharmony_ci	vaf	($xc0,$xc2,@K[2]);
608e1051a39Sopenharmony_ci	vaf	($xd0,$xd2,@K[3]);
609e1051a39Sopenharmony_ci
610e1051a39Sopenharmony_ci	vperm	($xa0,$xa0,$xa0,$beperm);
611e1051a39Sopenharmony_ci	vperm	($xb0,$xb0,$xb0,$beperm);
612e1051a39Sopenharmony_ci	vperm	($xc0,$xc0,$xc0,$beperm);
613e1051a39Sopenharmony_ci	vperm	($xd0,$xd0,$xd0,$beperm);
614e1051a39Sopenharmony_ci
615e1051a39Sopenharmony_ci&{$z?	\&clgfi:\&clfi} ($len,0x40);
616e1051a39Sopenharmony_ci	jl	(".Ltail_4x");
617e1051a39Sopenharmony_ci
618e1051a39Sopenharmony_ci	vlm	($xt0,$xt3,"0($inp)");
619e1051a39Sopenharmony_ci
620e1051a39Sopenharmony_ci	vx	($xt0,$xt0,$xa0);
621e1051a39Sopenharmony_ci	vx	($xt1,$xt1,$xb0);
622e1051a39Sopenharmony_ci	vx	($xt2,$xt2,$xc0);
623e1051a39Sopenharmony_ci	vx	($xt3,$xt3,$xd0);
624e1051a39Sopenharmony_ci
625e1051a39Sopenharmony_ci	vstm	($xt0,$xt3,"0($out)");
626e1051a39Sopenharmony_ci
627e1051a39Sopenharmony_ci	la	($inp,"0x40($inp)");
628e1051a39Sopenharmony_ci	la	($out,"0x40($out)");
629e1051a39Sopenharmony_ci&{$z?	\&aghi:\&ahi}	($len,-0x40);
630e1051a39Sopenharmony_ci	je	(".Ldone_4x");
631e1051a39Sopenharmony_ci
632e1051a39Sopenharmony_ci	vaf	($xa0,$xa3,@K[0]);
633e1051a39Sopenharmony_ci	vaf	($xb0,$xb3,@K[1]);
634e1051a39Sopenharmony_ci	vaf	($xc0,$xc3,@K[2]);
635e1051a39Sopenharmony_ci	vaf	($xd0,$xd3,@K[3]);
636e1051a39Sopenharmony_ci
637e1051a39Sopenharmony_ci	vperm	($xa0,$xa0,$xa0,$beperm);
638e1051a39Sopenharmony_ci	vperm	($xb0,$xb0,$xb0,$beperm);
639e1051a39Sopenharmony_ci	vperm	($xc0,$xc0,$xc0,$beperm);
640e1051a39Sopenharmony_ci	vperm	($xd0,$xd0,$xd0,$beperm);
641e1051a39Sopenharmony_ci
642e1051a39Sopenharmony_ci&{$z?	\&clgfi:\&clfi} ($len,0x40);
643e1051a39Sopenharmony_ci	jl	(".Ltail_4x");
644e1051a39Sopenharmony_ci
645e1051a39Sopenharmony_ci	vlm	($xt0,$xt3,"0($inp)");
646e1051a39Sopenharmony_ci
647e1051a39Sopenharmony_ci	vx	($xt0,$xt0,$xa0);
648e1051a39Sopenharmony_ci	vx	($xt1,$xt1,$xb0);
649e1051a39Sopenharmony_ci	vx	($xt2,$xt2,$xc0);
650e1051a39Sopenharmony_ci	vx	($xt3,$xt3,$xd0);
651e1051a39Sopenharmony_ci
652e1051a39Sopenharmony_ci	vstm	($xt0,$xt3,"0($out)");
653e1051a39Sopenharmony_ci
654e1051a39Sopenharmony_ci	#la	$inp,0x40($inp));
655e1051a39Sopenharmony_ci	#la	$out,0x40($out));
656e1051a39Sopenharmony_ci	#lhi	%r0,10);
657e1051a39Sopenharmony_ci	#&{$z?	\&aghi:\&ahi}	$len,-0x40);
658e1051a39Sopenharmony_ci	#jne	.Loop_outer_4x);
659e1051a39Sopenharmony_ci
660e1051a39Sopenharmony_ciLABEL	(".Ldone_4x");
661e1051a39Sopenharmony_ciif (!$z) {
662e1051a39Sopenharmony_ci	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
663e1051a39Sopenharmony_ci	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
664e1051a39Sopenharmony_ci} else {
665e1051a39Sopenharmony_ci	ld	("%f8","$stdframe+8*0($sp)");
666e1051a39Sopenharmony_ci	ld	("%f9","$stdframe+8*1($sp)");
667e1051a39Sopenharmony_ci	ld	("%f10","$stdframe+8*2($sp)");
668e1051a39Sopenharmony_ci	ld	("%f11","$stdframe+8*3($sp)");
669e1051a39Sopenharmony_ci	ld	("%f12","$stdframe+8*4($sp)");
670e1051a39Sopenharmony_ci	ld	("%f13","$stdframe+8*5($sp)");
671e1051a39Sopenharmony_ci	ld	("%f14","$stdframe+8*6($sp)");
672e1051a39Sopenharmony_ci	ld	("%f15","$stdframe+8*7($sp)");
673e1051a39Sopenharmony_ci}
674e1051a39Sopenharmony_ci&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
675e1051a39Sopenharmony_ci	la	($sp,"$FRAME($sp)");
676e1051a39Sopenharmony_ci	br	("%r14");
677e1051a39Sopenharmony_ci
678e1051a39Sopenharmony_ciALIGN	(16);
679e1051a39Sopenharmony_ciLABEL	(".Ltail_4x");
680e1051a39Sopenharmony_ciif (!$z) {
681e1051a39Sopenharmony_ci	vlr	($xt0,$xb0);
682e1051a39Sopenharmony_ci	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
683e1051a39Sopenharmony_ci	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
684e1051a39Sopenharmony_ci
685e1051a39Sopenharmony_ci	vst	($xa0,"$stdframe+0x00($sp)");
686e1051a39Sopenharmony_ci	vst	($xt0,"$stdframe+0x10($sp)");
687e1051a39Sopenharmony_ci	vst	($xc0,"$stdframe+0x20($sp)");
688e1051a39Sopenharmony_ci	vst	($xd0,"$stdframe+0x30($sp)");
689e1051a39Sopenharmony_ci} else {
690e1051a39Sopenharmony_ci	vlr	($xt0,$xc0);
691e1051a39Sopenharmony_ci	ld	("%f8","$stdframe+8*0($sp)");
692e1051a39Sopenharmony_ci	ld	("%f9","$stdframe+8*1($sp)");
693e1051a39Sopenharmony_ci	ld	("%f10","$stdframe+8*2($sp)");
694e1051a39Sopenharmony_ci	ld	("%f11","$stdframe+8*3($sp)");
695e1051a39Sopenharmony_ci	vlr	($xt1,$xd0);
696e1051a39Sopenharmony_ci	ld	("%f12","$stdframe+8*4($sp)");
697e1051a39Sopenharmony_ci	ld	("%f13","$stdframe+8*5($sp)");
698e1051a39Sopenharmony_ci	ld	("%f14","$stdframe+8*6($sp)");
699e1051a39Sopenharmony_ci	ld	("%f15","$stdframe+8*7($sp)");
700e1051a39Sopenharmony_ci
701e1051a39Sopenharmony_ci	vst	($xa0,"$stdframe+0x00($sp)");
702e1051a39Sopenharmony_ci	vst	($xb0,"$stdframe+0x10($sp)");
703e1051a39Sopenharmony_ci	vst	($xt0,"$stdframe+0x20($sp)");
704e1051a39Sopenharmony_ci	vst	($xt1,"$stdframe+0x30($sp)");
705e1051a39Sopenharmony_ci}
706e1051a39Sopenharmony_ci	lghi	("%r1",0);
707e1051a39Sopenharmony_ci
708e1051a39Sopenharmony_ciLABEL	(".Loop_tail_4x");
709e1051a39Sopenharmony_ci	llgc	("%r5","0(%r1,$inp)");
710e1051a39Sopenharmony_ci	llgc	("%r6","$stdframe(%r1,$sp)");
711e1051a39Sopenharmony_ci	xr	("%r6","%r5");
712e1051a39Sopenharmony_ci	stc	("%r6","0(%r1,$out)");
713e1051a39Sopenharmony_ci	la	("%r1","1(%r1)");
714e1051a39Sopenharmony_ci	brct	($len,".Loop_tail_4x");
715e1051a39Sopenharmony_ci
716e1051a39Sopenharmony_ci&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
717e1051a39Sopenharmony_ci	la	($sp,"$FRAME($sp)");
718e1051a39Sopenharmony_ci	br	("%r14");
719e1051a39Sopenharmony_ciSIZE	("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
720e1051a39Sopenharmony_ci}
721e1051a39Sopenharmony_ci
722e1051a39Sopenharmony_ci########################################################################
723e1051a39Sopenharmony_ci# 6x"horizontal" layout is optimal fit for the platform in its current
724e1051a39Sopenharmony_ci# shape, more specifically for given vector instructions' latency. Well,
725e1051a39Sopenharmony_ci# computational part of 8x"vertical" would be faster, but it consumes
726e1051a39Sopenharmony_ci# all registers and dealing with that will diminish the return...
727e1051a39Sopenharmony_ci#
728e1051a39Sopenharmony_ci{
729e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
730e1051a39Sopenharmony_ci    $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
731e1051a39Sopenharmony_ci    $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
732e1051a39Sopenharmony_cimy @K=map("%v$_",(27,24..26));
733e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
734e1051a39Sopenharmony_cimy $beperm="%v31";
735e1051a39Sopenharmony_cimy $FRAME=$stdframe + 4*16;
736e1051a39Sopenharmony_ci
737e1051a39Sopenharmony_ciGLOBL	("ChaCha20_ctr32_vx");
738e1051a39Sopenharmony_ciALIGN	(32);
739e1051a39Sopenharmony_ciLABEL	("ChaCha20_ctr32_vx");
740e1051a39Sopenharmony_ciLABEL	(".LChaCha20_ctr32_vx");
741e1051a39Sopenharmony_ci&{$z?	\&clgfi:\&clfi}	($len,256);
742e1051a39Sopenharmony_ci	jle	(".LChaCha20_ctr32_4x");
743e1051a39Sopenharmony_ci&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
744e1051a39Sopenharmony_ciif (!$z) {
745e1051a39Sopenharmony_ci	std	("%f4","16*$SIZE_T+2*8($sp)");
746e1051a39Sopenharmony_ci	std	("%f6","16*$SIZE_T+3*8($sp)");
747e1051a39Sopenharmony_ci}
748e1051a39Sopenharmony_ci&{$z?	\&lghi:\&lhi}	("%r1",-$FRAME);
749e1051a39Sopenharmony_ci	lgr	("%r0",$sp);
750e1051a39Sopenharmony_ci	la	($sp,"0(%r1,$sp)");
751e1051a39Sopenharmony_ci&{$z?	\&stg:\&st}	("%r0","0($sp)");	# back-chain
752e1051a39Sopenharmony_ciif ($z) {
753e1051a39Sopenharmony_ci	std	("%f8","$FRAME-8*8($sp)");
754e1051a39Sopenharmony_ci	std	("%f9","$FRAME-8*7($sp)");
755e1051a39Sopenharmony_ci	std	("%f10","$FRAME-8*6($sp)");
756e1051a39Sopenharmony_ci	std	("%f11","$FRAME-8*5($sp)");
757e1051a39Sopenharmony_ci	std	("%f12","$FRAME-8*4($sp)");
758e1051a39Sopenharmony_ci	std	("%f13","$FRAME-8*3($sp)");
759e1051a39Sopenharmony_ci	std	("%f14","$FRAME-8*2($sp)");
760e1051a39Sopenharmony_ci	std	("%f15","$FRAME-8*1($sp)");
761e1051a39Sopenharmony_ci}
762e1051a39Sopenharmony_ci	larl	("%r7",".Lsigma");
763e1051a39Sopenharmony_ci	lhi	("%r0",10);
764e1051a39Sopenharmony_ci
765e1051a39Sopenharmony_ci	vlm	(@K[1],@K[2],"0($key)");	# load key
766e1051a39Sopenharmony_ci	vl	(@K[3],"0($counter)");		# load counter
767e1051a39Sopenharmony_ci
768e1051a39Sopenharmony_ci	vlm	(@K[0],"$beperm","0(%r7)");	# load sigma, increments, ...
769e1051a39Sopenharmony_ci
770e1051a39Sopenharmony_ciLABEL	(".Loop_outer_vx");
771e1051a39Sopenharmony_ci	vlr	($a0,@K[0]);
772e1051a39Sopenharmony_ci	vlr	($b0,@K[1]);
773e1051a39Sopenharmony_ci	vlr	($a1,@K[0]);
774e1051a39Sopenharmony_ci	vlr	($b1,@K[1]);
775e1051a39Sopenharmony_ci	vlr	($a2,@K[0]);
776e1051a39Sopenharmony_ci	vlr	($b2,@K[1]);
777e1051a39Sopenharmony_ci	vlr	($a3,@K[0]);
778e1051a39Sopenharmony_ci	vlr	($b3,@K[1]);
779e1051a39Sopenharmony_ci	vlr	($a4,@K[0]);
780e1051a39Sopenharmony_ci	vlr	($b4,@K[1]);
781e1051a39Sopenharmony_ci	vlr	($a5,@K[0]);
782e1051a39Sopenharmony_ci	vlr	($b5,@K[1]);
783e1051a39Sopenharmony_ci
784e1051a39Sopenharmony_ci	vlr	($d0,@K[3]);
785e1051a39Sopenharmony_ci	vaf	($d1,@K[3],$t1);		# K[3]+1
786e1051a39Sopenharmony_ci	vaf	($d2,@K[3],$t2);		# K[3]+2
787e1051a39Sopenharmony_ci	vaf	($d3,@K[3],$t3);		# K[3]+3
788e1051a39Sopenharmony_ci	vaf	($d4,$d2,$t2);			# K[3]+4
789e1051a39Sopenharmony_ci	vaf	($d5,$d2,$t3);			# K[3]+5
790e1051a39Sopenharmony_ci
791e1051a39Sopenharmony_ci	vlr	($c0,@K[2]);
792e1051a39Sopenharmony_ci	vlr	($c1,@K[2]);
793e1051a39Sopenharmony_ci	vlr	($c2,@K[2]);
794e1051a39Sopenharmony_ci	vlr	($c3,@K[2]);
795e1051a39Sopenharmony_ci	vlr	($c4,@K[2]);
796e1051a39Sopenharmony_ci	vlr	($c5,@K[2]);
797e1051a39Sopenharmony_ci
798e1051a39Sopenharmony_ci	vlr	($t1,$d1);
799e1051a39Sopenharmony_ci	vlr	($t2,$d2);
800e1051a39Sopenharmony_ci	vlr	($t3,$d3);
801e1051a39Sopenharmony_ci
802e1051a39Sopenharmony_ciALIGN	(4);
803e1051a39Sopenharmony_ciLABEL	(".Loop_vx");
804e1051a39Sopenharmony_ci
805e1051a39Sopenharmony_ci	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
806e1051a39Sopenharmony_ci		 $b0,$b1,$b2,$b3,$b4,$b5,
807e1051a39Sopenharmony_ci		 $c0,$c1,$c2,$c3,$c4,$c5,
808e1051a39Sopenharmony_ci		 $d0,$d1,$d2,$d3,$d4,$d5,
809e1051a39Sopenharmony_ci		 0);
810e1051a39Sopenharmony_ci
811e1051a39Sopenharmony_ci	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
812e1051a39Sopenharmony_ci		 $b0,$b1,$b2,$b3,$b4,$b5,
813e1051a39Sopenharmony_ci		 $c0,$c1,$c2,$c3,$c4,$c5,
814e1051a39Sopenharmony_ci		 $d0,$d1,$d2,$d3,$d4,$d5,
815e1051a39Sopenharmony_ci		 1);
816e1051a39Sopenharmony_ci
817e1051a39Sopenharmony_ci	brct	("%r0",".Loop_vx");
818e1051a39Sopenharmony_ci
819e1051a39Sopenharmony_ci	vaf	($a0,$a0,@K[0]);
820e1051a39Sopenharmony_ci	vaf	($b0,$b0,@K[1]);
821e1051a39Sopenharmony_ci	vaf	($c0,$c0,@K[2]);
822e1051a39Sopenharmony_ci	vaf	($d0,$d0,@K[3]);
823e1051a39Sopenharmony_ci	vaf	($a1,$a1,@K[0]);
824e1051a39Sopenharmony_ci	vaf	($d1,$d1,$t1);			# +K[3]+1
825e1051a39Sopenharmony_ci
826e1051a39Sopenharmony_ci	vperm	($a0,$a0,$a0,$beperm);
827e1051a39Sopenharmony_ci	vperm	($b0,$b0,$b0,$beperm);
828e1051a39Sopenharmony_ci	vperm	($c0,$c0,$c0,$beperm);
829e1051a39Sopenharmony_ci	vperm	($d0,$d0,$d0,$beperm);
830e1051a39Sopenharmony_ci
831e1051a39Sopenharmony_ci&{$z?	\&clgfi:\&clfi}	($len,0x40);
832e1051a39Sopenharmony_ci	jl	(".Ltail_vx");
833e1051a39Sopenharmony_ci
834e1051a39Sopenharmony_ci	vaf	($d2,$d2,$t2);			# +K[3]+2
835e1051a39Sopenharmony_ci	vaf	($d3,$d3,$t3);			# +K[3]+3
836e1051a39Sopenharmony_ci	vlm	($t0,$t3,"0($inp)");
837e1051a39Sopenharmony_ci
838e1051a39Sopenharmony_ci	vx	($a0,$a0,$t0);
839e1051a39Sopenharmony_ci	vx	($b0,$b0,$t1);
840e1051a39Sopenharmony_ci	vx	($c0,$c0,$t2);
841e1051a39Sopenharmony_ci	vx	($d0,$d0,$t3);
842e1051a39Sopenharmony_ci
843e1051a39Sopenharmony_ci	vlm	(@K[0],$t3,"0(%r7)");		# re-load sigma and increments
844e1051a39Sopenharmony_ci
845e1051a39Sopenharmony_ci	vstm	($a0,$d0,"0($out)");
846e1051a39Sopenharmony_ci
847e1051a39Sopenharmony_ci	la	($inp,"0x40($inp)");
848e1051a39Sopenharmony_ci	la	($out,"0x40($out)");
849e1051a39Sopenharmony_ci&{$z?	\&aghi:\&ahi}	($len,-0x40);
850e1051a39Sopenharmony_ci	je	(".Ldone_vx");
851e1051a39Sopenharmony_ci
852e1051a39Sopenharmony_ci	vaf	($b1,$b1,@K[1]);
853e1051a39Sopenharmony_ci	vaf	($c1,$c1,@K[2]);
854e1051a39Sopenharmony_ci
855e1051a39Sopenharmony_ci	vperm	($a0,$a1,$a1,$beperm);
856e1051a39Sopenharmony_ci	vperm	($b0,$b1,$b1,$beperm);
857e1051a39Sopenharmony_ci	vperm	($c0,$c1,$c1,$beperm);
858e1051a39Sopenharmony_ci	vperm	($d0,$d1,$d1,$beperm);
859e1051a39Sopenharmony_ci
860e1051a39Sopenharmony_ci&{$z?	\&clgfi:\&clfi} ($len,0x40);
861e1051a39Sopenharmony_ci	jl	(".Ltail_vx");
862e1051a39Sopenharmony_ci
863e1051a39Sopenharmony_ci	vlm	($a1,$d1,"0($inp)");
864e1051a39Sopenharmony_ci
865e1051a39Sopenharmony_ci	vx	($a0,$a0,$a1);
866e1051a39Sopenharmony_ci	vx	($b0,$b0,$b1);
867e1051a39Sopenharmony_ci	vx	($c0,$c0,$c1);
868e1051a39Sopenharmony_ci	vx	($d0,$d0,$d1);
869e1051a39Sopenharmony_ci
870e1051a39Sopenharmony_ci	vstm	($a0,$d0,"0($out)");
871e1051a39Sopenharmony_ci
872e1051a39Sopenharmony_ci	la	($inp,"0x40($inp)");
873e1051a39Sopenharmony_ci	la	($out,"0x40($out)");
874e1051a39Sopenharmony_ci&{$z?	\&aghi:\&ahi}	($len,-0x40);
875e1051a39Sopenharmony_ci	je	(".Ldone_vx");
876e1051a39Sopenharmony_ci
877e1051a39Sopenharmony_ci	vaf	($a2,$a2,@K[0]);
878e1051a39Sopenharmony_ci	vaf	($b2,$b2,@K[1]);
879e1051a39Sopenharmony_ci	vaf	($c2,$c2,@K[2]);
880e1051a39Sopenharmony_ci
881e1051a39Sopenharmony_ci	vperm	($a0,$a2,$a2,$beperm);
882e1051a39Sopenharmony_ci	vperm	($b0,$b2,$b2,$beperm);
883e1051a39Sopenharmony_ci	vperm	($c0,$c2,$c2,$beperm);
884e1051a39Sopenharmony_ci	vperm	($d0,$d2,$d2,$beperm);
885e1051a39Sopenharmony_ci
886e1051a39Sopenharmony_ci&{$z?	\&clgfi:\&clfi}	($len,0x40);
887e1051a39Sopenharmony_ci	jl	(".Ltail_vx");
888e1051a39Sopenharmony_ci
889e1051a39Sopenharmony_ci	vlm	($a1,$d1,"0($inp)");
890e1051a39Sopenharmony_ci
891e1051a39Sopenharmony_ci	vx	($a0,$a0,$a1);
892e1051a39Sopenharmony_ci	vx	($b0,$b0,$b1);
893e1051a39Sopenharmony_ci	vx	($c0,$c0,$c1);
894e1051a39Sopenharmony_ci	vx	($d0,$d0,$d1);
895e1051a39Sopenharmony_ci
896e1051a39Sopenharmony_ci	vstm	($a0,$d0,"0($out)");
897e1051a39Sopenharmony_ci
898e1051a39Sopenharmony_ci	la	($inp,"0x40($inp)");
899e1051a39Sopenharmony_ci	la	($out,"0x40($out)");
900e1051a39Sopenharmony_ci&{$z?	\&aghi:\&ahi}	($len,-0x40);
901e1051a39Sopenharmony_ci	je	(".Ldone_vx");
902e1051a39Sopenharmony_ci
903e1051a39Sopenharmony_ci	vaf	($a3,$a3,@K[0]);
904e1051a39Sopenharmony_ci	vaf	($b3,$b3,@K[1]);
905e1051a39Sopenharmony_ci	vaf	($c3,$c3,@K[2]);
906e1051a39Sopenharmony_ci	vaf	($d2,@K[3],$t3);		# K[3]+3
907e1051a39Sopenharmony_ci
908e1051a39Sopenharmony_ci	vperm	($a0,$a3,$a3,$beperm);
909e1051a39Sopenharmony_ci	vperm	($b0,$b3,$b3,$beperm);
910e1051a39Sopenharmony_ci	vperm	($c0,$c3,$c3,$beperm);
911e1051a39Sopenharmony_ci	vperm	($d0,$d3,$d3,$beperm);
912e1051a39Sopenharmony_ci
913e1051a39Sopenharmony_ci&{$z?	\&clgfi:\&clfi}	($len,0x40);
914e1051a39Sopenharmony_ci	jl	(".Ltail_vx");
915e1051a39Sopenharmony_ci
916e1051a39Sopenharmony_ci	vaf	($d3,$d2,$t1);			# K[3]+4
917e1051a39Sopenharmony_ci	vlm	($a1,$d1,"0($inp)");
918e1051a39Sopenharmony_ci
919e1051a39Sopenharmony_ci	vx	($a0,$a0,$a1);
920e1051a39Sopenharmony_ci	vx	($b0,$b0,$b1);
921e1051a39Sopenharmony_ci	vx	($c0,$c0,$c1);
922e1051a39Sopenharmony_ci	vx	($d0,$d0,$d1);
923e1051a39Sopenharmony_ci
924e1051a39Sopenharmony_ci	vstm	($a0,$d0,"0($out)");
925e1051a39Sopenharmony_ci
926e1051a39Sopenharmony_ci	la	($inp,"0x40($inp)");
927e1051a39Sopenharmony_ci	la	($out,"0x40($out)");
928e1051a39Sopenharmony_ci&{$z?	\&aghi:\&ahi}	($len,-0x40);
929e1051a39Sopenharmony_ci	je	(".Ldone_vx");
930e1051a39Sopenharmony_ci
931e1051a39Sopenharmony_ci	vaf	($a4,$a4,@K[0]);
932e1051a39Sopenharmony_ci	vaf	($b4,$b4,@K[1]);
933e1051a39Sopenharmony_ci	vaf	($c4,$c4,@K[2]);
934e1051a39Sopenharmony_ci	vaf	($d4,$d4,$d3);			# +K[3]+4
935e1051a39Sopenharmony_ci	vaf	($d3,$d3,$t1);			# K[3]+5
936e1051a39Sopenharmony_ci	vaf	(@K[3],$d2,$t3);		# K[3]+=6
937e1051a39Sopenharmony_ci
938e1051a39Sopenharmony_ci	vperm	($a0,$a4,$a4,$beperm);
939e1051a39Sopenharmony_ci	vperm	($b0,$b4,$b4,$beperm);
940e1051a39Sopenharmony_ci	vperm	($c0,$c4,$c4,$beperm);
941e1051a39Sopenharmony_ci	vperm	($d0,$d4,$d4,$beperm);
942e1051a39Sopenharmony_ci
943e1051a39Sopenharmony_ci&{$z?	\&clgfi:\&clfi}	($len,0x40);
944e1051a39Sopenharmony_ci	jl	(".Ltail_vx");
945e1051a39Sopenharmony_ci
946e1051a39Sopenharmony_ci	vlm	($a1,$d1,"0($inp)");
947e1051a39Sopenharmony_ci
948e1051a39Sopenharmony_ci	vx	($a0,$a0,$a1);
949e1051a39Sopenharmony_ci	vx	($b0,$b0,$b1);
950e1051a39Sopenharmony_ci	vx	($c0,$c0,$c1);
951e1051a39Sopenharmony_ci	vx	($d0,$d0,$d1);
952e1051a39Sopenharmony_ci
953e1051a39Sopenharmony_ci	vstm	($a0,$d0,"0($out)");
954e1051a39Sopenharmony_ci
955e1051a39Sopenharmony_ci	la	($inp,"0x40($inp)");
956e1051a39Sopenharmony_ci	la	($out,"0x40($out)");
957e1051a39Sopenharmony_ci&{$z?	\&aghi:\&ahi}	($len,-0x40);
958e1051a39Sopenharmony_ci	je	(".Ldone_vx");
959e1051a39Sopenharmony_ci
960e1051a39Sopenharmony_ci	vaf	($a5,$a5,@K[0]);
961e1051a39Sopenharmony_ci	vaf	($b5,$b5,@K[1]);
962e1051a39Sopenharmony_ci	vaf	($c5,$c5,@K[2]);
963e1051a39Sopenharmony_ci	vaf	($d5,$d5,$d3);			# +K[3]+5
964e1051a39Sopenharmony_ci
965e1051a39Sopenharmony_ci	vperm	($a0,$a5,$a5,$beperm);
966e1051a39Sopenharmony_ci	vperm	($b0,$b5,$b5,$beperm);
967e1051a39Sopenharmony_ci	vperm	($c0,$c5,$c5,$beperm);
968e1051a39Sopenharmony_ci	vperm	($d0,$d5,$d5,$beperm);
969e1051a39Sopenharmony_ci
970e1051a39Sopenharmony_ci&{$z?	\&clgfi:\&clfi} ($len,0x40);
971e1051a39Sopenharmony_ci	jl	(".Ltail_vx");
972e1051a39Sopenharmony_ci
973e1051a39Sopenharmony_ci	vlm	($a1,$d1,"0($inp)");
974e1051a39Sopenharmony_ci
975e1051a39Sopenharmony_ci	vx	($a0,$a0,$a1);
976e1051a39Sopenharmony_ci	vx	($b0,$b0,$b1);
977e1051a39Sopenharmony_ci	vx	($c0,$c0,$c1);
978e1051a39Sopenharmony_ci	vx	($d0,$d0,$d1);
979e1051a39Sopenharmony_ci
980e1051a39Sopenharmony_ci	vstm	($a0,$d0,"0($out)");
981e1051a39Sopenharmony_ci
982e1051a39Sopenharmony_ci	la	($inp,"0x40($inp)");
983e1051a39Sopenharmony_ci	la	($out,"0x40($out)");
984e1051a39Sopenharmony_ci	lhi	("%r0",10);
985e1051a39Sopenharmony_ci&{$z?	\&aghi:\&ahi}	($len,-0x40);
986e1051a39Sopenharmony_ci	jne	(".Loop_outer_vx");
987e1051a39Sopenharmony_ci
988e1051a39Sopenharmony_ciLABEL	(".Ldone_vx");
989e1051a39Sopenharmony_ciif (!$z) {
990e1051a39Sopenharmony_ci	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
991e1051a39Sopenharmony_ci	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
992e1051a39Sopenharmony_ci} else {
993e1051a39Sopenharmony_ci	ld	("%f8","$FRAME-8*8($sp)");
994e1051a39Sopenharmony_ci	ld	("%f9","$FRAME-8*7($sp)");
995e1051a39Sopenharmony_ci	ld	("%f10","$FRAME-8*6($sp)");
996e1051a39Sopenharmony_ci	ld	("%f11","$FRAME-8*5($sp)");
997e1051a39Sopenharmony_ci	ld	("%f12","$FRAME-8*4($sp)");
998e1051a39Sopenharmony_ci	ld	("%f13","$FRAME-8*3($sp)");
999e1051a39Sopenharmony_ci	ld	("%f14","$FRAME-8*2($sp)");
1000e1051a39Sopenharmony_ci	ld	("%f15","$FRAME-8*1($sp)");
1001e1051a39Sopenharmony_ci}
1002e1051a39Sopenharmony_ci&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1003e1051a39Sopenharmony_ci	la	($sp,"$FRAME($sp)");
1004e1051a39Sopenharmony_ci	br	("%r14");
1005e1051a39Sopenharmony_ci
1006e1051a39Sopenharmony_ciALIGN	(16);
1007e1051a39Sopenharmony_ciLABEL	(".Ltail_vx");
1008e1051a39Sopenharmony_ciif (!$z) {
1009e1051a39Sopenharmony_ci	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
1010e1051a39Sopenharmony_ci	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
1011e1051a39Sopenharmony_ci} else {
1012e1051a39Sopenharmony_ci	ld	("%f8","$FRAME-8*8($sp)");
1013e1051a39Sopenharmony_ci	ld	("%f9","$FRAME-8*7($sp)");
1014e1051a39Sopenharmony_ci	ld	("%f10","$FRAME-8*6($sp)");
1015e1051a39Sopenharmony_ci	ld	("%f11","$FRAME-8*5($sp)");
1016e1051a39Sopenharmony_ci	ld	("%f12","$FRAME-8*4($sp)");
1017e1051a39Sopenharmony_ci	ld	("%f13","$FRAME-8*3($sp)");
1018e1051a39Sopenharmony_ci	ld	("%f14","$FRAME-8*2($sp)");
1019e1051a39Sopenharmony_ci	ld	("%f15","$FRAME-8*1($sp)");
1020e1051a39Sopenharmony_ci}
1021e1051a39Sopenharmony_ci	vstm	($a0,$d0,"$stdframe($sp)");
1022e1051a39Sopenharmony_ci	lghi	("%r1",0);
1023e1051a39Sopenharmony_ci
1024e1051a39Sopenharmony_ciLABEL	(".Loop_tail_vx");
1025e1051a39Sopenharmony_ci	llgc	("%r5","0(%r1,$inp)");
1026e1051a39Sopenharmony_ci	llgc	("%r6","$stdframe(%r1,$sp)");
1027e1051a39Sopenharmony_ci	xr	("%r6","%r5");
1028e1051a39Sopenharmony_ci	stc	("%r6","0(%r1,$out)");
1029e1051a39Sopenharmony_ci	la	("%r1","1(%r1)");
1030e1051a39Sopenharmony_ci	brct	($len,".Loop_tail_vx");
1031e1051a39Sopenharmony_ci
1032e1051a39Sopenharmony_ci&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1033e1051a39Sopenharmony_ci	la	($sp,"$FRAME($sp)");
1034e1051a39Sopenharmony_ci	br	("%r14");
1035e1051a39Sopenharmony_ciSIZE	("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
1036e1051a39Sopenharmony_ci}
1037e1051a39Sopenharmony_ci################
1038e1051a39Sopenharmony_ci
1039e1051a39Sopenharmony_ciALIGN	(32);
1040e1051a39Sopenharmony_ciLABEL	(".Lsigma");
1041e1051a39Sopenharmony_ciLONG	(0x61707865,0x3320646e,0x79622d32,0x6b206574);	# endian-neutral sigma
1042e1051a39Sopenharmony_ciLONG	(1,0,0,0);
1043e1051a39Sopenharmony_ciLONG	(2,0,0,0);
1044e1051a39Sopenharmony_ciLONG	(3,0,0,0);
1045e1051a39Sopenharmony_ciLONG	(0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c);	# byte swap
1046e1051a39Sopenharmony_ci
1047e1051a39Sopenharmony_ciLONG	(0,1,2,3);
1048e1051a39Sopenharmony_ciLONG	(0x61707865,0x61707865,0x61707865,0x61707865);	# smashed sigma
1049e1051a39Sopenharmony_ciLONG	(0x3320646e,0x3320646e,0x3320646e,0x3320646e);
1050e1051a39Sopenharmony_ciLONG	(0x79622d32,0x79622d32,0x79622d32,0x79622d32);
1051e1051a39Sopenharmony_ciLONG	(0x6b206574,0x6b206574,0x6b206574,0x6b206574);
1052e1051a39Sopenharmony_ci
1053e1051a39Sopenharmony_ciASCIZ	("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
1054e1051a39Sopenharmony_ciALIGN	(4);
1055e1051a39Sopenharmony_ci
1056e1051a39Sopenharmony_ciPERLASM_END();
1057