1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci# 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# December 2015 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# ChaCha20 for s390x. 20e1051a39Sopenharmony_ci# 21e1051a39Sopenharmony_ci# 3 times faster than compiler-generated code. 22e1051a39Sopenharmony_ci 23e1051a39Sopenharmony_ci# 24e1051a39Sopenharmony_ci# August 2018 25e1051a39Sopenharmony_ci# 26e1051a39Sopenharmony_ci# Add vx code path: 4x"vertical". 27e1051a39Sopenharmony_ci# 28e1051a39Sopenharmony_ci# Copyright IBM Corp. 2018 29e1051a39Sopenharmony_ci# Author: Patrick Steuer <patrick.steuer@de.ibm.com> 30e1051a39Sopenharmony_ci 31e1051a39Sopenharmony_ci# 32e1051a39Sopenharmony_ci# February 2019 33e1051a39Sopenharmony_ci# 34e1051a39Sopenharmony_ci# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's 35e1051a39Sopenharmony_ci# 4x"vertical" submission [on z13] and >3 faster than scalar code. 36e1051a39Sopenharmony_ci# But to harness overheads revert to transliteration of VSX code path 37e1051a39Sopenharmony_ci# from chacha-ppc module, which is also 4x"vertical", to handle inputs 38e1051a39Sopenharmony_ci# not longer than 256 bytes. 39e1051a39Sopenharmony_ci 40e1051a39Sopenharmony_ciuse strict; 41e1051a39Sopenharmony_ciuse FindBin qw($Bin); 42e1051a39Sopenharmony_ciuse lib "$Bin/../.."; 43e1051a39Sopenharmony_ciuse perlasm::s390x qw(:DEFAULT :VX :EI AUTOLOAD LABEL INCLUDE); 44e1051a39Sopenharmony_ci 45e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 46e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 47e1051a39Sopenharmony_cimy $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 48e1051a39Sopenharmony_cimy $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 49e1051a39Sopenharmony_ci 50e1051a39Sopenharmony_cimy ($z,$SIZE_T); 51e1051a39Sopenharmony_ciif ($flavour =~ /3[12]/) { 52e1051a39Sopenharmony_ci $z=0; # S/390 ABI 53e1051a39Sopenharmony_ci $SIZE_T=4; 54e1051a39Sopenharmony_ci} else { 55e1051a39Sopenharmony_ci $z=1; # zSeries ABI 56e1051a39Sopenharmony_ci $SIZE_T=8; 57e1051a39Sopenharmony_ci} 58e1051a39Sopenharmony_ci 59e1051a39Sopenharmony_cimy $sp="%r15"; 60e1051a39Sopenharmony_cimy $stdframe=16*$SIZE_T+4*8; 61e1051a39Sopenharmony_ci 62e1051a39Sopenharmony_cisub ROUND { 63e1051a39Sopenharmony_cimy @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); 64e1051a39Sopenharmony_cimy @t=map("%r$_",(8,9)); 65e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0)=@_; 66e1051a39Sopenharmony_cimy ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 67e1051a39Sopenharmony_cimy ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 68e1051a39Sopenharmony_cimy ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 69e1051a39Sopenharmony_cimy ($xc,$xc_)=map("$_",@t); 70e1051a39Sopenharmony_ci 71e1051a39Sopenharmony_ci # Consider order in which variables are addressed by their 72e1051a39Sopenharmony_ci # index: 73e1051a39Sopenharmony_ci # 74e1051a39Sopenharmony_ci # a b c d 75e1051a39Sopenharmony_ci # 76e1051a39Sopenharmony_ci # 0 4 8 12 < even round 77e1051a39Sopenharmony_ci # 1 5 9 13 78e1051a39Sopenharmony_ci # 2 6 10 14 79e1051a39Sopenharmony_ci # 3 7 11 15 80e1051a39Sopenharmony_ci # 0 5 10 15 < odd round 81e1051a39Sopenharmony_ci # 1 6 11 12 82e1051a39Sopenharmony_ci # 2 7 8 13 83e1051a39Sopenharmony_ci # 3 4 9 14 84e1051a39Sopenharmony_ci # 85e1051a39Sopenharmony_ci # 'a', 'b' and 'd's are permanently allocated in registers, 86e1051a39Sopenharmony_ci # @x[0..7,12..15], while 'c's are maintained in memory. If 87e1051a39Sopenharmony_ci # you observe 'c' column, you'll notice that pair of 'c's is 88e1051a39Sopenharmony_ci # invariant between rounds. This means that we have to reload 89e1051a39Sopenharmony_ci # them once per round, in the middle. This is why you'll see 90e1051a39Sopenharmony_ci # 'c' stores and loads in the middle, but none in the beginning 91e1051a39Sopenharmony_ci # or end. 92e1051a39Sopenharmony_ci 93e1051a39Sopenharmony_ci alr (@x[$a0],@x[$b0]); # Q1 94e1051a39Sopenharmony_ci alr (@x[$a1],@x[$b1]); # Q2 95e1051a39Sopenharmony_ci xr (@x[$d0],@x[$a0]); 96e1051a39Sopenharmony_ci xr (@x[$d1],@x[$a1]); 97e1051a39Sopenharmony_ci rll (@x[$d0],@x[$d0],16); 98e1051a39Sopenharmony_ci rll (@x[$d1],@x[$d1],16); 99e1051a39Sopenharmony_ci 100e1051a39Sopenharmony_ci alr ($xc,@x[$d0]); 101e1051a39Sopenharmony_ci alr ($xc_,@x[$d1]); 102e1051a39Sopenharmony_ci xr (@x[$b0],$xc); 103e1051a39Sopenharmony_ci xr (@x[$b1],$xc_); 104e1051a39Sopenharmony_ci rll (@x[$b0],@x[$b0],12); 105e1051a39Sopenharmony_ci rll (@x[$b1],@x[$b1],12); 106e1051a39Sopenharmony_ci 107e1051a39Sopenharmony_ci alr (@x[$a0],@x[$b0]); 108e1051a39Sopenharmony_ci alr (@x[$a1],@x[$b1]); 109e1051a39Sopenharmony_ci xr (@x[$d0],@x[$a0]); 110e1051a39Sopenharmony_ci xr (@x[$d1],@x[$a1]); 111e1051a39Sopenharmony_ci rll (@x[$d0],@x[$d0],8); 112e1051a39Sopenharmony_ci rll (@x[$d1],@x[$d1],8); 113e1051a39Sopenharmony_ci 114e1051a39Sopenharmony_ci alr ($xc,@x[$d0]); 115e1051a39Sopenharmony_ci alr ($xc_,@x[$d1]); 116e1051a39Sopenharmony_ci xr (@x[$b0],$xc); 117e1051a39Sopenharmony_ci xr (@x[$b1],$xc_); 118e1051a39Sopenharmony_ci rll (@x[$b0],@x[$b0],7); 119e1051a39Sopenharmony_ci rll (@x[$b1],@x[$b1],7); 120e1051a39Sopenharmony_ci 121e1051a39Sopenharmony_ci stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's 122e1051a39Sopenharmony_ci lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)"); 123e1051a39Sopenharmony_ci 124e1051a39Sopenharmony_ci alr (@x[$a2],@x[$b2]); # Q3 125e1051a39Sopenharmony_ci alr (@x[$a3],@x[$b3]); # Q4 126e1051a39Sopenharmony_ci xr (@x[$d2],@x[$a2]); 127e1051a39Sopenharmony_ci xr (@x[$d3],@x[$a3]); 128e1051a39Sopenharmony_ci rll (@x[$d2],@x[$d2],16); 129e1051a39Sopenharmony_ci rll (@x[$d3],@x[$d3],16); 130e1051a39Sopenharmony_ci 131e1051a39Sopenharmony_ci alr ($xc,@x[$d2]); 132e1051a39Sopenharmony_ci alr ($xc_,@x[$d3]); 133e1051a39Sopenharmony_ci xr (@x[$b2],$xc); 134e1051a39Sopenharmony_ci xr (@x[$b3],$xc_); 135e1051a39Sopenharmony_ci rll (@x[$b2],@x[$b2],12); 136e1051a39Sopenharmony_ci rll (@x[$b3],@x[$b3],12); 137e1051a39Sopenharmony_ci 138e1051a39Sopenharmony_ci alr (@x[$a2],@x[$b2]); 139e1051a39Sopenharmony_ci alr (@x[$a3],@x[$b3]); 140e1051a39Sopenharmony_ci xr (@x[$d2],@x[$a2]); 141e1051a39Sopenharmony_ci xr (@x[$d3],@x[$a3]); 142e1051a39Sopenharmony_ci rll (@x[$d2],@x[$d2],8); 143e1051a39Sopenharmony_ci rll (@x[$d3],@x[$d3],8); 144e1051a39Sopenharmony_ci 145e1051a39Sopenharmony_ci alr ($xc,@x[$d2]); 146e1051a39Sopenharmony_ci alr ($xc_,@x[$d3]); 147e1051a39Sopenharmony_ci xr (@x[$b2],$xc); 148e1051a39Sopenharmony_ci xr (@x[$b3],$xc_); 149e1051a39Sopenharmony_ci rll (@x[$b2],@x[$b2],7); 150e1051a39Sopenharmony_ci rll (@x[$b3],@x[$b3],7); 151e1051a39Sopenharmony_ci} 152e1051a39Sopenharmony_ci 153e1051a39Sopenharmony_cisub VX_lane_ROUND { 154e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0)=@_; 155e1051a39Sopenharmony_cimy ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 156e1051a39Sopenharmony_cimy ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 157e1051a39Sopenharmony_cimy ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 158e1051a39Sopenharmony_cimy @x=map("%v$_",(0..15)); 159e1051a39Sopenharmony_ci 160e1051a39Sopenharmony_ci vaf (@x[$a0],@x[$a0],@x[$b0]); # Q1 161e1051a39Sopenharmony_ci vx (@x[$d0],@x[$d0],@x[$a0]); 162e1051a39Sopenharmony_ci verllf (@x[$d0],@x[$d0],16); 163e1051a39Sopenharmony_ci vaf (@x[$a1],@x[$a1],@x[$b1]); # Q2 164e1051a39Sopenharmony_ci vx (@x[$d1],@x[$d1],@x[$a1]); 165e1051a39Sopenharmony_ci verllf (@x[$d1],@x[$d1],16); 166e1051a39Sopenharmony_ci vaf (@x[$a2],@x[$a2],@x[$b2]); # Q3 167e1051a39Sopenharmony_ci vx (@x[$d2],@x[$d2],@x[$a2]); 168e1051a39Sopenharmony_ci verllf (@x[$d2],@x[$d2],16); 169e1051a39Sopenharmony_ci vaf (@x[$a3],@x[$a3],@x[$b3]); # Q4 170e1051a39Sopenharmony_ci vx (@x[$d3],@x[$d3],@x[$a3]); 171e1051a39Sopenharmony_ci verllf (@x[$d3],@x[$d3],16); 172e1051a39Sopenharmony_ci 173e1051a39Sopenharmony_ci vaf (@x[$c0],@x[$c0],@x[$d0]); 174e1051a39Sopenharmony_ci vx (@x[$b0],@x[$b0],@x[$c0]); 175e1051a39Sopenharmony_ci verllf (@x[$b0],@x[$b0],12); 176e1051a39Sopenharmony_ci vaf (@x[$c1],@x[$c1],@x[$d1]); 177e1051a39Sopenharmony_ci vx (@x[$b1],@x[$b1],@x[$c1]); 178e1051a39Sopenharmony_ci verllf (@x[$b1],@x[$b1],12); 179e1051a39Sopenharmony_ci vaf (@x[$c2],@x[$c2],@x[$d2]); 180e1051a39Sopenharmony_ci vx (@x[$b2],@x[$b2],@x[$c2]); 181e1051a39Sopenharmony_ci verllf (@x[$b2],@x[$b2],12); 182e1051a39Sopenharmony_ci vaf (@x[$c3],@x[$c3],@x[$d3]); 183e1051a39Sopenharmony_ci vx (@x[$b3],@x[$b3],@x[$c3]); 184e1051a39Sopenharmony_ci verllf (@x[$b3],@x[$b3],12); 185e1051a39Sopenharmony_ci 186e1051a39Sopenharmony_ci vaf (@x[$a0],@x[$a0],@x[$b0]); 187e1051a39Sopenharmony_ci vx (@x[$d0],@x[$d0],@x[$a0]); 188e1051a39Sopenharmony_ci verllf (@x[$d0],@x[$d0],8); 189e1051a39Sopenharmony_ci vaf (@x[$a1],@x[$a1],@x[$b1]); 190e1051a39Sopenharmony_ci vx (@x[$d1],@x[$d1],@x[$a1]); 191e1051a39Sopenharmony_ci verllf (@x[$d1],@x[$d1],8); 192e1051a39Sopenharmony_ci vaf (@x[$a2],@x[$a2],@x[$b2]); 193e1051a39Sopenharmony_ci vx (@x[$d2],@x[$d2],@x[$a2]); 194e1051a39Sopenharmony_ci verllf (@x[$d2],@x[$d2],8); 195e1051a39Sopenharmony_ci vaf (@x[$a3],@x[$a3],@x[$b3]); 196e1051a39Sopenharmony_ci vx (@x[$d3],@x[$d3],@x[$a3]); 197e1051a39Sopenharmony_ci verllf (@x[$d3],@x[$d3],8); 198e1051a39Sopenharmony_ci 199e1051a39Sopenharmony_ci vaf (@x[$c0],@x[$c0],@x[$d0]); 200e1051a39Sopenharmony_ci vx (@x[$b0],@x[$b0],@x[$c0]); 201e1051a39Sopenharmony_ci verllf (@x[$b0],@x[$b0],7); 202e1051a39Sopenharmony_ci vaf (@x[$c1],@x[$c1],@x[$d1]); 203e1051a39Sopenharmony_ci vx (@x[$b1],@x[$b1],@x[$c1]); 204e1051a39Sopenharmony_ci verllf (@x[$b1],@x[$b1],7); 205e1051a39Sopenharmony_ci vaf (@x[$c2],@x[$c2],@x[$d2]); 206e1051a39Sopenharmony_ci vx (@x[$b2],@x[$b2],@x[$c2]); 207e1051a39Sopenharmony_ci verllf (@x[$b2],@x[$b2],7); 208e1051a39Sopenharmony_ci vaf (@x[$c3],@x[$c3],@x[$d3]); 209e1051a39Sopenharmony_ci vx (@x[$b3],@x[$b3],@x[$c3]); 210e1051a39Sopenharmony_ci verllf (@x[$b3],@x[$b3],7); 211e1051a39Sopenharmony_ci} 212e1051a39Sopenharmony_ci 213e1051a39Sopenharmony_cisub VX_ROUND { 214e1051a39Sopenharmony_cimy @a=@_[0..5]; 215e1051a39Sopenharmony_cimy @b=@_[6..11]; 216e1051a39Sopenharmony_cimy @c=@_[12..17]; 217e1051a39Sopenharmony_cimy @d=@_[18..23]; 218e1051a39Sopenharmony_cimy $odd=@_[24]; 219e1051a39Sopenharmony_ci 220e1051a39Sopenharmony_ci vaf (@a[$_],@a[$_],@b[$_]) for (0..5); 221e1051a39Sopenharmony_ci vx (@d[$_],@d[$_],@a[$_]) for (0..5); 222e1051a39Sopenharmony_ci verllf (@d[$_],@d[$_],16) for (0..5); 223e1051a39Sopenharmony_ci 224e1051a39Sopenharmony_ci vaf (@c[$_],@c[$_],@d[$_]) for (0..5); 225e1051a39Sopenharmony_ci vx (@b[$_],@b[$_],@c[$_]) for (0..5); 226e1051a39Sopenharmony_ci verllf (@b[$_],@b[$_],12) for (0..5); 227e1051a39Sopenharmony_ci 228e1051a39Sopenharmony_ci vaf (@a[$_],@a[$_],@b[$_]) for (0..5); 229e1051a39Sopenharmony_ci vx (@d[$_],@d[$_],@a[$_]) for (0..5); 230e1051a39Sopenharmony_ci verllf (@d[$_],@d[$_],8) for (0..5); 231e1051a39Sopenharmony_ci 232e1051a39Sopenharmony_ci vaf (@c[$_],@c[$_],@d[$_]) for (0..5); 233e1051a39Sopenharmony_ci vx (@b[$_],@b[$_],@c[$_]) for (0..5); 234e1051a39Sopenharmony_ci verllf (@b[$_],@b[$_],7) for (0..5); 235e1051a39Sopenharmony_ci 236e1051a39Sopenharmony_ci vsldb (@c[$_],@c[$_],@c[$_],8) for (0..5); 237e1051a39Sopenharmony_ci vsldb (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5); 238e1051a39Sopenharmony_ci vsldb (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5); 239e1051a39Sopenharmony_ci} 240e1051a39Sopenharmony_ci 241e1051a39Sopenharmony_ciPERLASM_BEGIN($output); 242e1051a39Sopenharmony_ci 243e1051a39Sopenharmony_ciINCLUDE ("s390x_arch.h"); 244e1051a39Sopenharmony_ciTEXT (); 245e1051a39Sopenharmony_ci 246e1051a39Sopenharmony_ci################ 247e1051a39Sopenharmony_ci# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len, 248e1051a39Sopenharmony_ci# const unsigned int key[8], const unsigned int counter[4]) 249e1051a39Sopenharmony_cimy ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); 250e1051a39Sopenharmony_ci{ 251e1051a39Sopenharmony_cimy $frame=$stdframe+4*20; 252e1051a39Sopenharmony_cimy @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); 253e1051a39Sopenharmony_cimy @t=map("%r$_",(8,9)); 254e1051a39Sopenharmony_ci 255e1051a39Sopenharmony_ciGLOBL ("ChaCha20_ctr32"); 256e1051a39Sopenharmony_ciTYPE ("ChaCha20_ctr32","\@function"); 257e1051a39Sopenharmony_ciALIGN (32); 258e1051a39Sopenharmony_ciLABEL ("ChaCha20_ctr32"); 259e1051a39Sopenharmony_ci larl ("%r1","OPENSSL_s390xcap_P"); 260e1051a39Sopenharmony_ci 261e1051a39Sopenharmony_ci lghi ("%r0",64); 262e1051a39Sopenharmony_ci&{$z? \<gr:\<r} ($len,$len); # len==0? 263e1051a39Sopenharmony_ci bzr ("%r14"); 264e1051a39Sopenharmony_ci lg ("%r1","S390X_STFLE+16(%r1)"); 265e1051a39Sopenharmony_ci&{$z? \&clgr:\&clr} ($len,"%r0"); 266e1051a39Sopenharmony_ci jle (".Lshort"); 267e1051a39Sopenharmony_ci 268e1051a39Sopenharmony_ci tmhh ("%r1",0x4000); # check for vx bit 269e1051a39Sopenharmony_ci jnz (".LChaCha20_ctr32_vx"); 270e1051a39Sopenharmony_ci 271e1051a39Sopenharmony_ciLABEL (".Lshort"); 272e1051a39Sopenharmony_ci&{$z? \&aghi:\&ahi} ($len,-64); 273e1051a39Sopenharmony_ci&{$z? \&lghi:\&lhi} ("%r1",-$frame); 274e1051a39Sopenharmony_ci&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)"); 275e1051a39Sopenharmony_ci&{$z? \&slgr:\&slr} ($out,$inp); # difference 276e1051a39Sopenharmony_ci la ($len,"0($inp,$len)"); # end of input minus 64 277e1051a39Sopenharmony_ci larl ("%r7",".Lsigma"); 278e1051a39Sopenharmony_ci lgr ("%r0",$sp); 279e1051a39Sopenharmony_ci la ($sp,"0(%r1,$sp)"); 280e1051a39Sopenharmony_ci&{$z? \&stg:\&st} ("%r0","0($sp)"); 281e1051a39Sopenharmony_ci 282e1051a39Sopenharmony_ci lmg ("%r8","%r11","0($key)"); # load key 283e1051a39Sopenharmony_ci lmg ("%r12","%r13","0($counter)"); # load counter 284e1051a39Sopenharmony_ci lmg ("%r6","%r7","0(%r7)"); # load sigma constant 285e1051a39Sopenharmony_ci 286e1051a39Sopenharmony_ci la ("%r14","0($inp)"); 287e1051a39Sopenharmony_ci&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)"); 288e1051a39Sopenharmony_ci&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)"); 289e1051a39Sopenharmony_ci stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack 290e1051a39Sopenharmony_ci srlg (@x[12],"%r12",32); # 32-bit counter value 291e1051a39Sopenharmony_ci j (".Loop_outer"); 292e1051a39Sopenharmony_ci 293e1051a39Sopenharmony_ciALIGN (16); 294e1051a39Sopenharmony_ciLABEL (".Loop_outer"); 295e1051a39Sopenharmony_ci lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7] 296e1051a39Sopenharmony_ci lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11] 297e1051a39Sopenharmony_ci lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15] 298e1051a39Sopenharmony_ci stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11] 299e1051a39Sopenharmony_ci lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9] 300e1051a39Sopenharmony_ci st (@x[12],"$stdframe+4*12($sp)"); # save counter 301e1051a39Sopenharmony_ci&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer 302e1051a39Sopenharmony_ci lhi ("%r14",10); 303e1051a39Sopenharmony_ci j (".Loop"); 304e1051a39Sopenharmony_ci 305e1051a39Sopenharmony_ciALIGN (4); 306e1051a39Sopenharmony_ciLABEL (".Loop"); 307e1051a39Sopenharmony_ci ROUND (0, 4, 8,12); 308e1051a39Sopenharmony_ci ROUND (0, 5,10,15); 309e1051a39Sopenharmony_ci brct ("%r14",".Loop"); 310e1051a39Sopenharmony_ci 311e1051a39Sopenharmony_ci&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer 312e1051a39Sopenharmony_ci stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9] 313e1051a39Sopenharmony_ci&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)"); 314e1051a39Sopenharmony_ci 315e1051a39Sopenharmony_ci al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule 316e1051a39Sopenharmony_ci al (@x[1],"$stdframe+4*1($sp)"); 317e1051a39Sopenharmony_ci al (@x[2],"$stdframe+4*2($sp)"); 318e1051a39Sopenharmony_ci al (@x[3],"$stdframe+4*3($sp)"); 319e1051a39Sopenharmony_ci al (@x[4],"$stdframe+4*4($sp)"); 320e1051a39Sopenharmony_ci al (@x[5],"$stdframe+4*5($sp)"); 321e1051a39Sopenharmony_ci al (@x[6],"$stdframe+4*6($sp)"); 322e1051a39Sopenharmony_ci al (@x[7],"$stdframe+4*7($sp)"); 323e1051a39Sopenharmony_ci lrvr (@x[0],@x[0]); 324e1051a39Sopenharmony_ci lrvr (@x[1],@x[1]); 325e1051a39Sopenharmony_ci lrvr (@x[2],@x[2]); 326e1051a39Sopenharmony_ci lrvr (@x[3],@x[3]); 327e1051a39Sopenharmony_ci lrvr (@x[4],@x[4]); 328e1051a39Sopenharmony_ci lrvr (@x[5],@x[5]); 329e1051a39Sopenharmony_ci lrvr (@x[6],@x[6]); 330e1051a39Sopenharmony_ci lrvr (@x[7],@x[7]); 331e1051a39Sopenharmony_ci al (@x[12],"$stdframe+4*12($sp)"); 332e1051a39Sopenharmony_ci al (@x[13],"$stdframe+4*13($sp)"); 333e1051a39Sopenharmony_ci al (@x[14],"$stdframe+4*14($sp)"); 334e1051a39Sopenharmony_ci al (@x[15],"$stdframe+4*15($sp)"); 335e1051a39Sopenharmony_ci lrvr (@x[12],@x[12]); 336e1051a39Sopenharmony_ci lrvr (@x[13],@x[13]); 337e1051a39Sopenharmony_ci lrvr (@x[14],@x[14]); 338e1051a39Sopenharmony_ci lrvr (@x[15],@x[15]); 339e1051a39Sopenharmony_ci 340e1051a39Sopenharmony_ci la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer 341e1051a39Sopenharmony_ci&{$z? \&clgr:\&clr} ("%r14",@t[1]); 342e1051a39Sopenharmony_ci jh (".Ltail"); 343e1051a39Sopenharmony_ci 344e1051a39Sopenharmony_ci x (@x[0],"4*0(%r14)"); # xor with input 345e1051a39Sopenharmony_ci x (@x[1],"4*1(%r14)"); 346e1051a39Sopenharmony_ci st (@x[0],"4*0(@t[0])"); # store output 347e1051a39Sopenharmony_ci x (@x[2],"4*2(%r14)"); 348e1051a39Sopenharmony_ci st (@x[1],"4*1(@t[0])"); 349e1051a39Sopenharmony_ci x (@x[3],"4*3(%r14)"); 350e1051a39Sopenharmony_ci st (@x[2],"4*2(@t[0])"); 351e1051a39Sopenharmony_ci x (@x[4],"4*4(%r14)"); 352e1051a39Sopenharmony_ci st (@x[3],"4*3(@t[0])"); 353e1051a39Sopenharmony_ci lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11] 354e1051a39Sopenharmony_ci x (@x[5],"4*5(%r14)"); 355e1051a39Sopenharmony_ci st (@x[4],"4*4(@t[0])"); 356e1051a39Sopenharmony_ci x (@x[6],"4*6(%r14)"); 357e1051a39Sopenharmony_ci al (@x[0],"$stdframe+4*8($sp)"); 358e1051a39Sopenharmony_ci st (@x[5],"4*5(@t[0])"); 359e1051a39Sopenharmony_ci x (@x[7],"4*7(%r14)"); 360e1051a39Sopenharmony_ci al (@x[1],"$stdframe+4*9($sp)"); 361e1051a39Sopenharmony_ci st (@x[6],"4*6(@t[0])"); 362e1051a39Sopenharmony_ci x (@x[12],"4*12(%r14)"); 363e1051a39Sopenharmony_ci al (@x[2],"$stdframe+4*10($sp)"); 364e1051a39Sopenharmony_ci st (@x[7],"4*7(@t[0])"); 365e1051a39Sopenharmony_ci x (@x[13],"4*13(%r14)"); 366e1051a39Sopenharmony_ci al (@x[3],"$stdframe+4*11($sp)"); 367e1051a39Sopenharmony_ci st (@x[12],"4*12(@t[0])"); 368e1051a39Sopenharmony_ci x (@x[14],"4*14(%r14)"); 369e1051a39Sopenharmony_ci st (@x[13],"4*13(@t[0])"); 370e1051a39Sopenharmony_ci x (@x[15],"4*15(%r14)"); 371e1051a39Sopenharmony_ci st (@x[14],"4*14(@t[0])"); 372e1051a39Sopenharmony_ci lrvr (@x[0],@x[0]); 373e1051a39Sopenharmony_ci st (@x[15],"4*15(@t[0])"); 374e1051a39Sopenharmony_ci lrvr (@x[1],@x[1]); 375e1051a39Sopenharmony_ci lrvr (@x[2],@x[2]); 376e1051a39Sopenharmony_ci lrvr (@x[3],@x[3]); 377e1051a39Sopenharmony_ci lhi (@x[12],1); 378e1051a39Sopenharmony_ci x (@x[0],"4*8(%r14)"); 379e1051a39Sopenharmony_ci al (@x[12],"$stdframe+4*12($sp)"); # increment counter 380e1051a39Sopenharmony_ci x (@x[1],"4*9(%r14)"); 381e1051a39Sopenharmony_ci st (@x[0],"4*8(@t[0])"); 382e1051a39Sopenharmony_ci x (@x[2],"4*10(%r14)"); 383e1051a39Sopenharmony_ci st (@x[1],"4*9(@t[0])"); 384e1051a39Sopenharmony_ci x (@x[3],"4*11(%r14)"); 385e1051a39Sopenharmony_ci st (@x[2],"4*10(@t[0])"); 386e1051a39Sopenharmony_ci st (@x[3],"4*11(@t[0])"); 387e1051a39Sopenharmony_ci 388e1051a39Sopenharmony_ci&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet? 389e1051a39Sopenharmony_ci la ("%r14","64(%r14)"); 390e1051a39Sopenharmony_ci jl (".Loop_outer"); 391e1051a39Sopenharmony_ci 392e1051a39Sopenharmony_ciLABEL (".Ldone"); 393e1051a39Sopenharmony_ci xgr ("%r0","%r0"); 394e1051a39Sopenharmony_ci xgr ("%r1","%r1"); 395e1051a39Sopenharmony_ci xgr ("%r2","%r2"); 396e1051a39Sopenharmony_ci xgr ("%r3","%r3"); 397e1051a39Sopenharmony_ci stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy 398e1051a39Sopenharmony_ci stmg ("%r0","%r3","$stdframe+4*12($sp)"); 399e1051a39Sopenharmony_ci 400e1051a39Sopenharmony_ci&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)"); 401e1051a39Sopenharmony_ci br ("%r14"); 402e1051a39Sopenharmony_ci 403e1051a39Sopenharmony_ciALIGN (16); 404e1051a39Sopenharmony_ciLABEL (".Ltail"); 405e1051a39Sopenharmony_ci la (@t[1],"64($t[1])"); 406e1051a39Sopenharmony_ci stm (@x[0],@x[7],"$stdframe+4*0($sp)"); 407e1051a39Sopenharmony_ci&{$z? \&slgr:\&slr} (@t[1],"%r14"); 408e1051a39Sopenharmony_ci lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); 409e1051a39Sopenharmony_ci&{$z? \&lghi:\&lhi} (@x[6],0); 410e1051a39Sopenharmony_ci stm (@x[12],@x[15],"$stdframe+4*12($sp)"); 411e1051a39Sopenharmony_ci al (@x[0],"$stdframe+4*8($sp)"); 412e1051a39Sopenharmony_ci al (@x[1],"$stdframe+4*9($sp)"); 413e1051a39Sopenharmony_ci al (@x[2],"$stdframe+4*10($sp)"); 414e1051a39Sopenharmony_ci al (@x[3],"$stdframe+4*11($sp)"); 415e1051a39Sopenharmony_ci lrvr (@x[0],@x[0]); 416e1051a39Sopenharmony_ci lrvr (@x[1],@x[1]); 417e1051a39Sopenharmony_ci lrvr (@x[2],@x[2]); 418e1051a39Sopenharmony_ci lrvr (@x[3],@x[3]); 419e1051a39Sopenharmony_ci stm (@x[0],@x[3],"$stdframe+4*8($sp)"); 420e1051a39Sopenharmony_ci 421e1051a39Sopenharmony_ciLABEL (".Loop_tail"); 422e1051a39Sopenharmony_ci llgc (@x[4],"0(@x[6],%r14)"); 423e1051a39Sopenharmony_ci llgc (@x[5],"$stdframe(@x[6],$sp)"); 424e1051a39Sopenharmony_ci xr (@x[5],@x[4]); 425e1051a39Sopenharmony_ci stc (@x[5],"0(@x[6],@t[0])"); 426e1051a39Sopenharmony_ci la (@x[6],"1(@x[6])"); 427e1051a39Sopenharmony_ci brct (@t[1],".Loop_tail"); 428e1051a39Sopenharmony_ci 429e1051a39Sopenharmony_ci j (".Ldone"); 430e1051a39Sopenharmony_ciSIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32"); 431e1051a39Sopenharmony_ci} 432e1051a39Sopenharmony_ci 433e1051a39Sopenharmony_ci######################################################################## 434e1051a39Sopenharmony_ci# 4x"vertical" layout minimizes amount of instructions, but pipeline 435e1051a39Sopenharmony_ci# runs underutilized [because of vector instructions' high latency]. 436e1051a39Sopenharmony_ci# On the other hand minimum amount of data it takes to fully utilize 437e1051a39Sopenharmony_ci# the pipeline is higher, so that effectively, short inputs would be 438e1051a39Sopenharmony_ci# processed slower. Hence this code path targeting <=256 bytes lengths. 439e1051a39Sopenharmony_ci# 440e1051a39Sopenharmony_ci{ 441e1051a39Sopenharmony_cimy ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 442e1051a39Sopenharmony_ci $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15)); 443e1051a39Sopenharmony_cimy @K=map("%v$_",(16..19)); 444e1051a39Sopenharmony_cimy $CTR="%v26"; 445e1051a39Sopenharmony_cimy ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30)); 446e1051a39Sopenharmony_cimy $beperm="%v31"; 447e1051a39Sopenharmony_cimy ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10))); 448e1051a39Sopenharmony_cimy $FRAME=$stdframe+4*16; 449e1051a39Sopenharmony_ci 450e1051a39Sopenharmony_ciALIGN (32); 451e1051a39Sopenharmony_ciLABEL ("ChaCha20_ctr32_4x"); 452e1051a39Sopenharmony_ciLABEL (".LChaCha20_ctr32_4x"); 453e1051a39Sopenharmony_ci&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)"); 454e1051a39Sopenharmony_ciif (!$z) { 455e1051a39Sopenharmony_ci std ("%f4","16*$SIZE_T+2*8($sp)"); 456e1051a39Sopenharmony_ci std ("%f6","16*$SIZE_T+3*8($sp)"); 457e1051a39Sopenharmony_ci} 458e1051a39Sopenharmony_ci&{$z? \&lghi:\&lhi} ("%r1",-$FRAME); 459e1051a39Sopenharmony_ci lgr ("%r0",$sp); 460e1051a39Sopenharmony_ci la ($sp,"0(%r1,$sp)"); 461e1051a39Sopenharmony_ci&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain 462e1051a39Sopenharmony_ciif ($z) { 463e1051a39Sopenharmony_ci std ("%f8","$stdframe+8*0($sp)"); 464e1051a39Sopenharmony_ci std ("%f9","$stdframe+8*1($sp)"); 465e1051a39Sopenharmony_ci std ("%f10","$stdframe+8*2($sp)"); 466e1051a39Sopenharmony_ci std ("%f11","$stdframe+8*3($sp)"); 467e1051a39Sopenharmony_ci std ("%f12","$stdframe+8*4($sp)"); 468e1051a39Sopenharmony_ci std ("%f13","$stdframe+8*5($sp)"); 469e1051a39Sopenharmony_ci std ("%f14","$stdframe+8*6($sp)"); 470e1051a39Sopenharmony_ci std ("%f15","$stdframe+8*7($sp)"); 471e1051a39Sopenharmony_ci} 472e1051a39Sopenharmony_ci larl ("%r7",".Lsigma"); 473e1051a39Sopenharmony_ci lhi ("%r0",10); 474e1051a39Sopenharmony_ci lhi ("%r1",0); 475e1051a39Sopenharmony_ci 476e1051a39Sopenharmony_ci vl (@K[0],"0(%r7)"); # load sigma 477e1051a39Sopenharmony_ci vl (@K[1],"0($key)"); # load key 478e1051a39Sopenharmony_ci vl (@K[2],"16($key)"); 479e1051a39Sopenharmony_ci vl (@K[3],"0($counter)"); # load counter 480e1051a39Sopenharmony_ci 481e1051a39Sopenharmony_ci vl ($beperm,"0x40(%r7)"); 482e1051a39Sopenharmony_ci vl ($xt1,"0x50(%r7)"); 483e1051a39Sopenharmony_ci vrepf ($CTR,@K[3],0); 484e1051a39Sopenharmony_ci vlvgf (@K[3],"%r1",0); # clear @K[3].word[0] 485e1051a39Sopenharmony_ci vaf ($CTR,$CTR,$xt1); 486e1051a39Sopenharmony_ci 487e1051a39Sopenharmony_ci#LABEL (".Loop_outer_4x"); 488e1051a39Sopenharmony_ci vlm ($xa0,$xa3,"0x60(%r7)"); # load [smashed] sigma 489e1051a39Sopenharmony_ci 490e1051a39Sopenharmony_ci vrepf ($xb0,@K[1],0); # smash the key 491e1051a39Sopenharmony_ci vrepf ($xb1,@K[1],1); 492e1051a39Sopenharmony_ci vrepf ($xb2,@K[1],2); 493e1051a39Sopenharmony_ci vrepf ($xb3,@K[1],3); 494e1051a39Sopenharmony_ci 495e1051a39Sopenharmony_ci vrepf ($xc0,@K[2],0); 496e1051a39Sopenharmony_ci vrepf ($xc1,@K[2],1); 497e1051a39Sopenharmony_ci vrepf ($xc2,@K[2],2); 498e1051a39Sopenharmony_ci vrepf ($xc3,@K[2],3); 499e1051a39Sopenharmony_ci 500e1051a39Sopenharmony_ci vlr ($xd0,$CTR); 501e1051a39Sopenharmony_ci vrepf ($xd1,@K[3],1); 502e1051a39Sopenharmony_ci vrepf ($xd2,@K[3],2); 503e1051a39Sopenharmony_ci vrepf ($xd3,@K[3],3); 504e1051a39Sopenharmony_ci 505e1051a39Sopenharmony_ciLABEL (".Loop_4x"); 506e1051a39Sopenharmony_ci VX_lane_ROUND(0, 4, 8,12); 507e1051a39Sopenharmony_ci VX_lane_ROUND(0, 5,10,15); 508e1051a39Sopenharmony_ci brct ("%r0",".Loop_4x"); 509e1051a39Sopenharmony_ci 510e1051a39Sopenharmony_ci vaf ($xd0,$xd0,$CTR); 511e1051a39Sopenharmony_ci 512e1051a39Sopenharmony_ci vmrhf ($xt0,$xa0,$xa1); # transpose data 513e1051a39Sopenharmony_ci vmrhf ($xt1,$xa2,$xa3); 514e1051a39Sopenharmony_ci vmrlf ($xt2,$xa0,$xa1); 515e1051a39Sopenharmony_ci vmrlf ($xt3,$xa2,$xa3); 516e1051a39Sopenharmony_ci vpdi ($xa0,$xt0,$xt1,0b0000); 517e1051a39Sopenharmony_ci vpdi ($xa1,$xt0,$xt1,0b0101); 518e1051a39Sopenharmony_ci vpdi ($xa2,$xt2,$xt3,0b0000); 519e1051a39Sopenharmony_ci vpdi ($xa3,$xt2,$xt3,0b0101); 520e1051a39Sopenharmony_ci 521e1051a39Sopenharmony_ci vmrhf ($xt0,$xb0,$xb1); 522e1051a39Sopenharmony_ci vmrhf ($xt1,$xb2,$xb3); 523e1051a39Sopenharmony_ci vmrlf ($xt2,$xb0,$xb1); 524e1051a39Sopenharmony_ci vmrlf ($xt3,$xb2,$xb3); 525e1051a39Sopenharmony_ci vpdi ($xb0,$xt0,$xt1,0b0000); 526e1051a39Sopenharmony_ci vpdi ($xb1,$xt0,$xt1,0b0101); 527e1051a39Sopenharmony_ci vpdi ($xb2,$xt2,$xt3,0b0000); 528e1051a39Sopenharmony_ci vpdi ($xb3,$xt2,$xt3,0b0101); 529e1051a39Sopenharmony_ci 530e1051a39Sopenharmony_ci vmrhf ($xt0,$xc0,$xc1); 531e1051a39Sopenharmony_ci vmrhf ($xt1,$xc2,$xc3); 532e1051a39Sopenharmony_ci vmrlf ($xt2,$xc0,$xc1); 533e1051a39Sopenharmony_ci vmrlf ($xt3,$xc2,$xc3); 534e1051a39Sopenharmony_ci vpdi ($xc0,$xt0,$xt1,0b0000); 535e1051a39Sopenharmony_ci vpdi ($xc1,$xt0,$xt1,0b0101); 536e1051a39Sopenharmony_ci vpdi ($xc2,$xt2,$xt3,0b0000); 537e1051a39Sopenharmony_ci vpdi ($xc3,$xt2,$xt3,0b0101); 538e1051a39Sopenharmony_ci 539e1051a39Sopenharmony_ci vmrhf ($xt0,$xd0,$xd1); 540e1051a39Sopenharmony_ci vmrhf ($xt1,$xd2,$xd3); 541e1051a39Sopenharmony_ci vmrlf ($xt2,$xd0,$xd1); 542e1051a39Sopenharmony_ci vmrlf ($xt3,$xd2,$xd3); 543e1051a39Sopenharmony_ci vpdi ($xd0,$xt0,$xt1,0b0000); 544e1051a39Sopenharmony_ci vpdi ($xd1,$xt0,$xt1,0b0101); 545e1051a39Sopenharmony_ci vpdi ($xd2,$xt2,$xt3,0b0000); 546e1051a39Sopenharmony_ci vpdi ($xd3,$xt2,$xt3,0b0101); 547e1051a39Sopenharmony_ci 548e1051a39Sopenharmony_ci #vrepif ($xt0,4); 549e1051a39Sopenharmony_ci #vaf ($CTR,$CTR,$xt0); # next counter value 550e1051a39Sopenharmony_ci 551e1051a39Sopenharmony_ci vaf ($xa0,$xa0,@K[0]); 552e1051a39Sopenharmony_ci vaf ($xb0,$xb0,@K[1]); 553e1051a39Sopenharmony_ci vaf ($xc0,$xc0,@K[2]); 554e1051a39Sopenharmony_ci vaf ($xd0,$xd0,@K[3]); 555e1051a39Sopenharmony_ci 556e1051a39Sopenharmony_ci vperm ($xa0,$xa0,$xa0,$beperm); 557e1051a39Sopenharmony_ci vperm ($xb0,$xb0,$xb0,$beperm); 558e1051a39Sopenharmony_ci vperm ($xc0,$xc0,$xc0,$beperm); 559e1051a39Sopenharmony_ci vperm ($xd0,$xd0,$xd0,$beperm); 560e1051a39Sopenharmony_ci 561e1051a39Sopenharmony_ci #&{$z? \&clgfi:\&clfi} ($len,0x40); 562e1051a39Sopenharmony_ci #jl (".Ltail_4x"); 563e1051a39Sopenharmony_ci 564e1051a39Sopenharmony_ci vlm ($xt0,$xt3,"0($inp)"); 565e1051a39Sopenharmony_ci 566e1051a39Sopenharmony_ci vx ($xt0,$xt0,$xa0); 567e1051a39Sopenharmony_ci vx ($xt1,$xt1,$xb0); 568e1051a39Sopenharmony_ci vx ($xt2,$xt2,$xc0); 569e1051a39Sopenharmony_ci vx ($xt3,$xt3,$xd0); 570e1051a39Sopenharmony_ci 571e1051a39Sopenharmony_ci vstm ($xt0,$xt3,"0($out)"); 572e1051a39Sopenharmony_ci 573e1051a39Sopenharmony_ci la ($inp,"0x40($inp)"); 574e1051a39Sopenharmony_ci la ($out,"0x40($out)"); 575e1051a39Sopenharmony_ci&{$z? \&aghi:\&ahi} ($len,-0x40); 576e1051a39Sopenharmony_ci #je (".Ldone_4x"); 577e1051a39Sopenharmony_ci 578e1051a39Sopenharmony_ci vaf ($xa0,$xa1,@K[0]); 579e1051a39Sopenharmony_ci vaf ($xb0,$xb1,@K[1]); 580e1051a39Sopenharmony_ci vaf ($xc0,$xc1,@K[2]); 581e1051a39Sopenharmony_ci vaf ($xd0,$xd1,@K[3]); 582e1051a39Sopenharmony_ci 583e1051a39Sopenharmony_ci vperm ($xa0,$xa0,$xa0,$beperm); 584e1051a39Sopenharmony_ci vperm ($xb0,$xb0,$xb0,$beperm); 585e1051a39Sopenharmony_ci vperm ($xc0,$xc0,$xc0,$beperm); 586e1051a39Sopenharmony_ci vperm ($xd0,$xd0,$xd0,$beperm); 587e1051a39Sopenharmony_ci 588e1051a39Sopenharmony_ci&{$z? \&clgfi:\&clfi} ($len,0x40); 589e1051a39Sopenharmony_ci jl (".Ltail_4x"); 590e1051a39Sopenharmony_ci 591e1051a39Sopenharmony_ci vlm ($xt0,$xt3,"0($inp)"); 592e1051a39Sopenharmony_ci 593e1051a39Sopenharmony_ci vx ($xt0,$xt0,$xa0); 594e1051a39Sopenharmony_ci vx ($xt1,$xt1,$xb0); 595e1051a39Sopenharmony_ci vx ($xt2,$xt2,$xc0); 596e1051a39Sopenharmony_ci vx ($xt3,$xt3,$xd0); 597e1051a39Sopenharmony_ci 598e1051a39Sopenharmony_ci vstm ($xt0,$xt3,"0($out)"); 599e1051a39Sopenharmony_ci 600e1051a39Sopenharmony_ci la ($inp,"0x40($inp)"); 601e1051a39Sopenharmony_ci la ($out,"0x40($out)"); 602e1051a39Sopenharmony_ci&{$z? \&aghi:\&ahi} ($len,-0x40); 603e1051a39Sopenharmony_ci je (".Ldone_4x"); 604e1051a39Sopenharmony_ci 605e1051a39Sopenharmony_ci vaf ($xa0,$xa2,@K[0]); 606e1051a39Sopenharmony_ci vaf ($xb0,$xb2,@K[1]); 607e1051a39Sopenharmony_ci vaf ($xc0,$xc2,@K[2]); 608e1051a39Sopenharmony_ci vaf ($xd0,$xd2,@K[3]); 609e1051a39Sopenharmony_ci 610e1051a39Sopenharmony_ci vperm ($xa0,$xa0,$xa0,$beperm); 611e1051a39Sopenharmony_ci vperm ($xb0,$xb0,$xb0,$beperm); 612e1051a39Sopenharmony_ci vperm ($xc0,$xc0,$xc0,$beperm); 613e1051a39Sopenharmony_ci vperm ($xd0,$xd0,$xd0,$beperm); 614e1051a39Sopenharmony_ci 615e1051a39Sopenharmony_ci&{$z? \&clgfi:\&clfi} ($len,0x40); 616e1051a39Sopenharmony_ci jl (".Ltail_4x"); 617e1051a39Sopenharmony_ci 618e1051a39Sopenharmony_ci vlm ($xt0,$xt3,"0($inp)"); 619e1051a39Sopenharmony_ci 620e1051a39Sopenharmony_ci vx ($xt0,$xt0,$xa0); 621e1051a39Sopenharmony_ci vx ($xt1,$xt1,$xb0); 622e1051a39Sopenharmony_ci vx ($xt2,$xt2,$xc0); 623e1051a39Sopenharmony_ci vx ($xt3,$xt3,$xd0); 624e1051a39Sopenharmony_ci 625e1051a39Sopenharmony_ci vstm ($xt0,$xt3,"0($out)"); 626e1051a39Sopenharmony_ci 627e1051a39Sopenharmony_ci la ($inp,"0x40($inp)"); 628e1051a39Sopenharmony_ci la ($out,"0x40($out)"); 629e1051a39Sopenharmony_ci&{$z? \&aghi:\&ahi} ($len,-0x40); 630e1051a39Sopenharmony_ci je (".Ldone_4x"); 631e1051a39Sopenharmony_ci 632e1051a39Sopenharmony_ci vaf ($xa0,$xa3,@K[0]); 633e1051a39Sopenharmony_ci vaf ($xb0,$xb3,@K[1]); 634e1051a39Sopenharmony_ci vaf ($xc0,$xc3,@K[2]); 635e1051a39Sopenharmony_ci vaf ($xd0,$xd3,@K[3]); 636e1051a39Sopenharmony_ci 637e1051a39Sopenharmony_ci vperm ($xa0,$xa0,$xa0,$beperm); 638e1051a39Sopenharmony_ci vperm ($xb0,$xb0,$xb0,$beperm); 639e1051a39Sopenharmony_ci vperm ($xc0,$xc0,$xc0,$beperm); 640e1051a39Sopenharmony_ci vperm ($xd0,$xd0,$xd0,$beperm); 641e1051a39Sopenharmony_ci 642e1051a39Sopenharmony_ci&{$z? \&clgfi:\&clfi} ($len,0x40); 643e1051a39Sopenharmony_ci jl (".Ltail_4x"); 644e1051a39Sopenharmony_ci 645e1051a39Sopenharmony_ci vlm ($xt0,$xt3,"0($inp)"); 646e1051a39Sopenharmony_ci 647e1051a39Sopenharmony_ci vx ($xt0,$xt0,$xa0); 648e1051a39Sopenharmony_ci vx ($xt1,$xt1,$xb0); 649e1051a39Sopenharmony_ci vx ($xt2,$xt2,$xc0); 650e1051a39Sopenharmony_ci vx ($xt3,$xt3,$xd0); 651e1051a39Sopenharmony_ci 652e1051a39Sopenharmony_ci vstm ($xt0,$xt3,"0($out)"); 653e1051a39Sopenharmony_ci 654e1051a39Sopenharmony_ci #la $inp,0x40($inp)); 655e1051a39Sopenharmony_ci #la $out,0x40($out)); 656e1051a39Sopenharmony_ci #lhi %r0,10); 657e1051a39Sopenharmony_ci #&{$z? \&aghi:\&ahi} $len,-0x40); 658e1051a39Sopenharmony_ci #jne .Loop_outer_4x); 659e1051a39Sopenharmony_ci 660e1051a39Sopenharmony_ciLABEL (".Ldone_4x"); 661e1051a39Sopenharmony_ciif (!$z) { 662e1051a39Sopenharmony_ci ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); 663e1051a39Sopenharmony_ci ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); 664e1051a39Sopenharmony_ci} else { 665e1051a39Sopenharmony_ci ld ("%f8","$stdframe+8*0($sp)"); 666e1051a39Sopenharmony_ci ld ("%f9","$stdframe+8*1($sp)"); 667e1051a39Sopenharmony_ci ld ("%f10","$stdframe+8*2($sp)"); 668e1051a39Sopenharmony_ci ld ("%f11","$stdframe+8*3($sp)"); 669e1051a39Sopenharmony_ci ld ("%f12","$stdframe+8*4($sp)"); 670e1051a39Sopenharmony_ci ld ("%f13","$stdframe+8*5($sp)"); 671e1051a39Sopenharmony_ci ld ("%f14","$stdframe+8*6($sp)"); 672e1051a39Sopenharmony_ci ld ("%f15","$stdframe+8*7($sp)"); 673e1051a39Sopenharmony_ci} 674e1051a39Sopenharmony_ci&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); 675e1051a39Sopenharmony_ci la ($sp,"$FRAME($sp)"); 676e1051a39Sopenharmony_ci br ("%r14"); 677e1051a39Sopenharmony_ci 678e1051a39Sopenharmony_ciALIGN (16); 679e1051a39Sopenharmony_ciLABEL (".Ltail_4x"); 680e1051a39Sopenharmony_ciif (!$z) { 681e1051a39Sopenharmony_ci vlr ($xt0,$xb0); 682e1051a39Sopenharmony_ci ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); 683e1051a39Sopenharmony_ci ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); 684e1051a39Sopenharmony_ci 685e1051a39Sopenharmony_ci vst ($xa0,"$stdframe+0x00($sp)"); 686e1051a39Sopenharmony_ci vst ($xt0,"$stdframe+0x10($sp)"); 687e1051a39Sopenharmony_ci vst ($xc0,"$stdframe+0x20($sp)"); 688e1051a39Sopenharmony_ci vst ($xd0,"$stdframe+0x30($sp)"); 689e1051a39Sopenharmony_ci} else { 690e1051a39Sopenharmony_ci vlr ($xt0,$xc0); 691e1051a39Sopenharmony_ci ld ("%f8","$stdframe+8*0($sp)"); 692e1051a39Sopenharmony_ci ld ("%f9","$stdframe+8*1($sp)"); 693e1051a39Sopenharmony_ci ld ("%f10","$stdframe+8*2($sp)"); 694e1051a39Sopenharmony_ci ld ("%f11","$stdframe+8*3($sp)"); 695e1051a39Sopenharmony_ci vlr ($xt1,$xd0); 696e1051a39Sopenharmony_ci ld ("%f12","$stdframe+8*4($sp)"); 697e1051a39Sopenharmony_ci ld ("%f13","$stdframe+8*5($sp)"); 698e1051a39Sopenharmony_ci ld ("%f14","$stdframe+8*6($sp)"); 699e1051a39Sopenharmony_ci ld ("%f15","$stdframe+8*7($sp)"); 700e1051a39Sopenharmony_ci 701e1051a39Sopenharmony_ci vst ($xa0,"$stdframe+0x00($sp)"); 702e1051a39Sopenharmony_ci vst ($xb0,"$stdframe+0x10($sp)"); 703e1051a39Sopenharmony_ci vst ($xt0,"$stdframe+0x20($sp)"); 704e1051a39Sopenharmony_ci vst ($xt1,"$stdframe+0x30($sp)"); 705e1051a39Sopenharmony_ci} 706e1051a39Sopenharmony_ci lghi ("%r1",0); 707e1051a39Sopenharmony_ci 708e1051a39Sopenharmony_ciLABEL (".Loop_tail_4x"); 709e1051a39Sopenharmony_ci llgc ("%r5","0(%r1,$inp)"); 710e1051a39Sopenharmony_ci llgc ("%r6","$stdframe(%r1,$sp)"); 711e1051a39Sopenharmony_ci xr ("%r6","%r5"); 712e1051a39Sopenharmony_ci stc ("%r6","0(%r1,$out)"); 713e1051a39Sopenharmony_ci la ("%r1","1(%r1)"); 714e1051a39Sopenharmony_ci brct ($len,".Loop_tail_4x"); 715e1051a39Sopenharmony_ci 716e1051a39Sopenharmony_ci&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); 717e1051a39Sopenharmony_ci la ($sp,"$FRAME($sp)"); 718e1051a39Sopenharmony_ci br ("%r14"); 719e1051a39Sopenharmony_ciSIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x"); 720e1051a39Sopenharmony_ci} 721e1051a39Sopenharmony_ci 722e1051a39Sopenharmony_ci######################################################################## 723e1051a39Sopenharmony_ci# 6x"horizontal" layout is optimal fit for the platform in its current 724e1051a39Sopenharmony_ci# shape, more specifically for given vector instructions' latency. Well, 725e1051a39Sopenharmony_ci# computational part of 8x"vertical" would be faster, but it consumes 726e1051a39Sopenharmony_ci# all registers and dealing with that will diminish the return... 727e1051a39Sopenharmony_ci# 728e1051a39Sopenharmony_ci{ 729e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1, 730e1051a39Sopenharmony_ci $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3, 731e1051a39Sopenharmony_ci $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23)); 732e1051a39Sopenharmony_cimy @K=map("%v$_",(27,24..26)); 733e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3)=map("%v$_",27..30); 734e1051a39Sopenharmony_cimy $beperm="%v31"; 735e1051a39Sopenharmony_cimy $FRAME=$stdframe + 4*16; 736e1051a39Sopenharmony_ci 737e1051a39Sopenharmony_ciGLOBL ("ChaCha20_ctr32_vx"); 738e1051a39Sopenharmony_ciALIGN (32); 739e1051a39Sopenharmony_ciLABEL ("ChaCha20_ctr32_vx"); 740e1051a39Sopenharmony_ciLABEL (".LChaCha20_ctr32_vx"); 741e1051a39Sopenharmony_ci&{$z? \&clgfi:\&clfi} ($len,256); 742e1051a39Sopenharmony_ci jle (".LChaCha20_ctr32_4x"); 743e1051a39Sopenharmony_ci&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)"); 744e1051a39Sopenharmony_ciif (!$z) { 745e1051a39Sopenharmony_ci std ("%f4","16*$SIZE_T+2*8($sp)"); 746e1051a39Sopenharmony_ci std ("%f6","16*$SIZE_T+3*8($sp)"); 747e1051a39Sopenharmony_ci} 748e1051a39Sopenharmony_ci&{$z? \&lghi:\&lhi} ("%r1",-$FRAME); 749e1051a39Sopenharmony_ci lgr ("%r0",$sp); 750e1051a39Sopenharmony_ci la ($sp,"0(%r1,$sp)"); 751e1051a39Sopenharmony_ci&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain 752e1051a39Sopenharmony_ciif ($z) { 753e1051a39Sopenharmony_ci std ("%f8","$FRAME-8*8($sp)"); 754e1051a39Sopenharmony_ci std ("%f9","$FRAME-8*7($sp)"); 755e1051a39Sopenharmony_ci std ("%f10","$FRAME-8*6($sp)"); 756e1051a39Sopenharmony_ci std ("%f11","$FRAME-8*5($sp)"); 757e1051a39Sopenharmony_ci std ("%f12","$FRAME-8*4($sp)"); 758e1051a39Sopenharmony_ci std ("%f13","$FRAME-8*3($sp)"); 759e1051a39Sopenharmony_ci std ("%f14","$FRAME-8*2($sp)"); 760e1051a39Sopenharmony_ci std ("%f15","$FRAME-8*1($sp)"); 761e1051a39Sopenharmony_ci} 762e1051a39Sopenharmony_ci larl ("%r7",".Lsigma"); 763e1051a39Sopenharmony_ci lhi ("%r0",10); 764e1051a39Sopenharmony_ci 765e1051a39Sopenharmony_ci vlm (@K[1],@K[2],"0($key)"); # load key 766e1051a39Sopenharmony_ci vl (@K[3],"0($counter)"); # load counter 767e1051a39Sopenharmony_ci 768e1051a39Sopenharmony_ci vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ... 769e1051a39Sopenharmony_ci 770e1051a39Sopenharmony_ciLABEL (".Loop_outer_vx"); 771e1051a39Sopenharmony_ci vlr ($a0,@K[0]); 772e1051a39Sopenharmony_ci vlr ($b0,@K[1]); 773e1051a39Sopenharmony_ci vlr ($a1,@K[0]); 774e1051a39Sopenharmony_ci vlr ($b1,@K[1]); 775e1051a39Sopenharmony_ci vlr ($a2,@K[0]); 776e1051a39Sopenharmony_ci vlr ($b2,@K[1]); 777e1051a39Sopenharmony_ci vlr ($a3,@K[0]); 778e1051a39Sopenharmony_ci vlr ($b3,@K[1]); 779e1051a39Sopenharmony_ci vlr ($a4,@K[0]); 780e1051a39Sopenharmony_ci vlr ($b4,@K[1]); 781e1051a39Sopenharmony_ci vlr ($a5,@K[0]); 782e1051a39Sopenharmony_ci vlr ($b5,@K[1]); 783e1051a39Sopenharmony_ci 784e1051a39Sopenharmony_ci vlr ($d0,@K[3]); 785e1051a39Sopenharmony_ci vaf ($d1,@K[3],$t1); # K[3]+1 786e1051a39Sopenharmony_ci vaf ($d2,@K[3],$t2); # K[3]+2 787e1051a39Sopenharmony_ci vaf ($d3,@K[3],$t3); # K[3]+3 788e1051a39Sopenharmony_ci vaf ($d4,$d2,$t2); # K[3]+4 789e1051a39Sopenharmony_ci vaf ($d5,$d2,$t3); # K[3]+5 790e1051a39Sopenharmony_ci 791e1051a39Sopenharmony_ci vlr ($c0,@K[2]); 792e1051a39Sopenharmony_ci vlr ($c1,@K[2]); 793e1051a39Sopenharmony_ci vlr ($c2,@K[2]); 794e1051a39Sopenharmony_ci vlr ($c3,@K[2]); 795e1051a39Sopenharmony_ci vlr ($c4,@K[2]); 796e1051a39Sopenharmony_ci vlr ($c5,@K[2]); 797e1051a39Sopenharmony_ci 798e1051a39Sopenharmony_ci vlr ($t1,$d1); 799e1051a39Sopenharmony_ci vlr ($t2,$d2); 800e1051a39Sopenharmony_ci vlr ($t3,$d3); 801e1051a39Sopenharmony_ci 802e1051a39Sopenharmony_ciALIGN (4); 803e1051a39Sopenharmony_ciLABEL (".Loop_vx"); 804e1051a39Sopenharmony_ci 805e1051a39Sopenharmony_ci VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5, 806e1051a39Sopenharmony_ci $b0,$b1,$b2,$b3,$b4,$b5, 807e1051a39Sopenharmony_ci $c0,$c1,$c2,$c3,$c4,$c5, 808e1051a39Sopenharmony_ci $d0,$d1,$d2,$d3,$d4,$d5, 809e1051a39Sopenharmony_ci 0); 810e1051a39Sopenharmony_ci 811e1051a39Sopenharmony_ci VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5, 812e1051a39Sopenharmony_ci $b0,$b1,$b2,$b3,$b4,$b5, 813e1051a39Sopenharmony_ci $c0,$c1,$c2,$c3,$c4,$c5, 814e1051a39Sopenharmony_ci $d0,$d1,$d2,$d3,$d4,$d5, 815e1051a39Sopenharmony_ci 1); 816e1051a39Sopenharmony_ci 817e1051a39Sopenharmony_ci brct ("%r0",".Loop_vx"); 818e1051a39Sopenharmony_ci 819e1051a39Sopenharmony_ci vaf ($a0,$a0,@K[0]); 820e1051a39Sopenharmony_ci vaf ($b0,$b0,@K[1]); 821e1051a39Sopenharmony_ci vaf ($c0,$c0,@K[2]); 822e1051a39Sopenharmony_ci vaf ($d0,$d0,@K[3]); 823e1051a39Sopenharmony_ci vaf ($a1,$a1,@K[0]); 824e1051a39Sopenharmony_ci vaf ($d1,$d1,$t1); # +K[3]+1 825e1051a39Sopenharmony_ci 826e1051a39Sopenharmony_ci vperm ($a0,$a0,$a0,$beperm); 827e1051a39Sopenharmony_ci vperm ($b0,$b0,$b0,$beperm); 828e1051a39Sopenharmony_ci vperm ($c0,$c0,$c0,$beperm); 829e1051a39Sopenharmony_ci vperm ($d0,$d0,$d0,$beperm); 830e1051a39Sopenharmony_ci 831e1051a39Sopenharmony_ci&{$z? \&clgfi:\&clfi} ($len,0x40); 832e1051a39Sopenharmony_ci jl (".Ltail_vx"); 833e1051a39Sopenharmony_ci 834e1051a39Sopenharmony_ci vaf ($d2,$d2,$t2); # +K[3]+2 835e1051a39Sopenharmony_ci vaf ($d3,$d3,$t3); # +K[3]+3 836e1051a39Sopenharmony_ci vlm ($t0,$t3,"0($inp)"); 837e1051a39Sopenharmony_ci 838e1051a39Sopenharmony_ci vx ($a0,$a0,$t0); 839e1051a39Sopenharmony_ci vx ($b0,$b0,$t1); 840e1051a39Sopenharmony_ci vx ($c0,$c0,$t2); 841e1051a39Sopenharmony_ci vx ($d0,$d0,$t3); 842e1051a39Sopenharmony_ci 843e1051a39Sopenharmony_ci vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments 844e1051a39Sopenharmony_ci 845e1051a39Sopenharmony_ci vstm ($a0,$d0,"0($out)"); 846e1051a39Sopenharmony_ci 847e1051a39Sopenharmony_ci la ($inp,"0x40($inp)"); 848e1051a39Sopenharmony_ci la ($out,"0x40($out)"); 849e1051a39Sopenharmony_ci&{$z? \&aghi:\&ahi} ($len,-0x40); 850e1051a39Sopenharmony_ci je (".Ldone_vx"); 851e1051a39Sopenharmony_ci 852e1051a39Sopenharmony_ci vaf ($b1,$b1,@K[1]); 853e1051a39Sopenharmony_ci vaf ($c1,$c1,@K[2]); 854e1051a39Sopenharmony_ci 855e1051a39Sopenharmony_ci vperm ($a0,$a1,$a1,$beperm); 856e1051a39Sopenharmony_ci vperm ($b0,$b1,$b1,$beperm); 857e1051a39Sopenharmony_ci vperm ($c0,$c1,$c1,$beperm); 858e1051a39Sopenharmony_ci vperm ($d0,$d1,$d1,$beperm); 859e1051a39Sopenharmony_ci 860e1051a39Sopenharmony_ci&{$z? \&clgfi:\&clfi} ($len,0x40); 861e1051a39Sopenharmony_ci jl (".Ltail_vx"); 862e1051a39Sopenharmony_ci 863e1051a39Sopenharmony_ci vlm ($a1,$d1,"0($inp)"); 864e1051a39Sopenharmony_ci 865e1051a39Sopenharmony_ci vx ($a0,$a0,$a1); 866e1051a39Sopenharmony_ci vx ($b0,$b0,$b1); 867e1051a39Sopenharmony_ci vx ($c0,$c0,$c1); 868e1051a39Sopenharmony_ci vx ($d0,$d0,$d1); 869e1051a39Sopenharmony_ci 870e1051a39Sopenharmony_ci vstm ($a0,$d0,"0($out)"); 871e1051a39Sopenharmony_ci 872e1051a39Sopenharmony_ci la ($inp,"0x40($inp)"); 873e1051a39Sopenharmony_ci la ($out,"0x40($out)"); 874e1051a39Sopenharmony_ci&{$z? \&aghi:\&ahi} ($len,-0x40); 875e1051a39Sopenharmony_ci je (".Ldone_vx"); 876e1051a39Sopenharmony_ci 877e1051a39Sopenharmony_ci vaf ($a2,$a2,@K[0]); 878e1051a39Sopenharmony_ci vaf ($b2,$b2,@K[1]); 879e1051a39Sopenharmony_ci vaf ($c2,$c2,@K[2]); 880e1051a39Sopenharmony_ci 881e1051a39Sopenharmony_ci vperm ($a0,$a2,$a2,$beperm); 882e1051a39Sopenharmony_ci vperm ($b0,$b2,$b2,$beperm); 883e1051a39Sopenharmony_ci vperm ($c0,$c2,$c2,$beperm); 884e1051a39Sopenharmony_ci vperm ($d0,$d2,$d2,$beperm); 885e1051a39Sopenharmony_ci 886e1051a39Sopenharmony_ci&{$z? \&clgfi:\&clfi} ($len,0x40); 887e1051a39Sopenharmony_ci jl (".Ltail_vx"); 888e1051a39Sopenharmony_ci 889e1051a39Sopenharmony_ci vlm ($a1,$d1,"0($inp)"); 890e1051a39Sopenharmony_ci 891e1051a39Sopenharmony_ci vx ($a0,$a0,$a1); 892e1051a39Sopenharmony_ci vx ($b0,$b0,$b1); 893e1051a39Sopenharmony_ci vx ($c0,$c0,$c1); 894e1051a39Sopenharmony_ci vx ($d0,$d0,$d1); 895e1051a39Sopenharmony_ci 896e1051a39Sopenharmony_ci vstm ($a0,$d0,"0($out)"); 897e1051a39Sopenharmony_ci 898e1051a39Sopenharmony_ci la ($inp,"0x40($inp)"); 899e1051a39Sopenharmony_ci la ($out,"0x40($out)"); 900e1051a39Sopenharmony_ci&{$z? \&aghi:\&ahi} ($len,-0x40); 901e1051a39Sopenharmony_ci je (".Ldone_vx"); 902e1051a39Sopenharmony_ci 903e1051a39Sopenharmony_ci vaf ($a3,$a3,@K[0]); 904e1051a39Sopenharmony_ci vaf ($b3,$b3,@K[1]); 905e1051a39Sopenharmony_ci vaf ($c3,$c3,@K[2]); 906e1051a39Sopenharmony_ci vaf ($d2,@K[3],$t3); # K[3]+3 907e1051a39Sopenharmony_ci 908e1051a39Sopenharmony_ci vperm ($a0,$a3,$a3,$beperm); 909e1051a39Sopenharmony_ci vperm ($b0,$b3,$b3,$beperm); 910e1051a39Sopenharmony_ci vperm ($c0,$c3,$c3,$beperm); 911e1051a39Sopenharmony_ci vperm ($d0,$d3,$d3,$beperm); 912e1051a39Sopenharmony_ci 913e1051a39Sopenharmony_ci&{$z? \&clgfi:\&clfi} ($len,0x40); 914e1051a39Sopenharmony_ci jl (".Ltail_vx"); 915e1051a39Sopenharmony_ci 916e1051a39Sopenharmony_ci vaf ($d3,$d2,$t1); # K[3]+4 917e1051a39Sopenharmony_ci vlm ($a1,$d1,"0($inp)"); 918e1051a39Sopenharmony_ci 919e1051a39Sopenharmony_ci vx ($a0,$a0,$a1); 920e1051a39Sopenharmony_ci vx ($b0,$b0,$b1); 921e1051a39Sopenharmony_ci vx ($c0,$c0,$c1); 922e1051a39Sopenharmony_ci vx ($d0,$d0,$d1); 923e1051a39Sopenharmony_ci 924e1051a39Sopenharmony_ci vstm ($a0,$d0,"0($out)"); 925e1051a39Sopenharmony_ci 926e1051a39Sopenharmony_ci la ($inp,"0x40($inp)"); 927e1051a39Sopenharmony_ci la ($out,"0x40($out)"); 928e1051a39Sopenharmony_ci&{$z? \&aghi:\&ahi} ($len,-0x40); 929e1051a39Sopenharmony_ci je (".Ldone_vx"); 930e1051a39Sopenharmony_ci 931e1051a39Sopenharmony_ci vaf ($a4,$a4,@K[0]); 932e1051a39Sopenharmony_ci vaf ($b4,$b4,@K[1]); 933e1051a39Sopenharmony_ci vaf ($c4,$c4,@K[2]); 934e1051a39Sopenharmony_ci vaf ($d4,$d4,$d3); # +K[3]+4 935e1051a39Sopenharmony_ci vaf ($d3,$d3,$t1); # K[3]+5 936e1051a39Sopenharmony_ci vaf (@K[3],$d2,$t3); # K[3]+=6 937e1051a39Sopenharmony_ci 938e1051a39Sopenharmony_ci vperm ($a0,$a4,$a4,$beperm); 939e1051a39Sopenharmony_ci vperm ($b0,$b4,$b4,$beperm); 940e1051a39Sopenharmony_ci vperm ($c0,$c4,$c4,$beperm); 941e1051a39Sopenharmony_ci vperm ($d0,$d4,$d4,$beperm); 942e1051a39Sopenharmony_ci 943e1051a39Sopenharmony_ci&{$z? \&clgfi:\&clfi} ($len,0x40); 944e1051a39Sopenharmony_ci jl (".Ltail_vx"); 945e1051a39Sopenharmony_ci 946e1051a39Sopenharmony_ci vlm ($a1,$d1,"0($inp)"); 947e1051a39Sopenharmony_ci 948e1051a39Sopenharmony_ci vx ($a0,$a0,$a1); 949e1051a39Sopenharmony_ci vx ($b0,$b0,$b1); 950e1051a39Sopenharmony_ci vx ($c0,$c0,$c1); 951e1051a39Sopenharmony_ci vx ($d0,$d0,$d1); 952e1051a39Sopenharmony_ci 953e1051a39Sopenharmony_ci vstm ($a0,$d0,"0($out)"); 954e1051a39Sopenharmony_ci 955e1051a39Sopenharmony_ci la ($inp,"0x40($inp)"); 956e1051a39Sopenharmony_ci la ($out,"0x40($out)"); 957e1051a39Sopenharmony_ci&{$z? \&aghi:\&ahi} ($len,-0x40); 958e1051a39Sopenharmony_ci je (".Ldone_vx"); 959e1051a39Sopenharmony_ci 960e1051a39Sopenharmony_ci vaf ($a5,$a5,@K[0]); 961e1051a39Sopenharmony_ci vaf ($b5,$b5,@K[1]); 962e1051a39Sopenharmony_ci vaf ($c5,$c5,@K[2]); 963e1051a39Sopenharmony_ci vaf ($d5,$d5,$d3); # +K[3]+5 964e1051a39Sopenharmony_ci 965e1051a39Sopenharmony_ci vperm ($a0,$a5,$a5,$beperm); 966e1051a39Sopenharmony_ci vperm ($b0,$b5,$b5,$beperm); 967e1051a39Sopenharmony_ci vperm ($c0,$c5,$c5,$beperm); 968e1051a39Sopenharmony_ci vperm ($d0,$d5,$d5,$beperm); 969e1051a39Sopenharmony_ci 970e1051a39Sopenharmony_ci&{$z? \&clgfi:\&clfi} ($len,0x40); 971e1051a39Sopenharmony_ci jl (".Ltail_vx"); 972e1051a39Sopenharmony_ci 973e1051a39Sopenharmony_ci vlm ($a1,$d1,"0($inp)"); 974e1051a39Sopenharmony_ci 975e1051a39Sopenharmony_ci vx ($a0,$a0,$a1); 976e1051a39Sopenharmony_ci vx ($b0,$b0,$b1); 977e1051a39Sopenharmony_ci vx ($c0,$c0,$c1); 978e1051a39Sopenharmony_ci vx ($d0,$d0,$d1); 979e1051a39Sopenharmony_ci 980e1051a39Sopenharmony_ci vstm ($a0,$d0,"0($out)"); 981e1051a39Sopenharmony_ci 982e1051a39Sopenharmony_ci la ($inp,"0x40($inp)"); 983e1051a39Sopenharmony_ci la ($out,"0x40($out)"); 984e1051a39Sopenharmony_ci lhi ("%r0",10); 985e1051a39Sopenharmony_ci&{$z? \&aghi:\&ahi} ($len,-0x40); 986e1051a39Sopenharmony_ci jne (".Loop_outer_vx"); 987e1051a39Sopenharmony_ci 988e1051a39Sopenharmony_ciLABEL (".Ldone_vx"); 989e1051a39Sopenharmony_ciif (!$z) { 990e1051a39Sopenharmony_ci ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); 991e1051a39Sopenharmony_ci ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); 992e1051a39Sopenharmony_ci} else { 993e1051a39Sopenharmony_ci ld ("%f8","$FRAME-8*8($sp)"); 994e1051a39Sopenharmony_ci ld ("%f9","$FRAME-8*7($sp)"); 995e1051a39Sopenharmony_ci ld ("%f10","$FRAME-8*6($sp)"); 996e1051a39Sopenharmony_ci ld ("%f11","$FRAME-8*5($sp)"); 997e1051a39Sopenharmony_ci ld ("%f12","$FRAME-8*4($sp)"); 998e1051a39Sopenharmony_ci ld ("%f13","$FRAME-8*3($sp)"); 999e1051a39Sopenharmony_ci ld ("%f14","$FRAME-8*2($sp)"); 1000e1051a39Sopenharmony_ci ld ("%f15","$FRAME-8*1($sp)"); 1001e1051a39Sopenharmony_ci} 1002e1051a39Sopenharmony_ci&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); 1003e1051a39Sopenharmony_ci la ($sp,"$FRAME($sp)"); 1004e1051a39Sopenharmony_ci br ("%r14"); 1005e1051a39Sopenharmony_ci 1006e1051a39Sopenharmony_ciALIGN (16); 1007e1051a39Sopenharmony_ciLABEL (".Ltail_vx"); 1008e1051a39Sopenharmony_ciif (!$z) { 1009e1051a39Sopenharmony_ci ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); 1010e1051a39Sopenharmony_ci ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); 1011e1051a39Sopenharmony_ci} else { 1012e1051a39Sopenharmony_ci ld ("%f8","$FRAME-8*8($sp)"); 1013e1051a39Sopenharmony_ci ld ("%f9","$FRAME-8*7($sp)"); 1014e1051a39Sopenharmony_ci ld ("%f10","$FRAME-8*6($sp)"); 1015e1051a39Sopenharmony_ci ld ("%f11","$FRAME-8*5($sp)"); 1016e1051a39Sopenharmony_ci ld ("%f12","$FRAME-8*4($sp)"); 1017e1051a39Sopenharmony_ci ld ("%f13","$FRAME-8*3($sp)"); 1018e1051a39Sopenharmony_ci ld ("%f14","$FRAME-8*2($sp)"); 1019e1051a39Sopenharmony_ci ld ("%f15","$FRAME-8*1($sp)"); 1020e1051a39Sopenharmony_ci} 1021e1051a39Sopenharmony_ci vstm ($a0,$d0,"$stdframe($sp)"); 1022e1051a39Sopenharmony_ci lghi ("%r1",0); 1023e1051a39Sopenharmony_ci 1024e1051a39Sopenharmony_ciLABEL (".Loop_tail_vx"); 1025e1051a39Sopenharmony_ci llgc ("%r5","0(%r1,$inp)"); 1026e1051a39Sopenharmony_ci llgc ("%r6","$stdframe(%r1,$sp)"); 1027e1051a39Sopenharmony_ci xr ("%r6","%r5"); 1028e1051a39Sopenharmony_ci stc ("%r6","0(%r1,$out)"); 1029e1051a39Sopenharmony_ci la ("%r1","1(%r1)"); 1030e1051a39Sopenharmony_ci brct ($len,".Loop_tail_vx"); 1031e1051a39Sopenharmony_ci 1032e1051a39Sopenharmony_ci&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); 1033e1051a39Sopenharmony_ci la ($sp,"$FRAME($sp)"); 1034e1051a39Sopenharmony_ci br ("%r14"); 1035e1051a39Sopenharmony_ciSIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx"); 1036e1051a39Sopenharmony_ci} 1037e1051a39Sopenharmony_ci################ 1038e1051a39Sopenharmony_ci 1039e1051a39Sopenharmony_ciALIGN (32); 1040e1051a39Sopenharmony_ciLABEL (".Lsigma"); 1041e1051a39Sopenharmony_ciLONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma 1042e1051a39Sopenharmony_ciLONG (1,0,0,0); 1043e1051a39Sopenharmony_ciLONG (2,0,0,0); 1044e1051a39Sopenharmony_ciLONG (3,0,0,0); 1045e1051a39Sopenharmony_ciLONG (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c); # byte swap 1046e1051a39Sopenharmony_ci 1047e1051a39Sopenharmony_ciLONG (0,1,2,3); 1048e1051a39Sopenharmony_ciLONG (0x61707865,0x61707865,0x61707865,0x61707865); # smashed sigma 1049e1051a39Sopenharmony_ciLONG (0x3320646e,0x3320646e,0x3320646e,0x3320646e); 1050e1051a39Sopenharmony_ciLONG (0x79622d32,0x79622d32,0x79622d32,0x79622d32); 1051e1051a39Sopenharmony_ciLONG (0x6b206574,0x6b206574,0x6b206574,0x6b206574); 1052e1051a39Sopenharmony_ci 1053e1051a39Sopenharmony_ciASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\""); 1054e1051a39Sopenharmony_ciALIGN (4); 1055e1051a39Sopenharmony_ci 1056e1051a39Sopenharmony_ciPERLASM_END(); 1057