1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci# 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# November 2014 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# ChaCha20 for x86_64. 20e1051a39Sopenharmony_ci# 21e1051a39Sopenharmony_ci# December 2016 22e1051a39Sopenharmony_ci# 23e1051a39Sopenharmony_ci# Add AVX512F code path. 24e1051a39Sopenharmony_ci# 25e1051a39Sopenharmony_ci# December 2017 26e1051a39Sopenharmony_ci# 27e1051a39Sopenharmony_ci# Add AVX512VL code path. 28e1051a39Sopenharmony_ci# 29e1051a39Sopenharmony_ci# Performance in cycles per byte out of large buffer. 30e1051a39Sopenharmony_ci# 31e1051a39Sopenharmony_ci# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) 32e1051a39Sopenharmony_ci# 33e1051a39Sopenharmony_ci# P4 9.48/+99% - - 34e1051a39Sopenharmony_ci# Core2 7.83/+55% 7.90/5.76 4.35 35e1051a39Sopenharmony_ci# Westmere 7.19/+50% 5.60/4.50 3.00 36e1051a39Sopenharmony_ci# Sandy Bridge 8.31/+42% 5.45/4.00 2.72 37e1051a39Sopenharmony_ci# Ivy Bridge 6.71/+46% 5.40/? 2.41 38e1051a39Sopenharmony_ci# Haswell 5.92/+43% 5.20/3.45 2.42 1.23 39e1051a39Sopenharmony_ci# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] 40e1051a39Sopenharmony_ci# Silvermont 12.0/+33% 7.75/6.90 7.03(iii) 41e1051a39Sopenharmony_ci# Knights L 11.7/- ? 9.60(iii) 0.80 42e1051a39Sopenharmony_ci# Goldmont 10.6/+17% 5.10/3.52 3.28 43e1051a39Sopenharmony_ci# Sledgehammer 7.28/+52% - - 44e1051a39Sopenharmony_ci# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) 45e1051a39Sopenharmony_ci# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 46e1051a39Sopenharmony_ci# VIA Nano 10.5/+46% 6.72/6.88 6.05 47e1051a39Sopenharmony_ci# 48e1051a39Sopenharmony_ci# (i) compared to older gcc 3.x one can observe >2x improvement on 49e1051a39Sopenharmony_ci# most platforms; 50e1051a39Sopenharmony_ci# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used 51e1051a39Sopenharmony_ci# by chacha20_poly1305_tls_cipher, results are EVP-free; 52e1051a39Sopenharmony_ci# (iii) this is not optimal result for Atom because of MSROM 53e1051a39Sopenharmony_ci# limitations, SSE2 can do better, but gain is considered too 54e1051a39Sopenharmony_ci# low to justify the [maintenance] effort; 55e1051a39Sopenharmony_ci# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 56e1051a39Sopenharmony_ci# and 4.85 for 128-byte inputs; 57e1051a39Sopenharmony_ci# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; 58e1051a39Sopenharmony_ci# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 59e1051a39Sopenharmony_ci# cpb in single thread, the corresponding capability is suppressed; 60e1051a39Sopenharmony_ci 61e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 62e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 63e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 64e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 65e1051a39Sopenharmony_ci 66e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 67e1051a39Sopenharmony_ci 68e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 69e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 70e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 71e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl"; 72e1051a39Sopenharmony_ci 73e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 74e1051a39Sopenharmony_ci =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 75e1051a39Sopenharmony_ci $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 76e1051a39Sopenharmony_ci} 77e1051a39Sopenharmony_ci 78e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 79e1051a39Sopenharmony_ci `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 80e1051a39Sopenharmony_ci $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 81e1051a39Sopenharmony_ci $avx += 1 if ($1==2.11 && $2>=8); 82e1051a39Sopenharmony_ci} 83e1051a39Sopenharmony_ci 84e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 85e1051a39Sopenharmony_ci `ml64 2>&1` =~ /Version ([0-9]+)\./) { 86e1051a39Sopenharmony_ci $avx = ($1>=10) + ($1>=11); 87e1051a39Sopenharmony_ci} 88e1051a39Sopenharmony_ci 89e1051a39Sopenharmony_ciif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 90e1051a39Sopenharmony_ci $avx = ($2>=3.0) + ($2>3.0); 91e1051a39Sopenharmony_ci} 92e1051a39Sopenharmony_ci 93e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 94e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 95e1051a39Sopenharmony_ci*STDOUT=*OUT; 96e1051a39Sopenharmony_ci 97e1051a39Sopenharmony_ci# input parameter block 98e1051a39Sopenharmony_ci($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); 99e1051a39Sopenharmony_ci 100e1051a39Sopenharmony_ci$code.=<<___; 101e1051a39Sopenharmony_ci.text 102e1051a39Sopenharmony_ci 103e1051a39Sopenharmony_ci.extern OPENSSL_ia32cap_P 104e1051a39Sopenharmony_ci 105e1051a39Sopenharmony_ci.align 64 106e1051a39Sopenharmony_ci.Lzero: 107e1051a39Sopenharmony_ci.long 0,0,0,0 108e1051a39Sopenharmony_ci.Lone: 109e1051a39Sopenharmony_ci.long 1,0,0,0 110e1051a39Sopenharmony_ci.Linc: 111e1051a39Sopenharmony_ci.long 0,1,2,3 112e1051a39Sopenharmony_ci.Lfour: 113e1051a39Sopenharmony_ci.long 4,4,4,4 114e1051a39Sopenharmony_ci.Lincy: 115e1051a39Sopenharmony_ci.long 0,2,4,6,1,3,5,7 116e1051a39Sopenharmony_ci.Leight: 117e1051a39Sopenharmony_ci.long 8,8,8,8,8,8,8,8 118e1051a39Sopenharmony_ci.Lrot16: 119e1051a39Sopenharmony_ci.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 120e1051a39Sopenharmony_ci.Lrot24: 121e1051a39Sopenharmony_ci.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 122e1051a39Sopenharmony_ci.Ltwoy: 123e1051a39Sopenharmony_ci.long 2,0,0,0, 2,0,0,0 124e1051a39Sopenharmony_ci.align 64 125e1051a39Sopenharmony_ci.Lzeroz: 126e1051a39Sopenharmony_ci.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 127e1051a39Sopenharmony_ci.Lfourz: 128e1051a39Sopenharmony_ci.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 129e1051a39Sopenharmony_ci.Lincz: 130e1051a39Sopenharmony_ci.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 131e1051a39Sopenharmony_ci.Lsixteen: 132e1051a39Sopenharmony_ci.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 133e1051a39Sopenharmony_ci.Lsigma: 134e1051a39Sopenharmony_ci.asciz "expand 32-byte k" 135e1051a39Sopenharmony_ci.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 136e1051a39Sopenharmony_ci___ 137e1051a39Sopenharmony_ci 138e1051a39Sopenharmony_cisub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 139e1051a39Sopenharmony_ci{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 140e1051a39Sopenharmony_ci my $arg = pop; 141e1051a39Sopenharmony_ci $arg = "\$$arg" if ($arg*1 eq $arg); 142e1051a39Sopenharmony_ci $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 143e1051a39Sopenharmony_ci} 144e1051a39Sopenharmony_ci 145e1051a39Sopenharmony_ci@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), 146e1051a39Sopenharmony_ci "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); 147e1051a39Sopenharmony_ci@t=("%esi","%edi"); 148e1051a39Sopenharmony_ci 149e1051a39Sopenharmony_cisub ROUND { # critical path is 24 cycles per round 150e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0)=@_; 151e1051a39Sopenharmony_cimy ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 152e1051a39Sopenharmony_cimy ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 153e1051a39Sopenharmony_cimy ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 154e1051a39Sopenharmony_cimy ($xc,$xc_)=map("\"$_\"",@t); 155e1051a39Sopenharmony_cimy @x=map("\"$_\"",@x); 156e1051a39Sopenharmony_ci 157e1051a39Sopenharmony_ci # Consider order in which variables are addressed by their 158e1051a39Sopenharmony_ci # index: 159e1051a39Sopenharmony_ci # 160e1051a39Sopenharmony_ci # a b c d 161e1051a39Sopenharmony_ci # 162e1051a39Sopenharmony_ci # 0 4 8 12 < even round 163e1051a39Sopenharmony_ci # 1 5 9 13 164e1051a39Sopenharmony_ci # 2 6 10 14 165e1051a39Sopenharmony_ci # 3 7 11 15 166e1051a39Sopenharmony_ci # 0 5 10 15 < odd round 167e1051a39Sopenharmony_ci # 1 6 11 12 168e1051a39Sopenharmony_ci # 2 7 8 13 169e1051a39Sopenharmony_ci # 3 4 9 14 170e1051a39Sopenharmony_ci # 171e1051a39Sopenharmony_ci # 'a', 'b' and 'd's are permanently allocated in registers, 172e1051a39Sopenharmony_ci # @x[0..7,12..15], while 'c's are maintained in memory. If 173e1051a39Sopenharmony_ci # you observe 'c' column, you'll notice that pair of 'c's is 174e1051a39Sopenharmony_ci # invariant between rounds. This means that we have to reload 175e1051a39Sopenharmony_ci # them once per round, in the middle. This is why you'll see 176e1051a39Sopenharmony_ci # bunch of 'c' stores and loads in the middle, but none in 177e1051a39Sopenharmony_ci # the beginning or end. 178e1051a39Sopenharmony_ci 179e1051a39Sopenharmony_ci # Normally instructions would be interleaved to favour in-order 180e1051a39Sopenharmony_ci # execution. Generally out-of-order cores manage it gracefully, 181e1051a39Sopenharmony_ci # but not this time for some reason. As in-order execution 182e1051a39Sopenharmony_ci # cores are dying breed, old Atom is the only one around, 183e1051a39Sopenharmony_ci # instructions are left uninterleaved. Besides, Atom is better 184e1051a39Sopenharmony_ci # off executing 1xSSSE3 code anyway... 185e1051a39Sopenharmony_ci 186e1051a39Sopenharmony_ci ( 187e1051a39Sopenharmony_ci "&add (@x[$a0],@x[$b0])", # Q1 188e1051a39Sopenharmony_ci "&xor (@x[$d0],@x[$a0])", 189e1051a39Sopenharmony_ci "&rol (@x[$d0],16)", 190e1051a39Sopenharmony_ci "&add (@x[$a1],@x[$b1])", # Q2 191e1051a39Sopenharmony_ci "&xor (@x[$d1],@x[$a1])", 192e1051a39Sopenharmony_ci "&rol (@x[$d1],16)", 193e1051a39Sopenharmony_ci 194e1051a39Sopenharmony_ci "&add ($xc,@x[$d0])", 195e1051a39Sopenharmony_ci "&xor (@x[$b0],$xc)", 196e1051a39Sopenharmony_ci "&rol (@x[$b0],12)", 197e1051a39Sopenharmony_ci "&add ($xc_,@x[$d1])", 198e1051a39Sopenharmony_ci "&xor (@x[$b1],$xc_)", 199e1051a39Sopenharmony_ci "&rol (@x[$b1],12)", 200e1051a39Sopenharmony_ci 201e1051a39Sopenharmony_ci "&add (@x[$a0],@x[$b0])", 202e1051a39Sopenharmony_ci "&xor (@x[$d0],@x[$a0])", 203e1051a39Sopenharmony_ci "&rol (@x[$d0],8)", 204e1051a39Sopenharmony_ci "&add (@x[$a1],@x[$b1])", 205e1051a39Sopenharmony_ci "&xor (@x[$d1],@x[$a1])", 206e1051a39Sopenharmony_ci "&rol (@x[$d1],8)", 207e1051a39Sopenharmony_ci 208e1051a39Sopenharmony_ci "&add ($xc,@x[$d0])", 209e1051a39Sopenharmony_ci "&xor (@x[$b0],$xc)", 210e1051a39Sopenharmony_ci "&rol (@x[$b0],7)", 211e1051a39Sopenharmony_ci "&add ($xc_,@x[$d1])", 212e1051a39Sopenharmony_ci "&xor (@x[$b1],$xc_)", 213e1051a39Sopenharmony_ci "&rol (@x[$b1],7)", 214e1051a39Sopenharmony_ci 215e1051a39Sopenharmony_ci "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's 216e1051a39Sopenharmony_ci "&mov (\"4*$c1(%rsp)\",$xc_)", 217e1051a39Sopenharmony_ci "&mov ($xc,\"4*$c2(%rsp)\")", 218e1051a39Sopenharmony_ci "&mov ($xc_,\"4*$c3(%rsp)\")", 219e1051a39Sopenharmony_ci 220e1051a39Sopenharmony_ci "&add (@x[$a2],@x[$b2])", # Q3 221e1051a39Sopenharmony_ci "&xor (@x[$d2],@x[$a2])", 222e1051a39Sopenharmony_ci "&rol (@x[$d2],16)", 223e1051a39Sopenharmony_ci "&add (@x[$a3],@x[$b3])", # Q4 224e1051a39Sopenharmony_ci "&xor (@x[$d3],@x[$a3])", 225e1051a39Sopenharmony_ci "&rol (@x[$d3],16)", 226e1051a39Sopenharmony_ci 227e1051a39Sopenharmony_ci "&add ($xc,@x[$d2])", 228e1051a39Sopenharmony_ci "&xor (@x[$b2],$xc)", 229e1051a39Sopenharmony_ci "&rol (@x[$b2],12)", 230e1051a39Sopenharmony_ci "&add ($xc_,@x[$d3])", 231e1051a39Sopenharmony_ci "&xor (@x[$b3],$xc_)", 232e1051a39Sopenharmony_ci "&rol (@x[$b3],12)", 233e1051a39Sopenharmony_ci 234e1051a39Sopenharmony_ci "&add (@x[$a2],@x[$b2])", 235e1051a39Sopenharmony_ci "&xor (@x[$d2],@x[$a2])", 236e1051a39Sopenharmony_ci "&rol (@x[$d2],8)", 237e1051a39Sopenharmony_ci "&add (@x[$a3],@x[$b3])", 238e1051a39Sopenharmony_ci "&xor (@x[$d3],@x[$a3])", 239e1051a39Sopenharmony_ci "&rol (@x[$d3],8)", 240e1051a39Sopenharmony_ci 241e1051a39Sopenharmony_ci "&add ($xc,@x[$d2])", 242e1051a39Sopenharmony_ci "&xor (@x[$b2],$xc)", 243e1051a39Sopenharmony_ci "&rol (@x[$b2],7)", 244e1051a39Sopenharmony_ci "&add ($xc_,@x[$d3])", 245e1051a39Sopenharmony_ci "&xor (@x[$b3],$xc_)", 246e1051a39Sopenharmony_ci "&rol (@x[$b3],7)" 247e1051a39Sopenharmony_ci ); 248e1051a39Sopenharmony_ci} 249e1051a39Sopenharmony_ci 250e1051a39Sopenharmony_ci######################################################################## 251e1051a39Sopenharmony_ci# Generic code path that handles all lengths on pre-SSSE3 processors. 252e1051a39Sopenharmony_ci$code.=<<___; 253e1051a39Sopenharmony_ci.globl ChaCha20_ctr32 254e1051a39Sopenharmony_ci.type ChaCha20_ctr32,\@function,5 255e1051a39Sopenharmony_ci.align 64 256e1051a39Sopenharmony_ciChaCha20_ctr32: 257e1051a39Sopenharmony_ci.cfi_startproc 258e1051a39Sopenharmony_ci cmp \$0,$len 259e1051a39Sopenharmony_ci je .Lno_data 260e1051a39Sopenharmony_ci mov OPENSSL_ia32cap_P+4(%rip),%r10 261e1051a39Sopenharmony_ci___ 262e1051a39Sopenharmony_ci$code.=<<___ if ($avx>2); 263e1051a39Sopenharmony_ci bt \$48,%r10 # check for AVX512F 264e1051a39Sopenharmony_ci jc .LChaCha20_avx512 265e1051a39Sopenharmony_ci test %r10,%r10 # check for AVX512VL 266e1051a39Sopenharmony_ci js .LChaCha20_avx512vl 267e1051a39Sopenharmony_ci___ 268e1051a39Sopenharmony_ci$code.=<<___; 269e1051a39Sopenharmony_ci test \$`1<<(41-32)`,%r10d 270e1051a39Sopenharmony_ci jnz .LChaCha20_ssse3 271e1051a39Sopenharmony_ci 272e1051a39Sopenharmony_ci push %rbx 273e1051a39Sopenharmony_ci.cfi_push %rbx 274e1051a39Sopenharmony_ci push %rbp 275e1051a39Sopenharmony_ci.cfi_push %rbp 276e1051a39Sopenharmony_ci push %r12 277e1051a39Sopenharmony_ci.cfi_push %r12 278e1051a39Sopenharmony_ci push %r13 279e1051a39Sopenharmony_ci.cfi_push %r13 280e1051a39Sopenharmony_ci push %r14 281e1051a39Sopenharmony_ci.cfi_push %r14 282e1051a39Sopenharmony_ci push %r15 283e1051a39Sopenharmony_ci.cfi_push %r15 284e1051a39Sopenharmony_ci sub \$64+24,%rsp 285e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset 64+24 286e1051a39Sopenharmony_ci.Lctr32_body: 287e1051a39Sopenharmony_ci 288e1051a39Sopenharmony_ci #movdqa .Lsigma(%rip),%xmm0 289e1051a39Sopenharmony_ci movdqu ($key),%xmm1 290e1051a39Sopenharmony_ci movdqu 16($key),%xmm2 291e1051a39Sopenharmony_ci movdqu ($counter),%xmm3 292e1051a39Sopenharmony_ci movdqa .Lone(%rip),%xmm4 293e1051a39Sopenharmony_ci 294e1051a39Sopenharmony_ci #movdqa %xmm0,4*0(%rsp) # key[0] 295e1051a39Sopenharmony_ci movdqa %xmm1,4*4(%rsp) # key[1] 296e1051a39Sopenharmony_ci movdqa %xmm2,4*8(%rsp) # key[2] 297e1051a39Sopenharmony_ci movdqa %xmm3,4*12(%rsp) # key[3] 298e1051a39Sopenharmony_ci mov $len,%rbp # reassign $len 299e1051a39Sopenharmony_ci jmp .Loop_outer 300e1051a39Sopenharmony_ci 301e1051a39Sopenharmony_ci.align 32 302e1051a39Sopenharmony_ci.Loop_outer: 303e1051a39Sopenharmony_ci mov \$0x61707865,@x[0] # 'expa' 304e1051a39Sopenharmony_ci mov \$0x3320646e,@x[1] # 'nd 3' 305e1051a39Sopenharmony_ci mov \$0x79622d32,@x[2] # '2-by' 306e1051a39Sopenharmony_ci mov \$0x6b206574,@x[3] # 'te k' 307e1051a39Sopenharmony_ci mov 4*4(%rsp),@x[4] 308e1051a39Sopenharmony_ci mov 4*5(%rsp),@x[5] 309e1051a39Sopenharmony_ci mov 4*6(%rsp),@x[6] 310e1051a39Sopenharmony_ci mov 4*7(%rsp),@x[7] 311e1051a39Sopenharmony_ci movd %xmm3,@x[12] 312e1051a39Sopenharmony_ci mov 4*13(%rsp),@x[13] 313e1051a39Sopenharmony_ci mov 4*14(%rsp),@x[14] 314e1051a39Sopenharmony_ci mov 4*15(%rsp),@x[15] 315e1051a39Sopenharmony_ci 316e1051a39Sopenharmony_ci mov %rbp,64+0(%rsp) # save len 317e1051a39Sopenharmony_ci mov \$10,%ebp 318e1051a39Sopenharmony_ci mov $inp,64+8(%rsp) # save inp 319e1051a39Sopenharmony_ci movq %xmm2,%rsi # "@x[8]" 320e1051a39Sopenharmony_ci mov $out,64+16(%rsp) # save out 321e1051a39Sopenharmony_ci mov %rsi,%rdi 322e1051a39Sopenharmony_ci shr \$32,%rdi # "@x[9]" 323e1051a39Sopenharmony_ci jmp .Loop 324e1051a39Sopenharmony_ci 325e1051a39Sopenharmony_ci.align 32 326e1051a39Sopenharmony_ci.Loop: 327e1051a39Sopenharmony_ci___ 328e1051a39Sopenharmony_ci foreach (&ROUND (0, 4, 8,12)) { eval; } 329e1051a39Sopenharmony_ci foreach (&ROUND (0, 5,10,15)) { eval; } 330e1051a39Sopenharmony_ci &dec ("%ebp"); 331e1051a39Sopenharmony_ci &jnz (".Loop"); 332e1051a39Sopenharmony_ci 333e1051a39Sopenharmony_ci$code.=<<___; 334e1051a39Sopenharmony_ci mov @t[1],4*9(%rsp) # modulo-scheduled 335e1051a39Sopenharmony_ci mov @t[0],4*8(%rsp) 336e1051a39Sopenharmony_ci mov 64(%rsp),%rbp # load len 337e1051a39Sopenharmony_ci movdqa %xmm2,%xmm1 338e1051a39Sopenharmony_ci mov 64+8(%rsp),$inp # load inp 339e1051a39Sopenharmony_ci paddd %xmm4,%xmm3 # increment counter 340e1051a39Sopenharmony_ci mov 64+16(%rsp),$out # load out 341e1051a39Sopenharmony_ci 342e1051a39Sopenharmony_ci add \$0x61707865,@x[0] # 'expa' 343e1051a39Sopenharmony_ci add \$0x3320646e,@x[1] # 'nd 3' 344e1051a39Sopenharmony_ci add \$0x79622d32,@x[2] # '2-by' 345e1051a39Sopenharmony_ci add \$0x6b206574,@x[3] # 'te k' 346e1051a39Sopenharmony_ci add 4*4(%rsp),@x[4] 347e1051a39Sopenharmony_ci add 4*5(%rsp),@x[5] 348e1051a39Sopenharmony_ci add 4*6(%rsp),@x[6] 349e1051a39Sopenharmony_ci add 4*7(%rsp),@x[7] 350e1051a39Sopenharmony_ci add 4*12(%rsp),@x[12] 351e1051a39Sopenharmony_ci add 4*13(%rsp),@x[13] 352e1051a39Sopenharmony_ci add 4*14(%rsp),@x[14] 353e1051a39Sopenharmony_ci add 4*15(%rsp),@x[15] 354e1051a39Sopenharmony_ci paddd 4*8(%rsp),%xmm1 355e1051a39Sopenharmony_ci 356e1051a39Sopenharmony_ci cmp \$64,%rbp 357e1051a39Sopenharmony_ci jb .Ltail 358e1051a39Sopenharmony_ci 359e1051a39Sopenharmony_ci xor 4*0($inp),@x[0] # xor with input 360e1051a39Sopenharmony_ci xor 4*1($inp),@x[1] 361e1051a39Sopenharmony_ci xor 4*2($inp),@x[2] 362e1051a39Sopenharmony_ci xor 4*3($inp),@x[3] 363e1051a39Sopenharmony_ci xor 4*4($inp),@x[4] 364e1051a39Sopenharmony_ci xor 4*5($inp),@x[5] 365e1051a39Sopenharmony_ci xor 4*6($inp),@x[6] 366e1051a39Sopenharmony_ci xor 4*7($inp),@x[7] 367e1051a39Sopenharmony_ci movdqu 4*8($inp),%xmm0 368e1051a39Sopenharmony_ci xor 4*12($inp),@x[12] 369e1051a39Sopenharmony_ci xor 4*13($inp),@x[13] 370e1051a39Sopenharmony_ci xor 4*14($inp),@x[14] 371e1051a39Sopenharmony_ci xor 4*15($inp),@x[15] 372e1051a39Sopenharmony_ci lea 4*16($inp),$inp # inp+=64 373e1051a39Sopenharmony_ci pxor %xmm1,%xmm0 374e1051a39Sopenharmony_ci 375e1051a39Sopenharmony_ci movdqa %xmm2,4*8(%rsp) 376e1051a39Sopenharmony_ci movd %xmm3,4*12(%rsp) 377e1051a39Sopenharmony_ci 378e1051a39Sopenharmony_ci mov @x[0],4*0($out) # write output 379e1051a39Sopenharmony_ci mov @x[1],4*1($out) 380e1051a39Sopenharmony_ci mov @x[2],4*2($out) 381e1051a39Sopenharmony_ci mov @x[3],4*3($out) 382e1051a39Sopenharmony_ci mov @x[4],4*4($out) 383e1051a39Sopenharmony_ci mov @x[5],4*5($out) 384e1051a39Sopenharmony_ci mov @x[6],4*6($out) 385e1051a39Sopenharmony_ci mov @x[7],4*7($out) 386e1051a39Sopenharmony_ci movdqu %xmm0,4*8($out) 387e1051a39Sopenharmony_ci mov @x[12],4*12($out) 388e1051a39Sopenharmony_ci mov @x[13],4*13($out) 389e1051a39Sopenharmony_ci mov @x[14],4*14($out) 390e1051a39Sopenharmony_ci mov @x[15],4*15($out) 391e1051a39Sopenharmony_ci lea 4*16($out),$out # out+=64 392e1051a39Sopenharmony_ci 393e1051a39Sopenharmony_ci sub \$64,%rbp 394e1051a39Sopenharmony_ci jnz .Loop_outer 395e1051a39Sopenharmony_ci 396e1051a39Sopenharmony_ci jmp .Ldone 397e1051a39Sopenharmony_ci 398e1051a39Sopenharmony_ci.align 16 399e1051a39Sopenharmony_ci.Ltail: 400e1051a39Sopenharmony_ci mov @x[0],4*0(%rsp) 401e1051a39Sopenharmony_ci mov @x[1],4*1(%rsp) 402e1051a39Sopenharmony_ci xor %rbx,%rbx 403e1051a39Sopenharmony_ci mov @x[2],4*2(%rsp) 404e1051a39Sopenharmony_ci mov @x[3],4*3(%rsp) 405e1051a39Sopenharmony_ci mov @x[4],4*4(%rsp) 406e1051a39Sopenharmony_ci mov @x[5],4*5(%rsp) 407e1051a39Sopenharmony_ci mov @x[6],4*6(%rsp) 408e1051a39Sopenharmony_ci mov @x[7],4*7(%rsp) 409e1051a39Sopenharmony_ci movdqa %xmm1,4*8(%rsp) 410e1051a39Sopenharmony_ci mov @x[12],4*12(%rsp) 411e1051a39Sopenharmony_ci mov @x[13],4*13(%rsp) 412e1051a39Sopenharmony_ci mov @x[14],4*14(%rsp) 413e1051a39Sopenharmony_ci mov @x[15],4*15(%rsp) 414e1051a39Sopenharmony_ci 415e1051a39Sopenharmony_ci.Loop_tail: 416e1051a39Sopenharmony_ci movzb ($inp,%rbx),%eax 417e1051a39Sopenharmony_ci movzb (%rsp,%rbx),%edx 418e1051a39Sopenharmony_ci lea 1(%rbx),%rbx 419e1051a39Sopenharmony_ci xor %edx,%eax 420e1051a39Sopenharmony_ci mov %al,-1($out,%rbx) 421e1051a39Sopenharmony_ci dec %rbp 422e1051a39Sopenharmony_ci jnz .Loop_tail 423e1051a39Sopenharmony_ci 424e1051a39Sopenharmony_ci.Ldone: 425e1051a39Sopenharmony_ci lea 64+24+48(%rsp),%rsi 426e1051a39Sopenharmony_ci.cfi_def_cfa %rsi,8 427e1051a39Sopenharmony_ci mov -48(%rsi),%r15 428e1051a39Sopenharmony_ci.cfi_restore %r15 429e1051a39Sopenharmony_ci mov -40(%rsi),%r14 430e1051a39Sopenharmony_ci.cfi_restore %r14 431e1051a39Sopenharmony_ci mov -32(%rsi),%r13 432e1051a39Sopenharmony_ci.cfi_restore %r13 433e1051a39Sopenharmony_ci mov -24(%rsi),%r12 434e1051a39Sopenharmony_ci.cfi_restore %r12 435e1051a39Sopenharmony_ci mov -16(%rsi),%rbp 436e1051a39Sopenharmony_ci.cfi_restore %rbp 437e1051a39Sopenharmony_ci mov -8(%rsi),%rbx 438e1051a39Sopenharmony_ci.cfi_restore %rbx 439e1051a39Sopenharmony_ci lea (%rsi),%rsp 440e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 441e1051a39Sopenharmony_ci.Lno_data: 442e1051a39Sopenharmony_ci ret 443e1051a39Sopenharmony_ci.cfi_endproc 444e1051a39Sopenharmony_ci.size ChaCha20_ctr32,.-ChaCha20_ctr32 445e1051a39Sopenharmony_ci___ 446e1051a39Sopenharmony_ci 447e1051a39Sopenharmony_ci######################################################################## 448e1051a39Sopenharmony_ci# SSSE3 code path that handles shorter lengths 449e1051a39Sopenharmony_ci{ 450e1051a39Sopenharmony_cimy ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); 451e1051a39Sopenharmony_ci 452e1051a39Sopenharmony_cisub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 453e1051a39Sopenharmony_ci &paddd ($a,$b); 454e1051a39Sopenharmony_ci &pxor ($d,$a); 455e1051a39Sopenharmony_ci &pshufb ($d,$rot16); 456e1051a39Sopenharmony_ci 457e1051a39Sopenharmony_ci &paddd ($c,$d); 458e1051a39Sopenharmony_ci &pxor ($b,$c); 459e1051a39Sopenharmony_ci &movdqa ($t,$b); 460e1051a39Sopenharmony_ci &psrld ($b,20); 461e1051a39Sopenharmony_ci &pslld ($t,12); 462e1051a39Sopenharmony_ci &por ($b,$t); 463e1051a39Sopenharmony_ci 464e1051a39Sopenharmony_ci &paddd ($a,$b); 465e1051a39Sopenharmony_ci &pxor ($d,$a); 466e1051a39Sopenharmony_ci &pshufb ($d,$rot24); 467e1051a39Sopenharmony_ci 468e1051a39Sopenharmony_ci &paddd ($c,$d); 469e1051a39Sopenharmony_ci &pxor ($b,$c); 470e1051a39Sopenharmony_ci &movdqa ($t,$b); 471e1051a39Sopenharmony_ci &psrld ($b,25); 472e1051a39Sopenharmony_ci &pslld ($t,7); 473e1051a39Sopenharmony_ci &por ($b,$t); 474e1051a39Sopenharmony_ci} 475e1051a39Sopenharmony_ci 476e1051a39Sopenharmony_cimy $xframe = $win64 ? 160+8 : 8; 477e1051a39Sopenharmony_ci 478e1051a39Sopenharmony_ci$code.=<<___; 479e1051a39Sopenharmony_ci.type ChaCha20_ssse3,\@function,5 480e1051a39Sopenharmony_ci.align 32 481e1051a39Sopenharmony_ciChaCha20_ssse3: 482e1051a39Sopenharmony_ci.cfi_startproc 483e1051a39Sopenharmony_ci.LChaCha20_ssse3: 484e1051a39Sopenharmony_ci mov %rsp,%r9 # frame pointer 485e1051a39Sopenharmony_ci.cfi_def_cfa_register %r9 486e1051a39Sopenharmony_ci___ 487e1051a39Sopenharmony_ci$code.=<<___ if ($avx); 488e1051a39Sopenharmony_ci test \$`1<<(43-32)`,%r10d 489e1051a39Sopenharmony_ci jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 490e1051a39Sopenharmony_ci___ 491e1051a39Sopenharmony_ci$code.=<<___; 492e1051a39Sopenharmony_ci cmp \$128,$len # we might throw away some data, 493e1051a39Sopenharmony_ci je .LChaCha20_128 494e1051a39Sopenharmony_ci ja .LChaCha20_4x # but overall it won't be slower 495e1051a39Sopenharmony_ci 496e1051a39Sopenharmony_ci.Ldo_sse3_after_all: 497e1051a39Sopenharmony_ci sub \$64+$xframe,%rsp 498e1051a39Sopenharmony_ci___ 499e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 500e1051a39Sopenharmony_ci movaps %xmm6,-0x28(%r9) 501e1051a39Sopenharmony_ci movaps %xmm7,-0x18(%r9) 502e1051a39Sopenharmony_ci.Lssse3_body: 503e1051a39Sopenharmony_ci___ 504e1051a39Sopenharmony_ci$code.=<<___; 505e1051a39Sopenharmony_ci movdqa .Lsigma(%rip),$a 506e1051a39Sopenharmony_ci movdqu ($key),$b 507e1051a39Sopenharmony_ci movdqu 16($key),$c 508e1051a39Sopenharmony_ci movdqu ($counter),$d 509e1051a39Sopenharmony_ci movdqa .Lrot16(%rip),$rot16 510e1051a39Sopenharmony_ci movdqa .Lrot24(%rip),$rot24 511e1051a39Sopenharmony_ci 512e1051a39Sopenharmony_ci movdqa $a,0x00(%rsp) 513e1051a39Sopenharmony_ci movdqa $b,0x10(%rsp) 514e1051a39Sopenharmony_ci movdqa $c,0x20(%rsp) 515e1051a39Sopenharmony_ci movdqa $d,0x30(%rsp) 516e1051a39Sopenharmony_ci mov \$10,$counter # reuse $counter 517e1051a39Sopenharmony_ci jmp .Loop_ssse3 518e1051a39Sopenharmony_ci 519e1051a39Sopenharmony_ci.align 32 520e1051a39Sopenharmony_ci.Loop_outer_ssse3: 521e1051a39Sopenharmony_ci movdqa .Lone(%rip),$d 522e1051a39Sopenharmony_ci movdqa 0x00(%rsp),$a 523e1051a39Sopenharmony_ci movdqa 0x10(%rsp),$b 524e1051a39Sopenharmony_ci movdqa 0x20(%rsp),$c 525e1051a39Sopenharmony_ci paddd 0x30(%rsp),$d 526e1051a39Sopenharmony_ci mov \$10,$counter 527e1051a39Sopenharmony_ci movdqa $d,0x30(%rsp) 528e1051a39Sopenharmony_ci jmp .Loop_ssse3 529e1051a39Sopenharmony_ci 530e1051a39Sopenharmony_ci.align 32 531e1051a39Sopenharmony_ci.Loop_ssse3: 532e1051a39Sopenharmony_ci___ 533e1051a39Sopenharmony_ci &SSSE3ROUND(); 534e1051a39Sopenharmony_ci &pshufd ($c,$c,0b01001110); 535e1051a39Sopenharmony_ci &pshufd ($b,$b,0b00111001); 536e1051a39Sopenharmony_ci &pshufd ($d,$d,0b10010011); 537e1051a39Sopenharmony_ci &nop (); 538e1051a39Sopenharmony_ci 539e1051a39Sopenharmony_ci &SSSE3ROUND(); 540e1051a39Sopenharmony_ci &pshufd ($c,$c,0b01001110); 541e1051a39Sopenharmony_ci &pshufd ($b,$b,0b10010011); 542e1051a39Sopenharmony_ci &pshufd ($d,$d,0b00111001); 543e1051a39Sopenharmony_ci 544e1051a39Sopenharmony_ci &dec ($counter); 545e1051a39Sopenharmony_ci &jnz (".Loop_ssse3"); 546e1051a39Sopenharmony_ci 547e1051a39Sopenharmony_ci$code.=<<___; 548e1051a39Sopenharmony_ci paddd 0x00(%rsp),$a 549e1051a39Sopenharmony_ci paddd 0x10(%rsp),$b 550e1051a39Sopenharmony_ci paddd 0x20(%rsp),$c 551e1051a39Sopenharmony_ci paddd 0x30(%rsp),$d 552e1051a39Sopenharmony_ci 553e1051a39Sopenharmony_ci cmp \$64,$len 554e1051a39Sopenharmony_ci jb .Ltail_ssse3 555e1051a39Sopenharmony_ci 556e1051a39Sopenharmony_ci movdqu 0x00($inp),$t 557e1051a39Sopenharmony_ci movdqu 0x10($inp),$t1 558e1051a39Sopenharmony_ci pxor $t,$a # xor with input 559e1051a39Sopenharmony_ci movdqu 0x20($inp),$t 560e1051a39Sopenharmony_ci pxor $t1,$b 561e1051a39Sopenharmony_ci movdqu 0x30($inp),$t1 562e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64 563e1051a39Sopenharmony_ci pxor $t,$c 564e1051a39Sopenharmony_ci pxor $t1,$d 565e1051a39Sopenharmony_ci 566e1051a39Sopenharmony_ci movdqu $a,0x00($out) # write output 567e1051a39Sopenharmony_ci movdqu $b,0x10($out) 568e1051a39Sopenharmony_ci movdqu $c,0x20($out) 569e1051a39Sopenharmony_ci movdqu $d,0x30($out) 570e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64 571e1051a39Sopenharmony_ci 572e1051a39Sopenharmony_ci sub \$64,$len 573e1051a39Sopenharmony_ci jnz .Loop_outer_ssse3 574e1051a39Sopenharmony_ci 575e1051a39Sopenharmony_ci jmp .Ldone_ssse3 576e1051a39Sopenharmony_ci 577e1051a39Sopenharmony_ci.align 16 578e1051a39Sopenharmony_ci.Ltail_ssse3: 579e1051a39Sopenharmony_ci movdqa $a,0x00(%rsp) 580e1051a39Sopenharmony_ci movdqa $b,0x10(%rsp) 581e1051a39Sopenharmony_ci movdqa $c,0x20(%rsp) 582e1051a39Sopenharmony_ci movdqa $d,0x30(%rsp) 583e1051a39Sopenharmony_ci xor $counter,$counter 584e1051a39Sopenharmony_ci 585e1051a39Sopenharmony_ci.Loop_tail_ssse3: 586e1051a39Sopenharmony_ci movzb ($inp,$counter),%eax 587e1051a39Sopenharmony_ci movzb (%rsp,$counter),%ecx 588e1051a39Sopenharmony_ci lea 1($counter),$counter 589e1051a39Sopenharmony_ci xor %ecx,%eax 590e1051a39Sopenharmony_ci mov %al,-1($out,$counter) 591e1051a39Sopenharmony_ci dec $len 592e1051a39Sopenharmony_ci jnz .Loop_tail_ssse3 593e1051a39Sopenharmony_ci 594e1051a39Sopenharmony_ci.Ldone_ssse3: 595e1051a39Sopenharmony_ci___ 596e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 597e1051a39Sopenharmony_ci movaps -0x28(%r9),%xmm6 598e1051a39Sopenharmony_ci movaps -0x18(%r9),%xmm7 599e1051a39Sopenharmony_ci___ 600e1051a39Sopenharmony_ci$code.=<<___; 601e1051a39Sopenharmony_ci lea (%r9),%rsp 602e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 603e1051a39Sopenharmony_ci.Lssse3_epilogue: 604e1051a39Sopenharmony_ci ret 605e1051a39Sopenharmony_ci.cfi_endproc 606e1051a39Sopenharmony_ci.size ChaCha20_ssse3,.-ChaCha20_ssse3 607e1051a39Sopenharmony_ci___ 608e1051a39Sopenharmony_ci} 609e1051a39Sopenharmony_ci 610e1051a39Sopenharmony_ci######################################################################## 611e1051a39Sopenharmony_ci# SSSE3 code path that handles 128-byte inputs 612e1051a39Sopenharmony_ci{ 613e1051a39Sopenharmony_cimy ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); 614e1051a39Sopenharmony_cimy ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); 615e1051a39Sopenharmony_ci 616e1051a39Sopenharmony_cisub SSSE3ROUND_2x { 617e1051a39Sopenharmony_ci &paddd ($a,$b); 618e1051a39Sopenharmony_ci &pxor ($d,$a); 619e1051a39Sopenharmony_ci &paddd ($a1,$b1); 620e1051a39Sopenharmony_ci &pxor ($d1,$a1); 621e1051a39Sopenharmony_ci &pshufb ($d,$rot16); 622e1051a39Sopenharmony_ci &pshufb($d1,$rot16); 623e1051a39Sopenharmony_ci 624e1051a39Sopenharmony_ci &paddd ($c,$d); 625e1051a39Sopenharmony_ci &paddd ($c1,$d1); 626e1051a39Sopenharmony_ci &pxor ($b,$c); 627e1051a39Sopenharmony_ci &pxor ($b1,$c1); 628e1051a39Sopenharmony_ci &movdqa ($t,$b); 629e1051a39Sopenharmony_ci &psrld ($b,20); 630e1051a39Sopenharmony_ci &movdqa($t1,$b1); 631e1051a39Sopenharmony_ci &pslld ($t,12); 632e1051a39Sopenharmony_ci &psrld ($b1,20); 633e1051a39Sopenharmony_ci &por ($b,$t); 634e1051a39Sopenharmony_ci &pslld ($t1,12); 635e1051a39Sopenharmony_ci &por ($b1,$t1); 636e1051a39Sopenharmony_ci 637e1051a39Sopenharmony_ci &paddd ($a,$b); 638e1051a39Sopenharmony_ci &pxor ($d,$a); 639e1051a39Sopenharmony_ci &paddd ($a1,$b1); 640e1051a39Sopenharmony_ci &pxor ($d1,$a1); 641e1051a39Sopenharmony_ci &pshufb ($d,$rot24); 642e1051a39Sopenharmony_ci &pshufb($d1,$rot24); 643e1051a39Sopenharmony_ci 644e1051a39Sopenharmony_ci &paddd ($c,$d); 645e1051a39Sopenharmony_ci &paddd ($c1,$d1); 646e1051a39Sopenharmony_ci &pxor ($b,$c); 647e1051a39Sopenharmony_ci &pxor ($b1,$c1); 648e1051a39Sopenharmony_ci &movdqa ($t,$b); 649e1051a39Sopenharmony_ci &psrld ($b,25); 650e1051a39Sopenharmony_ci &movdqa($t1,$b1); 651e1051a39Sopenharmony_ci &pslld ($t,7); 652e1051a39Sopenharmony_ci &psrld ($b1,25); 653e1051a39Sopenharmony_ci &por ($b,$t); 654e1051a39Sopenharmony_ci &pslld ($t1,7); 655e1051a39Sopenharmony_ci &por ($b1,$t1); 656e1051a39Sopenharmony_ci} 657e1051a39Sopenharmony_ci 658e1051a39Sopenharmony_cimy $xframe = $win64 ? 0x68 : 8; 659e1051a39Sopenharmony_ci 660e1051a39Sopenharmony_ci$code.=<<___; 661e1051a39Sopenharmony_ci.type ChaCha20_128,\@function,5 662e1051a39Sopenharmony_ci.align 32 663e1051a39Sopenharmony_ciChaCha20_128: 664e1051a39Sopenharmony_ci.cfi_startproc 665e1051a39Sopenharmony_ci.LChaCha20_128: 666e1051a39Sopenharmony_ci mov %rsp,%r9 # frame pointer 667e1051a39Sopenharmony_ci.cfi_def_cfa_register %r9 668e1051a39Sopenharmony_ci sub \$64+$xframe,%rsp 669e1051a39Sopenharmony_ci___ 670e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 671e1051a39Sopenharmony_ci movaps %xmm6,-0x68(%r9) 672e1051a39Sopenharmony_ci movaps %xmm7,-0x58(%r9) 673e1051a39Sopenharmony_ci movaps %xmm8,-0x48(%r9) 674e1051a39Sopenharmony_ci movaps %xmm9,-0x38(%r9) 675e1051a39Sopenharmony_ci movaps %xmm10,-0x28(%r9) 676e1051a39Sopenharmony_ci movaps %xmm11,-0x18(%r9) 677e1051a39Sopenharmony_ci.L128_body: 678e1051a39Sopenharmony_ci___ 679e1051a39Sopenharmony_ci$code.=<<___; 680e1051a39Sopenharmony_ci movdqa .Lsigma(%rip),$a 681e1051a39Sopenharmony_ci movdqu ($key),$b 682e1051a39Sopenharmony_ci movdqu 16($key),$c 683e1051a39Sopenharmony_ci movdqu ($counter),$d 684e1051a39Sopenharmony_ci movdqa .Lone(%rip),$d1 685e1051a39Sopenharmony_ci movdqa .Lrot16(%rip),$rot16 686e1051a39Sopenharmony_ci movdqa .Lrot24(%rip),$rot24 687e1051a39Sopenharmony_ci 688e1051a39Sopenharmony_ci movdqa $a,$a1 689e1051a39Sopenharmony_ci movdqa $a,0x00(%rsp) 690e1051a39Sopenharmony_ci movdqa $b,$b1 691e1051a39Sopenharmony_ci movdqa $b,0x10(%rsp) 692e1051a39Sopenharmony_ci movdqa $c,$c1 693e1051a39Sopenharmony_ci movdqa $c,0x20(%rsp) 694e1051a39Sopenharmony_ci paddd $d,$d1 695e1051a39Sopenharmony_ci movdqa $d,0x30(%rsp) 696e1051a39Sopenharmony_ci mov \$10,$counter # reuse $counter 697e1051a39Sopenharmony_ci jmp .Loop_128 698e1051a39Sopenharmony_ci 699e1051a39Sopenharmony_ci.align 32 700e1051a39Sopenharmony_ci.Loop_128: 701e1051a39Sopenharmony_ci___ 702e1051a39Sopenharmony_ci &SSSE3ROUND_2x(); 703e1051a39Sopenharmony_ci &pshufd ($c,$c,0b01001110); 704e1051a39Sopenharmony_ci &pshufd ($b,$b,0b00111001); 705e1051a39Sopenharmony_ci &pshufd ($d,$d,0b10010011); 706e1051a39Sopenharmony_ci &pshufd ($c1,$c1,0b01001110); 707e1051a39Sopenharmony_ci &pshufd ($b1,$b1,0b00111001); 708e1051a39Sopenharmony_ci &pshufd ($d1,$d1,0b10010011); 709e1051a39Sopenharmony_ci 710e1051a39Sopenharmony_ci &SSSE3ROUND_2x(); 711e1051a39Sopenharmony_ci &pshufd ($c,$c,0b01001110); 712e1051a39Sopenharmony_ci &pshufd ($b,$b,0b10010011); 713e1051a39Sopenharmony_ci &pshufd ($d,$d,0b00111001); 714e1051a39Sopenharmony_ci &pshufd ($c1,$c1,0b01001110); 715e1051a39Sopenharmony_ci &pshufd ($b1,$b1,0b10010011); 716e1051a39Sopenharmony_ci &pshufd ($d1,$d1,0b00111001); 717e1051a39Sopenharmony_ci 718e1051a39Sopenharmony_ci &dec ($counter); 719e1051a39Sopenharmony_ci &jnz (".Loop_128"); 720e1051a39Sopenharmony_ci 721e1051a39Sopenharmony_ci$code.=<<___; 722e1051a39Sopenharmony_ci paddd 0x00(%rsp),$a 723e1051a39Sopenharmony_ci paddd 0x10(%rsp),$b 724e1051a39Sopenharmony_ci paddd 0x20(%rsp),$c 725e1051a39Sopenharmony_ci paddd 0x30(%rsp),$d 726e1051a39Sopenharmony_ci paddd .Lone(%rip),$d1 727e1051a39Sopenharmony_ci paddd 0x00(%rsp),$a1 728e1051a39Sopenharmony_ci paddd 0x10(%rsp),$b1 729e1051a39Sopenharmony_ci paddd 0x20(%rsp),$c1 730e1051a39Sopenharmony_ci paddd 0x30(%rsp),$d1 731e1051a39Sopenharmony_ci 732e1051a39Sopenharmony_ci movdqu 0x00($inp),$t 733e1051a39Sopenharmony_ci movdqu 0x10($inp),$t1 734e1051a39Sopenharmony_ci pxor $t,$a # xor with input 735e1051a39Sopenharmony_ci movdqu 0x20($inp),$t 736e1051a39Sopenharmony_ci pxor $t1,$b 737e1051a39Sopenharmony_ci movdqu 0x30($inp),$t1 738e1051a39Sopenharmony_ci pxor $t,$c 739e1051a39Sopenharmony_ci movdqu 0x40($inp),$t 740e1051a39Sopenharmony_ci pxor $t1,$d 741e1051a39Sopenharmony_ci movdqu 0x50($inp),$t1 742e1051a39Sopenharmony_ci pxor $t,$a1 743e1051a39Sopenharmony_ci movdqu 0x60($inp),$t 744e1051a39Sopenharmony_ci pxor $t1,$b1 745e1051a39Sopenharmony_ci movdqu 0x70($inp),$t1 746e1051a39Sopenharmony_ci pxor $t,$c1 747e1051a39Sopenharmony_ci pxor $t1,$d1 748e1051a39Sopenharmony_ci 749e1051a39Sopenharmony_ci movdqu $a,0x00($out) # write output 750e1051a39Sopenharmony_ci movdqu $b,0x10($out) 751e1051a39Sopenharmony_ci movdqu $c,0x20($out) 752e1051a39Sopenharmony_ci movdqu $d,0x30($out) 753e1051a39Sopenharmony_ci movdqu $a1,0x40($out) 754e1051a39Sopenharmony_ci movdqu $b1,0x50($out) 755e1051a39Sopenharmony_ci movdqu $c1,0x60($out) 756e1051a39Sopenharmony_ci movdqu $d1,0x70($out) 757e1051a39Sopenharmony_ci___ 758e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 759e1051a39Sopenharmony_ci movaps -0x68(%r9),%xmm6 760e1051a39Sopenharmony_ci movaps -0x58(%r9),%xmm7 761e1051a39Sopenharmony_ci movaps -0x48(%r9),%xmm8 762e1051a39Sopenharmony_ci movaps -0x38(%r9),%xmm9 763e1051a39Sopenharmony_ci movaps -0x28(%r9),%xmm10 764e1051a39Sopenharmony_ci movaps -0x18(%r9),%xmm11 765e1051a39Sopenharmony_ci___ 766e1051a39Sopenharmony_ci$code.=<<___; 767e1051a39Sopenharmony_ci lea (%r9),%rsp 768e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 769e1051a39Sopenharmony_ci.L128_epilogue: 770e1051a39Sopenharmony_ci ret 771e1051a39Sopenharmony_ci.cfi_endproc 772e1051a39Sopenharmony_ci.size ChaCha20_128,.-ChaCha20_128 773e1051a39Sopenharmony_ci___ 774e1051a39Sopenharmony_ci} 775e1051a39Sopenharmony_ci 776e1051a39Sopenharmony_ci######################################################################## 777e1051a39Sopenharmony_ci# SSSE3 code path that handles longer messages. 778e1051a39Sopenharmony_ci{ 779e1051a39Sopenharmony_ci# assign variables to favor Atom front-end 780e1051a39Sopenharmony_cimy ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, 781e1051a39Sopenharmony_ci $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); 782e1051a39Sopenharmony_cimy @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 783e1051a39Sopenharmony_ci "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 784e1051a39Sopenharmony_ci 785e1051a39Sopenharmony_cisub SSSE3_lane_ROUND { 786e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0)=@_; 787e1051a39Sopenharmony_cimy ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 788e1051a39Sopenharmony_cimy ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 789e1051a39Sopenharmony_cimy ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 790e1051a39Sopenharmony_cimy ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 791e1051a39Sopenharmony_cimy @x=map("\"$_\"",@xx); 792e1051a39Sopenharmony_ci 793e1051a39Sopenharmony_ci # Consider order in which variables are addressed by their 794e1051a39Sopenharmony_ci # index: 795e1051a39Sopenharmony_ci # 796e1051a39Sopenharmony_ci # a b c d 797e1051a39Sopenharmony_ci # 798e1051a39Sopenharmony_ci # 0 4 8 12 < even round 799e1051a39Sopenharmony_ci # 1 5 9 13 800e1051a39Sopenharmony_ci # 2 6 10 14 801e1051a39Sopenharmony_ci # 3 7 11 15 802e1051a39Sopenharmony_ci # 0 5 10 15 < odd round 803e1051a39Sopenharmony_ci # 1 6 11 12 804e1051a39Sopenharmony_ci # 2 7 8 13 805e1051a39Sopenharmony_ci # 3 4 9 14 806e1051a39Sopenharmony_ci # 807e1051a39Sopenharmony_ci # 'a', 'b' and 'd's are permanently allocated in registers, 808e1051a39Sopenharmony_ci # @x[0..7,12..15], while 'c's are maintained in memory. If 809e1051a39Sopenharmony_ci # you observe 'c' column, you'll notice that pair of 'c's is 810e1051a39Sopenharmony_ci # invariant between rounds. This means that we have to reload 811e1051a39Sopenharmony_ci # them once per round, in the middle. This is why you'll see 812e1051a39Sopenharmony_ci # bunch of 'c' stores and loads in the middle, but none in 813e1051a39Sopenharmony_ci # the beginning or end. 814e1051a39Sopenharmony_ci 815e1051a39Sopenharmony_ci ( 816e1051a39Sopenharmony_ci "&paddd (@x[$a0],@x[$b0])", # Q1 817e1051a39Sopenharmony_ci "&paddd (@x[$a1],@x[$b1])", # Q2 818e1051a39Sopenharmony_ci "&pxor (@x[$d0],@x[$a0])", 819e1051a39Sopenharmony_ci "&pxor (@x[$d1],@x[$a1])", 820e1051a39Sopenharmony_ci "&pshufb (@x[$d0],$t1)", 821e1051a39Sopenharmony_ci "&pshufb (@x[$d1],$t1)", 822e1051a39Sopenharmony_ci 823e1051a39Sopenharmony_ci "&paddd ($xc,@x[$d0])", 824e1051a39Sopenharmony_ci "&paddd ($xc_,@x[$d1])", 825e1051a39Sopenharmony_ci "&pxor (@x[$b0],$xc)", 826e1051a39Sopenharmony_ci "&pxor (@x[$b1],$xc_)", 827e1051a39Sopenharmony_ci "&movdqa ($t0,@x[$b0])", 828e1051a39Sopenharmony_ci "&pslld (@x[$b0],12)", 829e1051a39Sopenharmony_ci "&psrld ($t0,20)", 830e1051a39Sopenharmony_ci "&movdqa ($t1,@x[$b1])", 831e1051a39Sopenharmony_ci "&pslld (@x[$b1],12)", 832e1051a39Sopenharmony_ci "&por (@x[$b0],$t0)", 833e1051a39Sopenharmony_ci "&psrld ($t1,20)", 834e1051a39Sopenharmony_ci "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 835e1051a39Sopenharmony_ci "&por (@x[$b1],$t1)", 836e1051a39Sopenharmony_ci 837e1051a39Sopenharmony_ci "&paddd (@x[$a0],@x[$b0])", 838e1051a39Sopenharmony_ci "&paddd (@x[$a1],@x[$b1])", 839e1051a39Sopenharmony_ci "&pxor (@x[$d0],@x[$a0])", 840e1051a39Sopenharmony_ci "&pxor (@x[$d1],@x[$a1])", 841e1051a39Sopenharmony_ci "&pshufb (@x[$d0],$t0)", 842e1051a39Sopenharmony_ci "&pshufb (@x[$d1],$t0)", 843e1051a39Sopenharmony_ci 844e1051a39Sopenharmony_ci "&paddd ($xc,@x[$d0])", 845e1051a39Sopenharmony_ci "&paddd ($xc_,@x[$d1])", 846e1051a39Sopenharmony_ci "&pxor (@x[$b0],$xc)", 847e1051a39Sopenharmony_ci "&pxor (@x[$b1],$xc_)", 848e1051a39Sopenharmony_ci "&movdqa ($t1,@x[$b0])", 849e1051a39Sopenharmony_ci "&pslld (@x[$b0],7)", 850e1051a39Sopenharmony_ci "&psrld ($t1,25)", 851e1051a39Sopenharmony_ci "&movdqa ($t0,@x[$b1])", 852e1051a39Sopenharmony_ci "&pslld (@x[$b1],7)", 853e1051a39Sopenharmony_ci "&por (@x[$b0],$t1)", 854e1051a39Sopenharmony_ci "&psrld ($t0,25)", 855e1051a39Sopenharmony_ci "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 856e1051a39Sopenharmony_ci "&por (@x[$b1],$t0)", 857e1051a39Sopenharmony_ci 858e1051a39Sopenharmony_ci "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 859e1051a39Sopenharmony_ci "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", 860e1051a39Sopenharmony_ci "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", 861e1051a39Sopenharmony_ci "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", 862e1051a39Sopenharmony_ci 863e1051a39Sopenharmony_ci "&paddd (@x[$a2],@x[$b2])", # Q3 864e1051a39Sopenharmony_ci "&paddd (@x[$a3],@x[$b3])", # Q4 865e1051a39Sopenharmony_ci "&pxor (@x[$d2],@x[$a2])", 866e1051a39Sopenharmony_ci "&pxor (@x[$d3],@x[$a3])", 867e1051a39Sopenharmony_ci "&pshufb (@x[$d2],$t1)", 868e1051a39Sopenharmony_ci "&pshufb (@x[$d3],$t1)", 869e1051a39Sopenharmony_ci 870e1051a39Sopenharmony_ci "&paddd ($xc,@x[$d2])", 871e1051a39Sopenharmony_ci "&paddd ($xc_,@x[$d3])", 872e1051a39Sopenharmony_ci "&pxor (@x[$b2],$xc)", 873e1051a39Sopenharmony_ci "&pxor (@x[$b3],$xc_)", 874e1051a39Sopenharmony_ci "&movdqa ($t0,@x[$b2])", 875e1051a39Sopenharmony_ci "&pslld (@x[$b2],12)", 876e1051a39Sopenharmony_ci "&psrld ($t0,20)", 877e1051a39Sopenharmony_ci "&movdqa ($t1,@x[$b3])", 878e1051a39Sopenharmony_ci "&pslld (@x[$b3],12)", 879e1051a39Sopenharmony_ci "&por (@x[$b2],$t0)", 880e1051a39Sopenharmony_ci "&psrld ($t1,20)", 881e1051a39Sopenharmony_ci "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 882e1051a39Sopenharmony_ci "&por (@x[$b3],$t1)", 883e1051a39Sopenharmony_ci 884e1051a39Sopenharmony_ci "&paddd (@x[$a2],@x[$b2])", 885e1051a39Sopenharmony_ci "&paddd (@x[$a3],@x[$b3])", 886e1051a39Sopenharmony_ci "&pxor (@x[$d2],@x[$a2])", 887e1051a39Sopenharmony_ci "&pxor (@x[$d3],@x[$a3])", 888e1051a39Sopenharmony_ci "&pshufb (@x[$d2],$t0)", 889e1051a39Sopenharmony_ci "&pshufb (@x[$d3],$t0)", 890e1051a39Sopenharmony_ci 891e1051a39Sopenharmony_ci "&paddd ($xc,@x[$d2])", 892e1051a39Sopenharmony_ci "&paddd ($xc_,@x[$d3])", 893e1051a39Sopenharmony_ci "&pxor (@x[$b2],$xc)", 894e1051a39Sopenharmony_ci "&pxor (@x[$b3],$xc_)", 895e1051a39Sopenharmony_ci "&movdqa ($t1,@x[$b2])", 896e1051a39Sopenharmony_ci "&pslld (@x[$b2],7)", 897e1051a39Sopenharmony_ci "&psrld ($t1,25)", 898e1051a39Sopenharmony_ci "&movdqa ($t0,@x[$b3])", 899e1051a39Sopenharmony_ci "&pslld (@x[$b3],7)", 900e1051a39Sopenharmony_ci "&por (@x[$b2],$t1)", 901e1051a39Sopenharmony_ci "&psrld ($t0,25)", 902e1051a39Sopenharmony_ci "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 903e1051a39Sopenharmony_ci "&por (@x[$b3],$t0)" 904e1051a39Sopenharmony_ci ); 905e1051a39Sopenharmony_ci} 906e1051a39Sopenharmony_ci 907e1051a39Sopenharmony_cimy $xframe = $win64 ? 0xa8 : 8; 908e1051a39Sopenharmony_ci 909e1051a39Sopenharmony_ci$code.=<<___; 910e1051a39Sopenharmony_ci.type ChaCha20_4x,\@function,5 911e1051a39Sopenharmony_ci.align 32 912e1051a39Sopenharmony_ciChaCha20_4x: 913e1051a39Sopenharmony_ci.cfi_startproc 914e1051a39Sopenharmony_ci.LChaCha20_4x: 915e1051a39Sopenharmony_ci mov %rsp,%r9 # frame pointer 916e1051a39Sopenharmony_ci.cfi_def_cfa_register %r9 917e1051a39Sopenharmony_ci mov %r10,%r11 918e1051a39Sopenharmony_ci___ 919e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 920e1051a39Sopenharmony_ci shr \$32,%r10 # OPENSSL_ia32cap_P+8 921e1051a39Sopenharmony_ci test \$`1<<5`,%r10 # test AVX2 922e1051a39Sopenharmony_ci jnz .LChaCha20_8x 923e1051a39Sopenharmony_ci___ 924e1051a39Sopenharmony_ci$code.=<<___; 925e1051a39Sopenharmony_ci cmp \$192,$len 926e1051a39Sopenharmony_ci ja .Lproceed4x 927e1051a39Sopenharmony_ci 928e1051a39Sopenharmony_ci and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE 929e1051a39Sopenharmony_ci cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE 930e1051a39Sopenharmony_ci je .Ldo_sse3_after_all # to detect Atom 931e1051a39Sopenharmony_ci 932e1051a39Sopenharmony_ci.Lproceed4x: 933e1051a39Sopenharmony_ci sub \$0x140+$xframe,%rsp 934e1051a39Sopenharmony_ci___ 935e1051a39Sopenharmony_ci ################ stack layout 936e1051a39Sopenharmony_ci # +0x00 SIMD equivalent of @x[8-12] 937e1051a39Sopenharmony_ci # ... 938e1051a39Sopenharmony_ci # +0x40 constant copy of key[0-2] smashed by lanes 939e1051a39Sopenharmony_ci # ... 940e1051a39Sopenharmony_ci # +0x100 SIMD counters (with nonce smashed by lanes) 941e1051a39Sopenharmony_ci # ... 942e1051a39Sopenharmony_ci # +0x140 943e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 944e1051a39Sopenharmony_ci movaps %xmm6,-0xa8(%r9) 945e1051a39Sopenharmony_ci movaps %xmm7,-0x98(%r9) 946e1051a39Sopenharmony_ci movaps %xmm8,-0x88(%r9) 947e1051a39Sopenharmony_ci movaps %xmm9,-0x78(%r9) 948e1051a39Sopenharmony_ci movaps %xmm10,-0x68(%r9) 949e1051a39Sopenharmony_ci movaps %xmm11,-0x58(%r9) 950e1051a39Sopenharmony_ci movaps %xmm12,-0x48(%r9) 951e1051a39Sopenharmony_ci movaps %xmm13,-0x38(%r9) 952e1051a39Sopenharmony_ci movaps %xmm14,-0x28(%r9) 953e1051a39Sopenharmony_ci movaps %xmm15,-0x18(%r9) 954e1051a39Sopenharmony_ci.L4x_body: 955e1051a39Sopenharmony_ci___ 956e1051a39Sopenharmony_ci$code.=<<___; 957e1051a39Sopenharmony_ci movdqa .Lsigma(%rip),$xa3 # key[0] 958e1051a39Sopenharmony_ci movdqu ($key),$xb3 # key[1] 959e1051a39Sopenharmony_ci movdqu 16($key),$xt3 # key[2] 960e1051a39Sopenharmony_ci movdqu ($counter),$xd3 # key[3] 961e1051a39Sopenharmony_ci lea 0x100(%rsp),%rcx # size optimization 962e1051a39Sopenharmony_ci lea .Lrot16(%rip),%r10 963e1051a39Sopenharmony_ci lea .Lrot24(%rip),%r11 964e1051a39Sopenharmony_ci 965e1051a39Sopenharmony_ci pshufd \$0x00,$xa3,$xa0 # smash key by lanes... 966e1051a39Sopenharmony_ci pshufd \$0x55,$xa3,$xa1 967e1051a39Sopenharmony_ci movdqa $xa0,0x40(%rsp) # ... and offload 968e1051a39Sopenharmony_ci pshufd \$0xaa,$xa3,$xa2 969e1051a39Sopenharmony_ci movdqa $xa1,0x50(%rsp) 970e1051a39Sopenharmony_ci pshufd \$0xff,$xa3,$xa3 971e1051a39Sopenharmony_ci movdqa $xa2,0x60(%rsp) 972e1051a39Sopenharmony_ci movdqa $xa3,0x70(%rsp) 973e1051a39Sopenharmony_ci 974e1051a39Sopenharmony_ci pshufd \$0x00,$xb3,$xb0 975e1051a39Sopenharmony_ci pshufd \$0x55,$xb3,$xb1 976e1051a39Sopenharmony_ci movdqa $xb0,0x80-0x100(%rcx) 977e1051a39Sopenharmony_ci pshufd \$0xaa,$xb3,$xb2 978e1051a39Sopenharmony_ci movdqa $xb1,0x90-0x100(%rcx) 979e1051a39Sopenharmony_ci pshufd \$0xff,$xb3,$xb3 980e1051a39Sopenharmony_ci movdqa $xb2,0xa0-0x100(%rcx) 981e1051a39Sopenharmony_ci movdqa $xb3,0xb0-0x100(%rcx) 982e1051a39Sopenharmony_ci 983e1051a39Sopenharmony_ci pshufd \$0x00,$xt3,$xt0 # "$xc0" 984e1051a39Sopenharmony_ci pshufd \$0x55,$xt3,$xt1 # "$xc1" 985e1051a39Sopenharmony_ci movdqa $xt0,0xc0-0x100(%rcx) 986e1051a39Sopenharmony_ci pshufd \$0xaa,$xt3,$xt2 # "$xc2" 987e1051a39Sopenharmony_ci movdqa $xt1,0xd0-0x100(%rcx) 988e1051a39Sopenharmony_ci pshufd \$0xff,$xt3,$xt3 # "$xc3" 989e1051a39Sopenharmony_ci movdqa $xt2,0xe0-0x100(%rcx) 990e1051a39Sopenharmony_ci movdqa $xt3,0xf0-0x100(%rcx) 991e1051a39Sopenharmony_ci 992e1051a39Sopenharmony_ci pshufd \$0x00,$xd3,$xd0 993e1051a39Sopenharmony_ci pshufd \$0x55,$xd3,$xd1 994e1051a39Sopenharmony_ci paddd .Linc(%rip),$xd0 # don't save counters yet 995e1051a39Sopenharmony_ci pshufd \$0xaa,$xd3,$xd2 996e1051a39Sopenharmony_ci movdqa $xd1,0x110-0x100(%rcx) 997e1051a39Sopenharmony_ci pshufd \$0xff,$xd3,$xd3 998e1051a39Sopenharmony_ci movdqa $xd2,0x120-0x100(%rcx) 999e1051a39Sopenharmony_ci movdqa $xd3,0x130-0x100(%rcx) 1000e1051a39Sopenharmony_ci 1001e1051a39Sopenharmony_ci jmp .Loop_enter4x 1002e1051a39Sopenharmony_ci 1003e1051a39Sopenharmony_ci.align 32 1004e1051a39Sopenharmony_ci.Loop_outer4x: 1005e1051a39Sopenharmony_ci movdqa 0x40(%rsp),$xa0 # re-load smashed key 1006e1051a39Sopenharmony_ci movdqa 0x50(%rsp),$xa1 1007e1051a39Sopenharmony_ci movdqa 0x60(%rsp),$xa2 1008e1051a39Sopenharmony_ci movdqa 0x70(%rsp),$xa3 1009e1051a39Sopenharmony_ci movdqa 0x80-0x100(%rcx),$xb0 1010e1051a39Sopenharmony_ci movdqa 0x90-0x100(%rcx),$xb1 1011e1051a39Sopenharmony_ci movdqa 0xa0-0x100(%rcx),$xb2 1012e1051a39Sopenharmony_ci movdqa 0xb0-0x100(%rcx),$xb3 1013e1051a39Sopenharmony_ci movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 1014e1051a39Sopenharmony_ci movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 1015e1051a39Sopenharmony_ci movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 1016e1051a39Sopenharmony_ci movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 1017e1051a39Sopenharmony_ci movdqa 0x100-0x100(%rcx),$xd0 1018e1051a39Sopenharmony_ci movdqa 0x110-0x100(%rcx),$xd1 1019e1051a39Sopenharmony_ci movdqa 0x120-0x100(%rcx),$xd2 1020e1051a39Sopenharmony_ci movdqa 0x130-0x100(%rcx),$xd3 1021e1051a39Sopenharmony_ci paddd .Lfour(%rip),$xd0 # next SIMD counters 1022e1051a39Sopenharmony_ci 1023e1051a39Sopenharmony_ci.Loop_enter4x: 1024e1051a39Sopenharmony_ci movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" 1025e1051a39Sopenharmony_ci movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" 1026e1051a39Sopenharmony_ci movdqa (%r10),$xt3 # .Lrot16(%rip) 1027e1051a39Sopenharmony_ci mov \$10,%eax 1028e1051a39Sopenharmony_ci movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 1029e1051a39Sopenharmony_ci jmp .Loop4x 1030e1051a39Sopenharmony_ci 1031e1051a39Sopenharmony_ci.align 32 1032e1051a39Sopenharmony_ci.Loop4x: 1033e1051a39Sopenharmony_ci___ 1034e1051a39Sopenharmony_ci foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } 1035e1051a39Sopenharmony_ci foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } 1036e1051a39Sopenharmony_ci$code.=<<___; 1037e1051a39Sopenharmony_ci dec %eax 1038e1051a39Sopenharmony_ci jnz .Loop4x 1039e1051a39Sopenharmony_ci 1040e1051a39Sopenharmony_ci paddd 0x40(%rsp),$xa0 # accumulate key material 1041e1051a39Sopenharmony_ci paddd 0x50(%rsp),$xa1 1042e1051a39Sopenharmony_ci paddd 0x60(%rsp),$xa2 1043e1051a39Sopenharmony_ci paddd 0x70(%rsp),$xa3 1044e1051a39Sopenharmony_ci 1045e1051a39Sopenharmony_ci movdqa $xa0,$xt2 # "de-interlace" data 1046e1051a39Sopenharmony_ci punpckldq $xa1,$xa0 1047e1051a39Sopenharmony_ci movdqa $xa2,$xt3 1048e1051a39Sopenharmony_ci punpckldq $xa3,$xa2 1049e1051a39Sopenharmony_ci punpckhdq $xa1,$xt2 1050e1051a39Sopenharmony_ci punpckhdq $xa3,$xt3 1051e1051a39Sopenharmony_ci movdqa $xa0,$xa1 1052e1051a39Sopenharmony_ci punpcklqdq $xa2,$xa0 # "a0" 1053e1051a39Sopenharmony_ci movdqa $xt2,$xa3 1054e1051a39Sopenharmony_ci punpcklqdq $xt3,$xt2 # "a2" 1055e1051a39Sopenharmony_ci punpckhqdq $xa2,$xa1 # "a1" 1056e1051a39Sopenharmony_ci punpckhqdq $xt3,$xa3 # "a3" 1057e1051a39Sopenharmony_ci___ 1058e1051a39Sopenharmony_ci ($xa2,$xt2)=($xt2,$xa2); 1059e1051a39Sopenharmony_ci$code.=<<___; 1060e1051a39Sopenharmony_ci paddd 0x80-0x100(%rcx),$xb0 1061e1051a39Sopenharmony_ci paddd 0x90-0x100(%rcx),$xb1 1062e1051a39Sopenharmony_ci paddd 0xa0-0x100(%rcx),$xb2 1063e1051a39Sopenharmony_ci paddd 0xb0-0x100(%rcx),$xb3 1064e1051a39Sopenharmony_ci 1065e1051a39Sopenharmony_ci movdqa $xa0,0x00(%rsp) # offload $xaN 1066e1051a39Sopenharmony_ci movdqa $xa1,0x10(%rsp) 1067e1051a39Sopenharmony_ci movdqa 0x20(%rsp),$xa0 # "xc2" 1068e1051a39Sopenharmony_ci movdqa 0x30(%rsp),$xa1 # "xc3" 1069e1051a39Sopenharmony_ci 1070e1051a39Sopenharmony_ci movdqa $xb0,$xt2 1071e1051a39Sopenharmony_ci punpckldq $xb1,$xb0 1072e1051a39Sopenharmony_ci movdqa $xb2,$xt3 1073e1051a39Sopenharmony_ci punpckldq $xb3,$xb2 1074e1051a39Sopenharmony_ci punpckhdq $xb1,$xt2 1075e1051a39Sopenharmony_ci punpckhdq $xb3,$xt3 1076e1051a39Sopenharmony_ci movdqa $xb0,$xb1 1077e1051a39Sopenharmony_ci punpcklqdq $xb2,$xb0 # "b0" 1078e1051a39Sopenharmony_ci movdqa $xt2,$xb3 1079e1051a39Sopenharmony_ci punpcklqdq $xt3,$xt2 # "b2" 1080e1051a39Sopenharmony_ci punpckhqdq $xb2,$xb1 # "b1" 1081e1051a39Sopenharmony_ci punpckhqdq $xt3,$xb3 # "b3" 1082e1051a39Sopenharmony_ci___ 1083e1051a39Sopenharmony_ci ($xb2,$xt2)=($xt2,$xb2); 1084e1051a39Sopenharmony_ci my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1085e1051a39Sopenharmony_ci$code.=<<___; 1086e1051a39Sopenharmony_ci paddd 0xc0-0x100(%rcx),$xc0 1087e1051a39Sopenharmony_ci paddd 0xd0-0x100(%rcx),$xc1 1088e1051a39Sopenharmony_ci paddd 0xe0-0x100(%rcx),$xc2 1089e1051a39Sopenharmony_ci paddd 0xf0-0x100(%rcx),$xc3 1090e1051a39Sopenharmony_ci 1091e1051a39Sopenharmony_ci movdqa $xa2,0x20(%rsp) # keep offloading $xaN 1092e1051a39Sopenharmony_ci movdqa $xa3,0x30(%rsp) 1093e1051a39Sopenharmony_ci 1094e1051a39Sopenharmony_ci movdqa $xc0,$xt2 1095e1051a39Sopenharmony_ci punpckldq $xc1,$xc0 1096e1051a39Sopenharmony_ci movdqa $xc2,$xt3 1097e1051a39Sopenharmony_ci punpckldq $xc3,$xc2 1098e1051a39Sopenharmony_ci punpckhdq $xc1,$xt2 1099e1051a39Sopenharmony_ci punpckhdq $xc3,$xt3 1100e1051a39Sopenharmony_ci movdqa $xc0,$xc1 1101e1051a39Sopenharmony_ci punpcklqdq $xc2,$xc0 # "c0" 1102e1051a39Sopenharmony_ci movdqa $xt2,$xc3 1103e1051a39Sopenharmony_ci punpcklqdq $xt3,$xt2 # "c2" 1104e1051a39Sopenharmony_ci punpckhqdq $xc2,$xc1 # "c1" 1105e1051a39Sopenharmony_ci punpckhqdq $xt3,$xc3 # "c3" 1106e1051a39Sopenharmony_ci___ 1107e1051a39Sopenharmony_ci ($xc2,$xt2)=($xt2,$xc2); 1108e1051a39Sopenharmony_ci ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary 1109e1051a39Sopenharmony_ci$code.=<<___; 1110e1051a39Sopenharmony_ci paddd 0x100-0x100(%rcx),$xd0 1111e1051a39Sopenharmony_ci paddd 0x110-0x100(%rcx),$xd1 1112e1051a39Sopenharmony_ci paddd 0x120-0x100(%rcx),$xd2 1113e1051a39Sopenharmony_ci paddd 0x130-0x100(%rcx),$xd3 1114e1051a39Sopenharmony_ci 1115e1051a39Sopenharmony_ci movdqa $xd0,$xt2 1116e1051a39Sopenharmony_ci punpckldq $xd1,$xd0 1117e1051a39Sopenharmony_ci movdqa $xd2,$xt3 1118e1051a39Sopenharmony_ci punpckldq $xd3,$xd2 1119e1051a39Sopenharmony_ci punpckhdq $xd1,$xt2 1120e1051a39Sopenharmony_ci punpckhdq $xd3,$xt3 1121e1051a39Sopenharmony_ci movdqa $xd0,$xd1 1122e1051a39Sopenharmony_ci punpcklqdq $xd2,$xd0 # "d0" 1123e1051a39Sopenharmony_ci movdqa $xt2,$xd3 1124e1051a39Sopenharmony_ci punpcklqdq $xt3,$xt2 # "d2" 1125e1051a39Sopenharmony_ci punpckhqdq $xd2,$xd1 # "d1" 1126e1051a39Sopenharmony_ci punpckhqdq $xt3,$xd3 # "d3" 1127e1051a39Sopenharmony_ci___ 1128e1051a39Sopenharmony_ci ($xd2,$xt2)=($xt2,$xd2); 1129e1051a39Sopenharmony_ci$code.=<<___; 1130e1051a39Sopenharmony_ci cmp \$64*4,$len 1131e1051a39Sopenharmony_ci jb .Ltail4x 1132e1051a39Sopenharmony_ci 1133e1051a39Sopenharmony_ci movdqu 0x00($inp),$xt0 # xor with input 1134e1051a39Sopenharmony_ci movdqu 0x10($inp),$xt1 1135e1051a39Sopenharmony_ci movdqu 0x20($inp),$xt2 1136e1051a39Sopenharmony_ci movdqu 0x30($inp),$xt3 1137e1051a39Sopenharmony_ci pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1138e1051a39Sopenharmony_ci pxor $xb0,$xt1 1139e1051a39Sopenharmony_ci pxor $xc0,$xt2 1140e1051a39Sopenharmony_ci pxor $xd0,$xt3 1141e1051a39Sopenharmony_ci 1142e1051a39Sopenharmony_ci movdqu $xt0,0x00($out) 1143e1051a39Sopenharmony_ci movdqu 0x40($inp),$xt0 1144e1051a39Sopenharmony_ci movdqu $xt1,0x10($out) 1145e1051a39Sopenharmony_ci movdqu 0x50($inp),$xt1 1146e1051a39Sopenharmony_ci movdqu $xt2,0x20($out) 1147e1051a39Sopenharmony_ci movdqu 0x60($inp),$xt2 1148e1051a39Sopenharmony_ci movdqu $xt3,0x30($out) 1149e1051a39Sopenharmony_ci movdqu 0x70($inp),$xt3 1150e1051a39Sopenharmony_ci lea 0x80($inp),$inp # size optimization 1151e1051a39Sopenharmony_ci pxor 0x10(%rsp),$xt0 1152e1051a39Sopenharmony_ci pxor $xb1,$xt1 1153e1051a39Sopenharmony_ci pxor $xc1,$xt2 1154e1051a39Sopenharmony_ci pxor $xd1,$xt3 1155e1051a39Sopenharmony_ci 1156e1051a39Sopenharmony_ci movdqu $xt0,0x40($out) 1157e1051a39Sopenharmony_ci movdqu 0x00($inp),$xt0 1158e1051a39Sopenharmony_ci movdqu $xt1,0x50($out) 1159e1051a39Sopenharmony_ci movdqu 0x10($inp),$xt1 1160e1051a39Sopenharmony_ci movdqu $xt2,0x60($out) 1161e1051a39Sopenharmony_ci movdqu 0x20($inp),$xt2 1162e1051a39Sopenharmony_ci movdqu $xt3,0x70($out) 1163e1051a39Sopenharmony_ci lea 0x80($out),$out # size optimization 1164e1051a39Sopenharmony_ci movdqu 0x30($inp),$xt3 1165e1051a39Sopenharmony_ci pxor 0x20(%rsp),$xt0 1166e1051a39Sopenharmony_ci pxor $xb2,$xt1 1167e1051a39Sopenharmony_ci pxor $xc2,$xt2 1168e1051a39Sopenharmony_ci pxor $xd2,$xt3 1169e1051a39Sopenharmony_ci 1170e1051a39Sopenharmony_ci movdqu $xt0,0x00($out) 1171e1051a39Sopenharmony_ci movdqu 0x40($inp),$xt0 1172e1051a39Sopenharmony_ci movdqu $xt1,0x10($out) 1173e1051a39Sopenharmony_ci movdqu 0x50($inp),$xt1 1174e1051a39Sopenharmony_ci movdqu $xt2,0x20($out) 1175e1051a39Sopenharmony_ci movdqu 0x60($inp),$xt2 1176e1051a39Sopenharmony_ci movdqu $xt3,0x30($out) 1177e1051a39Sopenharmony_ci movdqu 0x70($inp),$xt3 1178e1051a39Sopenharmony_ci lea 0x80($inp),$inp # inp+=64*4 1179e1051a39Sopenharmony_ci pxor 0x30(%rsp),$xt0 1180e1051a39Sopenharmony_ci pxor $xb3,$xt1 1181e1051a39Sopenharmony_ci pxor $xc3,$xt2 1182e1051a39Sopenharmony_ci pxor $xd3,$xt3 1183e1051a39Sopenharmony_ci movdqu $xt0,0x40($out) 1184e1051a39Sopenharmony_ci movdqu $xt1,0x50($out) 1185e1051a39Sopenharmony_ci movdqu $xt2,0x60($out) 1186e1051a39Sopenharmony_ci movdqu $xt3,0x70($out) 1187e1051a39Sopenharmony_ci lea 0x80($out),$out # out+=64*4 1188e1051a39Sopenharmony_ci 1189e1051a39Sopenharmony_ci sub \$64*4,$len 1190e1051a39Sopenharmony_ci jnz .Loop_outer4x 1191e1051a39Sopenharmony_ci 1192e1051a39Sopenharmony_ci jmp .Ldone4x 1193e1051a39Sopenharmony_ci 1194e1051a39Sopenharmony_ci.Ltail4x: 1195e1051a39Sopenharmony_ci cmp \$192,$len 1196e1051a39Sopenharmony_ci jae .L192_or_more4x 1197e1051a39Sopenharmony_ci cmp \$128,$len 1198e1051a39Sopenharmony_ci jae .L128_or_more4x 1199e1051a39Sopenharmony_ci cmp \$64,$len 1200e1051a39Sopenharmony_ci jae .L64_or_more4x 1201e1051a39Sopenharmony_ci 1202e1051a39Sopenharmony_ci #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1203e1051a39Sopenharmony_ci xor %r10,%r10 1204e1051a39Sopenharmony_ci #movdqa $xt0,0x00(%rsp) 1205e1051a39Sopenharmony_ci movdqa $xb0,0x10(%rsp) 1206e1051a39Sopenharmony_ci movdqa $xc0,0x20(%rsp) 1207e1051a39Sopenharmony_ci movdqa $xd0,0x30(%rsp) 1208e1051a39Sopenharmony_ci jmp .Loop_tail4x 1209e1051a39Sopenharmony_ci 1210e1051a39Sopenharmony_ci.align 32 1211e1051a39Sopenharmony_ci.L64_or_more4x: 1212e1051a39Sopenharmony_ci movdqu 0x00($inp),$xt0 # xor with input 1213e1051a39Sopenharmony_ci movdqu 0x10($inp),$xt1 1214e1051a39Sopenharmony_ci movdqu 0x20($inp),$xt2 1215e1051a39Sopenharmony_ci movdqu 0x30($inp),$xt3 1216e1051a39Sopenharmony_ci pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? 1217e1051a39Sopenharmony_ci pxor $xb0,$xt1 1218e1051a39Sopenharmony_ci pxor $xc0,$xt2 1219e1051a39Sopenharmony_ci pxor $xd0,$xt3 1220e1051a39Sopenharmony_ci movdqu $xt0,0x00($out) 1221e1051a39Sopenharmony_ci movdqu $xt1,0x10($out) 1222e1051a39Sopenharmony_ci movdqu $xt2,0x20($out) 1223e1051a39Sopenharmony_ci movdqu $xt3,0x30($out) 1224e1051a39Sopenharmony_ci je .Ldone4x 1225e1051a39Sopenharmony_ci 1226e1051a39Sopenharmony_ci movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? 1227e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64*1 1228e1051a39Sopenharmony_ci xor %r10,%r10 1229e1051a39Sopenharmony_ci movdqa $xt0,0x00(%rsp) 1230e1051a39Sopenharmony_ci movdqa $xb1,0x10(%rsp) 1231e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64*1 1232e1051a39Sopenharmony_ci movdqa $xc1,0x20(%rsp) 1233e1051a39Sopenharmony_ci sub \$64,$len # len-=64*1 1234e1051a39Sopenharmony_ci movdqa $xd1,0x30(%rsp) 1235e1051a39Sopenharmony_ci jmp .Loop_tail4x 1236e1051a39Sopenharmony_ci 1237e1051a39Sopenharmony_ci.align 32 1238e1051a39Sopenharmony_ci.L128_or_more4x: 1239e1051a39Sopenharmony_ci movdqu 0x00($inp),$xt0 # xor with input 1240e1051a39Sopenharmony_ci movdqu 0x10($inp),$xt1 1241e1051a39Sopenharmony_ci movdqu 0x20($inp),$xt2 1242e1051a39Sopenharmony_ci movdqu 0x30($inp),$xt3 1243e1051a39Sopenharmony_ci pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1244e1051a39Sopenharmony_ci pxor $xb0,$xt1 1245e1051a39Sopenharmony_ci pxor $xc0,$xt2 1246e1051a39Sopenharmony_ci pxor $xd0,$xt3 1247e1051a39Sopenharmony_ci 1248e1051a39Sopenharmony_ci movdqu $xt0,0x00($out) 1249e1051a39Sopenharmony_ci movdqu 0x40($inp),$xt0 1250e1051a39Sopenharmony_ci movdqu $xt1,0x10($out) 1251e1051a39Sopenharmony_ci movdqu 0x50($inp),$xt1 1252e1051a39Sopenharmony_ci movdqu $xt2,0x20($out) 1253e1051a39Sopenharmony_ci movdqu 0x60($inp),$xt2 1254e1051a39Sopenharmony_ci movdqu $xt3,0x30($out) 1255e1051a39Sopenharmony_ci movdqu 0x70($inp),$xt3 1256e1051a39Sopenharmony_ci pxor 0x10(%rsp),$xt0 1257e1051a39Sopenharmony_ci pxor $xb1,$xt1 1258e1051a39Sopenharmony_ci pxor $xc1,$xt2 1259e1051a39Sopenharmony_ci pxor $xd1,$xt3 1260e1051a39Sopenharmony_ci movdqu $xt0,0x40($out) 1261e1051a39Sopenharmony_ci movdqu $xt1,0x50($out) 1262e1051a39Sopenharmony_ci movdqu $xt2,0x60($out) 1263e1051a39Sopenharmony_ci movdqu $xt3,0x70($out) 1264e1051a39Sopenharmony_ci je .Ldone4x 1265e1051a39Sopenharmony_ci 1266e1051a39Sopenharmony_ci movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? 1267e1051a39Sopenharmony_ci lea 0x80($inp),$inp # inp+=64*2 1268e1051a39Sopenharmony_ci xor %r10,%r10 1269e1051a39Sopenharmony_ci movdqa $xt0,0x00(%rsp) 1270e1051a39Sopenharmony_ci movdqa $xb2,0x10(%rsp) 1271e1051a39Sopenharmony_ci lea 0x80($out),$out # out+=64*2 1272e1051a39Sopenharmony_ci movdqa $xc2,0x20(%rsp) 1273e1051a39Sopenharmony_ci sub \$128,$len # len-=64*2 1274e1051a39Sopenharmony_ci movdqa $xd2,0x30(%rsp) 1275e1051a39Sopenharmony_ci jmp .Loop_tail4x 1276e1051a39Sopenharmony_ci 1277e1051a39Sopenharmony_ci.align 32 1278e1051a39Sopenharmony_ci.L192_or_more4x: 1279e1051a39Sopenharmony_ci movdqu 0x00($inp),$xt0 # xor with input 1280e1051a39Sopenharmony_ci movdqu 0x10($inp),$xt1 1281e1051a39Sopenharmony_ci movdqu 0x20($inp),$xt2 1282e1051a39Sopenharmony_ci movdqu 0x30($inp),$xt3 1283e1051a39Sopenharmony_ci pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1284e1051a39Sopenharmony_ci pxor $xb0,$xt1 1285e1051a39Sopenharmony_ci pxor $xc0,$xt2 1286e1051a39Sopenharmony_ci pxor $xd0,$xt3 1287e1051a39Sopenharmony_ci 1288e1051a39Sopenharmony_ci movdqu $xt0,0x00($out) 1289e1051a39Sopenharmony_ci movdqu 0x40($inp),$xt0 1290e1051a39Sopenharmony_ci movdqu $xt1,0x10($out) 1291e1051a39Sopenharmony_ci movdqu 0x50($inp),$xt1 1292e1051a39Sopenharmony_ci movdqu $xt2,0x20($out) 1293e1051a39Sopenharmony_ci movdqu 0x60($inp),$xt2 1294e1051a39Sopenharmony_ci movdqu $xt3,0x30($out) 1295e1051a39Sopenharmony_ci movdqu 0x70($inp),$xt3 1296e1051a39Sopenharmony_ci lea 0x80($inp),$inp # size optimization 1297e1051a39Sopenharmony_ci pxor 0x10(%rsp),$xt0 1298e1051a39Sopenharmony_ci pxor $xb1,$xt1 1299e1051a39Sopenharmony_ci pxor $xc1,$xt2 1300e1051a39Sopenharmony_ci pxor $xd1,$xt3 1301e1051a39Sopenharmony_ci 1302e1051a39Sopenharmony_ci movdqu $xt0,0x40($out) 1303e1051a39Sopenharmony_ci movdqu 0x00($inp),$xt0 1304e1051a39Sopenharmony_ci movdqu $xt1,0x50($out) 1305e1051a39Sopenharmony_ci movdqu 0x10($inp),$xt1 1306e1051a39Sopenharmony_ci movdqu $xt2,0x60($out) 1307e1051a39Sopenharmony_ci movdqu 0x20($inp),$xt2 1308e1051a39Sopenharmony_ci movdqu $xt3,0x70($out) 1309e1051a39Sopenharmony_ci lea 0x80($out),$out # size optimization 1310e1051a39Sopenharmony_ci movdqu 0x30($inp),$xt3 1311e1051a39Sopenharmony_ci pxor 0x20(%rsp),$xt0 1312e1051a39Sopenharmony_ci pxor $xb2,$xt1 1313e1051a39Sopenharmony_ci pxor $xc2,$xt2 1314e1051a39Sopenharmony_ci pxor $xd2,$xt3 1315e1051a39Sopenharmony_ci movdqu $xt0,0x00($out) 1316e1051a39Sopenharmony_ci movdqu $xt1,0x10($out) 1317e1051a39Sopenharmony_ci movdqu $xt2,0x20($out) 1318e1051a39Sopenharmony_ci movdqu $xt3,0x30($out) 1319e1051a39Sopenharmony_ci je .Ldone4x 1320e1051a39Sopenharmony_ci 1321e1051a39Sopenharmony_ci movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? 1322e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64*3 1323e1051a39Sopenharmony_ci xor %r10,%r10 1324e1051a39Sopenharmony_ci movdqa $xt0,0x00(%rsp) 1325e1051a39Sopenharmony_ci movdqa $xb3,0x10(%rsp) 1326e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64*3 1327e1051a39Sopenharmony_ci movdqa $xc3,0x20(%rsp) 1328e1051a39Sopenharmony_ci sub \$192,$len # len-=64*3 1329e1051a39Sopenharmony_ci movdqa $xd3,0x30(%rsp) 1330e1051a39Sopenharmony_ci 1331e1051a39Sopenharmony_ci.Loop_tail4x: 1332e1051a39Sopenharmony_ci movzb ($inp,%r10),%eax 1333e1051a39Sopenharmony_ci movzb (%rsp,%r10),%ecx 1334e1051a39Sopenharmony_ci lea 1(%r10),%r10 1335e1051a39Sopenharmony_ci xor %ecx,%eax 1336e1051a39Sopenharmony_ci mov %al,-1($out,%r10) 1337e1051a39Sopenharmony_ci dec $len 1338e1051a39Sopenharmony_ci jnz .Loop_tail4x 1339e1051a39Sopenharmony_ci 1340e1051a39Sopenharmony_ci.Ldone4x: 1341e1051a39Sopenharmony_ci___ 1342e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1343e1051a39Sopenharmony_ci movaps -0xa8(%r9),%xmm6 1344e1051a39Sopenharmony_ci movaps -0x98(%r9),%xmm7 1345e1051a39Sopenharmony_ci movaps -0x88(%r9),%xmm8 1346e1051a39Sopenharmony_ci movaps -0x78(%r9),%xmm9 1347e1051a39Sopenharmony_ci movaps -0x68(%r9),%xmm10 1348e1051a39Sopenharmony_ci movaps -0x58(%r9),%xmm11 1349e1051a39Sopenharmony_ci movaps -0x48(%r9),%xmm12 1350e1051a39Sopenharmony_ci movaps -0x38(%r9),%xmm13 1351e1051a39Sopenharmony_ci movaps -0x28(%r9),%xmm14 1352e1051a39Sopenharmony_ci movaps -0x18(%r9),%xmm15 1353e1051a39Sopenharmony_ci___ 1354e1051a39Sopenharmony_ci$code.=<<___; 1355e1051a39Sopenharmony_ci lea (%r9),%rsp 1356e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 1357e1051a39Sopenharmony_ci.L4x_epilogue: 1358e1051a39Sopenharmony_ci ret 1359e1051a39Sopenharmony_ci.cfi_endproc 1360e1051a39Sopenharmony_ci.size ChaCha20_4x,.-ChaCha20_4x 1361e1051a39Sopenharmony_ci___ 1362e1051a39Sopenharmony_ci} 1363e1051a39Sopenharmony_ci 1364e1051a39Sopenharmony_ci######################################################################## 1365e1051a39Sopenharmony_ci# XOP code path that handles all lengths. 1366e1051a39Sopenharmony_ciif ($avx) { 1367e1051a39Sopenharmony_ci# There is some "anomaly" observed depending on instructions' size or 1368e1051a39Sopenharmony_ci# alignment. If you look closely at below code you'll notice that 1369e1051a39Sopenharmony_ci# sometimes argument order varies. The order affects instruction 1370e1051a39Sopenharmony_ci# encoding by making it larger, and such fiddling gives 5% performance 1371e1051a39Sopenharmony_ci# improvement. This is on FX-4100... 1372e1051a39Sopenharmony_ci 1373e1051a39Sopenharmony_cimy ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1374e1051a39Sopenharmony_ci $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); 1375e1051a39Sopenharmony_cimy @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1376e1051a39Sopenharmony_ci $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); 1377e1051a39Sopenharmony_ci 1378e1051a39Sopenharmony_cisub XOP_lane_ROUND { 1379e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0)=@_; 1380e1051a39Sopenharmony_cimy ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1381e1051a39Sopenharmony_cimy ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1382e1051a39Sopenharmony_cimy ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1383e1051a39Sopenharmony_cimy @x=map("\"$_\"",@xx); 1384e1051a39Sopenharmony_ci 1385e1051a39Sopenharmony_ci ( 1386e1051a39Sopenharmony_ci "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1387e1051a39Sopenharmony_ci "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1388e1051a39Sopenharmony_ci "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1389e1051a39Sopenharmony_ci "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1390e1051a39Sopenharmony_ci "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1391e1051a39Sopenharmony_ci "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1392e1051a39Sopenharmony_ci "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1393e1051a39Sopenharmony_ci "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1394e1051a39Sopenharmony_ci "&vprotd (@x[$d0],@x[$d0],16)", 1395e1051a39Sopenharmony_ci "&vprotd (@x[$d1],@x[$d1],16)", 1396e1051a39Sopenharmony_ci "&vprotd (@x[$d2],@x[$d2],16)", 1397e1051a39Sopenharmony_ci "&vprotd (@x[$d3],@x[$d3],16)", 1398e1051a39Sopenharmony_ci 1399e1051a39Sopenharmony_ci "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 1400e1051a39Sopenharmony_ci "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 1401e1051a39Sopenharmony_ci "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 1402e1051a39Sopenharmony_ci "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 1403e1051a39Sopenharmony_ci "&vpxor (@x[$b0],@x[$c0],@x[$b0])", 1404e1051a39Sopenharmony_ci "&vpxor (@x[$b1],@x[$c1],@x[$b1])", 1405e1051a39Sopenharmony_ci "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip 1406e1051a39Sopenharmony_ci "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip 1407e1051a39Sopenharmony_ci "&vprotd (@x[$b0],@x[$b0],12)", 1408e1051a39Sopenharmony_ci "&vprotd (@x[$b1],@x[$b1],12)", 1409e1051a39Sopenharmony_ci "&vprotd (@x[$b2],@x[$b2],12)", 1410e1051a39Sopenharmony_ci "&vprotd (@x[$b3],@x[$b3],12)", 1411e1051a39Sopenharmony_ci 1412e1051a39Sopenharmony_ci "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip 1413e1051a39Sopenharmony_ci "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip 1414e1051a39Sopenharmony_ci "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1415e1051a39Sopenharmony_ci "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1416e1051a39Sopenharmony_ci "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1417e1051a39Sopenharmony_ci "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1418e1051a39Sopenharmony_ci "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1419e1051a39Sopenharmony_ci "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1420e1051a39Sopenharmony_ci "&vprotd (@x[$d0],@x[$d0],8)", 1421e1051a39Sopenharmony_ci "&vprotd (@x[$d1],@x[$d1],8)", 1422e1051a39Sopenharmony_ci "&vprotd (@x[$d2],@x[$d2],8)", 1423e1051a39Sopenharmony_ci "&vprotd (@x[$d3],@x[$d3],8)", 1424e1051a39Sopenharmony_ci 1425e1051a39Sopenharmony_ci "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 1426e1051a39Sopenharmony_ci "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 1427e1051a39Sopenharmony_ci "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 1428e1051a39Sopenharmony_ci "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 1429e1051a39Sopenharmony_ci "&vpxor (@x[$b0],@x[$c0],@x[$b0])", 1430e1051a39Sopenharmony_ci "&vpxor (@x[$b1],@x[$c1],@x[$b1])", 1431e1051a39Sopenharmony_ci "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip 1432e1051a39Sopenharmony_ci "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip 1433e1051a39Sopenharmony_ci "&vprotd (@x[$b0],@x[$b0],7)", 1434e1051a39Sopenharmony_ci "&vprotd (@x[$b1],@x[$b1],7)", 1435e1051a39Sopenharmony_ci "&vprotd (@x[$b2],@x[$b2],7)", 1436e1051a39Sopenharmony_ci "&vprotd (@x[$b3],@x[$b3],7)" 1437e1051a39Sopenharmony_ci ); 1438e1051a39Sopenharmony_ci} 1439e1051a39Sopenharmony_ci 1440e1051a39Sopenharmony_cimy $xframe = $win64 ? 0xa8 : 8; 1441e1051a39Sopenharmony_ci 1442e1051a39Sopenharmony_ci$code.=<<___; 1443e1051a39Sopenharmony_ci.type ChaCha20_4xop,\@function,5 1444e1051a39Sopenharmony_ci.align 32 1445e1051a39Sopenharmony_ciChaCha20_4xop: 1446e1051a39Sopenharmony_ci.cfi_startproc 1447e1051a39Sopenharmony_ci.LChaCha20_4xop: 1448e1051a39Sopenharmony_ci mov %rsp,%r9 # frame pointer 1449e1051a39Sopenharmony_ci.cfi_def_cfa_register %r9 1450e1051a39Sopenharmony_ci sub \$0x140+$xframe,%rsp 1451e1051a39Sopenharmony_ci___ 1452e1051a39Sopenharmony_ci ################ stack layout 1453e1051a39Sopenharmony_ci # +0x00 SIMD equivalent of @x[8-12] 1454e1051a39Sopenharmony_ci # ... 1455e1051a39Sopenharmony_ci # +0x40 constant copy of key[0-2] smashed by lanes 1456e1051a39Sopenharmony_ci # ... 1457e1051a39Sopenharmony_ci # +0x100 SIMD counters (with nonce smashed by lanes) 1458e1051a39Sopenharmony_ci # ... 1459e1051a39Sopenharmony_ci # +0x140 1460e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1461e1051a39Sopenharmony_ci movaps %xmm6,-0xa8(%r9) 1462e1051a39Sopenharmony_ci movaps %xmm7,-0x98(%r9) 1463e1051a39Sopenharmony_ci movaps %xmm8,-0x88(%r9) 1464e1051a39Sopenharmony_ci movaps %xmm9,-0x78(%r9) 1465e1051a39Sopenharmony_ci movaps %xmm10,-0x68(%r9) 1466e1051a39Sopenharmony_ci movaps %xmm11,-0x58(%r9) 1467e1051a39Sopenharmony_ci movaps %xmm12,-0x48(%r9) 1468e1051a39Sopenharmony_ci movaps %xmm13,-0x38(%r9) 1469e1051a39Sopenharmony_ci movaps %xmm14,-0x28(%r9) 1470e1051a39Sopenharmony_ci movaps %xmm15,-0x18(%r9) 1471e1051a39Sopenharmony_ci.L4xop_body: 1472e1051a39Sopenharmony_ci___ 1473e1051a39Sopenharmony_ci$code.=<<___; 1474e1051a39Sopenharmony_ci vzeroupper 1475e1051a39Sopenharmony_ci 1476e1051a39Sopenharmony_ci vmovdqa .Lsigma(%rip),$xa3 # key[0] 1477e1051a39Sopenharmony_ci vmovdqu ($key),$xb3 # key[1] 1478e1051a39Sopenharmony_ci vmovdqu 16($key),$xt3 # key[2] 1479e1051a39Sopenharmony_ci vmovdqu ($counter),$xd3 # key[3] 1480e1051a39Sopenharmony_ci lea 0x100(%rsp),%rcx # size optimization 1481e1051a39Sopenharmony_ci 1482e1051a39Sopenharmony_ci vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1483e1051a39Sopenharmony_ci vpshufd \$0x55,$xa3,$xa1 1484e1051a39Sopenharmony_ci vmovdqa $xa0,0x40(%rsp) # ... and offload 1485e1051a39Sopenharmony_ci vpshufd \$0xaa,$xa3,$xa2 1486e1051a39Sopenharmony_ci vmovdqa $xa1,0x50(%rsp) 1487e1051a39Sopenharmony_ci vpshufd \$0xff,$xa3,$xa3 1488e1051a39Sopenharmony_ci vmovdqa $xa2,0x60(%rsp) 1489e1051a39Sopenharmony_ci vmovdqa $xa3,0x70(%rsp) 1490e1051a39Sopenharmony_ci 1491e1051a39Sopenharmony_ci vpshufd \$0x00,$xb3,$xb0 1492e1051a39Sopenharmony_ci vpshufd \$0x55,$xb3,$xb1 1493e1051a39Sopenharmony_ci vmovdqa $xb0,0x80-0x100(%rcx) 1494e1051a39Sopenharmony_ci vpshufd \$0xaa,$xb3,$xb2 1495e1051a39Sopenharmony_ci vmovdqa $xb1,0x90-0x100(%rcx) 1496e1051a39Sopenharmony_ci vpshufd \$0xff,$xb3,$xb3 1497e1051a39Sopenharmony_ci vmovdqa $xb2,0xa0-0x100(%rcx) 1498e1051a39Sopenharmony_ci vmovdqa $xb3,0xb0-0x100(%rcx) 1499e1051a39Sopenharmony_ci 1500e1051a39Sopenharmony_ci vpshufd \$0x00,$xt3,$xt0 # "$xc0" 1501e1051a39Sopenharmony_ci vpshufd \$0x55,$xt3,$xt1 # "$xc1" 1502e1051a39Sopenharmony_ci vmovdqa $xt0,0xc0-0x100(%rcx) 1503e1051a39Sopenharmony_ci vpshufd \$0xaa,$xt3,$xt2 # "$xc2" 1504e1051a39Sopenharmony_ci vmovdqa $xt1,0xd0-0x100(%rcx) 1505e1051a39Sopenharmony_ci vpshufd \$0xff,$xt3,$xt3 # "$xc3" 1506e1051a39Sopenharmony_ci vmovdqa $xt2,0xe0-0x100(%rcx) 1507e1051a39Sopenharmony_ci vmovdqa $xt3,0xf0-0x100(%rcx) 1508e1051a39Sopenharmony_ci 1509e1051a39Sopenharmony_ci vpshufd \$0x00,$xd3,$xd0 1510e1051a39Sopenharmony_ci vpshufd \$0x55,$xd3,$xd1 1511e1051a39Sopenharmony_ci vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet 1512e1051a39Sopenharmony_ci vpshufd \$0xaa,$xd3,$xd2 1513e1051a39Sopenharmony_ci vmovdqa $xd1,0x110-0x100(%rcx) 1514e1051a39Sopenharmony_ci vpshufd \$0xff,$xd3,$xd3 1515e1051a39Sopenharmony_ci vmovdqa $xd2,0x120-0x100(%rcx) 1516e1051a39Sopenharmony_ci vmovdqa $xd3,0x130-0x100(%rcx) 1517e1051a39Sopenharmony_ci 1518e1051a39Sopenharmony_ci jmp .Loop_enter4xop 1519e1051a39Sopenharmony_ci 1520e1051a39Sopenharmony_ci.align 32 1521e1051a39Sopenharmony_ci.Loop_outer4xop: 1522e1051a39Sopenharmony_ci vmovdqa 0x40(%rsp),$xa0 # re-load smashed key 1523e1051a39Sopenharmony_ci vmovdqa 0x50(%rsp),$xa1 1524e1051a39Sopenharmony_ci vmovdqa 0x60(%rsp),$xa2 1525e1051a39Sopenharmony_ci vmovdqa 0x70(%rsp),$xa3 1526e1051a39Sopenharmony_ci vmovdqa 0x80-0x100(%rcx),$xb0 1527e1051a39Sopenharmony_ci vmovdqa 0x90-0x100(%rcx),$xb1 1528e1051a39Sopenharmony_ci vmovdqa 0xa0-0x100(%rcx),$xb2 1529e1051a39Sopenharmony_ci vmovdqa 0xb0-0x100(%rcx),$xb3 1530e1051a39Sopenharmony_ci vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 1531e1051a39Sopenharmony_ci vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 1532e1051a39Sopenharmony_ci vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 1533e1051a39Sopenharmony_ci vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 1534e1051a39Sopenharmony_ci vmovdqa 0x100-0x100(%rcx),$xd0 1535e1051a39Sopenharmony_ci vmovdqa 0x110-0x100(%rcx),$xd1 1536e1051a39Sopenharmony_ci vmovdqa 0x120-0x100(%rcx),$xd2 1537e1051a39Sopenharmony_ci vmovdqa 0x130-0x100(%rcx),$xd3 1538e1051a39Sopenharmony_ci vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters 1539e1051a39Sopenharmony_ci 1540e1051a39Sopenharmony_ci.Loop_enter4xop: 1541e1051a39Sopenharmony_ci mov \$10,%eax 1542e1051a39Sopenharmony_ci vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 1543e1051a39Sopenharmony_ci jmp .Loop4xop 1544e1051a39Sopenharmony_ci 1545e1051a39Sopenharmony_ci.align 32 1546e1051a39Sopenharmony_ci.Loop4xop: 1547e1051a39Sopenharmony_ci___ 1548e1051a39Sopenharmony_ci foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } 1549e1051a39Sopenharmony_ci foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } 1550e1051a39Sopenharmony_ci$code.=<<___; 1551e1051a39Sopenharmony_ci dec %eax 1552e1051a39Sopenharmony_ci jnz .Loop4xop 1553e1051a39Sopenharmony_ci 1554e1051a39Sopenharmony_ci vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material 1555e1051a39Sopenharmony_ci vpaddd 0x50(%rsp),$xa1,$xa1 1556e1051a39Sopenharmony_ci vpaddd 0x60(%rsp),$xa2,$xa2 1557e1051a39Sopenharmony_ci vpaddd 0x70(%rsp),$xa3,$xa3 1558e1051a39Sopenharmony_ci 1559e1051a39Sopenharmony_ci vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 1560e1051a39Sopenharmony_ci vmovdqa $xt3,0x30(%rsp) 1561e1051a39Sopenharmony_ci 1562e1051a39Sopenharmony_ci vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 1563e1051a39Sopenharmony_ci vpunpckldq $xa3,$xa2,$xt3 1564e1051a39Sopenharmony_ci vpunpckhdq $xa1,$xa0,$xa0 1565e1051a39Sopenharmony_ci vpunpckhdq $xa3,$xa2,$xa2 1566e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 1567e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 1568e1051a39Sopenharmony_ci vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 1569e1051a39Sopenharmony_ci vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 1570e1051a39Sopenharmony_ci___ 1571e1051a39Sopenharmony_ci ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 1572e1051a39Sopenharmony_ci$code.=<<___; 1573e1051a39Sopenharmony_ci vpaddd 0x80-0x100(%rcx),$xb0,$xb0 1574e1051a39Sopenharmony_ci vpaddd 0x90-0x100(%rcx),$xb1,$xb1 1575e1051a39Sopenharmony_ci vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 1576e1051a39Sopenharmony_ci vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 1577e1051a39Sopenharmony_ci 1578e1051a39Sopenharmony_ci vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 1579e1051a39Sopenharmony_ci vmovdqa $xa1,0x10(%rsp) 1580e1051a39Sopenharmony_ci vmovdqa 0x20(%rsp),$xa0 # "xc2" 1581e1051a39Sopenharmony_ci vmovdqa 0x30(%rsp),$xa1 # "xc3" 1582e1051a39Sopenharmony_ci 1583e1051a39Sopenharmony_ci vpunpckldq $xb1,$xb0,$xt2 1584e1051a39Sopenharmony_ci vpunpckldq $xb3,$xb2,$xt3 1585e1051a39Sopenharmony_ci vpunpckhdq $xb1,$xb0,$xb0 1586e1051a39Sopenharmony_ci vpunpckhdq $xb3,$xb2,$xb2 1587e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 1588e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 1589e1051a39Sopenharmony_ci vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 1590e1051a39Sopenharmony_ci vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 1591e1051a39Sopenharmony_ci___ 1592e1051a39Sopenharmony_ci ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 1593e1051a39Sopenharmony_ci my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1594e1051a39Sopenharmony_ci$code.=<<___; 1595e1051a39Sopenharmony_ci vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 1596e1051a39Sopenharmony_ci vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 1597e1051a39Sopenharmony_ci vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 1598e1051a39Sopenharmony_ci vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 1599e1051a39Sopenharmony_ci 1600e1051a39Sopenharmony_ci vpunpckldq $xc1,$xc0,$xt2 1601e1051a39Sopenharmony_ci vpunpckldq $xc3,$xc2,$xt3 1602e1051a39Sopenharmony_ci vpunpckhdq $xc1,$xc0,$xc0 1603e1051a39Sopenharmony_ci vpunpckhdq $xc3,$xc2,$xc2 1604e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 1605e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 1606e1051a39Sopenharmony_ci vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 1607e1051a39Sopenharmony_ci vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 1608e1051a39Sopenharmony_ci___ 1609e1051a39Sopenharmony_ci ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 1610e1051a39Sopenharmony_ci$code.=<<___; 1611e1051a39Sopenharmony_ci vpaddd 0x100-0x100(%rcx),$xd0,$xd0 1612e1051a39Sopenharmony_ci vpaddd 0x110-0x100(%rcx),$xd1,$xd1 1613e1051a39Sopenharmony_ci vpaddd 0x120-0x100(%rcx),$xd2,$xd2 1614e1051a39Sopenharmony_ci vpaddd 0x130-0x100(%rcx),$xd3,$xd3 1615e1051a39Sopenharmony_ci 1616e1051a39Sopenharmony_ci vpunpckldq $xd1,$xd0,$xt2 1617e1051a39Sopenharmony_ci vpunpckldq $xd3,$xd2,$xt3 1618e1051a39Sopenharmony_ci vpunpckhdq $xd1,$xd0,$xd0 1619e1051a39Sopenharmony_ci vpunpckhdq $xd3,$xd2,$xd2 1620e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 1621e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 1622e1051a39Sopenharmony_ci vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 1623e1051a39Sopenharmony_ci vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 1624e1051a39Sopenharmony_ci___ 1625e1051a39Sopenharmony_ci ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 1626e1051a39Sopenharmony_ci ($xa0,$xa1)=($xt2,$xt3); 1627e1051a39Sopenharmony_ci$code.=<<___; 1628e1051a39Sopenharmony_ci vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 1629e1051a39Sopenharmony_ci vmovdqa 0x10(%rsp),$xa1 1630e1051a39Sopenharmony_ci 1631e1051a39Sopenharmony_ci cmp \$64*4,$len 1632e1051a39Sopenharmony_ci jb .Ltail4xop 1633e1051a39Sopenharmony_ci 1634e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 1635e1051a39Sopenharmony_ci vpxor 0x10($inp),$xb0,$xb0 1636e1051a39Sopenharmony_ci vpxor 0x20($inp),$xc0,$xc0 1637e1051a39Sopenharmony_ci vpxor 0x30($inp),$xd0,$xd0 1638e1051a39Sopenharmony_ci vpxor 0x40($inp),$xa1,$xa1 1639e1051a39Sopenharmony_ci vpxor 0x50($inp),$xb1,$xb1 1640e1051a39Sopenharmony_ci vpxor 0x60($inp),$xc1,$xc1 1641e1051a39Sopenharmony_ci vpxor 0x70($inp),$xd1,$xd1 1642e1051a39Sopenharmony_ci lea 0x80($inp),$inp # size optimization 1643e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa2,$xa2 1644e1051a39Sopenharmony_ci vpxor 0x10($inp),$xb2,$xb2 1645e1051a39Sopenharmony_ci vpxor 0x20($inp),$xc2,$xc2 1646e1051a39Sopenharmony_ci vpxor 0x30($inp),$xd2,$xd2 1647e1051a39Sopenharmony_ci vpxor 0x40($inp),$xa3,$xa3 1648e1051a39Sopenharmony_ci vpxor 0x50($inp),$xb3,$xb3 1649e1051a39Sopenharmony_ci vpxor 0x60($inp),$xc3,$xc3 1650e1051a39Sopenharmony_ci vpxor 0x70($inp),$xd3,$xd3 1651e1051a39Sopenharmony_ci lea 0x80($inp),$inp # inp+=64*4 1652e1051a39Sopenharmony_ci 1653e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 1654e1051a39Sopenharmony_ci vmovdqu $xb0,0x10($out) 1655e1051a39Sopenharmony_ci vmovdqu $xc0,0x20($out) 1656e1051a39Sopenharmony_ci vmovdqu $xd0,0x30($out) 1657e1051a39Sopenharmony_ci vmovdqu $xa1,0x40($out) 1658e1051a39Sopenharmony_ci vmovdqu $xb1,0x50($out) 1659e1051a39Sopenharmony_ci vmovdqu $xc1,0x60($out) 1660e1051a39Sopenharmony_ci vmovdqu $xd1,0x70($out) 1661e1051a39Sopenharmony_ci lea 0x80($out),$out # size optimization 1662e1051a39Sopenharmony_ci vmovdqu $xa2,0x00($out) 1663e1051a39Sopenharmony_ci vmovdqu $xb2,0x10($out) 1664e1051a39Sopenharmony_ci vmovdqu $xc2,0x20($out) 1665e1051a39Sopenharmony_ci vmovdqu $xd2,0x30($out) 1666e1051a39Sopenharmony_ci vmovdqu $xa3,0x40($out) 1667e1051a39Sopenharmony_ci vmovdqu $xb3,0x50($out) 1668e1051a39Sopenharmony_ci vmovdqu $xc3,0x60($out) 1669e1051a39Sopenharmony_ci vmovdqu $xd3,0x70($out) 1670e1051a39Sopenharmony_ci lea 0x80($out),$out # out+=64*4 1671e1051a39Sopenharmony_ci 1672e1051a39Sopenharmony_ci sub \$64*4,$len 1673e1051a39Sopenharmony_ci jnz .Loop_outer4xop 1674e1051a39Sopenharmony_ci 1675e1051a39Sopenharmony_ci jmp .Ldone4xop 1676e1051a39Sopenharmony_ci 1677e1051a39Sopenharmony_ci.align 32 1678e1051a39Sopenharmony_ci.Ltail4xop: 1679e1051a39Sopenharmony_ci cmp \$192,$len 1680e1051a39Sopenharmony_ci jae .L192_or_more4xop 1681e1051a39Sopenharmony_ci cmp \$128,$len 1682e1051a39Sopenharmony_ci jae .L128_or_more4xop 1683e1051a39Sopenharmony_ci cmp \$64,$len 1684e1051a39Sopenharmony_ci jae .L64_or_more4xop 1685e1051a39Sopenharmony_ci 1686e1051a39Sopenharmony_ci xor %r10,%r10 1687e1051a39Sopenharmony_ci vmovdqa $xa0,0x00(%rsp) 1688e1051a39Sopenharmony_ci vmovdqa $xb0,0x10(%rsp) 1689e1051a39Sopenharmony_ci vmovdqa $xc0,0x20(%rsp) 1690e1051a39Sopenharmony_ci vmovdqa $xd0,0x30(%rsp) 1691e1051a39Sopenharmony_ci jmp .Loop_tail4xop 1692e1051a39Sopenharmony_ci 1693e1051a39Sopenharmony_ci.align 32 1694e1051a39Sopenharmony_ci.L64_or_more4xop: 1695e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 1696e1051a39Sopenharmony_ci vpxor 0x10($inp),$xb0,$xb0 1697e1051a39Sopenharmony_ci vpxor 0x20($inp),$xc0,$xc0 1698e1051a39Sopenharmony_ci vpxor 0x30($inp),$xd0,$xd0 1699e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 1700e1051a39Sopenharmony_ci vmovdqu $xb0,0x10($out) 1701e1051a39Sopenharmony_ci vmovdqu $xc0,0x20($out) 1702e1051a39Sopenharmony_ci vmovdqu $xd0,0x30($out) 1703e1051a39Sopenharmony_ci je .Ldone4xop 1704e1051a39Sopenharmony_ci 1705e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64*1 1706e1051a39Sopenharmony_ci vmovdqa $xa1,0x00(%rsp) 1707e1051a39Sopenharmony_ci xor %r10,%r10 1708e1051a39Sopenharmony_ci vmovdqa $xb1,0x10(%rsp) 1709e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64*1 1710e1051a39Sopenharmony_ci vmovdqa $xc1,0x20(%rsp) 1711e1051a39Sopenharmony_ci sub \$64,$len # len-=64*1 1712e1051a39Sopenharmony_ci vmovdqa $xd1,0x30(%rsp) 1713e1051a39Sopenharmony_ci jmp .Loop_tail4xop 1714e1051a39Sopenharmony_ci 1715e1051a39Sopenharmony_ci.align 32 1716e1051a39Sopenharmony_ci.L128_or_more4xop: 1717e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 1718e1051a39Sopenharmony_ci vpxor 0x10($inp),$xb0,$xb0 1719e1051a39Sopenharmony_ci vpxor 0x20($inp),$xc0,$xc0 1720e1051a39Sopenharmony_ci vpxor 0x30($inp),$xd0,$xd0 1721e1051a39Sopenharmony_ci vpxor 0x40($inp),$xa1,$xa1 1722e1051a39Sopenharmony_ci vpxor 0x50($inp),$xb1,$xb1 1723e1051a39Sopenharmony_ci vpxor 0x60($inp),$xc1,$xc1 1724e1051a39Sopenharmony_ci vpxor 0x70($inp),$xd1,$xd1 1725e1051a39Sopenharmony_ci 1726e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 1727e1051a39Sopenharmony_ci vmovdqu $xb0,0x10($out) 1728e1051a39Sopenharmony_ci vmovdqu $xc0,0x20($out) 1729e1051a39Sopenharmony_ci vmovdqu $xd0,0x30($out) 1730e1051a39Sopenharmony_ci vmovdqu $xa1,0x40($out) 1731e1051a39Sopenharmony_ci vmovdqu $xb1,0x50($out) 1732e1051a39Sopenharmony_ci vmovdqu $xc1,0x60($out) 1733e1051a39Sopenharmony_ci vmovdqu $xd1,0x70($out) 1734e1051a39Sopenharmony_ci je .Ldone4xop 1735e1051a39Sopenharmony_ci 1736e1051a39Sopenharmony_ci lea 0x80($inp),$inp # inp+=64*2 1737e1051a39Sopenharmony_ci vmovdqa $xa2,0x00(%rsp) 1738e1051a39Sopenharmony_ci xor %r10,%r10 1739e1051a39Sopenharmony_ci vmovdqa $xb2,0x10(%rsp) 1740e1051a39Sopenharmony_ci lea 0x80($out),$out # out+=64*2 1741e1051a39Sopenharmony_ci vmovdqa $xc2,0x20(%rsp) 1742e1051a39Sopenharmony_ci sub \$128,$len # len-=64*2 1743e1051a39Sopenharmony_ci vmovdqa $xd2,0x30(%rsp) 1744e1051a39Sopenharmony_ci jmp .Loop_tail4xop 1745e1051a39Sopenharmony_ci 1746e1051a39Sopenharmony_ci.align 32 1747e1051a39Sopenharmony_ci.L192_or_more4xop: 1748e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 1749e1051a39Sopenharmony_ci vpxor 0x10($inp),$xb0,$xb0 1750e1051a39Sopenharmony_ci vpxor 0x20($inp),$xc0,$xc0 1751e1051a39Sopenharmony_ci vpxor 0x30($inp),$xd0,$xd0 1752e1051a39Sopenharmony_ci vpxor 0x40($inp),$xa1,$xa1 1753e1051a39Sopenharmony_ci vpxor 0x50($inp),$xb1,$xb1 1754e1051a39Sopenharmony_ci vpxor 0x60($inp),$xc1,$xc1 1755e1051a39Sopenharmony_ci vpxor 0x70($inp),$xd1,$xd1 1756e1051a39Sopenharmony_ci lea 0x80($inp),$inp # size optimization 1757e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa2,$xa2 1758e1051a39Sopenharmony_ci vpxor 0x10($inp),$xb2,$xb2 1759e1051a39Sopenharmony_ci vpxor 0x20($inp),$xc2,$xc2 1760e1051a39Sopenharmony_ci vpxor 0x30($inp),$xd2,$xd2 1761e1051a39Sopenharmony_ci 1762e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 1763e1051a39Sopenharmony_ci vmovdqu $xb0,0x10($out) 1764e1051a39Sopenharmony_ci vmovdqu $xc0,0x20($out) 1765e1051a39Sopenharmony_ci vmovdqu $xd0,0x30($out) 1766e1051a39Sopenharmony_ci vmovdqu $xa1,0x40($out) 1767e1051a39Sopenharmony_ci vmovdqu $xb1,0x50($out) 1768e1051a39Sopenharmony_ci vmovdqu $xc1,0x60($out) 1769e1051a39Sopenharmony_ci vmovdqu $xd1,0x70($out) 1770e1051a39Sopenharmony_ci lea 0x80($out),$out # size optimization 1771e1051a39Sopenharmony_ci vmovdqu $xa2,0x00($out) 1772e1051a39Sopenharmony_ci vmovdqu $xb2,0x10($out) 1773e1051a39Sopenharmony_ci vmovdqu $xc2,0x20($out) 1774e1051a39Sopenharmony_ci vmovdqu $xd2,0x30($out) 1775e1051a39Sopenharmony_ci je .Ldone4xop 1776e1051a39Sopenharmony_ci 1777e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64*3 1778e1051a39Sopenharmony_ci vmovdqa $xa3,0x00(%rsp) 1779e1051a39Sopenharmony_ci xor %r10,%r10 1780e1051a39Sopenharmony_ci vmovdqa $xb3,0x10(%rsp) 1781e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64*3 1782e1051a39Sopenharmony_ci vmovdqa $xc3,0x20(%rsp) 1783e1051a39Sopenharmony_ci sub \$192,$len # len-=64*3 1784e1051a39Sopenharmony_ci vmovdqa $xd3,0x30(%rsp) 1785e1051a39Sopenharmony_ci 1786e1051a39Sopenharmony_ci.Loop_tail4xop: 1787e1051a39Sopenharmony_ci movzb ($inp,%r10),%eax 1788e1051a39Sopenharmony_ci movzb (%rsp,%r10),%ecx 1789e1051a39Sopenharmony_ci lea 1(%r10),%r10 1790e1051a39Sopenharmony_ci xor %ecx,%eax 1791e1051a39Sopenharmony_ci mov %al,-1($out,%r10) 1792e1051a39Sopenharmony_ci dec $len 1793e1051a39Sopenharmony_ci jnz .Loop_tail4xop 1794e1051a39Sopenharmony_ci 1795e1051a39Sopenharmony_ci.Ldone4xop: 1796e1051a39Sopenharmony_ci vzeroupper 1797e1051a39Sopenharmony_ci___ 1798e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1799e1051a39Sopenharmony_ci movaps -0xa8(%r9),%xmm6 1800e1051a39Sopenharmony_ci movaps -0x98(%r9),%xmm7 1801e1051a39Sopenharmony_ci movaps -0x88(%r9),%xmm8 1802e1051a39Sopenharmony_ci movaps -0x78(%r9),%xmm9 1803e1051a39Sopenharmony_ci movaps -0x68(%r9),%xmm10 1804e1051a39Sopenharmony_ci movaps -0x58(%r9),%xmm11 1805e1051a39Sopenharmony_ci movaps -0x48(%r9),%xmm12 1806e1051a39Sopenharmony_ci movaps -0x38(%r9),%xmm13 1807e1051a39Sopenharmony_ci movaps -0x28(%r9),%xmm14 1808e1051a39Sopenharmony_ci movaps -0x18(%r9),%xmm15 1809e1051a39Sopenharmony_ci___ 1810e1051a39Sopenharmony_ci$code.=<<___; 1811e1051a39Sopenharmony_ci lea (%r9),%rsp 1812e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 1813e1051a39Sopenharmony_ci.L4xop_epilogue: 1814e1051a39Sopenharmony_ci ret 1815e1051a39Sopenharmony_ci.cfi_endproc 1816e1051a39Sopenharmony_ci.size ChaCha20_4xop,.-ChaCha20_4xop 1817e1051a39Sopenharmony_ci___ 1818e1051a39Sopenharmony_ci} 1819e1051a39Sopenharmony_ci 1820e1051a39Sopenharmony_ci######################################################################## 1821e1051a39Sopenharmony_ci# AVX2 code path 1822e1051a39Sopenharmony_ciif ($avx>1) { 1823e1051a39Sopenharmony_cimy ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1824e1051a39Sopenharmony_ci $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); 1825e1051a39Sopenharmony_cimy @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1826e1051a39Sopenharmony_ci "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 1827e1051a39Sopenharmony_ci 1828e1051a39Sopenharmony_cisub AVX2_lane_ROUND { 1829e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0)=@_; 1830e1051a39Sopenharmony_cimy ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1831e1051a39Sopenharmony_cimy ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1832e1051a39Sopenharmony_cimy ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1833e1051a39Sopenharmony_cimy ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 1834e1051a39Sopenharmony_cimy @x=map("\"$_\"",@xx); 1835e1051a39Sopenharmony_ci 1836e1051a39Sopenharmony_ci # Consider order in which variables are addressed by their 1837e1051a39Sopenharmony_ci # index: 1838e1051a39Sopenharmony_ci # 1839e1051a39Sopenharmony_ci # a b c d 1840e1051a39Sopenharmony_ci # 1841e1051a39Sopenharmony_ci # 0 4 8 12 < even round 1842e1051a39Sopenharmony_ci # 1 5 9 13 1843e1051a39Sopenharmony_ci # 2 6 10 14 1844e1051a39Sopenharmony_ci # 3 7 11 15 1845e1051a39Sopenharmony_ci # 0 5 10 15 < odd round 1846e1051a39Sopenharmony_ci # 1 6 11 12 1847e1051a39Sopenharmony_ci # 2 7 8 13 1848e1051a39Sopenharmony_ci # 3 4 9 14 1849e1051a39Sopenharmony_ci # 1850e1051a39Sopenharmony_ci # 'a', 'b' and 'd's are permanently allocated in registers, 1851e1051a39Sopenharmony_ci # @x[0..7,12..15], while 'c's are maintained in memory. If 1852e1051a39Sopenharmony_ci # you observe 'c' column, you'll notice that pair of 'c's is 1853e1051a39Sopenharmony_ci # invariant between rounds. This means that we have to reload 1854e1051a39Sopenharmony_ci # them once per round, in the middle. This is why you'll see 1855e1051a39Sopenharmony_ci # bunch of 'c' stores and loads in the middle, but none in 1856e1051a39Sopenharmony_ci # the beginning or end. 1857e1051a39Sopenharmony_ci 1858e1051a39Sopenharmony_ci ( 1859e1051a39Sopenharmony_ci "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1860e1051a39Sopenharmony_ci "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1861e1051a39Sopenharmony_ci "&vpshufb (@x[$d0],@x[$d0],$t1)", 1862e1051a39Sopenharmony_ci "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1863e1051a39Sopenharmony_ci "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1864e1051a39Sopenharmony_ci "&vpshufb (@x[$d1],@x[$d1],$t1)", 1865e1051a39Sopenharmony_ci 1866e1051a39Sopenharmony_ci "&vpaddd ($xc,$xc,@x[$d0])", 1867e1051a39Sopenharmony_ci "&vpxor (@x[$b0],$xc,@x[$b0])", 1868e1051a39Sopenharmony_ci "&vpslld ($t0,@x[$b0],12)", 1869e1051a39Sopenharmony_ci "&vpsrld (@x[$b0],@x[$b0],20)", 1870e1051a39Sopenharmony_ci "&vpor (@x[$b0],$t0,@x[$b0])", 1871e1051a39Sopenharmony_ci "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1872e1051a39Sopenharmony_ci "&vpaddd ($xc_,$xc_,@x[$d1])", 1873e1051a39Sopenharmony_ci "&vpxor (@x[$b1],$xc_,@x[$b1])", 1874e1051a39Sopenharmony_ci "&vpslld ($t1,@x[$b1],12)", 1875e1051a39Sopenharmony_ci "&vpsrld (@x[$b1],@x[$b1],20)", 1876e1051a39Sopenharmony_ci "&vpor (@x[$b1],$t1,@x[$b1])", 1877e1051a39Sopenharmony_ci 1878e1051a39Sopenharmony_ci "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 1879e1051a39Sopenharmony_ci "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1880e1051a39Sopenharmony_ci "&vpshufb (@x[$d0],@x[$d0],$t0)", 1881e1051a39Sopenharmony_ci "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 1882e1051a39Sopenharmony_ci "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1883e1051a39Sopenharmony_ci "&vpshufb (@x[$d1],@x[$d1],$t0)", 1884e1051a39Sopenharmony_ci 1885e1051a39Sopenharmony_ci "&vpaddd ($xc,$xc,@x[$d0])", 1886e1051a39Sopenharmony_ci "&vpxor (@x[$b0],$xc,@x[$b0])", 1887e1051a39Sopenharmony_ci "&vpslld ($t1,@x[$b0],7)", 1888e1051a39Sopenharmony_ci "&vpsrld (@x[$b0],@x[$b0],25)", 1889e1051a39Sopenharmony_ci "&vpor (@x[$b0],$t1,@x[$b0])", 1890e1051a39Sopenharmony_ci "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1891e1051a39Sopenharmony_ci "&vpaddd ($xc_,$xc_,@x[$d1])", 1892e1051a39Sopenharmony_ci "&vpxor (@x[$b1],$xc_,@x[$b1])", 1893e1051a39Sopenharmony_ci "&vpslld ($t0,@x[$b1],7)", 1894e1051a39Sopenharmony_ci "&vpsrld (@x[$b1],@x[$b1],25)", 1895e1051a39Sopenharmony_ci "&vpor (@x[$b1],$t0,@x[$b1])", 1896e1051a39Sopenharmony_ci 1897e1051a39Sopenharmony_ci "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 1898e1051a39Sopenharmony_ci "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", 1899e1051a39Sopenharmony_ci "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", 1900e1051a39Sopenharmony_ci "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", 1901e1051a39Sopenharmony_ci 1902e1051a39Sopenharmony_ci "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1903e1051a39Sopenharmony_ci "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1904e1051a39Sopenharmony_ci "&vpshufb (@x[$d2],@x[$d2],$t1)", 1905e1051a39Sopenharmony_ci "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1906e1051a39Sopenharmony_ci "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1907e1051a39Sopenharmony_ci "&vpshufb (@x[$d3],@x[$d3],$t1)", 1908e1051a39Sopenharmony_ci 1909e1051a39Sopenharmony_ci "&vpaddd ($xc,$xc,@x[$d2])", 1910e1051a39Sopenharmony_ci "&vpxor (@x[$b2],$xc,@x[$b2])", 1911e1051a39Sopenharmony_ci "&vpslld ($t0,@x[$b2],12)", 1912e1051a39Sopenharmony_ci "&vpsrld (@x[$b2],@x[$b2],20)", 1913e1051a39Sopenharmony_ci "&vpor (@x[$b2],$t0,@x[$b2])", 1914e1051a39Sopenharmony_ci "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1915e1051a39Sopenharmony_ci "&vpaddd ($xc_,$xc_,@x[$d3])", 1916e1051a39Sopenharmony_ci "&vpxor (@x[$b3],$xc_,@x[$b3])", 1917e1051a39Sopenharmony_ci "&vpslld ($t1,@x[$b3],12)", 1918e1051a39Sopenharmony_ci "&vpsrld (@x[$b3],@x[$b3],20)", 1919e1051a39Sopenharmony_ci "&vpor (@x[$b3],$t1,@x[$b3])", 1920e1051a39Sopenharmony_ci 1921e1051a39Sopenharmony_ci "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1922e1051a39Sopenharmony_ci "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1923e1051a39Sopenharmony_ci "&vpshufb (@x[$d2],@x[$d2],$t0)", 1924e1051a39Sopenharmony_ci "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1925e1051a39Sopenharmony_ci "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1926e1051a39Sopenharmony_ci "&vpshufb (@x[$d3],@x[$d3],$t0)", 1927e1051a39Sopenharmony_ci 1928e1051a39Sopenharmony_ci "&vpaddd ($xc,$xc,@x[$d2])", 1929e1051a39Sopenharmony_ci "&vpxor (@x[$b2],$xc,@x[$b2])", 1930e1051a39Sopenharmony_ci "&vpslld ($t1,@x[$b2],7)", 1931e1051a39Sopenharmony_ci "&vpsrld (@x[$b2],@x[$b2],25)", 1932e1051a39Sopenharmony_ci "&vpor (@x[$b2],$t1,@x[$b2])", 1933e1051a39Sopenharmony_ci "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1934e1051a39Sopenharmony_ci "&vpaddd ($xc_,$xc_,@x[$d3])", 1935e1051a39Sopenharmony_ci "&vpxor (@x[$b3],$xc_,@x[$b3])", 1936e1051a39Sopenharmony_ci "&vpslld ($t0,@x[$b3],7)", 1937e1051a39Sopenharmony_ci "&vpsrld (@x[$b3],@x[$b3],25)", 1938e1051a39Sopenharmony_ci "&vpor (@x[$b3],$t0,@x[$b3])" 1939e1051a39Sopenharmony_ci ); 1940e1051a39Sopenharmony_ci} 1941e1051a39Sopenharmony_ci 1942e1051a39Sopenharmony_cimy $xframe = $win64 ? 0xa8 : 8; 1943e1051a39Sopenharmony_ci 1944e1051a39Sopenharmony_ci$code.=<<___; 1945e1051a39Sopenharmony_ci.type ChaCha20_8x,\@function,5 1946e1051a39Sopenharmony_ci.align 32 1947e1051a39Sopenharmony_ciChaCha20_8x: 1948e1051a39Sopenharmony_ci.cfi_startproc 1949e1051a39Sopenharmony_ci.LChaCha20_8x: 1950e1051a39Sopenharmony_ci mov %rsp,%r9 # frame register 1951e1051a39Sopenharmony_ci.cfi_def_cfa_register %r9 1952e1051a39Sopenharmony_ci sub \$0x280+$xframe,%rsp 1953e1051a39Sopenharmony_ci and \$-32,%rsp 1954e1051a39Sopenharmony_ci___ 1955e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1956e1051a39Sopenharmony_ci movaps %xmm6,-0xa8(%r9) 1957e1051a39Sopenharmony_ci movaps %xmm7,-0x98(%r9) 1958e1051a39Sopenharmony_ci movaps %xmm8,-0x88(%r9) 1959e1051a39Sopenharmony_ci movaps %xmm9,-0x78(%r9) 1960e1051a39Sopenharmony_ci movaps %xmm10,-0x68(%r9) 1961e1051a39Sopenharmony_ci movaps %xmm11,-0x58(%r9) 1962e1051a39Sopenharmony_ci movaps %xmm12,-0x48(%r9) 1963e1051a39Sopenharmony_ci movaps %xmm13,-0x38(%r9) 1964e1051a39Sopenharmony_ci movaps %xmm14,-0x28(%r9) 1965e1051a39Sopenharmony_ci movaps %xmm15,-0x18(%r9) 1966e1051a39Sopenharmony_ci.L8x_body: 1967e1051a39Sopenharmony_ci___ 1968e1051a39Sopenharmony_ci$code.=<<___; 1969e1051a39Sopenharmony_ci vzeroupper 1970e1051a39Sopenharmony_ci 1971e1051a39Sopenharmony_ci ################ stack layout 1972e1051a39Sopenharmony_ci # +0x00 SIMD equivalent of @x[8-12] 1973e1051a39Sopenharmony_ci # ... 1974e1051a39Sopenharmony_ci # +0x80 constant copy of key[0-2] smashed by lanes 1975e1051a39Sopenharmony_ci # ... 1976e1051a39Sopenharmony_ci # +0x200 SIMD counters (with nonce smashed by lanes) 1977e1051a39Sopenharmony_ci # ... 1978e1051a39Sopenharmony_ci # +0x280 1979e1051a39Sopenharmony_ci 1980e1051a39Sopenharmony_ci vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] 1981e1051a39Sopenharmony_ci vbroadcasti128 ($key),$xb3 # key[1] 1982e1051a39Sopenharmony_ci vbroadcasti128 16($key),$xt3 # key[2] 1983e1051a39Sopenharmony_ci vbroadcasti128 ($counter),$xd3 # key[3] 1984e1051a39Sopenharmony_ci lea 0x100(%rsp),%rcx # size optimization 1985e1051a39Sopenharmony_ci lea 0x200(%rsp),%rax # size optimization 1986e1051a39Sopenharmony_ci lea .Lrot16(%rip),%r10 1987e1051a39Sopenharmony_ci lea .Lrot24(%rip),%r11 1988e1051a39Sopenharmony_ci 1989e1051a39Sopenharmony_ci vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1990e1051a39Sopenharmony_ci vpshufd \$0x55,$xa3,$xa1 1991e1051a39Sopenharmony_ci vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload 1992e1051a39Sopenharmony_ci vpshufd \$0xaa,$xa3,$xa2 1993e1051a39Sopenharmony_ci vmovdqa $xa1,0xa0-0x100(%rcx) 1994e1051a39Sopenharmony_ci vpshufd \$0xff,$xa3,$xa3 1995e1051a39Sopenharmony_ci vmovdqa $xa2,0xc0-0x100(%rcx) 1996e1051a39Sopenharmony_ci vmovdqa $xa3,0xe0-0x100(%rcx) 1997e1051a39Sopenharmony_ci 1998e1051a39Sopenharmony_ci vpshufd \$0x00,$xb3,$xb0 1999e1051a39Sopenharmony_ci vpshufd \$0x55,$xb3,$xb1 2000e1051a39Sopenharmony_ci vmovdqa $xb0,0x100-0x100(%rcx) 2001e1051a39Sopenharmony_ci vpshufd \$0xaa,$xb3,$xb2 2002e1051a39Sopenharmony_ci vmovdqa $xb1,0x120-0x100(%rcx) 2003e1051a39Sopenharmony_ci vpshufd \$0xff,$xb3,$xb3 2004e1051a39Sopenharmony_ci vmovdqa $xb2,0x140-0x100(%rcx) 2005e1051a39Sopenharmony_ci vmovdqa $xb3,0x160-0x100(%rcx) 2006e1051a39Sopenharmony_ci 2007e1051a39Sopenharmony_ci vpshufd \$0x00,$xt3,$xt0 # "xc0" 2008e1051a39Sopenharmony_ci vpshufd \$0x55,$xt3,$xt1 # "xc1" 2009e1051a39Sopenharmony_ci vmovdqa $xt0,0x180-0x200(%rax) 2010e1051a39Sopenharmony_ci vpshufd \$0xaa,$xt3,$xt2 # "xc2" 2011e1051a39Sopenharmony_ci vmovdqa $xt1,0x1a0-0x200(%rax) 2012e1051a39Sopenharmony_ci vpshufd \$0xff,$xt3,$xt3 # "xc3" 2013e1051a39Sopenharmony_ci vmovdqa $xt2,0x1c0-0x200(%rax) 2014e1051a39Sopenharmony_ci vmovdqa $xt3,0x1e0-0x200(%rax) 2015e1051a39Sopenharmony_ci 2016e1051a39Sopenharmony_ci vpshufd \$0x00,$xd3,$xd0 2017e1051a39Sopenharmony_ci vpshufd \$0x55,$xd3,$xd1 2018e1051a39Sopenharmony_ci vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 2019e1051a39Sopenharmony_ci vpshufd \$0xaa,$xd3,$xd2 2020e1051a39Sopenharmony_ci vmovdqa $xd1,0x220-0x200(%rax) 2021e1051a39Sopenharmony_ci vpshufd \$0xff,$xd3,$xd3 2022e1051a39Sopenharmony_ci vmovdqa $xd2,0x240-0x200(%rax) 2023e1051a39Sopenharmony_ci vmovdqa $xd3,0x260-0x200(%rax) 2024e1051a39Sopenharmony_ci 2025e1051a39Sopenharmony_ci jmp .Loop_enter8x 2026e1051a39Sopenharmony_ci 2027e1051a39Sopenharmony_ci.align 32 2028e1051a39Sopenharmony_ci.Loop_outer8x: 2029e1051a39Sopenharmony_ci vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key 2030e1051a39Sopenharmony_ci vmovdqa 0xa0-0x100(%rcx),$xa1 2031e1051a39Sopenharmony_ci vmovdqa 0xc0-0x100(%rcx),$xa2 2032e1051a39Sopenharmony_ci vmovdqa 0xe0-0x100(%rcx),$xa3 2033e1051a39Sopenharmony_ci vmovdqa 0x100-0x100(%rcx),$xb0 2034e1051a39Sopenharmony_ci vmovdqa 0x120-0x100(%rcx),$xb1 2035e1051a39Sopenharmony_ci vmovdqa 0x140-0x100(%rcx),$xb2 2036e1051a39Sopenharmony_ci vmovdqa 0x160-0x100(%rcx),$xb3 2037e1051a39Sopenharmony_ci vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" 2038e1051a39Sopenharmony_ci vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" 2039e1051a39Sopenharmony_ci vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" 2040e1051a39Sopenharmony_ci vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" 2041e1051a39Sopenharmony_ci vmovdqa 0x200-0x200(%rax),$xd0 2042e1051a39Sopenharmony_ci vmovdqa 0x220-0x200(%rax),$xd1 2043e1051a39Sopenharmony_ci vmovdqa 0x240-0x200(%rax),$xd2 2044e1051a39Sopenharmony_ci vmovdqa 0x260-0x200(%rax),$xd3 2045e1051a39Sopenharmony_ci vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters 2046e1051a39Sopenharmony_ci 2047e1051a39Sopenharmony_ci.Loop_enter8x: 2048e1051a39Sopenharmony_ci vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" 2049e1051a39Sopenharmony_ci vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" 2050e1051a39Sopenharmony_ci vbroadcasti128 (%r10),$xt3 2051e1051a39Sopenharmony_ci vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters 2052e1051a39Sopenharmony_ci mov \$10,%eax 2053e1051a39Sopenharmony_ci jmp .Loop8x 2054e1051a39Sopenharmony_ci 2055e1051a39Sopenharmony_ci.align 32 2056e1051a39Sopenharmony_ci.Loop8x: 2057e1051a39Sopenharmony_ci___ 2058e1051a39Sopenharmony_ci foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } 2059e1051a39Sopenharmony_ci foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } 2060e1051a39Sopenharmony_ci$code.=<<___; 2061e1051a39Sopenharmony_ci dec %eax 2062e1051a39Sopenharmony_ci jnz .Loop8x 2063e1051a39Sopenharmony_ci 2064e1051a39Sopenharmony_ci lea 0x200(%rsp),%rax # size optimization 2065e1051a39Sopenharmony_ci vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key 2066e1051a39Sopenharmony_ci vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 2067e1051a39Sopenharmony_ci vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 2068e1051a39Sopenharmony_ci vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 2069e1051a39Sopenharmony_ci 2070e1051a39Sopenharmony_ci vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 2071e1051a39Sopenharmony_ci vpunpckldq $xa3,$xa2,$xt3 2072e1051a39Sopenharmony_ci vpunpckhdq $xa1,$xa0,$xa0 2073e1051a39Sopenharmony_ci vpunpckhdq $xa3,$xa2,$xa2 2074e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 2075e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 2076e1051a39Sopenharmony_ci vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 2077e1051a39Sopenharmony_ci vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 2078e1051a39Sopenharmony_ci___ 2079e1051a39Sopenharmony_ci ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 2080e1051a39Sopenharmony_ci$code.=<<___; 2081e1051a39Sopenharmony_ci vpaddd 0x100-0x100(%rcx),$xb0,$xb0 2082e1051a39Sopenharmony_ci vpaddd 0x120-0x100(%rcx),$xb1,$xb1 2083e1051a39Sopenharmony_ci vpaddd 0x140-0x100(%rcx),$xb2,$xb2 2084e1051a39Sopenharmony_ci vpaddd 0x160-0x100(%rcx),$xb3,$xb3 2085e1051a39Sopenharmony_ci 2086e1051a39Sopenharmony_ci vpunpckldq $xb1,$xb0,$xt2 2087e1051a39Sopenharmony_ci vpunpckldq $xb3,$xb2,$xt3 2088e1051a39Sopenharmony_ci vpunpckhdq $xb1,$xb0,$xb0 2089e1051a39Sopenharmony_ci vpunpckhdq $xb3,$xb2,$xb2 2090e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 2091e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 2092e1051a39Sopenharmony_ci vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 2093e1051a39Sopenharmony_ci vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 2094e1051a39Sopenharmony_ci___ 2095e1051a39Sopenharmony_ci ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 2096e1051a39Sopenharmony_ci$code.=<<___; 2097e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further 2098e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xb0,$xa0,$xb0 2099e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xb1,$xa1,$xa0 2100e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xb1,$xa1,$xb1 2101e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xb2,$xa2,$xa1 2102e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xb2,$xa2,$xb2 2103e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xb3,$xa3,$xa2 2104e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xb3,$xa3,$xb3 2105e1051a39Sopenharmony_ci___ 2106e1051a39Sopenharmony_ci ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 2107e1051a39Sopenharmony_ci my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 2108e1051a39Sopenharmony_ci$code.=<<___; 2109e1051a39Sopenharmony_ci vmovdqa $xa0,0x00(%rsp) # offload $xaN 2110e1051a39Sopenharmony_ci vmovdqa $xa1,0x20(%rsp) 2111e1051a39Sopenharmony_ci vmovdqa 0x40(%rsp),$xc2 # $xa0 2112e1051a39Sopenharmony_ci vmovdqa 0x60(%rsp),$xc3 # $xa1 2113e1051a39Sopenharmony_ci 2114e1051a39Sopenharmony_ci vpaddd 0x180-0x200(%rax),$xc0,$xc0 2115e1051a39Sopenharmony_ci vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 2116e1051a39Sopenharmony_ci vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 2117e1051a39Sopenharmony_ci vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 2118e1051a39Sopenharmony_ci 2119e1051a39Sopenharmony_ci vpunpckldq $xc1,$xc0,$xt2 2120e1051a39Sopenharmony_ci vpunpckldq $xc3,$xc2,$xt3 2121e1051a39Sopenharmony_ci vpunpckhdq $xc1,$xc0,$xc0 2122e1051a39Sopenharmony_ci vpunpckhdq $xc3,$xc2,$xc2 2123e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 2124e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 2125e1051a39Sopenharmony_ci vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 2126e1051a39Sopenharmony_ci vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 2127e1051a39Sopenharmony_ci___ 2128e1051a39Sopenharmony_ci ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 2129e1051a39Sopenharmony_ci$code.=<<___; 2130e1051a39Sopenharmony_ci vpaddd 0x200-0x200(%rax),$xd0,$xd0 2131e1051a39Sopenharmony_ci vpaddd 0x220-0x200(%rax),$xd1,$xd1 2132e1051a39Sopenharmony_ci vpaddd 0x240-0x200(%rax),$xd2,$xd2 2133e1051a39Sopenharmony_ci vpaddd 0x260-0x200(%rax),$xd3,$xd3 2134e1051a39Sopenharmony_ci 2135e1051a39Sopenharmony_ci vpunpckldq $xd1,$xd0,$xt2 2136e1051a39Sopenharmony_ci vpunpckldq $xd3,$xd2,$xt3 2137e1051a39Sopenharmony_ci vpunpckhdq $xd1,$xd0,$xd0 2138e1051a39Sopenharmony_ci vpunpckhdq $xd3,$xd2,$xd2 2139e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 2140e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 2141e1051a39Sopenharmony_ci vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 2142e1051a39Sopenharmony_ci vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 2143e1051a39Sopenharmony_ci___ 2144e1051a39Sopenharmony_ci ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 2145e1051a39Sopenharmony_ci$code.=<<___; 2146e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 2147e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xd0,$xc0,$xd0 2148e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xd1,$xc1,$xc0 2149e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xd1,$xc1,$xd1 2150e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xd2,$xc2,$xc1 2151e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xd2,$xc2,$xd2 2152e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xd3,$xc3,$xc2 2153e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xd3,$xc3,$xd3 2154e1051a39Sopenharmony_ci___ 2155e1051a39Sopenharmony_ci ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 2156e1051a39Sopenharmony_ci ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 2157e1051a39Sopenharmony_ci ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 2158e1051a39Sopenharmony_ci ($xa0,$xa1)=($xt2,$xt3); 2159e1051a39Sopenharmony_ci$code.=<<___; 2160e1051a39Sopenharmony_ci vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? 2161e1051a39Sopenharmony_ci vmovdqa 0x20(%rsp),$xa1 2162e1051a39Sopenharmony_ci 2163e1051a39Sopenharmony_ci cmp \$64*8,$len 2164e1051a39Sopenharmony_ci jb .Ltail8x 2165e1051a39Sopenharmony_ci 2166e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 2167e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb0,$xb0 2168e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc0,$xc0 2169e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd0,$xd0 2170e1051a39Sopenharmony_ci lea 0x80($inp),$inp # size optimization 2171e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 2172e1051a39Sopenharmony_ci vmovdqu $xb0,0x20($out) 2173e1051a39Sopenharmony_ci vmovdqu $xc0,0x40($out) 2174e1051a39Sopenharmony_ci vmovdqu $xd0,0x60($out) 2175e1051a39Sopenharmony_ci lea 0x80($out),$out # size optimization 2176e1051a39Sopenharmony_ci 2177e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa1,$xa1 2178e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb1,$xb1 2179e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc1,$xc1 2180e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd1,$xd1 2181e1051a39Sopenharmony_ci lea 0x80($inp),$inp # size optimization 2182e1051a39Sopenharmony_ci vmovdqu $xa1,0x00($out) 2183e1051a39Sopenharmony_ci vmovdqu $xb1,0x20($out) 2184e1051a39Sopenharmony_ci vmovdqu $xc1,0x40($out) 2185e1051a39Sopenharmony_ci vmovdqu $xd1,0x60($out) 2186e1051a39Sopenharmony_ci lea 0x80($out),$out # size optimization 2187e1051a39Sopenharmony_ci 2188e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa2,$xa2 2189e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb2,$xb2 2190e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc2,$xc2 2191e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd2,$xd2 2192e1051a39Sopenharmony_ci lea 0x80($inp),$inp # size optimization 2193e1051a39Sopenharmony_ci vmovdqu $xa2,0x00($out) 2194e1051a39Sopenharmony_ci vmovdqu $xb2,0x20($out) 2195e1051a39Sopenharmony_ci vmovdqu $xc2,0x40($out) 2196e1051a39Sopenharmony_ci vmovdqu $xd2,0x60($out) 2197e1051a39Sopenharmony_ci lea 0x80($out),$out # size optimization 2198e1051a39Sopenharmony_ci 2199e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa3,$xa3 2200e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb3,$xb3 2201e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc3,$xc3 2202e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd3,$xd3 2203e1051a39Sopenharmony_ci lea 0x80($inp),$inp # size optimization 2204e1051a39Sopenharmony_ci vmovdqu $xa3,0x00($out) 2205e1051a39Sopenharmony_ci vmovdqu $xb3,0x20($out) 2206e1051a39Sopenharmony_ci vmovdqu $xc3,0x40($out) 2207e1051a39Sopenharmony_ci vmovdqu $xd3,0x60($out) 2208e1051a39Sopenharmony_ci lea 0x80($out),$out # size optimization 2209e1051a39Sopenharmony_ci 2210e1051a39Sopenharmony_ci sub \$64*8,$len 2211e1051a39Sopenharmony_ci jnz .Loop_outer8x 2212e1051a39Sopenharmony_ci 2213e1051a39Sopenharmony_ci jmp .Ldone8x 2214e1051a39Sopenharmony_ci 2215e1051a39Sopenharmony_ci.Ltail8x: 2216e1051a39Sopenharmony_ci cmp \$448,$len 2217e1051a39Sopenharmony_ci jae .L448_or_more8x 2218e1051a39Sopenharmony_ci cmp \$384,$len 2219e1051a39Sopenharmony_ci jae .L384_or_more8x 2220e1051a39Sopenharmony_ci cmp \$320,$len 2221e1051a39Sopenharmony_ci jae .L320_or_more8x 2222e1051a39Sopenharmony_ci cmp \$256,$len 2223e1051a39Sopenharmony_ci jae .L256_or_more8x 2224e1051a39Sopenharmony_ci cmp \$192,$len 2225e1051a39Sopenharmony_ci jae .L192_or_more8x 2226e1051a39Sopenharmony_ci cmp \$128,$len 2227e1051a39Sopenharmony_ci jae .L128_or_more8x 2228e1051a39Sopenharmony_ci cmp \$64,$len 2229e1051a39Sopenharmony_ci jae .L64_or_more8x 2230e1051a39Sopenharmony_ci 2231e1051a39Sopenharmony_ci xor %r10,%r10 2232e1051a39Sopenharmony_ci vmovdqa $xa0,0x00(%rsp) 2233e1051a39Sopenharmony_ci vmovdqa $xb0,0x20(%rsp) 2234e1051a39Sopenharmony_ci jmp .Loop_tail8x 2235e1051a39Sopenharmony_ci 2236e1051a39Sopenharmony_ci.align 32 2237e1051a39Sopenharmony_ci.L64_or_more8x: 2238e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 2239e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb0,$xb0 2240e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 2241e1051a39Sopenharmony_ci vmovdqu $xb0,0x20($out) 2242e1051a39Sopenharmony_ci je .Ldone8x 2243e1051a39Sopenharmony_ci 2244e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64*1 2245e1051a39Sopenharmony_ci xor %r10,%r10 2246e1051a39Sopenharmony_ci vmovdqa $xc0,0x00(%rsp) 2247e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64*1 2248e1051a39Sopenharmony_ci sub \$64,$len # len-=64*1 2249e1051a39Sopenharmony_ci vmovdqa $xd0,0x20(%rsp) 2250e1051a39Sopenharmony_ci jmp .Loop_tail8x 2251e1051a39Sopenharmony_ci 2252e1051a39Sopenharmony_ci.align 32 2253e1051a39Sopenharmony_ci.L128_or_more8x: 2254e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 2255e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb0,$xb0 2256e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc0,$xc0 2257e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd0,$xd0 2258e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 2259e1051a39Sopenharmony_ci vmovdqu $xb0,0x20($out) 2260e1051a39Sopenharmony_ci vmovdqu $xc0,0x40($out) 2261e1051a39Sopenharmony_ci vmovdqu $xd0,0x60($out) 2262e1051a39Sopenharmony_ci je .Ldone8x 2263e1051a39Sopenharmony_ci 2264e1051a39Sopenharmony_ci lea 0x80($inp),$inp # inp+=64*2 2265e1051a39Sopenharmony_ci xor %r10,%r10 2266e1051a39Sopenharmony_ci vmovdqa $xa1,0x00(%rsp) 2267e1051a39Sopenharmony_ci lea 0x80($out),$out # out+=64*2 2268e1051a39Sopenharmony_ci sub \$128,$len # len-=64*2 2269e1051a39Sopenharmony_ci vmovdqa $xb1,0x20(%rsp) 2270e1051a39Sopenharmony_ci jmp .Loop_tail8x 2271e1051a39Sopenharmony_ci 2272e1051a39Sopenharmony_ci.align 32 2273e1051a39Sopenharmony_ci.L192_or_more8x: 2274e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 2275e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb0,$xb0 2276e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc0,$xc0 2277e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd0,$xd0 2278e1051a39Sopenharmony_ci vpxor 0x80($inp),$xa1,$xa1 2279e1051a39Sopenharmony_ci vpxor 0xa0($inp),$xb1,$xb1 2280e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 2281e1051a39Sopenharmony_ci vmovdqu $xb0,0x20($out) 2282e1051a39Sopenharmony_ci vmovdqu $xc0,0x40($out) 2283e1051a39Sopenharmony_ci vmovdqu $xd0,0x60($out) 2284e1051a39Sopenharmony_ci vmovdqu $xa1,0x80($out) 2285e1051a39Sopenharmony_ci vmovdqu $xb1,0xa0($out) 2286e1051a39Sopenharmony_ci je .Ldone8x 2287e1051a39Sopenharmony_ci 2288e1051a39Sopenharmony_ci lea 0xc0($inp),$inp # inp+=64*3 2289e1051a39Sopenharmony_ci xor %r10,%r10 2290e1051a39Sopenharmony_ci vmovdqa $xc1,0x00(%rsp) 2291e1051a39Sopenharmony_ci lea 0xc0($out),$out # out+=64*3 2292e1051a39Sopenharmony_ci sub \$192,$len # len-=64*3 2293e1051a39Sopenharmony_ci vmovdqa $xd1,0x20(%rsp) 2294e1051a39Sopenharmony_ci jmp .Loop_tail8x 2295e1051a39Sopenharmony_ci 2296e1051a39Sopenharmony_ci.align 32 2297e1051a39Sopenharmony_ci.L256_or_more8x: 2298e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 2299e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb0,$xb0 2300e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc0,$xc0 2301e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd0,$xd0 2302e1051a39Sopenharmony_ci vpxor 0x80($inp),$xa1,$xa1 2303e1051a39Sopenharmony_ci vpxor 0xa0($inp),$xb1,$xb1 2304e1051a39Sopenharmony_ci vpxor 0xc0($inp),$xc1,$xc1 2305e1051a39Sopenharmony_ci vpxor 0xe0($inp),$xd1,$xd1 2306e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 2307e1051a39Sopenharmony_ci vmovdqu $xb0,0x20($out) 2308e1051a39Sopenharmony_ci vmovdqu $xc0,0x40($out) 2309e1051a39Sopenharmony_ci vmovdqu $xd0,0x60($out) 2310e1051a39Sopenharmony_ci vmovdqu $xa1,0x80($out) 2311e1051a39Sopenharmony_ci vmovdqu $xb1,0xa0($out) 2312e1051a39Sopenharmony_ci vmovdqu $xc1,0xc0($out) 2313e1051a39Sopenharmony_ci vmovdqu $xd1,0xe0($out) 2314e1051a39Sopenharmony_ci je .Ldone8x 2315e1051a39Sopenharmony_ci 2316e1051a39Sopenharmony_ci lea 0x100($inp),$inp # inp+=64*4 2317e1051a39Sopenharmony_ci xor %r10,%r10 2318e1051a39Sopenharmony_ci vmovdqa $xa2,0x00(%rsp) 2319e1051a39Sopenharmony_ci lea 0x100($out),$out # out+=64*4 2320e1051a39Sopenharmony_ci sub \$256,$len # len-=64*4 2321e1051a39Sopenharmony_ci vmovdqa $xb2,0x20(%rsp) 2322e1051a39Sopenharmony_ci jmp .Loop_tail8x 2323e1051a39Sopenharmony_ci 2324e1051a39Sopenharmony_ci.align 32 2325e1051a39Sopenharmony_ci.L320_or_more8x: 2326e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 2327e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb0,$xb0 2328e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc0,$xc0 2329e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd0,$xd0 2330e1051a39Sopenharmony_ci vpxor 0x80($inp),$xa1,$xa1 2331e1051a39Sopenharmony_ci vpxor 0xa0($inp),$xb1,$xb1 2332e1051a39Sopenharmony_ci vpxor 0xc0($inp),$xc1,$xc1 2333e1051a39Sopenharmony_ci vpxor 0xe0($inp),$xd1,$xd1 2334e1051a39Sopenharmony_ci vpxor 0x100($inp),$xa2,$xa2 2335e1051a39Sopenharmony_ci vpxor 0x120($inp),$xb2,$xb2 2336e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 2337e1051a39Sopenharmony_ci vmovdqu $xb0,0x20($out) 2338e1051a39Sopenharmony_ci vmovdqu $xc0,0x40($out) 2339e1051a39Sopenharmony_ci vmovdqu $xd0,0x60($out) 2340e1051a39Sopenharmony_ci vmovdqu $xa1,0x80($out) 2341e1051a39Sopenharmony_ci vmovdqu $xb1,0xa0($out) 2342e1051a39Sopenharmony_ci vmovdqu $xc1,0xc0($out) 2343e1051a39Sopenharmony_ci vmovdqu $xd1,0xe0($out) 2344e1051a39Sopenharmony_ci vmovdqu $xa2,0x100($out) 2345e1051a39Sopenharmony_ci vmovdqu $xb2,0x120($out) 2346e1051a39Sopenharmony_ci je .Ldone8x 2347e1051a39Sopenharmony_ci 2348e1051a39Sopenharmony_ci lea 0x140($inp),$inp # inp+=64*5 2349e1051a39Sopenharmony_ci xor %r10,%r10 2350e1051a39Sopenharmony_ci vmovdqa $xc2,0x00(%rsp) 2351e1051a39Sopenharmony_ci lea 0x140($out),$out # out+=64*5 2352e1051a39Sopenharmony_ci sub \$320,$len # len-=64*5 2353e1051a39Sopenharmony_ci vmovdqa $xd2,0x20(%rsp) 2354e1051a39Sopenharmony_ci jmp .Loop_tail8x 2355e1051a39Sopenharmony_ci 2356e1051a39Sopenharmony_ci.align 32 2357e1051a39Sopenharmony_ci.L384_or_more8x: 2358e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 2359e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb0,$xb0 2360e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc0,$xc0 2361e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd0,$xd0 2362e1051a39Sopenharmony_ci vpxor 0x80($inp),$xa1,$xa1 2363e1051a39Sopenharmony_ci vpxor 0xa0($inp),$xb1,$xb1 2364e1051a39Sopenharmony_ci vpxor 0xc0($inp),$xc1,$xc1 2365e1051a39Sopenharmony_ci vpxor 0xe0($inp),$xd1,$xd1 2366e1051a39Sopenharmony_ci vpxor 0x100($inp),$xa2,$xa2 2367e1051a39Sopenharmony_ci vpxor 0x120($inp),$xb2,$xb2 2368e1051a39Sopenharmony_ci vpxor 0x140($inp),$xc2,$xc2 2369e1051a39Sopenharmony_ci vpxor 0x160($inp),$xd2,$xd2 2370e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 2371e1051a39Sopenharmony_ci vmovdqu $xb0,0x20($out) 2372e1051a39Sopenharmony_ci vmovdqu $xc0,0x40($out) 2373e1051a39Sopenharmony_ci vmovdqu $xd0,0x60($out) 2374e1051a39Sopenharmony_ci vmovdqu $xa1,0x80($out) 2375e1051a39Sopenharmony_ci vmovdqu $xb1,0xa0($out) 2376e1051a39Sopenharmony_ci vmovdqu $xc1,0xc0($out) 2377e1051a39Sopenharmony_ci vmovdqu $xd1,0xe0($out) 2378e1051a39Sopenharmony_ci vmovdqu $xa2,0x100($out) 2379e1051a39Sopenharmony_ci vmovdqu $xb2,0x120($out) 2380e1051a39Sopenharmony_ci vmovdqu $xc2,0x140($out) 2381e1051a39Sopenharmony_ci vmovdqu $xd2,0x160($out) 2382e1051a39Sopenharmony_ci je .Ldone8x 2383e1051a39Sopenharmony_ci 2384e1051a39Sopenharmony_ci lea 0x180($inp),$inp # inp+=64*6 2385e1051a39Sopenharmony_ci xor %r10,%r10 2386e1051a39Sopenharmony_ci vmovdqa $xa3,0x00(%rsp) 2387e1051a39Sopenharmony_ci lea 0x180($out),$out # out+=64*6 2388e1051a39Sopenharmony_ci sub \$384,$len # len-=64*6 2389e1051a39Sopenharmony_ci vmovdqa $xb3,0x20(%rsp) 2390e1051a39Sopenharmony_ci jmp .Loop_tail8x 2391e1051a39Sopenharmony_ci 2392e1051a39Sopenharmony_ci.align 32 2393e1051a39Sopenharmony_ci.L448_or_more8x: 2394e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 2395e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb0,$xb0 2396e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc0,$xc0 2397e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd0,$xd0 2398e1051a39Sopenharmony_ci vpxor 0x80($inp),$xa1,$xa1 2399e1051a39Sopenharmony_ci vpxor 0xa0($inp),$xb1,$xb1 2400e1051a39Sopenharmony_ci vpxor 0xc0($inp),$xc1,$xc1 2401e1051a39Sopenharmony_ci vpxor 0xe0($inp),$xd1,$xd1 2402e1051a39Sopenharmony_ci vpxor 0x100($inp),$xa2,$xa2 2403e1051a39Sopenharmony_ci vpxor 0x120($inp),$xb2,$xb2 2404e1051a39Sopenharmony_ci vpxor 0x140($inp),$xc2,$xc2 2405e1051a39Sopenharmony_ci vpxor 0x160($inp),$xd2,$xd2 2406e1051a39Sopenharmony_ci vpxor 0x180($inp),$xa3,$xa3 2407e1051a39Sopenharmony_ci vpxor 0x1a0($inp),$xb3,$xb3 2408e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out) 2409e1051a39Sopenharmony_ci vmovdqu $xb0,0x20($out) 2410e1051a39Sopenharmony_ci vmovdqu $xc0,0x40($out) 2411e1051a39Sopenharmony_ci vmovdqu $xd0,0x60($out) 2412e1051a39Sopenharmony_ci vmovdqu $xa1,0x80($out) 2413e1051a39Sopenharmony_ci vmovdqu $xb1,0xa0($out) 2414e1051a39Sopenharmony_ci vmovdqu $xc1,0xc0($out) 2415e1051a39Sopenharmony_ci vmovdqu $xd1,0xe0($out) 2416e1051a39Sopenharmony_ci vmovdqu $xa2,0x100($out) 2417e1051a39Sopenharmony_ci vmovdqu $xb2,0x120($out) 2418e1051a39Sopenharmony_ci vmovdqu $xc2,0x140($out) 2419e1051a39Sopenharmony_ci vmovdqu $xd2,0x160($out) 2420e1051a39Sopenharmony_ci vmovdqu $xa3,0x180($out) 2421e1051a39Sopenharmony_ci vmovdqu $xb3,0x1a0($out) 2422e1051a39Sopenharmony_ci je .Ldone8x 2423e1051a39Sopenharmony_ci 2424e1051a39Sopenharmony_ci lea 0x1c0($inp),$inp # inp+=64*7 2425e1051a39Sopenharmony_ci xor %r10,%r10 2426e1051a39Sopenharmony_ci vmovdqa $xc3,0x00(%rsp) 2427e1051a39Sopenharmony_ci lea 0x1c0($out),$out # out+=64*7 2428e1051a39Sopenharmony_ci sub \$448,$len # len-=64*7 2429e1051a39Sopenharmony_ci vmovdqa $xd3,0x20(%rsp) 2430e1051a39Sopenharmony_ci 2431e1051a39Sopenharmony_ci.Loop_tail8x: 2432e1051a39Sopenharmony_ci movzb ($inp,%r10),%eax 2433e1051a39Sopenharmony_ci movzb (%rsp,%r10),%ecx 2434e1051a39Sopenharmony_ci lea 1(%r10),%r10 2435e1051a39Sopenharmony_ci xor %ecx,%eax 2436e1051a39Sopenharmony_ci mov %al,-1($out,%r10) 2437e1051a39Sopenharmony_ci dec $len 2438e1051a39Sopenharmony_ci jnz .Loop_tail8x 2439e1051a39Sopenharmony_ci 2440e1051a39Sopenharmony_ci.Ldone8x: 2441e1051a39Sopenharmony_ci vzeroall 2442e1051a39Sopenharmony_ci___ 2443e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 2444e1051a39Sopenharmony_ci movaps -0xa8(%r9),%xmm6 2445e1051a39Sopenharmony_ci movaps -0x98(%r9),%xmm7 2446e1051a39Sopenharmony_ci movaps -0x88(%r9),%xmm8 2447e1051a39Sopenharmony_ci movaps -0x78(%r9),%xmm9 2448e1051a39Sopenharmony_ci movaps -0x68(%r9),%xmm10 2449e1051a39Sopenharmony_ci movaps -0x58(%r9),%xmm11 2450e1051a39Sopenharmony_ci movaps -0x48(%r9),%xmm12 2451e1051a39Sopenharmony_ci movaps -0x38(%r9),%xmm13 2452e1051a39Sopenharmony_ci movaps -0x28(%r9),%xmm14 2453e1051a39Sopenharmony_ci movaps -0x18(%r9),%xmm15 2454e1051a39Sopenharmony_ci___ 2455e1051a39Sopenharmony_ci$code.=<<___; 2456e1051a39Sopenharmony_ci lea (%r9),%rsp 2457e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 2458e1051a39Sopenharmony_ci.L8x_epilogue: 2459e1051a39Sopenharmony_ci ret 2460e1051a39Sopenharmony_ci.cfi_endproc 2461e1051a39Sopenharmony_ci.size ChaCha20_8x,.-ChaCha20_8x 2462e1051a39Sopenharmony_ci___ 2463e1051a39Sopenharmony_ci} 2464e1051a39Sopenharmony_ci 2465e1051a39Sopenharmony_ci######################################################################## 2466e1051a39Sopenharmony_ci# AVX512 code paths 2467e1051a39Sopenharmony_ciif ($avx>2) { 2468e1051a39Sopenharmony_ci# This one handles shorter inputs... 2469e1051a39Sopenharmony_ci 2470e1051a39Sopenharmony_cimy ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); 2471e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 2472e1051a39Sopenharmony_ci 2473e1051a39Sopenharmony_cisub vpxord() # size optimization 2474e1051a39Sopenharmony_ci{ my $opcode = "vpxor"; # adhere to vpxor when possible 2475e1051a39Sopenharmony_ci 2476e1051a39Sopenharmony_ci foreach (@_) { 2477e1051a39Sopenharmony_ci if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { 2478e1051a39Sopenharmony_ci $opcode = "vpxord"; 2479e1051a39Sopenharmony_ci last; 2480e1051a39Sopenharmony_ci } 2481e1051a39Sopenharmony_ci } 2482e1051a39Sopenharmony_ci 2483e1051a39Sopenharmony_ci $code .= "\t$opcode\t".join(',',reverse @_)."\n"; 2484e1051a39Sopenharmony_ci} 2485e1051a39Sopenharmony_ci 2486e1051a39Sopenharmony_cisub AVX512ROUND { # critical path is 14 "SIMD ticks" per round 2487e1051a39Sopenharmony_ci &vpaddd ($a,$a,$b); 2488e1051a39Sopenharmony_ci &vpxord ($d,$d,$a); 2489e1051a39Sopenharmony_ci &vprold ($d,$d,16); 2490e1051a39Sopenharmony_ci 2491e1051a39Sopenharmony_ci &vpaddd ($c,$c,$d); 2492e1051a39Sopenharmony_ci &vpxord ($b,$b,$c); 2493e1051a39Sopenharmony_ci &vprold ($b,$b,12); 2494e1051a39Sopenharmony_ci 2495e1051a39Sopenharmony_ci &vpaddd ($a,$a,$b); 2496e1051a39Sopenharmony_ci &vpxord ($d,$d,$a); 2497e1051a39Sopenharmony_ci &vprold ($d,$d,8); 2498e1051a39Sopenharmony_ci 2499e1051a39Sopenharmony_ci &vpaddd ($c,$c,$d); 2500e1051a39Sopenharmony_ci &vpxord ($b,$b,$c); 2501e1051a39Sopenharmony_ci &vprold ($b,$b,7); 2502e1051a39Sopenharmony_ci} 2503e1051a39Sopenharmony_ci 2504e1051a39Sopenharmony_cimy $xframe = $win64 ? 160+8 : 8; 2505e1051a39Sopenharmony_ci 2506e1051a39Sopenharmony_ci$code.=<<___; 2507e1051a39Sopenharmony_ci.type ChaCha20_avx512,\@function,5 2508e1051a39Sopenharmony_ci.align 32 2509e1051a39Sopenharmony_ciChaCha20_avx512: 2510e1051a39Sopenharmony_ci.cfi_startproc 2511e1051a39Sopenharmony_ci.LChaCha20_avx512: 2512e1051a39Sopenharmony_ci mov %rsp,%r9 # frame pointer 2513e1051a39Sopenharmony_ci.cfi_def_cfa_register %r9 2514e1051a39Sopenharmony_ci cmp \$512,$len 2515e1051a39Sopenharmony_ci ja .LChaCha20_16x 2516e1051a39Sopenharmony_ci 2517e1051a39Sopenharmony_ci sub \$64+$xframe,%rsp 2518e1051a39Sopenharmony_ci___ 2519e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 2520e1051a39Sopenharmony_ci movaps %xmm6,-0xa8(%r9) 2521e1051a39Sopenharmony_ci movaps %xmm7,-0x98(%r9) 2522e1051a39Sopenharmony_ci movaps %xmm8,-0x88(%r9) 2523e1051a39Sopenharmony_ci movaps %xmm9,-0x78(%r9) 2524e1051a39Sopenharmony_ci movaps %xmm10,-0x68(%r9) 2525e1051a39Sopenharmony_ci movaps %xmm11,-0x58(%r9) 2526e1051a39Sopenharmony_ci movaps %xmm12,-0x48(%r9) 2527e1051a39Sopenharmony_ci movaps %xmm13,-0x38(%r9) 2528e1051a39Sopenharmony_ci movaps %xmm14,-0x28(%r9) 2529e1051a39Sopenharmony_ci movaps %xmm15,-0x18(%r9) 2530e1051a39Sopenharmony_ci.Lavx512_body: 2531e1051a39Sopenharmony_ci___ 2532e1051a39Sopenharmony_ci$code.=<<___; 2533e1051a39Sopenharmony_ci vbroadcasti32x4 .Lsigma(%rip),$a 2534e1051a39Sopenharmony_ci vbroadcasti32x4 ($key),$b 2535e1051a39Sopenharmony_ci vbroadcasti32x4 16($key),$c 2536e1051a39Sopenharmony_ci vbroadcasti32x4 ($counter),$d 2537e1051a39Sopenharmony_ci 2538e1051a39Sopenharmony_ci vmovdqa32 $a,$a_ 2539e1051a39Sopenharmony_ci vmovdqa32 $b,$b_ 2540e1051a39Sopenharmony_ci vmovdqa32 $c,$c_ 2541e1051a39Sopenharmony_ci vpaddd .Lzeroz(%rip),$d,$d 2542e1051a39Sopenharmony_ci vmovdqa32 .Lfourz(%rip),$fourz 2543e1051a39Sopenharmony_ci mov \$10,$counter # reuse $counter 2544e1051a39Sopenharmony_ci vmovdqa32 $d,$d_ 2545e1051a39Sopenharmony_ci jmp .Loop_avx512 2546e1051a39Sopenharmony_ci 2547e1051a39Sopenharmony_ci.align 16 2548e1051a39Sopenharmony_ci.Loop_outer_avx512: 2549e1051a39Sopenharmony_ci vmovdqa32 $a_,$a 2550e1051a39Sopenharmony_ci vmovdqa32 $b_,$b 2551e1051a39Sopenharmony_ci vmovdqa32 $c_,$c 2552e1051a39Sopenharmony_ci vpaddd $fourz,$d_,$d 2553e1051a39Sopenharmony_ci mov \$10,$counter 2554e1051a39Sopenharmony_ci vmovdqa32 $d,$d_ 2555e1051a39Sopenharmony_ci jmp .Loop_avx512 2556e1051a39Sopenharmony_ci 2557e1051a39Sopenharmony_ci.align 32 2558e1051a39Sopenharmony_ci.Loop_avx512: 2559e1051a39Sopenharmony_ci___ 2560e1051a39Sopenharmony_ci &AVX512ROUND(); 2561e1051a39Sopenharmony_ci &vpshufd ($c,$c,0b01001110); 2562e1051a39Sopenharmony_ci &vpshufd ($b,$b,0b00111001); 2563e1051a39Sopenharmony_ci &vpshufd ($d,$d,0b10010011); 2564e1051a39Sopenharmony_ci 2565e1051a39Sopenharmony_ci &AVX512ROUND(); 2566e1051a39Sopenharmony_ci &vpshufd ($c,$c,0b01001110); 2567e1051a39Sopenharmony_ci &vpshufd ($b,$b,0b10010011); 2568e1051a39Sopenharmony_ci &vpshufd ($d,$d,0b00111001); 2569e1051a39Sopenharmony_ci 2570e1051a39Sopenharmony_ci &dec ($counter); 2571e1051a39Sopenharmony_ci &jnz (".Loop_avx512"); 2572e1051a39Sopenharmony_ci 2573e1051a39Sopenharmony_ci$code.=<<___; 2574e1051a39Sopenharmony_ci vpaddd $a_,$a,$a 2575e1051a39Sopenharmony_ci vpaddd $b_,$b,$b 2576e1051a39Sopenharmony_ci vpaddd $c_,$c,$c 2577e1051a39Sopenharmony_ci vpaddd $d_,$d,$d 2578e1051a39Sopenharmony_ci 2579e1051a39Sopenharmony_ci sub \$64,$len 2580e1051a39Sopenharmony_ci jb .Ltail64_avx512 2581e1051a39Sopenharmony_ci 2582e1051a39Sopenharmony_ci vpxor 0x00($inp),%x#$a,$t0 # xor with input 2583e1051a39Sopenharmony_ci vpxor 0x10($inp),%x#$b,$t1 2584e1051a39Sopenharmony_ci vpxor 0x20($inp),%x#$c,$t2 2585e1051a39Sopenharmony_ci vpxor 0x30($inp),%x#$d,$t3 2586e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64 2587e1051a39Sopenharmony_ci 2588e1051a39Sopenharmony_ci vmovdqu $t0,0x00($out) # write output 2589e1051a39Sopenharmony_ci vmovdqu $t1,0x10($out) 2590e1051a39Sopenharmony_ci vmovdqu $t2,0x20($out) 2591e1051a39Sopenharmony_ci vmovdqu $t3,0x30($out) 2592e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64 2593e1051a39Sopenharmony_ci 2594e1051a39Sopenharmony_ci jz .Ldone_avx512 2595e1051a39Sopenharmony_ci 2596e1051a39Sopenharmony_ci vextracti32x4 \$1,$a,$t0 2597e1051a39Sopenharmony_ci vextracti32x4 \$1,$b,$t1 2598e1051a39Sopenharmony_ci vextracti32x4 \$1,$c,$t2 2599e1051a39Sopenharmony_ci vextracti32x4 \$1,$d,$t3 2600e1051a39Sopenharmony_ci 2601e1051a39Sopenharmony_ci sub \$64,$len 2602e1051a39Sopenharmony_ci jb .Ltail_avx512 2603e1051a39Sopenharmony_ci 2604e1051a39Sopenharmony_ci vpxor 0x00($inp),$t0,$t0 # xor with input 2605e1051a39Sopenharmony_ci vpxor 0x10($inp),$t1,$t1 2606e1051a39Sopenharmony_ci vpxor 0x20($inp),$t2,$t2 2607e1051a39Sopenharmony_ci vpxor 0x30($inp),$t3,$t3 2608e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64 2609e1051a39Sopenharmony_ci 2610e1051a39Sopenharmony_ci vmovdqu $t0,0x00($out) # write output 2611e1051a39Sopenharmony_ci vmovdqu $t1,0x10($out) 2612e1051a39Sopenharmony_ci vmovdqu $t2,0x20($out) 2613e1051a39Sopenharmony_ci vmovdqu $t3,0x30($out) 2614e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64 2615e1051a39Sopenharmony_ci 2616e1051a39Sopenharmony_ci jz .Ldone_avx512 2617e1051a39Sopenharmony_ci 2618e1051a39Sopenharmony_ci vextracti32x4 \$2,$a,$t0 2619e1051a39Sopenharmony_ci vextracti32x4 \$2,$b,$t1 2620e1051a39Sopenharmony_ci vextracti32x4 \$2,$c,$t2 2621e1051a39Sopenharmony_ci vextracti32x4 \$2,$d,$t3 2622e1051a39Sopenharmony_ci 2623e1051a39Sopenharmony_ci sub \$64,$len 2624e1051a39Sopenharmony_ci jb .Ltail_avx512 2625e1051a39Sopenharmony_ci 2626e1051a39Sopenharmony_ci vpxor 0x00($inp),$t0,$t0 # xor with input 2627e1051a39Sopenharmony_ci vpxor 0x10($inp),$t1,$t1 2628e1051a39Sopenharmony_ci vpxor 0x20($inp),$t2,$t2 2629e1051a39Sopenharmony_ci vpxor 0x30($inp),$t3,$t3 2630e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64 2631e1051a39Sopenharmony_ci 2632e1051a39Sopenharmony_ci vmovdqu $t0,0x00($out) # write output 2633e1051a39Sopenharmony_ci vmovdqu $t1,0x10($out) 2634e1051a39Sopenharmony_ci vmovdqu $t2,0x20($out) 2635e1051a39Sopenharmony_ci vmovdqu $t3,0x30($out) 2636e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64 2637e1051a39Sopenharmony_ci 2638e1051a39Sopenharmony_ci jz .Ldone_avx512 2639e1051a39Sopenharmony_ci 2640e1051a39Sopenharmony_ci vextracti32x4 \$3,$a,$t0 2641e1051a39Sopenharmony_ci vextracti32x4 \$3,$b,$t1 2642e1051a39Sopenharmony_ci vextracti32x4 \$3,$c,$t2 2643e1051a39Sopenharmony_ci vextracti32x4 \$3,$d,$t3 2644e1051a39Sopenharmony_ci 2645e1051a39Sopenharmony_ci sub \$64,$len 2646e1051a39Sopenharmony_ci jb .Ltail_avx512 2647e1051a39Sopenharmony_ci 2648e1051a39Sopenharmony_ci vpxor 0x00($inp),$t0,$t0 # xor with input 2649e1051a39Sopenharmony_ci vpxor 0x10($inp),$t1,$t1 2650e1051a39Sopenharmony_ci vpxor 0x20($inp),$t2,$t2 2651e1051a39Sopenharmony_ci vpxor 0x30($inp),$t3,$t3 2652e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64 2653e1051a39Sopenharmony_ci 2654e1051a39Sopenharmony_ci vmovdqu $t0,0x00($out) # write output 2655e1051a39Sopenharmony_ci vmovdqu $t1,0x10($out) 2656e1051a39Sopenharmony_ci vmovdqu $t2,0x20($out) 2657e1051a39Sopenharmony_ci vmovdqu $t3,0x30($out) 2658e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64 2659e1051a39Sopenharmony_ci 2660e1051a39Sopenharmony_ci jnz .Loop_outer_avx512 2661e1051a39Sopenharmony_ci 2662e1051a39Sopenharmony_ci jmp .Ldone_avx512 2663e1051a39Sopenharmony_ci 2664e1051a39Sopenharmony_ci.align 16 2665e1051a39Sopenharmony_ci.Ltail64_avx512: 2666e1051a39Sopenharmony_ci vmovdqa %x#$a,0x00(%rsp) 2667e1051a39Sopenharmony_ci vmovdqa %x#$b,0x10(%rsp) 2668e1051a39Sopenharmony_ci vmovdqa %x#$c,0x20(%rsp) 2669e1051a39Sopenharmony_ci vmovdqa %x#$d,0x30(%rsp) 2670e1051a39Sopenharmony_ci add \$64,$len 2671e1051a39Sopenharmony_ci jmp .Loop_tail_avx512 2672e1051a39Sopenharmony_ci 2673e1051a39Sopenharmony_ci.align 16 2674e1051a39Sopenharmony_ci.Ltail_avx512: 2675e1051a39Sopenharmony_ci vmovdqa $t0,0x00(%rsp) 2676e1051a39Sopenharmony_ci vmovdqa $t1,0x10(%rsp) 2677e1051a39Sopenharmony_ci vmovdqa $t2,0x20(%rsp) 2678e1051a39Sopenharmony_ci vmovdqa $t3,0x30(%rsp) 2679e1051a39Sopenharmony_ci add \$64,$len 2680e1051a39Sopenharmony_ci 2681e1051a39Sopenharmony_ci.Loop_tail_avx512: 2682e1051a39Sopenharmony_ci movzb ($inp,$counter),%eax 2683e1051a39Sopenharmony_ci movzb (%rsp,$counter),%ecx 2684e1051a39Sopenharmony_ci lea 1($counter),$counter 2685e1051a39Sopenharmony_ci xor %ecx,%eax 2686e1051a39Sopenharmony_ci mov %al,-1($out,$counter) 2687e1051a39Sopenharmony_ci dec $len 2688e1051a39Sopenharmony_ci jnz .Loop_tail_avx512 2689e1051a39Sopenharmony_ci 2690e1051a39Sopenharmony_ci vmovdqu32 $a_,0x00(%rsp) 2691e1051a39Sopenharmony_ci 2692e1051a39Sopenharmony_ci.Ldone_avx512: 2693e1051a39Sopenharmony_ci vzeroall 2694e1051a39Sopenharmony_ci___ 2695e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 2696e1051a39Sopenharmony_ci movaps -0xa8(%r9),%xmm6 2697e1051a39Sopenharmony_ci movaps -0x98(%r9),%xmm7 2698e1051a39Sopenharmony_ci movaps -0x88(%r9),%xmm8 2699e1051a39Sopenharmony_ci movaps -0x78(%r9),%xmm9 2700e1051a39Sopenharmony_ci movaps -0x68(%r9),%xmm10 2701e1051a39Sopenharmony_ci movaps -0x58(%r9),%xmm11 2702e1051a39Sopenharmony_ci movaps -0x48(%r9),%xmm12 2703e1051a39Sopenharmony_ci movaps -0x38(%r9),%xmm13 2704e1051a39Sopenharmony_ci movaps -0x28(%r9),%xmm14 2705e1051a39Sopenharmony_ci movaps -0x18(%r9),%xmm15 2706e1051a39Sopenharmony_ci___ 2707e1051a39Sopenharmony_ci$code.=<<___; 2708e1051a39Sopenharmony_ci lea (%r9),%rsp 2709e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 2710e1051a39Sopenharmony_ci.Lavx512_epilogue: 2711e1051a39Sopenharmony_ci ret 2712e1051a39Sopenharmony_ci.cfi_endproc 2713e1051a39Sopenharmony_ci.size ChaCha20_avx512,.-ChaCha20_avx512 2714e1051a39Sopenharmony_ci___ 2715e1051a39Sopenharmony_ci 2716e1051a39Sopenharmony_cimap(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); 2717e1051a39Sopenharmony_ci 2718e1051a39Sopenharmony_ci$code.=<<___; 2719e1051a39Sopenharmony_ci.type ChaCha20_avx512vl,\@function,5 2720e1051a39Sopenharmony_ci.align 32 2721e1051a39Sopenharmony_ciChaCha20_avx512vl: 2722e1051a39Sopenharmony_ci.cfi_startproc 2723e1051a39Sopenharmony_ci.LChaCha20_avx512vl: 2724e1051a39Sopenharmony_ci mov %rsp,%r9 # frame pointer 2725e1051a39Sopenharmony_ci.cfi_def_cfa_register %r9 2726e1051a39Sopenharmony_ci cmp \$128,$len 2727e1051a39Sopenharmony_ci ja .LChaCha20_8xvl 2728e1051a39Sopenharmony_ci 2729e1051a39Sopenharmony_ci sub \$64+$xframe,%rsp 2730e1051a39Sopenharmony_ci___ 2731e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 2732e1051a39Sopenharmony_ci movaps %xmm6,-0xa8(%r9) 2733e1051a39Sopenharmony_ci movaps %xmm7,-0x98(%r9) 2734e1051a39Sopenharmony_ci movaps %xmm8,-0x88(%r9) 2735e1051a39Sopenharmony_ci movaps %xmm9,-0x78(%r9) 2736e1051a39Sopenharmony_ci movaps %xmm10,-0x68(%r9) 2737e1051a39Sopenharmony_ci movaps %xmm11,-0x58(%r9) 2738e1051a39Sopenharmony_ci movaps %xmm12,-0x48(%r9) 2739e1051a39Sopenharmony_ci movaps %xmm13,-0x38(%r9) 2740e1051a39Sopenharmony_ci movaps %xmm14,-0x28(%r9) 2741e1051a39Sopenharmony_ci movaps %xmm15,-0x18(%r9) 2742e1051a39Sopenharmony_ci.Lavx512vl_body: 2743e1051a39Sopenharmony_ci___ 2744e1051a39Sopenharmony_ci$code.=<<___; 2745e1051a39Sopenharmony_ci vbroadcasti128 .Lsigma(%rip),$a 2746e1051a39Sopenharmony_ci vbroadcasti128 ($key),$b 2747e1051a39Sopenharmony_ci vbroadcasti128 16($key),$c 2748e1051a39Sopenharmony_ci vbroadcasti128 ($counter),$d 2749e1051a39Sopenharmony_ci 2750e1051a39Sopenharmony_ci vmovdqa32 $a,$a_ 2751e1051a39Sopenharmony_ci vmovdqa32 $b,$b_ 2752e1051a39Sopenharmony_ci vmovdqa32 $c,$c_ 2753e1051a39Sopenharmony_ci vpaddd .Lzeroz(%rip),$d,$d 2754e1051a39Sopenharmony_ci vmovdqa32 .Ltwoy(%rip),$fourz 2755e1051a39Sopenharmony_ci mov \$10,$counter # reuse $counter 2756e1051a39Sopenharmony_ci vmovdqa32 $d,$d_ 2757e1051a39Sopenharmony_ci jmp .Loop_avx512vl 2758e1051a39Sopenharmony_ci 2759e1051a39Sopenharmony_ci.align 16 2760e1051a39Sopenharmony_ci.Loop_outer_avx512vl: 2761e1051a39Sopenharmony_ci vmovdqa32 $c_,$c 2762e1051a39Sopenharmony_ci vpaddd $fourz,$d_,$d 2763e1051a39Sopenharmony_ci mov \$10,$counter 2764e1051a39Sopenharmony_ci vmovdqa32 $d,$d_ 2765e1051a39Sopenharmony_ci jmp .Loop_avx512vl 2766e1051a39Sopenharmony_ci 2767e1051a39Sopenharmony_ci.align 32 2768e1051a39Sopenharmony_ci.Loop_avx512vl: 2769e1051a39Sopenharmony_ci___ 2770e1051a39Sopenharmony_ci &AVX512ROUND(); 2771e1051a39Sopenharmony_ci &vpshufd ($c,$c,0b01001110); 2772e1051a39Sopenharmony_ci &vpshufd ($b,$b,0b00111001); 2773e1051a39Sopenharmony_ci &vpshufd ($d,$d,0b10010011); 2774e1051a39Sopenharmony_ci 2775e1051a39Sopenharmony_ci &AVX512ROUND(); 2776e1051a39Sopenharmony_ci &vpshufd ($c,$c,0b01001110); 2777e1051a39Sopenharmony_ci &vpshufd ($b,$b,0b10010011); 2778e1051a39Sopenharmony_ci &vpshufd ($d,$d,0b00111001); 2779e1051a39Sopenharmony_ci 2780e1051a39Sopenharmony_ci &dec ($counter); 2781e1051a39Sopenharmony_ci &jnz (".Loop_avx512vl"); 2782e1051a39Sopenharmony_ci 2783e1051a39Sopenharmony_ci$code.=<<___; 2784e1051a39Sopenharmony_ci vpaddd $a_,$a,$a 2785e1051a39Sopenharmony_ci vpaddd $b_,$b,$b 2786e1051a39Sopenharmony_ci vpaddd $c_,$c,$c 2787e1051a39Sopenharmony_ci vpaddd $d_,$d,$d 2788e1051a39Sopenharmony_ci 2789e1051a39Sopenharmony_ci sub \$64,$len 2790e1051a39Sopenharmony_ci jb .Ltail64_avx512vl 2791e1051a39Sopenharmony_ci 2792e1051a39Sopenharmony_ci vpxor 0x00($inp),%x#$a,$t0 # xor with input 2793e1051a39Sopenharmony_ci vpxor 0x10($inp),%x#$b,$t1 2794e1051a39Sopenharmony_ci vpxor 0x20($inp),%x#$c,$t2 2795e1051a39Sopenharmony_ci vpxor 0x30($inp),%x#$d,$t3 2796e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64 2797e1051a39Sopenharmony_ci 2798e1051a39Sopenharmony_ci vmovdqu $t0,0x00($out) # write output 2799e1051a39Sopenharmony_ci vmovdqu $t1,0x10($out) 2800e1051a39Sopenharmony_ci vmovdqu $t2,0x20($out) 2801e1051a39Sopenharmony_ci vmovdqu $t3,0x30($out) 2802e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64 2803e1051a39Sopenharmony_ci 2804e1051a39Sopenharmony_ci jz .Ldone_avx512vl 2805e1051a39Sopenharmony_ci 2806e1051a39Sopenharmony_ci vextracti128 \$1,$a,$t0 2807e1051a39Sopenharmony_ci vextracti128 \$1,$b,$t1 2808e1051a39Sopenharmony_ci vextracti128 \$1,$c,$t2 2809e1051a39Sopenharmony_ci vextracti128 \$1,$d,$t3 2810e1051a39Sopenharmony_ci 2811e1051a39Sopenharmony_ci sub \$64,$len 2812e1051a39Sopenharmony_ci jb .Ltail_avx512vl 2813e1051a39Sopenharmony_ci 2814e1051a39Sopenharmony_ci vpxor 0x00($inp),$t0,$t0 # xor with input 2815e1051a39Sopenharmony_ci vpxor 0x10($inp),$t1,$t1 2816e1051a39Sopenharmony_ci vpxor 0x20($inp),$t2,$t2 2817e1051a39Sopenharmony_ci vpxor 0x30($inp),$t3,$t3 2818e1051a39Sopenharmony_ci lea 0x40($inp),$inp # inp+=64 2819e1051a39Sopenharmony_ci 2820e1051a39Sopenharmony_ci vmovdqu $t0,0x00($out) # write output 2821e1051a39Sopenharmony_ci vmovdqu $t1,0x10($out) 2822e1051a39Sopenharmony_ci vmovdqu $t2,0x20($out) 2823e1051a39Sopenharmony_ci vmovdqu $t3,0x30($out) 2824e1051a39Sopenharmony_ci lea 0x40($out),$out # out+=64 2825e1051a39Sopenharmony_ci 2826e1051a39Sopenharmony_ci vmovdqa32 $a_,$a 2827e1051a39Sopenharmony_ci vmovdqa32 $b_,$b 2828e1051a39Sopenharmony_ci jnz .Loop_outer_avx512vl 2829e1051a39Sopenharmony_ci 2830e1051a39Sopenharmony_ci jmp .Ldone_avx512vl 2831e1051a39Sopenharmony_ci 2832e1051a39Sopenharmony_ci.align 16 2833e1051a39Sopenharmony_ci.Ltail64_avx512vl: 2834e1051a39Sopenharmony_ci vmovdqa %x#$a,0x00(%rsp) 2835e1051a39Sopenharmony_ci vmovdqa %x#$b,0x10(%rsp) 2836e1051a39Sopenharmony_ci vmovdqa %x#$c,0x20(%rsp) 2837e1051a39Sopenharmony_ci vmovdqa %x#$d,0x30(%rsp) 2838e1051a39Sopenharmony_ci add \$64,$len 2839e1051a39Sopenharmony_ci jmp .Loop_tail_avx512vl 2840e1051a39Sopenharmony_ci 2841e1051a39Sopenharmony_ci.align 16 2842e1051a39Sopenharmony_ci.Ltail_avx512vl: 2843e1051a39Sopenharmony_ci vmovdqa $t0,0x00(%rsp) 2844e1051a39Sopenharmony_ci vmovdqa $t1,0x10(%rsp) 2845e1051a39Sopenharmony_ci vmovdqa $t2,0x20(%rsp) 2846e1051a39Sopenharmony_ci vmovdqa $t3,0x30(%rsp) 2847e1051a39Sopenharmony_ci add \$64,$len 2848e1051a39Sopenharmony_ci 2849e1051a39Sopenharmony_ci.Loop_tail_avx512vl: 2850e1051a39Sopenharmony_ci movzb ($inp,$counter),%eax 2851e1051a39Sopenharmony_ci movzb (%rsp,$counter),%ecx 2852e1051a39Sopenharmony_ci lea 1($counter),$counter 2853e1051a39Sopenharmony_ci xor %ecx,%eax 2854e1051a39Sopenharmony_ci mov %al,-1($out,$counter) 2855e1051a39Sopenharmony_ci dec $len 2856e1051a39Sopenharmony_ci jnz .Loop_tail_avx512vl 2857e1051a39Sopenharmony_ci 2858e1051a39Sopenharmony_ci vmovdqu32 $a_,0x00(%rsp) 2859e1051a39Sopenharmony_ci vmovdqu32 $a_,0x20(%rsp) 2860e1051a39Sopenharmony_ci 2861e1051a39Sopenharmony_ci.Ldone_avx512vl: 2862e1051a39Sopenharmony_ci vzeroall 2863e1051a39Sopenharmony_ci___ 2864e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 2865e1051a39Sopenharmony_ci movaps -0xa8(%r9),%xmm6 2866e1051a39Sopenharmony_ci movaps -0x98(%r9),%xmm7 2867e1051a39Sopenharmony_ci movaps -0x88(%r9),%xmm8 2868e1051a39Sopenharmony_ci movaps -0x78(%r9),%xmm9 2869e1051a39Sopenharmony_ci movaps -0x68(%r9),%xmm10 2870e1051a39Sopenharmony_ci movaps -0x58(%r9),%xmm11 2871e1051a39Sopenharmony_ci movaps -0x48(%r9),%xmm12 2872e1051a39Sopenharmony_ci movaps -0x38(%r9),%xmm13 2873e1051a39Sopenharmony_ci movaps -0x28(%r9),%xmm14 2874e1051a39Sopenharmony_ci movaps -0x18(%r9),%xmm15 2875e1051a39Sopenharmony_ci___ 2876e1051a39Sopenharmony_ci$code.=<<___; 2877e1051a39Sopenharmony_ci lea (%r9),%rsp 2878e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 2879e1051a39Sopenharmony_ci.Lavx512vl_epilogue: 2880e1051a39Sopenharmony_ci ret 2881e1051a39Sopenharmony_ci.cfi_endproc 2882e1051a39Sopenharmony_ci.size ChaCha20_avx512vl,.-ChaCha20_avx512vl 2883e1051a39Sopenharmony_ci___ 2884e1051a39Sopenharmony_ci} 2885e1051a39Sopenharmony_ciif ($avx>2) { 2886e1051a39Sopenharmony_ci# This one handles longer inputs... 2887e1051a39Sopenharmony_ci 2888e1051a39Sopenharmony_cimy ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2889e1051a39Sopenharmony_ci $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); 2890e1051a39Sopenharmony_cimy @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2891e1051a39Sopenharmony_ci $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 2892e1051a39Sopenharmony_cimy @key=map("%zmm$_",(16..31)); 2893e1051a39Sopenharmony_cimy ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 2894e1051a39Sopenharmony_ci 2895e1051a39Sopenharmony_cisub AVX512_lane_ROUND { 2896e1051a39Sopenharmony_cimy ($a0,$b0,$c0,$d0)=@_; 2897e1051a39Sopenharmony_cimy ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 2898e1051a39Sopenharmony_cimy ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 2899e1051a39Sopenharmony_cimy ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 2900e1051a39Sopenharmony_cimy @x=map("\"$_\"",@xx); 2901e1051a39Sopenharmony_ci 2902e1051a39Sopenharmony_ci ( 2903e1051a39Sopenharmony_ci "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 2904e1051a39Sopenharmony_ci "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 2905e1051a39Sopenharmony_ci "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 2906e1051a39Sopenharmony_ci "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 2907e1051a39Sopenharmony_ci "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2908e1051a39Sopenharmony_ci "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2909e1051a39Sopenharmony_ci "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2910e1051a39Sopenharmony_ci "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2911e1051a39Sopenharmony_ci "&vprold (@x[$d0],@x[$d0],16)", 2912e1051a39Sopenharmony_ci "&vprold (@x[$d1],@x[$d1],16)", 2913e1051a39Sopenharmony_ci "&vprold (@x[$d2],@x[$d2],16)", 2914e1051a39Sopenharmony_ci "&vprold (@x[$d3],@x[$d3],16)", 2915e1051a39Sopenharmony_ci 2916e1051a39Sopenharmony_ci "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2917e1051a39Sopenharmony_ci "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2918e1051a39Sopenharmony_ci "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2919e1051a39Sopenharmony_ci "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2920e1051a39Sopenharmony_ci "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2921e1051a39Sopenharmony_ci "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2922e1051a39Sopenharmony_ci "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2923e1051a39Sopenharmony_ci "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2924e1051a39Sopenharmony_ci "&vprold (@x[$b0],@x[$b0],12)", 2925e1051a39Sopenharmony_ci "&vprold (@x[$b1],@x[$b1],12)", 2926e1051a39Sopenharmony_ci "&vprold (@x[$b2],@x[$b2],12)", 2927e1051a39Sopenharmony_ci "&vprold (@x[$b3],@x[$b3],12)", 2928e1051a39Sopenharmony_ci 2929e1051a39Sopenharmony_ci "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 2930e1051a39Sopenharmony_ci "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 2931e1051a39Sopenharmony_ci "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 2932e1051a39Sopenharmony_ci "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 2933e1051a39Sopenharmony_ci "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2934e1051a39Sopenharmony_ci "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2935e1051a39Sopenharmony_ci "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2936e1051a39Sopenharmony_ci "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2937e1051a39Sopenharmony_ci "&vprold (@x[$d0],@x[$d0],8)", 2938e1051a39Sopenharmony_ci "&vprold (@x[$d1],@x[$d1],8)", 2939e1051a39Sopenharmony_ci "&vprold (@x[$d2],@x[$d2],8)", 2940e1051a39Sopenharmony_ci "&vprold (@x[$d3],@x[$d3],8)", 2941e1051a39Sopenharmony_ci 2942e1051a39Sopenharmony_ci "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2943e1051a39Sopenharmony_ci "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2944e1051a39Sopenharmony_ci "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2945e1051a39Sopenharmony_ci "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2946e1051a39Sopenharmony_ci "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2947e1051a39Sopenharmony_ci "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2948e1051a39Sopenharmony_ci "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2949e1051a39Sopenharmony_ci "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2950e1051a39Sopenharmony_ci "&vprold (@x[$b0],@x[$b0],7)", 2951e1051a39Sopenharmony_ci "&vprold (@x[$b1],@x[$b1],7)", 2952e1051a39Sopenharmony_ci "&vprold (@x[$b2],@x[$b2],7)", 2953e1051a39Sopenharmony_ci "&vprold (@x[$b3],@x[$b3],7)" 2954e1051a39Sopenharmony_ci ); 2955e1051a39Sopenharmony_ci} 2956e1051a39Sopenharmony_ci 2957e1051a39Sopenharmony_cimy $xframe = $win64 ? 0xa8 : 8; 2958e1051a39Sopenharmony_ci 2959e1051a39Sopenharmony_ci$code.=<<___; 2960e1051a39Sopenharmony_ci.type ChaCha20_16x,\@function,5 2961e1051a39Sopenharmony_ci.align 32 2962e1051a39Sopenharmony_ciChaCha20_16x: 2963e1051a39Sopenharmony_ci.cfi_startproc 2964e1051a39Sopenharmony_ci.LChaCha20_16x: 2965e1051a39Sopenharmony_ci mov %rsp,%r9 # frame register 2966e1051a39Sopenharmony_ci.cfi_def_cfa_register %r9 2967e1051a39Sopenharmony_ci sub \$64+$xframe,%rsp 2968e1051a39Sopenharmony_ci and \$-64,%rsp 2969e1051a39Sopenharmony_ci___ 2970e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 2971e1051a39Sopenharmony_ci movaps %xmm6,-0xa8(%r9) 2972e1051a39Sopenharmony_ci movaps %xmm7,-0x98(%r9) 2973e1051a39Sopenharmony_ci movaps %xmm8,-0x88(%r9) 2974e1051a39Sopenharmony_ci movaps %xmm9,-0x78(%r9) 2975e1051a39Sopenharmony_ci movaps %xmm10,-0x68(%r9) 2976e1051a39Sopenharmony_ci movaps %xmm11,-0x58(%r9) 2977e1051a39Sopenharmony_ci movaps %xmm12,-0x48(%r9) 2978e1051a39Sopenharmony_ci movaps %xmm13,-0x38(%r9) 2979e1051a39Sopenharmony_ci movaps %xmm14,-0x28(%r9) 2980e1051a39Sopenharmony_ci movaps %xmm15,-0x18(%r9) 2981e1051a39Sopenharmony_ci.L16x_body: 2982e1051a39Sopenharmony_ci___ 2983e1051a39Sopenharmony_ci$code.=<<___; 2984e1051a39Sopenharmony_ci vzeroupper 2985e1051a39Sopenharmony_ci 2986e1051a39Sopenharmony_ci lea .Lsigma(%rip),%r10 2987e1051a39Sopenharmony_ci vbroadcasti32x4 (%r10),$xa3 # key[0] 2988e1051a39Sopenharmony_ci vbroadcasti32x4 ($key),$xb3 # key[1] 2989e1051a39Sopenharmony_ci vbroadcasti32x4 16($key),$xc3 # key[2] 2990e1051a39Sopenharmony_ci vbroadcasti32x4 ($counter),$xd3 # key[3] 2991e1051a39Sopenharmony_ci 2992e1051a39Sopenharmony_ci vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 2993e1051a39Sopenharmony_ci vpshufd \$0x55,$xa3,$xa1 2994e1051a39Sopenharmony_ci vpshufd \$0xaa,$xa3,$xa2 2995e1051a39Sopenharmony_ci vpshufd \$0xff,$xa3,$xa3 2996e1051a39Sopenharmony_ci vmovdqa64 $xa0,@key[0] 2997e1051a39Sopenharmony_ci vmovdqa64 $xa1,@key[1] 2998e1051a39Sopenharmony_ci vmovdqa64 $xa2,@key[2] 2999e1051a39Sopenharmony_ci vmovdqa64 $xa3,@key[3] 3000e1051a39Sopenharmony_ci 3001e1051a39Sopenharmony_ci vpshufd \$0x00,$xb3,$xb0 3002e1051a39Sopenharmony_ci vpshufd \$0x55,$xb3,$xb1 3003e1051a39Sopenharmony_ci vpshufd \$0xaa,$xb3,$xb2 3004e1051a39Sopenharmony_ci vpshufd \$0xff,$xb3,$xb3 3005e1051a39Sopenharmony_ci vmovdqa64 $xb0,@key[4] 3006e1051a39Sopenharmony_ci vmovdqa64 $xb1,@key[5] 3007e1051a39Sopenharmony_ci vmovdqa64 $xb2,@key[6] 3008e1051a39Sopenharmony_ci vmovdqa64 $xb3,@key[7] 3009e1051a39Sopenharmony_ci 3010e1051a39Sopenharmony_ci vpshufd \$0x00,$xc3,$xc0 3011e1051a39Sopenharmony_ci vpshufd \$0x55,$xc3,$xc1 3012e1051a39Sopenharmony_ci vpshufd \$0xaa,$xc3,$xc2 3013e1051a39Sopenharmony_ci vpshufd \$0xff,$xc3,$xc3 3014e1051a39Sopenharmony_ci vmovdqa64 $xc0,@key[8] 3015e1051a39Sopenharmony_ci vmovdqa64 $xc1,@key[9] 3016e1051a39Sopenharmony_ci vmovdqa64 $xc2,@key[10] 3017e1051a39Sopenharmony_ci vmovdqa64 $xc3,@key[11] 3018e1051a39Sopenharmony_ci 3019e1051a39Sopenharmony_ci vpshufd \$0x00,$xd3,$xd0 3020e1051a39Sopenharmony_ci vpshufd \$0x55,$xd3,$xd1 3021e1051a39Sopenharmony_ci vpshufd \$0xaa,$xd3,$xd2 3022e1051a39Sopenharmony_ci vpshufd \$0xff,$xd3,$xd3 3023e1051a39Sopenharmony_ci vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet 3024e1051a39Sopenharmony_ci vmovdqa64 $xd0,@key[12] 3025e1051a39Sopenharmony_ci vmovdqa64 $xd1,@key[13] 3026e1051a39Sopenharmony_ci vmovdqa64 $xd2,@key[14] 3027e1051a39Sopenharmony_ci vmovdqa64 $xd3,@key[15] 3028e1051a39Sopenharmony_ci 3029e1051a39Sopenharmony_ci mov \$10,%eax 3030e1051a39Sopenharmony_ci jmp .Loop16x 3031e1051a39Sopenharmony_ci 3032e1051a39Sopenharmony_ci.align 32 3033e1051a39Sopenharmony_ci.Loop_outer16x: 3034e1051a39Sopenharmony_ci vpbroadcastd 0(%r10),$xa0 # reload key 3035e1051a39Sopenharmony_ci vpbroadcastd 4(%r10),$xa1 3036e1051a39Sopenharmony_ci vpbroadcastd 8(%r10),$xa2 3037e1051a39Sopenharmony_ci vpbroadcastd 12(%r10),$xa3 3038e1051a39Sopenharmony_ci vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters 3039e1051a39Sopenharmony_ci vmovdqa64 @key[4],$xb0 3040e1051a39Sopenharmony_ci vmovdqa64 @key[5],$xb1 3041e1051a39Sopenharmony_ci vmovdqa64 @key[6],$xb2 3042e1051a39Sopenharmony_ci vmovdqa64 @key[7],$xb3 3043e1051a39Sopenharmony_ci vmovdqa64 @key[8],$xc0 3044e1051a39Sopenharmony_ci vmovdqa64 @key[9],$xc1 3045e1051a39Sopenharmony_ci vmovdqa64 @key[10],$xc2 3046e1051a39Sopenharmony_ci vmovdqa64 @key[11],$xc3 3047e1051a39Sopenharmony_ci vmovdqa64 @key[12],$xd0 3048e1051a39Sopenharmony_ci vmovdqa64 @key[13],$xd1 3049e1051a39Sopenharmony_ci vmovdqa64 @key[14],$xd2 3050e1051a39Sopenharmony_ci vmovdqa64 @key[15],$xd3 3051e1051a39Sopenharmony_ci 3052e1051a39Sopenharmony_ci vmovdqa64 $xa0,@key[0] 3053e1051a39Sopenharmony_ci vmovdqa64 $xa1,@key[1] 3054e1051a39Sopenharmony_ci vmovdqa64 $xa2,@key[2] 3055e1051a39Sopenharmony_ci vmovdqa64 $xa3,@key[3] 3056e1051a39Sopenharmony_ci 3057e1051a39Sopenharmony_ci mov \$10,%eax 3058e1051a39Sopenharmony_ci jmp .Loop16x 3059e1051a39Sopenharmony_ci 3060e1051a39Sopenharmony_ci.align 32 3061e1051a39Sopenharmony_ci.Loop16x: 3062e1051a39Sopenharmony_ci___ 3063e1051a39Sopenharmony_ci foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 3064e1051a39Sopenharmony_ci foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 3065e1051a39Sopenharmony_ci$code.=<<___; 3066e1051a39Sopenharmony_ci dec %eax 3067e1051a39Sopenharmony_ci jnz .Loop16x 3068e1051a39Sopenharmony_ci 3069e1051a39Sopenharmony_ci vpaddd @key[0],$xa0,$xa0 # accumulate key 3070e1051a39Sopenharmony_ci vpaddd @key[1],$xa1,$xa1 3071e1051a39Sopenharmony_ci vpaddd @key[2],$xa2,$xa2 3072e1051a39Sopenharmony_ci vpaddd @key[3],$xa3,$xa3 3073e1051a39Sopenharmony_ci 3074e1051a39Sopenharmony_ci vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 3075e1051a39Sopenharmony_ci vpunpckldq $xa3,$xa2,$xt3 3076e1051a39Sopenharmony_ci vpunpckhdq $xa1,$xa0,$xa0 3077e1051a39Sopenharmony_ci vpunpckhdq $xa3,$xa2,$xa2 3078e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 3079e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 3080e1051a39Sopenharmony_ci vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 3081e1051a39Sopenharmony_ci vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 3082e1051a39Sopenharmony_ci___ 3083e1051a39Sopenharmony_ci ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 3084e1051a39Sopenharmony_ci$code.=<<___; 3085e1051a39Sopenharmony_ci vpaddd @key[4],$xb0,$xb0 3086e1051a39Sopenharmony_ci vpaddd @key[5],$xb1,$xb1 3087e1051a39Sopenharmony_ci vpaddd @key[6],$xb2,$xb2 3088e1051a39Sopenharmony_ci vpaddd @key[7],$xb3,$xb3 3089e1051a39Sopenharmony_ci 3090e1051a39Sopenharmony_ci vpunpckldq $xb1,$xb0,$xt2 3091e1051a39Sopenharmony_ci vpunpckldq $xb3,$xb2,$xt3 3092e1051a39Sopenharmony_ci vpunpckhdq $xb1,$xb0,$xb0 3093e1051a39Sopenharmony_ci vpunpckhdq $xb3,$xb2,$xb2 3094e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 3095e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 3096e1051a39Sopenharmony_ci vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 3097e1051a39Sopenharmony_ci vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 3098e1051a39Sopenharmony_ci___ 3099e1051a39Sopenharmony_ci ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 3100e1051a39Sopenharmony_ci$code.=<<___; 3101e1051a39Sopenharmony_ci vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further 3102e1051a39Sopenharmony_ci vshufi32x4 \$0xee,$xb0,$xa0,$xb0 3103e1051a39Sopenharmony_ci vshufi32x4 \$0x44,$xb1,$xa1,$xa0 3104e1051a39Sopenharmony_ci vshufi32x4 \$0xee,$xb1,$xa1,$xb1 3105e1051a39Sopenharmony_ci vshufi32x4 \$0x44,$xb2,$xa2,$xa1 3106e1051a39Sopenharmony_ci vshufi32x4 \$0xee,$xb2,$xa2,$xb2 3107e1051a39Sopenharmony_ci vshufi32x4 \$0x44,$xb3,$xa3,$xa2 3108e1051a39Sopenharmony_ci vshufi32x4 \$0xee,$xb3,$xa3,$xb3 3109e1051a39Sopenharmony_ci___ 3110e1051a39Sopenharmony_ci ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 3111e1051a39Sopenharmony_ci$code.=<<___; 3112e1051a39Sopenharmony_ci vpaddd @key[8],$xc0,$xc0 3113e1051a39Sopenharmony_ci vpaddd @key[9],$xc1,$xc1 3114e1051a39Sopenharmony_ci vpaddd @key[10],$xc2,$xc2 3115e1051a39Sopenharmony_ci vpaddd @key[11],$xc3,$xc3 3116e1051a39Sopenharmony_ci 3117e1051a39Sopenharmony_ci vpunpckldq $xc1,$xc0,$xt2 3118e1051a39Sopenharmony_ci vpunpckldq $xc3,$xc2,$xt3 3119e1051a39Sopenharmony_ci vpunpckhdq $xc1,$xc0,$xc0 3120e1051a39Sopenharmony_ci vpunpckhdq $xc3,$xc2,$xc2 3121e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 3122e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 3123e1051a39Sopenharmony_ci vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 3124e1051a39Sopenharmony_ci vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 3125e1051a39Sopenharmony_ci___ 3126e1051a39Sopenharmony_ci ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 3127e1051a39Sopenharmony_ci$code.=<<___; 3128e1051a39Sopenharmony_ci vpaddd @key[12],$xd0,$xd0 3129e1051a39Sopenharmony_ci vpaddd @key[13],$xd1,$xd1 3130e1051a39Sopenharmony_ci vpaddd @key[14],$xd2,$xd2 3131e1051a39Sopenharmony_ci vpaddd @key[15],$xd3,$xd3 3132e1051a39Sopenharmony_ci 3133e1051a39Sopenharmony_ci vpunpckldq $xd1,$xd0,$xt2 3134e1051a39Sopenharmony_ci vpunpckldq $xd3,$xd2,$xt3 3135e1051a39Sopenharmony_ci vpunpckhdq $xd1,$xd0,$xd0 3136e1051a39Sopenharmony_ci vpunpckhdq $xd3,$xd2,$xd2 3137e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 3138e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 3139e1051a39Sopenharmony_ci vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 3140e1051a39Sopenharmony_ci vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 3141e1051a39Sopenharmony_ci___ 3142e1051a39Sopenharmony_ci ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 3143e1051a39Sopenharmony_ci$code.=<<___; 3144e1051a39Sopenharmony_ci vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further 3145e1051a39Sopenharmony_ci vshufi32x4 \$0xee,$xd0,$xc0,$xd0 3146e1051a39Sopenharmony_ci vshufi32x4 \$0x44,$xd1,$xc1,$xc0 3147e1051a39Sopenharmony_ci vshufi32x4 \$0xee,$xd1,$xc1,$xd1 3148e1051a39Sopenharmony_ci vshufi32x4 \$0x44,$xd2,$xc2,$xc1 3149e1051a39Sopenharmony_ci vshufi32x4 \$0xee,$xd2,$xc2,$xd2 3150e1051a39Sopenharmony_ci vshufi32x4 \$0x44,$xd3,$xc3,$xc2 3151e1051a39Sopenharmony_ci vshufi32x4 \$0xee,$xd3,$xc3,$xd3 3152e1051a39Sopenharmony_ci___ 3153e1051a39Sopenharmony_ci ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 3154e1051a39Sopenharmony_ci$code.=<<___; 3155e1051a39Sopenharmony_ci vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further 3156e1051a39Sopenharmony_ci vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 3157e1051a39Sopenharmony_ci vshufi32x4 \$0x88,$xd0,$xb0,$xc0 3158e1051a39Sopenharmony_ci vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 3159e1051a39Sopenharmony_ci vshufi32x4 \$0x88,$xc1,$xa1,$xt1 3160e1051a39Sopenharmony_ci vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 3161e1051a39Sopenharmony_ci vshufi32x4 \$0x88,$xd1,$xb1,$xc1 3162e1051a39Sopenharmony_ci vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 3163e1051a39Sopenharmony_ci vshufi32x4 \$0x88,$xc2,$xa2,$xt2 3164e1051a39Sopenharmony_ci vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 3165e1051a39Sopenharmony_ci vshufi32x4 \$0x88,$xd2,$xb2,$xc2 3166e1051a39Sopenharmony_ci vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 3167e1051a39Sopenharmony_ci vshufi32x4 \$0x88,$xc3,$xa3,$xt3 3168e1051a39Sopenharmony_ci vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 3169e1051a39Sopenharmony_ci vshufi32x4 \$0x88,$xd3,$xb3,$xc3 3170e1051a39Sopenharmony_ci vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 3171e1051a39Sopenharmony_ci___ 3172e1051a39Sopenharmony_ci ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= 3173e1051a39Sopenharmony_ci ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); 3174e1051a39Sopenharmony_ci 3175e1051a39Sopenharmony_ci ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, 3176e1051a39Sopenharmony_ci $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = 3177e1051a39Sopenharmony_ci ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3178e1051a39Sopenharmony_ci $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 3179e1051a39Sopenharmony_ci$code.=<<___; 3180e1051a39Sopenharmony_ci cmp \$64*16,$len 3181e1051a39Sopenharmony_ci jb .Ltail16x 3182e1051a39Sopenharmony_ci 3183e1051a39Sopenharmony_ci vpxord 0x00($inp),$xa0,$xa0 # xor with input 3184e1051a39Sopenharmony_ci vpxord 0x40($inp),$xb0,$xb0 3185e1051a39Sopenharmony_ci vpxord 0x80($inp),$xc0,$xc0 3186e1051a39Sopenharmony_ci vpxord 0xc0($inp),$xd0,$xd0 3187e1051a39Sopenharmony_ci vmovdqu32 $xa0,0x00($out) 3188e1051a39Sopenharmony_ci vmovdqu32 $xb0,0x40($out) 3189e1051a39Sopenharmony_ci vmovdqu32 $xc0,0x80($out) 3190e1051a39Sopenharmony_ci vmovdqu32 $xd0,0xc0($out) 3191e1051a39Sopenharmony_ci 3192e1051a39Sopenharmony_ci vpxord 0x100($inp),$xa1,$xa1 3193e1051a39Sopenharmony_ci vpxord 0x140($inp),$xb1,$xb1 3194e1051a39Sopenharmony_ci vpxord 0x180($inp),$xc1,$xc1 3195e1051a39Sopenharmony_ci vpxord 0x1c0($inp),$xd1,$xd1 3196e1051a39Sopenharmony_ci vmovdqu32 $xa1,0x100($out) 3197e1051a39Sopenharmony_ci vmovdqu32 $xb1,0x140($out) 3198e1051a39Sopenharmony_ci vmovdqu32 $xc1,0x180($out) 3199e1051a39Sopenharmony_ci vmovdqu32 $xd1,0x1c0($out) 3200e1051a39Sopenharmony_ci 3201e1051a39Sopenharmony_ci vpxord 0x200($inp),$xa2,$xa2 3202e1051a39Sopenharmony_ci vpxord 0x240($inp),$xb2,$xb2 3203e1051a39Sopenharmony_ci vpxord 0x280($inp),$xc2,$xc2 3204e1051a39Sopenharmony_ci vpxord 0x2c0($inp),$xd2,$xd2 3205e1051a39Sopenharmony_ci vmovdqu32 $xa2,0x200($out) 3206e1051a39Sopenharmony_ci vmovdqu32 $xb2,0x240($out) 3207e1051a39Sopenharmony_ci vmovdqu32 $xc2,0x280($out) 3208e1051a39Sopenharmony_ci vmovdqu32 $xd2,0x2c0($out) 3209e1051a39Sopenharmony_ci 3210e1051a39Sopenharmony_ci vpxord 0x300($inp),$xa3,$xa3 3211e1051a39Sopenharmony_ci vpxord 0x340($inp),$xb3,$xb3 3212e1051a39Sopenharmony_ci vpxord 0x380($inp),$xc3,$xc3 3213e1051a39Sopenharmony_ci vpxord 0x3c0($inp),$xd3,$xd3 3214e1051a39Sopenharmony_ci lea 0x400($inp),$inp 3215e1051a39Sopenharmony_ci vmovdqu32 $xa3,0x300($out) 3216e1051a39Sopenharmony_ci vmovdqu32 $xb3,0x340($out) 3217e1051a39Sopenharmony_ci vmovdqu32 $xc3,0x380($out) 3218e1051a39Sopenharmony_ci vmovdqu32 $xd3,0x3c0($out) 3219e1051a39Sopenharmony_ci lea 0x400($out),$out 3220e1051a39Sopenharmony_ci 3221e1051a39Sopenharmony_ci sub \$64*16,$len 3222e1051a39Sopenharmony_ci jnz .Loop_outer16x 3223e1051a39Sopenharmony_ci 3224e1051a39Sopenharmony_ci jmp .Ldone16x 3225e1051a39Sopenharmony_ci 3226e1051a39Sopenharmony_ci.align 32 3227e1051a39Sopenharmony_ci.Ltail16x: 3228e1051a39Sopenharmony_ci xor %r10,%r10 3229e1051a39Sopenharmony_ci sub $inp,$out 3230e1051a39Sopenharmony_ci cmp \$64*1,$len 3231e1051a39Sopenharmony_ci jb .Less_than_64_16x 3232e1051a39Sopenharmony_ci vpxord ($inp),$xa0,$xa0 # xor with input 3233e1051a39Sopenharmony_ci vmovdqu32 $xa0,($out,$inp) 3234e1051a39Sopenharmony_ci je .Ldone16x 3235e1051a39Sopenharmony_ci vmovdqa32 $xb0,$xa0 3236e1051a39Sopenharmony_ci lea 64($inp),$inp 3237e1051a39Sopenharmony_ci 3238e1051a39Sopenharmony_ci cmp \$64*2,$len 3239e1051a39Sopenharmony_ci jb .Less_than_64_16x 3240e1051a39Sopenharmony_ci vpxord ($inp),$xb0,$xb0 3241e1051a39Sopenharmony_ci vmovdqu32 $xb0,($out,$inp) 3242e1051a39Sopenharmony_ci je .Ldone16x 3243e1051a39Sopenharmony_ci vmovdqa32 $xc0,$xa0 3244e1051a39Sopenharmony_ci lea 64($inp),$inp 3245e1051a39Sopenharmony_ci 3246e1051a39Sopenharmony_ci cmp \$64*3,$len 3247e1051a39Sopenharmony_ci jb .Less_than_64_16x 3248e1051a39Sopenharmony_ci vpxord ($inp),$xc0,$xc0 3249e1051a39Sopenharmony_ci vmovdqu32 $xc0,($out,$inp) 3250e1051a39Sopenharmony_ci je .Ldone16x 3251e1051a39Sopenharmony_ci vmovdqa32 $xd0,$xa0 3252e1051a39Sopenharmony_ci lea 64($inp),$inp 3253e1051a39Sopenharmony_ci 3254e1051a39Sopenharmony_ci cmp \$64*4,$len 3255e1051a39Sopenharmony_ci jb .Less_than_64_16x 3256e1051a39Sopenharmony_ci vpxord ($inp),$xd0,$xd0 3257e1051a39Sopenharmony_ci vmovdqu32 $xd0,($out,$inp) 3258e1051a39Sopenharmony_ci je .Ldone16x 3259e1051a39Sopenharmony_ci vmovdqa32 $xa1,$xa0 3260e1051a39Sopenharmony_ci lea 64($inp),$inp 3261e1051a39Sopenharmony_ci 3262e1051a39Sopenharmony_ci cmp \$64*5,$len 3263e1051a39Sopenharmony_ci jb .Less_than_64_16x 3264e1051a39Sopenharmony_ci vpxord ($inp),$xa1,$xa1 3265e1051a39Sopenharmony_ci vmovdqu32 $xa1,($out,$inp) 3266e1051a39Sopenharmony_ci je .Ldone16x 3267e1051a39Sopenharmony_ci vmovdqa32 $xb1,$xa0 3268e1051a39Sopenharmony_ci lea 64($inp),$inp 3269e1051a39Sopenharmony_ci 3270e1051a39Sopenharmony_ci cmp \$64*6,$len 3271e1051a39Sopenharmony_ci jb .Less_than_64_16x 3272e1051a39Sopenharmony_ci vpxord ($inp),$xb1,$xb1 3273e1051a39Sopenharmony_ci vmovdqu32 $xb1,($out,$inp) 3274e1051a39Sopenharmony_ci je .Ldone16x 3275e1051a39Sopenharmony_ci vmovdqa32 $xc1,$xa0 3276e1051a39Sopenharmony_ci lea 64($inp),$inp 3277e1051a39Sopenharmony_ci 3278e1051a39Sopenharmony_ci cmp \$64*7,$len 3279e1051a39Sopenharmony_ci jb .Less_than_64_16x 3280e1051a39Sopenharmony_ci vpxord ($inp),$xc1,$xc1 3281e1051a39Sopenharmony_ci vmovdqu32 $xc1,($out,$inp) 3282e1051a39Sopenharmony_ci je .Ldone16x 3283e1051a39Sopenharmony_ci vmovdqa32 $xd1,$xa0 3284e1051a39Sopenharmony_ci lea 64($inp),$inp 3285e1051a39Sopenharmony_ci 3286e1051a39Sopenharmony_ci cmp \$64*8,$len 3287e1051a39Sopenharmony_ci jb .Less_than_64_16x 3288e1051a39Sopenharmony_ci vpxord ($inp),$xd1,$xd1 3289e1051a39Sopenharmony_ci vmovdqu32 $xd1,($out,$inp) 3290e1051a39Sopenharmony_ci je .Ldone16x 3291e1051a39Sopenharmony_ci vmovdqa32 $xa2,$xa0 3292e1051a39Sopenharmony_ci lea 64($inp),$inp 3293e1051a39Sopenharmony_ci 3294e1051a39Sopenharmony_ci cmp \$64*9,$len 3295e1051a39Sopenharmony_ci jb .Less_than_64_16x 3296e1051a39Sopenharmony_ci vpxord ($inp),$xa2,$xa2 3297e1051a39Sopenharmony_ci vmovdqu32 $xa2,($out,$inp) 3298e1051a39Sopenharmony_ci je .Ldone16x 3299e1051a39Sopenharmony_ci vmovdqa32 $xb2,$xa0 3300e1051a39Sopenharmony_ci lea 64($inp),$inp 3301e1051a39Sopenharmony_ci 3302e1051a39Sopenharmony_ci cmp \$64*10,$len 3303e1051a39Sopenharmony_ci jb .Less_than_64_16x 3304e1051a39Sopenharmony_ci vpxord ($inp),$xb2,$xb2 3305e1051a39Sopenharmony_ci vmovdqu32 $xb2,($out,$inp) 3306e1051a39Sopenharmony_ci je .Ldone16x 3307e1051a39Sopenharmony_ci vmovdqa32 $xc2,$xa0 3308e1051a39Sopenharmony_ci lea 64($inp),$inp 3309e1051a39Sopenharmony_ci 3310e1051a39Sopenharmony_ci cmp \$64*11,$len 3311e1051a39Sopenharmony_ci jb .Less_than_64_16x 3312e1051a39Sopenharmony_ci vpxord ($inp),$xc2,$xc2 3313e1051a39Sopenharmony_ci vmovdqu32 $xc2,($out,$inp) 3314e1051a39Sopenharmony_ci je .Ldone16x 3315e1051a39Sopenharmony_ci vmovdqa32 $xd2,$xa0 3316e1051a39Sopenharmony_ci lea 64($inp),$inp 3317e1051a39Sopenharmony_ci 3318e1051a39Sopenharmony_ci cmp \$64*12,$len 3319e1051a39Sopenharmony_ci jb .Less_than_64_16x 3320e1051a39Sopenharmony_ci vpxord ($inp),$xd2,$xd2 3321e1051a39Sopenharmony_ci vmovdqu32 $xd2,($out,$inp) 3322e1051a39Sopenharmony_ci je .Ldone16x 3323e1051a39Sopenharmony_ci vmovdqa32 $xa3,$xa0 3324e1051a39Sopenharmony_ci lea 64($inp),$inp 3325e1051a39Sopenharmony_ci 3326e1051a39Sopenharmony_ci cmp \$64*13,$len 3327e1051a39Sopenharmony_ci jb .Less_than_64_16x 3328e1051a39Sopenharmony_ci vpxord ($inp),$xa3,$xa3 3329e1051a39Sopenharmony_ci vmovdqu32 $xa3,($out,$inp) 3330e1051a39Sopenharmony_ci je .Ldone16x 3331e1051a39Sopenharmony_ci vmovdqa32 $xb3,$xa0 3332e1051a39Sopenharmony_ci lea 64($inp),$inp 3333e1051a39Sopenharmony_ci 3334e1051a39Sopenharmony_ci cmp \$64*14,$len 3335e1051a39Sopenharmony_ci jb .Less_than_64_16x 3336e1051a39Sopenharmony_ci vpxord ($inp),$xb3,$xb3 3337e1051a39Sopenharmony_ci vmovdqu32 $xb3,($out,$inp) 3338e1051a39Sopenharmony_ci je .Ldone16x 3339e1051a39Sopenharmony_ci vmovdqa32 $xc3,$xa0 3340e1051a39Sopenharmony_ci lea 64($inp),$inp 3341e1051a39Sopenharmony_ci 3342e1051a39Sopenharmony_ci cmp \$64*15,$len 3343e1051a39Sopenharmony_ci jb .Less_than_64_16x 3344e1051a39Sopenharmony_ci vpxord ($inp),$xc3,$xc3 3345e1051a39Sopenharmony_ci vmovdqu32 $xc3,($out,$inp) 3346e1051a39Sopenharmony_ci je .Ldone16x 3347e1051a39Sopenharmony_ci vmovdqa32 $xd3,$xa0 3348e1051a39Sopenharmony_ci lea 64($inp),$inp 3349e1051a39Sopenharmony_ci 3350e1051a39Sopenharmony_ci.Less_than_64_16x: 3351e1051a39Sopenharmony_ci vmovdqa32 $xa0,0x00(%rsp) 3352e1051a39Sopenharmony_ci lea ($out,$inp),$out 3353e1051a39Sopenharmony_ci and \$63,$len 3354e1051a39Sopenharmony_ci 3355e1051a39Sopenharmony_ci.Loop_tail16x: 3356e1051a39Sopenharmony_ci movzb ($inp,%r10),%eax 3357e1051a39Sopenharmony_ci movzb (%rsp,%r10),%ecx 3358e1051a39Sopenharmony_ci lea 1(%r10),%r10 3359e1051a39Sopenharmony_ci xor %ecx,%eax 3360e1051a39Sopenharmony_ci mov %al,-1($out,%r10) 3361e1051a39Sopenharmony_ci dec $len 3362e1051a39Sopenharmony_ci jnz .Loop_tail16x 3363e1051a39Sopenharmony_ci 3364e1051a39Sopenharmony_ci vpxord $xa0,$xa0,$xa0 3365e1051a39Sopenharmony_ci vmovdqa32 $xa0,0(%rsp) 3366e1051a39Sopenharmony_ci 3367e1051a39Sopenharmony_ci.Ldone16x: 3368e1051a39Sopenharmony_ci vzeroall 3369e1051a39Sopenharmony_ci___ 3370e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 3371e1051a39Sopenharmony_ci movaps -0xa8(%r9),%xmm6 3372e1051a39Sopenharmony_ci movaps -0x98(%r9),%xmm7 3373e1051a39Sopenharmony_ci movaps -0x88(%r9),%xmm8 3374e1051a39Sopenharmony_ci movaps -0x78(%r9),%xmm9 3375e1051a39Sopenharmony_ci movaps -0x68(%r9),%xmm10 3376e1051a39Sopenharmony_ci movaps -0x58(%r9),%xmm11 3377e1051a39Sopenharmony_ci movaps -0x48(%r9),%xmm12 3378e1051a39Sopenharmony_ci movaps -0x38(%r9),%xmm13 3379e1051a39Sopenharmony_ci movaps -0x28(%r9),%xmm14 3380e1051a39Sopenharmony_ci movaps -0x18(%r9),%xmm15 3381e1051a39Sopenharmony_ci___ 3382e1051a39Sopenharmony_ci$code.=<<___; 3383e1051a39Sopenharmony_ci lea (%r9),%rsp 3384e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 3385e1051a39Sopenharmony_ci.L16x_epilogue: 3386e1051a39Sopenharmony_ci ret 3387e1051a39Sopenharmony_ci.cfi_endproc 3388e1051a39Sopenharmony_ci.size ChaCha20_16x,.-ChaCha20_16x 3389e1051a39Sopenharmony_ci___ 3390e1051a39Sopenharmony_ci 3391e1051a39Sopenharmony_ci# switch to %ymm domain 3392e1051a39Sopenharmony_ci($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3393e1051a39Sopenharmony_ci $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); 3394e1051a39Sopenharmony_ci@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3395e1051a39Sopenharmony_ci $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 3396e1051a39Sopenharmony_ci@key=map("%ymm$_",(16..31)); 3397e1051a39Sopenharmony_ci($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 3398e1051a39Sopenharmony_ci 3399e1051a39Sopenharmony_ci$code.=<<___; 3400e1051a39Sopenharmony_ci.type ChaCha20_8xvl,\@function,5 3401e1051a39Sopenharmony_ci.align 32 3402e1051a39Sopenharmony_ciChaCha20_8xvl: 3403e1051a39Sopenharmony_ci.cfi_startproc 3404e1051a39Sopenharmony_ci.LChaCha20_8xvl: 3405e1051a39Sopenharmony_ci mov %rsp,%r9 # frame register 3406e1051a39Sopenharmony_ci.cfi_def_cfa_register %r9 3407e1051a39Sopenharmony_ci sub \$64+$xframe,%rsp 3408e1051a39Sopenharmony_ci and \$-64,%rsp 3409e1051a39Sopenharmony_ci___ 3410e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 3411e1051a39Sopenharmony_ci movaps %xmm6,-0xa8(%r9) 3412e1051a39Sopenharmony_ci movaps %xmm7,-0x98(%r9) 3413e1051a39Sopenharmony_ci movaps %xmm8,-0x88(%r9) 3414e1051a39Sopenharmony_ci movaps %xmm9,-0x78(%r9) 3415e1051a39Sopenharmony_ci movaps %xmm10,-0x68(%r9) 3416e1051a39Sopenharmony_ci movaps %xmm11,-0x58(%r9) 3417e1051a39Sopenharmony_ci movaps %xmm12,-0x48(%r9) 3418e1051a39Sopenharmony_ci movaps %xmm13,-0x38(%r9) 3419e1051a39Sopenharmony_ci movaps %xmm14,-0x28(%r9) 3420e1051a39Sopenharmony_ci movaps %xmm15,-0x18(%r9) 3421e1051a39Sopenharmony_ci.L8xvl_body: 3422e1051a39Sopenharmony_ci___ 3423e1051a39Sopenharmony_ci$code.=<<___; 3424e1051a39Sopenharmony_ci vzeroupper 3425e1051a39Sopenharmony_ci 3426e1051a39Sopenharmony_ci lea .Lsigma(%rip),%r10 3427e1051a39Sopenharmony_ci vbroadcasti128 (%r10),$xa3 # key[0] 3428e1051a39Sopenharmony_ci vbroadcasti128 ($key),$xb3 # key[1] 3429e1051a39Sopenharmony_ci vbroadcasti128 16($key),$xc3 # key[2] 3430e1051a39Sopenharmony_ci vbroadcasti128 ($counter),$xd3 # key[3] 3431e1051a39Sopenharmony_ci 3432e1051a39Sopenharmony_ci vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 3433e1051a39Sopenharmony_ci vpshufd \$0x55,$xa3,$xa1 3434e1051a39Sopenharmony_ci vpshufd \$0xaa,$xa3,$xa2 3435e1051a39Sopenharmony_ci vpshufd \$0xff,$xa3,$xa3 3436e1051a39Sopenharmony_ci vmovdqa64 $xa0,@key[0] 3437e1051a39Sopenharmony_ci vmovdqa64 $xa1,@key[1] 3438e1051a39Sopenharmony_ci vmovdqa64 $xa2,@key[2] 3439e1051a39Sopenharmony_ci vmovdqa64 $xa3,@key[3] 3440e1051a39Sopenharmony_ci 3441e1051a39Sopenharmony_ci vpshufd \$0x00,$xb3,$xb0 3442e1051a39Sopenharmony_ci vpshufd \$0x55,$xb3,$xb1 3443e1051a39Sopenharmony_ci vpshufd \$0xaa,$xb3,$xb2 3444e1051a39Sopenharmony_ci vpshufd \$0xff,$xb3,$xb3 3445e1051a39Sopenharmony_ci vmovdqa64 $xb0,@key[4] 3446e1051a39Sopenharmony_ci vmovdqa64 $xb1,@key[5] 3447e1051a39Sopenharmony_ci vmovdqa64 $xb2,@key[6] 3448e1051a39Sopenharmony_ci vmovdqa64 $xb3,@key[7] 3449e1051a39Sopenharmony_ci 3450e1051a39Sopenharmony_ci vpshufd \$0x00,$xc3,$xc0 3451e1051a39Sopenharmony_ci vpshufd \$0x55,$xc3,$xc1 3452e1051a39Sopenharmony_ci vpshufd \$0xaa,$xc3,$xc2 3453e1051a39Sopenharmony_ci vpshufd \$0xff,$xc3,$xc3 3454e1051a39Sopenharmony_ci vmovdqa64 $xc0,@key[8] 3455e1051a39Sopenharmony_ci vmovdqa64 $xc1,@key[9] 3456e1051a39Sopenharmony_ci vmovdqa64 $xc2,@key[10] 3457e1051a39Sopenharmony_ci vmovdqa64 $xc3,@key[11] 3458e1051a39Sopenharmony_ci 3459e1051a39Sopenharmony_ci vpshufd \$0x00,$xd3,$xd0 3460e1051a39Sopenharmony_ci vpshufd \$0x55,$xd3,$xd1 3461e1051a39Sopenharmony_ci vpshufd \$0xaa,$xd3,$xd2 3462e1051a39Sopenharmony_ci vpshufd \$0xff,$xd3,$xd3 3463e1051a39Sopenharmony_ci vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 3464e1051a39Sopenharmony_ci vmovdqa64 $xd0,@key[12] 3465e1051a39Sopenharmony_ci vmovdqa64 $xd1,@key[13] 3466e1051a39Sopenharmony_ci vmovdqa64 $xd2,@key[14] 3467e1051a39Sopenharmony_ci vmovdqa64 $xd3,@key[15] 3468e1051a39Sopenharmony_ci 3469e1051a39Sopenharmony_ci mov \$10,%eax 3470e1051a39Sopenharmony_ci jmp .Loop8xvl 3471e1051a39Sopenharmony_ci 3472e1051a39Sopenharmony_ci.align 32 3473e1051a39Sopenharmony_ci.Loop_outer8xvl: 3474e1051a39Sopenharmony_ci #vpbroadcastd 0(%r10),$xa0 # reload key 3475e1051a39Sopenharmony_ci #vpbroadcastd 4(%r10),$xa1 3476e1051a39Sopenharmony_ci vpbroadcastd 8(%r10),$xa2 3477e1051a39Sopenharmony_ci vpbroadcastd 12(%r10),$xa3 3478e1051a39Sopenharmony_ci vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters 3479e1051a39Sopenharmony_ci vmovdqa64 @key[4],$xb0 3480e1051a39Sopenharmony_ci vmovdqa64 @key[5],$xb1 3481e1051a39Sopenharmony_ci vmovdqa64 @key[6],$xb2 3482e1051a39Sopenharmony_ci vmovdqa64 @key[7],$xb3 3483e1051a39Sopenharmony_ci vmovdqa64 @key[8],$xc0 3484e1051a39Sopenharmony_ci vmovdqa64 @key[9],$xc1 3485e1051a39Sopenharmony_ci vmovdqa64 @key[10],$xc2 3486e1051a39Sopenharmony_ci vmovdqa64 @key[11],$xc3 3487e1051a39Sopenharmony_ci vmovdqa64 @key[12],$xd0 3488e1051a39Sopenharmony_ci vmovdqa64 @key[13],$xd1 3489e1051a39Sopenharmony_ci vmovdqa64 @key[14],$xd2 3490e1051a39Sopenharmony_ci vmovdqa64 @key[15],$xd3 3491e1051a39Sopenharmony_ci 3492e1051a39Sopenharmony_ci vmovdqa64 $xa0,@key[0] 3493e1051a39Sopenharmony_ci vmovdqa64 $xa1,@key[1] 3494e1051a39Sopenharmony_ci vmovdqa64 $xa2,@key[2] 3495e1051a39Sopenharmony_ci vmovdqa64 $xa3,@key[3] 3496e1051a39Sopenharmony_ci 3497e1051a39Sopenharmony_ci mov \$10,%eax 3498e1051a39Sopenharmony_ci jmp .Loop8xvl 3499e1051a39Sopenharmony_ci 3500e1051a39Sopenharmony_ci.align 32 3501e1051a39Sopenharmony_ci.Loop8xvl: 3502e1051a39Sopenharmony_ci___ 3503e1051a39Sopenharmony_ci foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 3504e1051a39Sopenharmony_ci foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 3505e1051a39Sopenharmony_ci$code.=<<___; 3506e1051a39Sopenharmony_ci dec %eax 3507e1051a39Sopenharmony_ci jnz .Loop8xvl 3508e1051a39Sopenharmony_ci 3509e1051a39Sopenharmony_ci vpaddd @key[0],$xa0,$xa0 # accumulate key 3510e1051a39Sopenharmony_ci vpaddd @key[1],$xa1,$xa1 3511e1051a39Sopenharmony_ci vpaddd @key[2],$xa2,$xa2 3512e1051a39Sopenharmony_ci vpaddd @key[3],$xa3,$xa3 3513e1051a39Sopenharmony_ci 3514e1051a39Sopenharmony_ci vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 3515e1051a39Sopenharmony_ci vpunpckldq $xa3,$xa2,$xt3 3516e1051a39Sopenharmony_ci vpunpckhdq $xa1,$xa0,$xa0 3517e1051a39Sopenharmony_ci vpunpckhdq $xa3,$xa2,$xa2 3518e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 3519e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 3520e1051a39Sopenharmony_ci vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 3521e1051a39Sopenharmony_ci vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 3522e1051a39Sopenharmony_ci___ 3523e1051a39Sopenharmony_ci ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 3524e1051a39Sopenharmony_ci$code.=<<___; 3525e1051a39Sopenharmony_ci vpaddd @key[4],$xb0,$xb0 3526e1051a39Sopenharmony_ci vpaddd @key[5],$xb1,$xb1 3527e1051a39Sopenharmony_ci vpaddd @key[6],$xb2,$xb2 3528e1051a39Sopenharmony_ci vpaddd @key[7],$xb3,$xb3 3529e1051a39Sopenharmony_ci 3530e1051a39Sopenharmony_ci vpunpckldq $xb1,$xb0,$xt2 3531e1051a39Sopenharmony_ci vpunpckldq $xb3,$xb2,$xt3 3532e1051a39Sopenharmony_ci vpunpckhdq $xb1,$xb0,$xb0 3533e1051a39Sopenharmony_ci vpunpckhdq $xb3,$xb2,$xb2 3534e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 3535e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 3536e1051a39Sopenharmony_ci vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 3537e1051a39Sopenharmony_ci vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 3538e1051a39Sopenharmony_ci___ 3539e1051a39Sopenharmony_ci ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 3540e1051a39Sopenharmony_ci$code.=<<___; 3541e1051a39Sopenharmony_ci vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further 3542e1051a39Sopenharmony_ci vshufi32x4 \$3,$xb0,$xa0,$xb0 3543e1051a39Sopenharmony_ci vshufi32x4 \$0,$xb1,$xa1,$xa0 3544e1051a39Sopenharmony_ci vshufi32x4 \$3,$xb1,$xa1,$xb1 3545e1051a39Sopenharmony_ci vshufi32x4 \$0,$xb2,$xa2,$xa1 3546e1051a39Sopenharmony_ci vshufi32x4 \$3,$xb2,$xa2,$xb2 3547e1051a39Sopenharmony_ci vshufi32x4 \$0,$xb3,$xa3,$xa2 3548e1051a39Sopenharmony_ci vshufi32x4 \$3,$xb3,$xa3,$xb3 3549e1051a39Sopenharmony_ci___ 3550e1051a39Sopenharmony_ci ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 3551e1051a39Sopenharmony_ci$code.=<<___; 3552e1051a39Sopenharmony_ci vpaddd @key[8],$xc0,$xc0 3553e1051a39Sopenharmony_ci vpaddd @key[9],$xc1,$xc1 3554e1051a39Sopenharmony_ci vpaddd @key[10],$xc2,$xc2 3555e1051a39Sopenharmony_ci vpaddd @key[11],$xc3,$xc3 3556e1051a39Sopenharmony_ci 3557e1051a39Sopenharmony_ci vpunpckldq $xc1,$xc0,$xt2 3558e1051a39Sopenharmony_ci vpunpckldq $xc3,$xc2,$xt3 3559e1051a39Sopenharmony_ci vpunpckhdq $xc1,$xc0,$xc0 3560e1051a39Sopenharmony_ci vpunpckhdq $xc3,$xc2,$xc2 3561e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 3562e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 3563e1051a39Sopenharmony_ci vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 3564e1051a39Sopenharmony_ci vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 3565e1051a39Sopenharmony_ci___ 3566e1051a39Sopenharmony_ci ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 3567e1051a39Sopenharmony_ci$code.=<<___; 3568e1051a39Sopenharmony_ci vpaddd @key[12],$xd0,$xd0 3569e1051a39Sopenharmony_ci vpaddd @key[13],$xd1,$xd1 3570e1051a39Sopenharmony_ci vpaddd @key[14],$xd2,$xd2 3571e1051a39Sopenharmony_ci vpaddd @key[15],$xd3,$xd3 3572e1051a39Sopenharmony_ci 3573e1051a39Sopenharmony_ci vpunpckldq $xd1,$xd0,$xt2 3574e1051a39Sopenharmony_ci vpunpckldq $xd3,$xd2,$xt3 3575e1051a39Sopenharmony_ci vpunpckhdq $xd1,$xd0,$xd0 3576e1051a39Sopenharmony_ci vpunpckhdq $xd3,$xd2,$xd2 3577e1051a39Sopenharmony_ci vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 3578e1051a39Sopenharmony_ci vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 3579e1051a39Sopenharmony_ci vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 3580e1051a39Sopenharmony_ci vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 3581e1051a39Sopenharmony_ci___ 3582e1051a39Sopenharmony_ci ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 3583e1051a39Sopenharmony_ci$code.=<<___; 3584e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 3585e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xd0,$xc0,$xd0 3586e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xd1,$xc1,$xc0 3587e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xd1,$xc1,$xd1 3588e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xd2,$xc2,$xc1 3589e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xd2,$xc2,$xd2 3590e1051a39Sopenharmony_ci vperm2i128 \$0x20,$xd3,$xc3,$xc2 3591e1051a39Sopenharmony_ci vperm2i128 \$0x31,$xd3,$xc3,$xd3 3592e1051a39Sopenharmony_ci___ 3593e1051a39Sopenharmony_ci ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 3594e1051a39Sopenharmony_ci ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 3595e1051a39Sopenharmony_ci ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 3596e1051a39Sopenharmony_ci$code.=<<___; 3597e1051a39Sopenharmony_ci cmp \$64*8,$len 3598e1051a39Sopenharmony_ci jb .Ltail8xvl 3599e1051a39Sopenharmony_ci 3600e1051a39Sopenharmony_ci mov \$0x80,%eax # size optimization 3601e1051a39Sopenharmony_ci vpxord 0x00($inp),$xa0,$xa0 # xor with input 3602e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb0,$xb0 3603e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc0,$xc0 3604e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd0,$xd0 3605e1051a39Sopenharmony_ci lea ($inp,%rax),$inp # size optimization 3606e1051a39Sopenharmony_ci vmovdqu32 $xa0,0x00($out) 3607e1051a39Sopenharmony_ci vmovdqu $xb0,0x20($out) 3608e1051a39Sopenharmony_ci vmovdqu $xc0,0x40($out) 3609e1051a39Sopenharmony_ci vmovdqu $xd0,0x60($out) 3610e1051a39Sopenharmony_ci lea ($out,%rax),$out # size optimization 3611e1051a39Sopenharmony_ci 3612e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa1,$xa1 3613e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb1,$xb1 3614e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc1,$xc1 3615e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd1,$xd1 3616e1051a39Sopenharmony_ci lea ($inp,%rax),$inp # size optimization 3617e1051a39Sopenharmony_ci vmovdqu $xa1,0x00($out) 3618e1051a39Sopenharmony_ci vmovdqu $xb1,0x20($out) 3619e1051a39Sopenharmony_ci vmovdqu $xc1,0x40($out) 3620e1051a39Sopenharmony_ci vmovdqu $xd1,0x60($out) 3621e1051a39Sopenharmony_ci lea ($out,%rax),$out # size optimization 3622e1051a39Sopenharmony_ci 3623e1051a39Sopenharmony_ci vpxord 0x00($inp),$xa2,$xa2 3624e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb2,$xb2 3625e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc2,$xc2 3626e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd2,$xd2 3627e1051a39Sopenharmony_ci lea ($inp,%rax),$inp # size optimization 3628e1051a39Sopenharmony_ci vmovdqu32 $xa2,0x00($out) 3629e1051a39Sopenharmony_ci vmovdqu $xb2,0x20($out) 3630e1051a39Sopenharmony_ci vmovdqu $xc2,0x40($out) 3631e1051a39Sopenharmony_ci vmovdqu $xd2,0x60($out) 3632e1051a39Sopenharmony_ci lea ($out,%rax),$out # size optimization 3633e1051a39Sopenharmony_ci 3634e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa3,$xa3 3635e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb3,$xb3 3636e1051a39Sopenharmony_ci vpxor 0x40($inp),$xc3,$xc3 3637e1051a39Sopenharmony_ci vpxor 0x60($inp),$xd3,$xd3 3638e1051a39Sopenharmony_ci lea ($inp,%rax),$inp # size optimization 3639e1051a39Sopenharmony_ci vmovdqu $xa3,0x00($out) 3640e1051a39Sopenharmony_ci vmovdqu $xb3,0x20($out) 3641e1051a39Sopenharmony_ci vmovdqu $xc3,0x40($out) 3642e1051a39Sopenharmony_ci vmovdqu $xd3,0x60($out) 3643e1051a39Sopenharmony_ci lea ($out,%rax),$out # size optimization 3644e1051a39Sopenharmony_ci 3645e1051a39Sopenharmony_ci vpbroadcastd 0(%r10),%ymm0 # reload key 3646e1051a39Sopenharmony_ci vpbroadcastd 4(%r10),%ymm1 3647e1051a39Sopenharmony_ci 3648e1051a39Sopenharmony_ci sub \$64*8,$len 3649e1051a39Sopenharmony_ci jnz .Loop_outer8xvl 3650e1051a39Sopenharmony_ci 3651e1051a39Sopenharmony_ci jmp .Ldone8xvl 3652e1051a39Sopenharmony_ci 3653e1051a39Sopenharmony_ci.align 32 3654e1051a39Sopenharmony_ci.Ltail8xvl: 3655e1051a39Sopenharmony_ci vmovdqa64 $xa0,%ymm8 # size optimization 3656e1051a39Sopenharmony_ci___ 3657e1051a39Sopenharmony_ci$xa0 = "%ymm8"; 3658e1051a39Sopenharmony_ci$code.=<<___; 3659e1051a39Sopenharmony_ci xor %r10,%r10 3660e1051a39Sopenharmony_ci sub $inp,$out 3661e1051a39Sopenharmony_ci cmp \$64*1,$len 3662e1051a39Sopenharmony_ci jb .Less_than_64_8xvl 3663e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa0,$xa0 # xor with input 3664e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb0,$xb0 3665e1051a39Sopenharmony_ci vmovdqu $xa0,0x00($out,$inp) 3666e1051a39Sopenharmony_ci vmovdqu $xb0,0x20($out,$inp) 3667e1051a39Sopenharmony_ci je .Ldone8xvl 3668e1051a39Sopenharmony_ci vmovdqa $xc0,$xa0 3669e1051a39Sopenharmony_ci vmovdqa $xd0,$xb0 3670e1051a39Sopenharmony_ci lea 64($inp),$inp 3671e1051a39Sopenharmony_ci 3672e1051a39Sopenharmony_ci cmp \$64*2,$len 3673e1051a39Sopenharmony_ci jb .Less_than_64_8xvl 3674e1051a39Sopenharmony_ci vpxor 0x00($inp),$xc0,$xc0 3675e1051a39Sopenharmony_ci vpxor 0x20($inp),$xd0,$xd0 3676e1051a39Sopenharmony_ci vmovdqu $xc0,0x00($out,$inp) 3677e1051a39Sopenharmony_ci vmovdqu $xd0,0x20($out,$inp) 3678e1051a39Sopenharmony_ci je .Ldone8xvl 3679e1051a39Sopenharmony_ci vmovdqa $xa1,$xa0 3680e1051a39Sopenharmony_ci vmovdqa $xb1,$xb0 3681e1051a39Sopenharmony_ci lea 64($inp),$inp 3682e1051a39Sopenharmony_ci 3683e1051a39Sopenharmony_ci cmp \$64*3,$len 3684e1051a39Sopenharmony_ci jb .Less_than_64_8xvl 3685e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa1,$xa1 3686e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb1,$xb1 3687e1051a39Sopenharmony_ci vmovdqu $xa1,0x00($out,$inp) 3688e1051a39Sopenharmony_ci vmovdqu $xb1,0x20($out,$inp) 3689e1051a39Sopenharmony_ci je .Ldone8xvl 3690e1051a39Sopenharmony_ci vmovdqa $xc1,$xa0 3691e1051a39Sopenharmony_ci vmovdqa $xd1,$xb0 3692e1051a39Sopenharmony_ci lea 64($inp),$inp 3693e1051a39Sopenharmony_ci 3694e1051a39Sopenharmony_ci cmp \$64*4,$len 3695e1051a39Sopenharmony_ci jb .Less_than_64_8xvl 3696e1051a39Sopenharmony_ci vpxor 0x00($inp),$xc1,$xc1 3697e1051a39Sopenharmony_ci vpxor 0x20($inp),$xd1,$xd1 3698e1051a39Sopenharmony_ci vmovdqu $xc1,0x00($out,$inp) 3699e1051a39Sopenharmony_ci vmovdqu $xd1,0x20($out,$inp) 3700e1051a39Sopenharmony_ci je .Ldone8xvl 3701e1051a39Sopenharmony_ci vmovdqa32 $xa2,$xa0 3702e1051a39Sopenharmony_ci vmovdqa $xb2,$xb0 3703e1051a39Sopenharmony_ci lea 64($inp),$inp 3704e1051a39Sopenharmony_ci 3705e1051a39Sopenharmony_ci cmp \$64*5,$len 3706e1051a39Sopenharmony_ci jb .Less_than_64_8xvl 3707e1051a39Sopenharmony_ci vpxord 0x00($inp),$xa2,$xa2 3708e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb2,$xb2 3709e1051a39Sopenharmony_ci vmovdqu32 $xa2,0x00($out,$inp) 3710e1051a39Sopenharmony_ci vmovdqu $xb2,0x20($out,$inp) 3711e1051a39Sopenharmony_ci je .Ldone8xvl 3712e1051a39Sopenharmony_ci vmovdqa $xc2,$xa0 3713e1051a39Sopenharmony_ci vmovdqa $xd2,$xb0 3714e1051a39Sopenharmony_ci lea 64($inp),$inp 3715e1051a39Sopenharmony_ci 3716e1051a39Sopenharmony_ci cmp \$64*6,$len 3717e1051a39Sopenharmony_ci jb .Less_than_64_8xvl 3718e1051a39Sopenharmony_ci vpxor 0x00($inp),$xc2,$xc2 3719e1051a39Sopenharmony_ci vpxor 0x20($inp),$xd2,$xd2 3720e1051a39Sopenharmony_ci vmovdqu $xc2,0x00($out,$inp) 3721e1051a39Sopenharmony_ci vmovdqu $xd2,0x20($out,$inp) 3722e1051a39Sopenharmony_ci je .Ldone8xvl 3723e1051a39Sopenharmony_ci vmovdqa $xa3,$xa0 3724e1051a39Sopenharmony_ci vmovdqa $xb3,$xb0 3725e1051a39Sopenharmony_ci lea 64($inp),$inp 3726e1051a39Sopenharmony_ci 3727e1051a39Sopenharmony_ci cmp \$64*7,$len 3728e1051a39Sopenharmony_ci jb .Less_than_64_8xvl 3729e1051a39Sopenharmony_ci vpxor 0x00($inp),$xa3,$xa3 3730e1051a39Sopenharmony_ci vpxor 0x20($inp),$xb3,$xb3 3731e1051a39Sopenharmony_ci vmovdqu $xa3,0x00($out,$inp) 3732e1051a39Sopenharmony_ci vmovdqu $xb3,0x20($out,$inp) 3733e1051a39Sopenharmony_ci je .Ldone8xvl 3734e1051a39Sopenharmony_ci vmovdqa $xc3,$xa0 3735e1051a39Sopenharmony_ci vmovdqa $xd3,$xb0 3736e1051a39Sopenharmony_ci lea 64($inp),$inp 3737e1051a39Sopenharmony_ci 3738e1051a39Sopenharmony_ci.Less_than_64_8xvl: 3739e1051a39Sopenharmony_ci vmovdqa $xa0,0x00(%rsp) 3740e1051a39Sopenharmony_ci vmovdqa $xb0,0x20(%rsp) 3741e1051a39Sopenharmony_ci lea ($out,$inp),$out 3742e1051a39Sopenharmony_ci and \$63,$len 3743e1051a39Sopenharmony_ci 3744e1051a39Sopenharmony_ci.Loop_tail8xvl: 3745e1051a39Sopenharmony_ci movzb ($inp,%r10),%eax 3746e1051a39Sopenharmony_ci movzb (%rsp,%r10),%ecx 3747e1051a39Sopenharmony_ci lea 1(%r10),%r10 3748e1051a39Sopenharmony_ci xor %ecx,%eax 3749e1051a39Sopenharmony_ci mov %al,-1($out,%r10) 3750e1051a39Sopenharmony_ci dec $len 3751e1051a39Sopenharmony_ci jnz .Loop_tail8xvl 3752e1051a39Sopenharmony_ci 3753e1051a39Sopenharmony_ci vpxor $xa0,$xa0,$xa0 3754e1051a39Sopenharmony_ci vmovdqa $xa0,0x00(%rsp) 3755e1051a39Sopenharmony_ci vmovdqa $xa0,0x20(%rsp) 3756e1051a39Sopenharmony_ci 3757e1051a39Sopenharmony_ci.Ldone8xvl: 3758e1051a39Sopenharmony_ci vzeroall 3759e1051a39Sopenharmony_ci___ 3760e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 3761e1051a39Sopenharmony_ci movaps -0xa8(%r9),%xmm6 3762e1051a39Sopenharmony_ci movaps -0x98(%r9),%xmm7 3763e1051a39Sopenharmony_ci movaps -0x88(%r9),%xmm8 3764e1051a39Sopenharmony_ci movaps -0x78(%r9),%xmm9 3765e1051a39Sopenharmony_ci movaps -0x68(%r9),%xmm10 3766e1051a39Sopenharmony_ci movaps -0x58(%r9),%xmm11 3767e1051a39Sopenharmony_ci movaps -0x48(%r9),%xmm12 3768e1051a39Sopenharmony_ci movaps -0x38(%r9),%xmm13 3769e1051a39Sopenharmony_ci movaps -0x28(%r9),%xmm14 3770e1051a39Sopenharmony_ci movaps -0x18(%r9),%xmm15 3771e1051a39Sopenharmony_ci___ 3772e1051a39Sopenharmony_ci$code.=<<___; 3773e1051a39Sopenharmony_ci lea (%r9),%rsp 3774e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 3775e1051a39Sopenharmony_ci.L8xvl_epilogue: 3776e1051a39Sopenharmony_ci ret 3777e1051a39Sopenharmony_ci.cfi_endproc 3778e1051a39Sopenharmony_ci.size ChaCha20_8xvl,.-ChaCha20_8xvl 3779e1051a39Sopenharmony_ci___ 3780e1051a39Sopenharmony_ci} 3781e1051a39Sopenharmony_ci 3782e1051a39Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3783e1051a39Sopenharmony_ci# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3784e1051a39Sopenharmony_ciif ($win64) { 3785e1051a39Sopenharmony_ci$rec="%rcx"; 3786e1051a39Sopenharmony_ci$frame="%rdx"; 3787e1051a39Sopenharmony_ci$context="%r8"; 3788e1051a39Sopenharmony_ci$disp="%r9"; 3789e1051a39Sopenharmony_ci 3790e1051a39Sopenharmony_ci$code.=<<___; 3791e1051a39Sopenharmony_ci.extern __imp_RtlVirtualUnwind 3792e1051a39Sopenharmony_ci.type se_handler,\@abi-omnipotent 3793e1051a39Sopenharmony_ci.align 16 3794e1051a39Sopenharmony_cise_handler: 3795e1051a39Sopenharmony_ci push %rsi 3796e1051a39Sopenharmony_ci push %rdi 3797e1051a39Sopenharmony_ci push %rbx 3798e1051a39Sopenharmony_ci push %rbp 3799e1051a39Sopenharmony_ci push %r12 3800e1051a39Sopenharmony_ci push %r13 3801e1051a39Sopenharmony_ci push %r14 3802e1051a39Sopenharmony_ci push %r15 3803e1051a39Sopenharmony_ci pushfq 3804e1051a39Sopenharmony_ci sub \$64,%rsp 3805e1051a39Sopenharmony_ci 3806e1051a39Sopenharmony_ci mov 120($context),%rax # pull context->Rax 3807e1051a39Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 3808e1051a39Sopenharmony_ci 3809e1051a39Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 3810e1051a39Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 3811e1051a39Sopenharmony_ci 3812e1051a39Sopenharmony_ci lea .Lctr32_body(%rip),%r10 3813e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<.Lprologue 3814e1051a39Sopenharmony_ci jb .Lcommon_seh_tail 3815e1051a39Sopenharmony_ci 3816e1051a39Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 3817e1051a39Sopenharmony_ci 3818e1051a39Sopenharmony_ci lea .Lno_data(%rip),%r10 # epilogue label 3819e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip>=.Lepilogue 3820e1051a39Sopenharmony_ci jae .Lcommon_seh_tail 3821e1051a39Sopenharmony_ci 3822e1051a39Sopenharmony_ci lea 64+24+48(%rax),%rax 3823e1051a39Sopenharmony_ci 3824e1051a39Sopenharmony_ci mov -8(%rax),%rbx 3825e1051a39Sopenharmony_ci mov -16(%rax),%rbp 3826e1051a39Sopenharmony_ci mov -24(%rax),%r12 3827e1051a39Sopenharmony_ci mov -32(%rax),%r13 3828e1051a39Sopenharmony_ci mov -40(%rax),%r14 3829e1051a39Sopenharmony_ci mov -48(%rax),%r15 3830e1051a39Sopenharmony_ci mov %rbx,144($context) # restore context->Rbx 3831e1051a39Sopenharmony_ci mov %rbp,160($context) # restore context->Rbp 3832e1051a39Sopenharmony_ci mov %r12,216($context) # restore context->R12 3833e1051a39Sopenharmony_ci mov %r13,224($context) # restore context->R13 3834e1051a39Sopenharmony_ci mov %r14,232($context) # restore context->R14 3835e1051a39Sopenharmony_ci mov %r15,240($context) # restore context->R14 3836e1051a39Sopenharmony_ci 3837e1051a39Sopenharmony_ci.Lcommon_seh_tail: 3838e1051a39Sopenharmony_ci mov 8(%rax),%rdi 3839e1051a39Sopenharmony_ci mov 16(%rax),%rsi 3840e1051a39Sopenharmony_ci mov %rax,152($context) # restore context->Rsp 3841e1051a39Sopenharmony_ci mov %rsi,168($context) # restore context->Rsi 3842e1051a39Sopenharmony_ci mov %rdi,176($context) # restore context->Rdi 3843e1051a39Sopenharmony_ci 3844e1051a39Sopenharmony_ci mov 40($disp),%rdi # disp->ContextRecord 3845e1051a39Sopenharmony_ci mov $context,%rsi # context 3846e1051a39Sopenharmony_ci mov \$154,%ecx # sizeof(CONTEXT) 3847e1051a39Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 3848e1051a39Sopenharmony_ci 3849e1051a39Sopenharmony_ci mov $disp,%rsi 3850e1051a39Sopenharmony_ci xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3851e1051a39Sopenharmony_ci mov 8(%rsi),%rdx # arg2, disp->ImageBase 3852e1051a39Sopenharmony_ci mov 0(%rsi),%r8 # arg3, disp->ControlPc 3853e1051a39Sopenharmony_ci mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3854e1051a39Sopenharmony_ci mov 40(%rsi),%r10 # disp->ContextRecord 3855e1051a39Sopenharmony_ci lea 56(%rsi),%r11 # &disp->HandlerData 3856e1051a39Sopenharmony_ci lea 24(%rsi),%r12 # &disp->EstablisherFrame 3857e1051a39Sopenharmony_ci mov %r10,32(%rsp) # arg5 3858e1051a39Sopenharmony_ci mov %r11,40(%rsp) # arg6 3859e1051a39Sopenharmony_ci mov %r12,48(%rsp) # arg7 3860e1051a39Sopenharmony_ci mov %rcx,56(%rsp) # arg8, (NULL) 3861e1051a39Sopenharmony_ci call *__imp_RtlVirtualUnwind(%rip) 3862e1051a39Sopenharmony_ci 3863e1051a39Sopenharmony_ci mov \$1,%eax # ExceptionContinueSearch 3864e1051a39Sopenharmony_ci add \$64,%rsp 3865e1051a39Sopenharmony_ci popfq 3866e1051a39Sopenharmony_ci pop %r15 3867e1051a39Sopenharmony_ci pop %r14 3868e1051a39Sopenharmony_ci pop %r13 3869e1051a39Sopenharmony_ci pop %r12 3870e1051a39Sopenharmony_ci pop %rbp 3871e1051a39Sopenharmony_ci pop %rbx 3872e1051a39Sopenharmony_ci pop %rdi 3873e1051a39Sopenharmony_ci pop %rsi 3874e1051a39Sopenharmony_ci ret 3875e1051a39Sopenharmony_ci.size se_handler,.-se_handler 3876e1051a39Sopenharmony_ci 3877e1051a39Sopenharmony_ci.type simd_handler,\@abi-omnipotent 3878e1051a39Sopenharmony_ci.align 16 3879e1051a39Sopenharmony_cisimd_handler: 3880e1051a39Sopenharmony_ci push %rsi 3881e1051a39Sopenharmony_ci push %rdi 3882e1051a39Sopenharmony_ci push %rbx 3883e1051a39Sopenharmony_ci push %rbp 3884e1051a39Sopenharmony_ci push %r12 3885e1051a39Sopenharmony_ci push %r13 3886e1051a39Sopenharmony_ci push %r14 3887e1051a39Sopenharmony_ci push %r15 3888e1051a39Sopenharmony_ci pushfq 3889e1051a39Sopenharmony_ci sub \$64,%rsp 3890e1051a39Sopenharmony_ci 3891e1051a39Sopenharmony_ci mov 120($context),%rax # pull context->Rax 3892e1051a39Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 3893e1051a39Sopenharmony_ci 3894e1051a39Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 3895e1051a39Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 3896e1051a39Sopenharmony_ci 3897e1051a39Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 3898e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # prologue label 3899e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<prologue label 3900e1051a39Sopenharmony_ci jb .Lcommon_seh_tail 3901e1051a39Sopenharmony_ci 3902e1051a39Sopenharmony_ci mov 192($context),%rax # pull context->R9 3903e1051a39Sopenharmony_ci 3904e1051a39Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 3905e1051a39Sopenharmony_ci mov 8(%r11),%ecx # HandlerData[2] 3906e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 3907e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip>=epilogue label 3908e1051a39Sopenharmony_ci jae .Lcommon_seh_tail 3909e1051a39Sopenharmony_ci 3910e1051a39Sopenharmony_ci neg %rcx 3911e1051a39Sopenharmony_ci lea -8(%rax,%rcx),%rsi 3912e1051a39Sopenharmony_ci lea 512($context),%rdi # &context.Xmm6 3913e1051a39Sopenharmony_ci neg %ecx 3914e1051a39Sopenharmony_ci shr \$3,%ecx 3915e1051a39Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 3916e1051a39Sopenharmony_ci 3917e1051a39Sopenharmony_ci jmp .Lcommon_seh_tail 3918e1051a39Sopenharmony_ci.size simd_handler,.-simd_handler 3919e1051a39Sopenharmony_ci 3920e1051a39Sopenharmony_ci.section .pdata 3921e1051a39Sopenharmony_ci.align 4 3922e1051a39Sopenharmony_ci .rva .LSEH_begin_ChaCha20_ctr32 3923e1051a39Sopenharmony_ci .rva .LSEH_end_ChaCha20_ctr32 3924e1051a39Sopenharmony_ci .rva .LSEH_info_ChaCha20_ctr32 3925e1051a39Sopenharmony_ci 3926e1051a39Sopenharmony_ci .rva .LSEH_begin_ChaCha20_ssse3 3927e1051a39Sopenharmony_ci .rva .LSEH_end_ChaCha20_ssse3 3928e1051a39Sopenharmony_ci .rva .LSEH_info_ChaCha20_ssse3 3929e1051a39Sopenharmony_ci 3930e1051a39Sopenharmony_ci .rva .LSEH_begin_ChaCha20_128 3931e1051a39Sopenharmony_ci .rva .LSEH_end_ChaCha20_128 3932e1051a39Sopenharmony_ci .rva .LSEH_info_ChaCha20_128 3933e1051a39Sopenharmony_ci 3934e1051a39Sopenharmony_ci .rva .LSEH_begin_ChaCha20_4x 3935e1051a39Sopenharmony_ci .rva .LSEH_end_ChaCha20_4x 3936e1051a39Sopenharmony_ci .rva .LSEH_info_ChaCha20_4x 3937e1051a39Sopenharmony_ci___ 3938e1051a39Sopenharmony_ci$code.=<<___ if ($avx); 3939e1051a39Sopenharmony_ci .rva .LSEH_begin_ChaCha20_4xop 3940e1051a39Sopenharmony_ci .rva .LSEH_end_ChaCha20_4xop 3941e1051a39Sopenharmony_ci .rva .LSEH_info_ChaCha20_4xop 3942e1051a39Sopenharmony_ci___ 3943e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 3944e1051a39Sopenharmony_ci .rva .LSEH_begin_ChaCha20_8x 3945e1051a39Sopenharmony_ci .rva .LSEH_end_ChaCha20_8x 3946e1051a39Sopenharmony_ci .rva .LSEH_info_ChaCha20_8x 3947e1051a39Sopenharmony_ci___ 3948e1051a39Sopenharmony_ci$code.=<<___ if ($avx>2); 3949e1051a39Sopenharmony_ci .rva .LSEH_begin_ChaCha20_avx512 3950e1051a39Sopenharmony_ci .rva .LSEH_end_ChaCha20_avx512 3951e1051a39Sopenharmony_ci .rva .LSEH_info_ChaCha20_avx512 3952e1051a39Sopenharmony_ci 3953e1051a39Sopenharmony_ci .rva .LSEH_begin_ChaCha20_avx512vl 3954e1051a39Sopenharmony_ci .rva .LSEH_end_ChaCha20_avx512vl 3955e1051a39Sopenharmony_ci .rva .LSEH_info_ChaCha20_avx512vl 3956e1051a39Sopenharmony_ci 3957e1051a39Sopenharmony_ci .rva .LSEH_begin_ChaCha20_16x 3958e1051a39Sopenharmony_ci .rva .LSEH_end_ChaCha20_16x 3959e1051a39Sopenharmony_ci .rva .LSEH_info_ChaCha20_16x 3960e1051a39Sopenharmony_ci 3961e1051a39Sopenharmony_ci .rva .LSEH_begin_ChaCha20_8xvl 3962e1051a39Sopenharmony_ci .rva .LSEH_end_ChaCha20_8xvl 3963e1051a39Sopenharmony_ci .rva .LSEH_info_ChaCha20_8xvl 3964e1051a39Sopenharmony_ci___ 3965e1051a39Sopenharmony_ci$code.=<<___; 3966e1051a39Sopenharmony_ci.section .xdata 3967e1051a39Sopenharmony_ci.align 8 3968e1051a39Sopenharmony_ci.LSEH_info_ChaCha20_ctr32: 3969e1051a39Sopenharmony_ci .byte 9,0,0,0 3970e1051a39Sopenharmony_ci .rva se_handler 3971e1051a39Sopenharmony_ci 3972e1051a39Sopenharmony_ci.LSEH_info_ChaCha20_ssse3: 3973e1051a39Sopenharmony_ci .byte 9,0,0,0 3974e1051a39Sopenharmony_ci .rva simd_handler 3975e1051a39Sopenharmony_ci .rva .Lssse3_body,.Lssse3_epilogue 3976e1051a39Sopenharmony_ci .long 0x20,0 3977e1051a39Sopenharmony_ci 3978e1051a39Sopenharmony_ci.LSEH_info_ChaCha20_128: 3979e1051a39Sopenharmony_ci .byte 9,0,0,0 3980e1051a39Sopenharmony_ci .rva simd_handler 3981e1051a39Sopenharmony_ci .rva .L128_body,.L128_epilogue 3982e1051a39Sopenharmony_ci .long 0x60,0 3983e1051a39Sopenharmony_ci 3984e1051a39Sopenharmony_ci.LSEH_info_ChaCha20_4x: 3985e1051a39Sopenharmony_ci .byte 9,0,0,0 3986e1051a39Sopenharmony_ci .rva simd_handler 3987e1051a39Sopenharmony_ci .rva .L4x_body,.L4x_epilogue 3988e1051a39Sopenharmony_ci .long 0xa0,0 3989e1051a39Sopenharmony_ci___ 3990e1051a39Sopenharmony_ci$code.=<<___ if ($avx); 3991e1051a39Sopenharmony_ci.LSEH_info_ChaCha20_4xop: 3992e1051a39Sopenharmony_ci .byte 9,0,0,0 3993e1051a39Sopenharmony_ci .rva simd_handler 3994e1051a39Sopenharmony_ci .rva .L4xop_body,.L4xop_epilogue # HandlerData[] 3995e1051a39Sopenharmony_ci .long 0xa0,0 3996e1051a39Sopenharmony_ci___ 3997e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 3998e1051a39Sopenharmony_ci.LSEH_info_ChaCha20_8x: 3999e1051a39Sopenharmony_ci .byte 9,0,0,0 4000e1051a39Sopenharmony_ci .rva simd_handler 4001e1051a39Sopenharmony_ci .rva .L8x_body,.L8x_epilogue # HandlerData[] 4002e1051a39Sopenharmony_ci .long 0xa0,0 4003e1051a39Sopenharmony_ci___ 4004e1051a39Sopenharmony_ci$code.=<<___ if ($avx>2); 4005e1051a39Sopenharmony_ci.LSEH_info_ChaCha20_avx512: 4006e1051a39Sopenharmony_ci .byte 9,0,0,0 4007e1051a39Sopenharmony_ci .rva simd_handler 4008e1051a39Sopenharmony_ci .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] 4009e1051a39Sopenharmony_ci .long 0x20,0 4010e1051a39Sopenharmony_ci 4011e1051a39Sopenharmony_ci.LSEH_info_ChaCha20_avx512vl: 4012e1051a39Sopenharmony_ci .byte 9,0,0,0 4013e1051a39Sopenharmony_ci .rva simd_handler 4014e1051a39Sopenharmony_ci .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] 4015e1051a39Sopenharmony_ci .long 0x20,0 4016e1051a39Sopenharmony_ci 4017e1051a39Sopenharmony_ci.LSEH_info_ChaCha20_16x: 4018e1051a39Sopenharmony_ci .byte 9,0,0,0 4019e1051a39Sopenharmony_ci .rva simd_handler 4020e1051a39Sopenharmony_ci .rva .L16x_body,.L16x_epilogue # HandlerData[] 4021e1051a39Sopenharmony_ci .long 0xa0,0 4022e1051a39Sopenharmony_ci 4023e1051a39Sopenharmony_ci.LSEH_info_ChaCha20_8xvl: 4024e1051a39Sopenharmony_ci .byte 9,0,0,0 4025e1051a39Sopenharmony_ci .rva simd_handler 4026e1051a39Sopenharmony_ci .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] 4027e1051a39Sopenharmony_ci .long 0xa0,0 4028e1051a39Sopenharmony_ci___ 4029e1051a39Sopenharmony_ci} 4030e1051a39Sopenharmony_ci 4031e1051a39Sopenharmony_ciforeach (split("\n",$code)) { 4032e1051a39Sopenharmony_ci s/\`([^\`]*)\`/eval $1/ge; 4033e1051a39Sopenharmony_ci 4034e1051a39Sopenharmony_ci s/%x#%[yz]/%x/g; # "down-shift" 4035e1051a39Sopenharmony_ci 4036e1051a39Sopenharmony_ci print $_,"\n"; 4037e1051a39Sopenharmony_ci} 4038e1051a39Sopenharmony_ci 4039e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 4040