162306a36Sopenharmony_ci#!/usr/bin/env perl 262306a36Sopenharmony_ci# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 362306a36Sopenharmony_ci# 462306a36Sopenharmony_ci# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 562306a36Sopenharmony_ci# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 662306a36Sopenharmony_ci# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. 762306a36Sopenharmony_ci# 862306a36Sopenharmony_ci# This code is taken from the OpenSSL project but the author, Andy Polyakov, 962306a36Sopenharmony_ci# has relicensed it under the licenses specified in the SPDX header above. 1062306a36Sopenharmony_ci# The original headers, including the original license headers, are 1162306a36Sopenharmony_ci# included below for completeness. 1262306a36Sopenharmony_ci# 1362306a36Sopenharmony_ci# ==================================================================== 1462306a36Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 1562306a36Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 1662306a36Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 1762306a36Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 1862306a36Sopenharmony_ci# ==================================================================== 1962306a36Sopenharmony_ci# 2062306a36Sopenharmony_ci# This module implements Poly1305 hash for x86_64. 2162306a36Sopenharmony_ci# 2262306a36Sopenharmony_ci# March 2015 2362306a36Sopenharmony_ci# 2462306a36Sopenharmony_ci# Initial release. 2562306a36Sopenharmony_ci# 2662306a36Sopenharmony_ci# December 2016 2762306a36Sopenharmony_ci# 2862306a36Sopenharmony_ci# Add AVX512F+VL+BW code path. 2962306a36Sopenharmony_ci# 3062306a36Sopenharmony_ci# November 2017 3162306a36Sopenharmony_ci# 3262306a36Sopenharmony_ci# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be 3362306a36Sopenharmony_ci# executed even on Knights Landing. Trigger for modification was 3462306a36Sopenharmony_ci# observation that AVX512 code paths can negatively affect overall 3562306a36Sopenharmony_ci# Skylake-X system performance. Since we are likely to suppress 3662306a36Sopenharmony_ci# AVX512F capability flag [at least on Skylake-X], conversion serves 3762306a36Sopenharmony_ci# as kind of "investment protection". Note that next *lake processor, 3862306a36Sopenharmony_ci# Cannonlake, has AVX512IFMA code path to execute... 3962306a36Sopenharmony_ci# 4062306a36Sopenharmony_ci# Numbers are cycles per processed byte with poly1305_blocks alone, 4162306a36Sopenharmony_ci# measured with rdtsc at fixed clock frequency. 4262306a36Sopenharmony_ci# 4362306a36Sopenharmony_ci# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 4462306a36Sopenharmony_ci# P4 4.46/+120% - 4562306a36Sopenharmony_ci# Core 2 2.41/+90% - 4662306a36Sopenharmony_ci# Westmere 1.88/+120% - 4762306a36Sopenharmony_ci# Sandy Bridge 1.39/+140% 1.10 4862306a36Sopenharmony_ci# Haswell 1.14/+175% 1.11 0.65 4962306a36Sopenharmony_ci# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] 5062306a36Sopenharmony_ci# Silvermont 2.83/+95% - 5162306a36Sopenharmony_ci# Knights L 3.60/? 1.65 1.10 0.41(***) 5262306a36Sopenharmony_ci# Goldmont 1.70/+180% - 5362306a36Sopenharmony_ci# VIA Nano 1.82/+150% - 5462306a36Sopenharmony_ci# Sledgehammer 1.38/+160% - 5562306a36Sopenharmony_ci# Bulldozer 2.30/+130% 0.97 5662306a36Sopenharmony_ci# Ryzen 1.15/+200% 1.08 1.18 5762306a36Sopenharmony_ci# 5862306a36Sopenharmony_ci# (*) improvement coefficients relative to clang are more modest and 5962306a36Sopenharmony_ci# are ~50% on most processors, in both cases we are comparing to 6062306a36Sopenharmony_ci# __int128 code; 6162306a36Sopenharmony_ci# (**) SSE2 implementation was attempted, but among non-AVX processors 6262306a36Sopenharmony_ci# it was faster than integer-only code only on older Intel P4 and 6362306a36Sopenharmony_ci# Core processors, 50-30%, less newer processor is, but slower on 6462306a36Sopenharmony_ci# contemporary ones, for example almost 2x slower on Atom, and as 6562306a36Sopenharmony_ci# former are naturally disappearing, SSE2 is deemed unnecessary; 6662306a36Sopenharmony_ci# (***) strangely enough performance seems to vary from core to core, 6762306a36Sopenharmony_ci# listed result is best case; 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci$flavour = shift; 7062306a36Sopenharmony_ci$output = shift; 7162306a36Sopenharmony_ciif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 7462306a36Sopenharmony_ci$kernel=0; $kernel=1 if (!$flavour && !$output); 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ciif (!$kernel) { 7762306a36Sopenharmony_ci $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 7862306a36Sopenharmony_ci ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 7962306a36Sopenharmony_ci ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 8062306a36Sopenharmony_ci die "can't locate x86_64-xlate.pl"; 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 8362306a36Sopenharmony_ci *STDOUT=*OUT; 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 8662306a36Sopenharmony_ci =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 8762306a36Sopenharmony_ci $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 8862306a36Sopenharmony_ci } 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 9162306a36Sopenharmony_ci `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 9262306a36Sopenharmony_ci $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 9362306a36Sopenharmony_ci $avx += 1 if ($1==2.11 && $2>=8); 9462306a36Sopenharmony_ci } 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 9762306a36Sopenharmony_ci `ml64 2>&1` =~ /Version ([0-9]+)\./) { 9862306a36Sopenharmony_ci $avx = ($1>=10) + ($1>=11); 9962306a36Sopenharmony_ci } 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 10262306a36Sopenharmony_ci $avx = ($2>=3.0) + ($2>3.0); 10362306a36Sopenharmony_ci } 10462306a36Sopenharmony_ci} else { 10562306a36Sopenharmony_ci $avx = 4; # The kernel uses ifdefs for this. 10662306a36Sopenharmony_ci} 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_cisub declare_function() { 10962306a36Sopenharmony_ci my ($name, $align, $nargs) = @_; 11062306a36Sopenharmony_ci if($kernel) { 11162306a36Sopenharmony_ci $code .= "SYM_FUNC_START($name)\n"; 11262306a36Sopenharmony_ci $code .= ".L$name:\n"; 11362306a36Sopenharmony_ci } else { 11462306a36Sopenharmony_ci $code .= ".globl $name\n"; 11562306a36Sopenharmony_ci $code .= ".type $name,\@function,$nargs\n"; 11662306a36Sopenharmony_ci $code .= ".align $align\n"; 11762306a36Sopenharmony_ci $code .= "$name:\n"; 11862306a36Sopenharmony_ci } 11962306a36Sopenharmony_ci} 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_cisub end_function() { 12262306a36Sopenharmony_ci my ($name) = @_; 12362306a36Sopenharmony_ci if($kernel) { 12462306a36Sopenharmony_ci $code .= "SYM_FUNC_END($name)\n"; 12562306a36Sopenharmony_ci } else { 12662306a36Sopenharmony_ci $code .= ".size $name,.-$name\n"; 12762306a36Sopenharmony_ci } 12862306a36Sopenharmony_ci} 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci$code.=<<___ if $kernel; 13162306a36Sopenharmony_ci#include <linux/linkage.h> 13262306a36Sopenharmony_ci___ 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ciif ($avx) { 13562306a36Sopenharmony_ci$code.=<<___ if $kernel; 13662306a36Sopenharmony_ci.section .rodata 13762306a36Sopenharmony_ci___ 13862306a36Sopenharmony_ci$code.=<<___; 13962306a36Sopenharmony_ci.align 64 14062306a36Sopenharmony_ci.Lconst: 14162306a36Sopenharmony_ci.Lmask24: 14262306a36Sopenharmony_ci.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 14362306a36Sopenharmony_ci.L129: 14462306a36Sopenharmony_ci.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 14562306a36Sopenharmony_ci.Lmask26: 14662306a36Sopenharmony_ci.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 14762306a36Sopenharmony_ci.Lpermd_avx2: 14862306a36Sopenharmony_ci.long 2,2,2,3,2,0,2,1 14962306a36Sopenharmony_ci.Lpermd_avx512: 15062306a36Sopenharmony_ci.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci.L2_44_inp_permd: 15362306a36Sopenharmony_ci.long 0,1,1,2,2,3,7,7 15462306a36Sopenharmony_ci.L2_44_inp_shift: 15562306a36Sopenharmony_ci.quad 0,12,24,64 15662306a36Sopenharmony_ci.L2_44_mask: 15762306a36Sopenharmony_ci.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 15862306a36Sopenharmony_ci.L2_44_shift_rgt: 15962306a36Sopenharmony_ci.quad 44,44,42,64 16062306a36Sopenharmony_ci.L2_44_shift_lft: 16162306a36Sopenharmony_ci.quad 8,8,10,64 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci.align 64 16462306a36Sopenharmony_ci.Lx_mask44: 16562306a36Sopenharmony_ci.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 16662306a36Sopenharmony_ci.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 16762306a36Sopenharmony_ci.Lx_mask42: 16862306a36Sopenharmony_ci.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 16962306a36Sopenharmony_ci.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 17062306a36Sopenharmony_ci___ 17162306a36Sopenharmony_ci} 17262306a36Sopenharmony_ci$code.=<<___ if (!$kernel); 17362306a36Sopenharmony_ci.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 17462306a36Sopenharmony_ci.align 16 17562306a36Sopenharmony_ci___ 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_cimy ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); 17862306a36Sopenharmony_cimy ($mac,$nonce)=($inp,$len); # *_emit arguments 17962306a36Sopenharmony_cimy ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); 18062306a36Sopenharmony_cimy ($h0,$h1,$h2)=("%r14","%rbx","%r10"); 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_cisub poly1305_iteration { 18362306a36Sopenharmony_ci# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 18462306a36Sopenharmony_ci# output: $h0-$h2 *= $r0-$r1 18562306a36Sopenharmony_ci$code.=<<___; 18662306a36Sopenharmony_ci mulq $h0 # h0*r1 18762306a36Sopenharmony_ci mov %rax,$d2 18862306a36Sopenharmony_ci mov $r0,%rax 18962306a36Sopenharmony_ci mov %rdx,$d3 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci mulq $h0 # h0*r0 19262306a36Sopenharmony_ci mov %rax,$h0 # future $h0 19362306a36Sopenharmony_ci mov $r0,%rax 19462306a36Sopenharmony_ci mov %rdx,$d1 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci mulq $h1 # h1*r0 19762306a36Sopenharmony_ci add %rax,$d2 19862306a36Sopenharmony_ci mov $s1,%rax 19962306a36Sopenharmony_ci adc %rdx,$d3 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ci mulq $h1 # h1*s1 20262306a36Sopenharmony_ci mov $h2,$h1 # borrow $h1 20362306a36Sopenharmony_ci add %rax,$h0 20462306a36Sopenharmony_ci adc %rdx,$d1 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci imulq $s1,$h1 # h2*s1 20762306a36Sopenharmony_ci add $h1,$d2 20862306a36Sopenharmony_ci mov $d1,$h1 20962306a36Sopenharmony_ci adc \$0,$d3 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci imulq $r0,$h2 # h2*r0 21262306a36Sopenharmony_ci add $d2,$h1 21362306a36Sopenharmony_ci mov \$-4,%rax # mask value 21462306a36Sopenharmony_ci adc $h2,$d3 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci and $d3,%rax # last reduction step 21762306a36Sopenharmony_ci mov $d3,$h2 21862306a36Sopenharmony_ci shr \$2,$d3 21962306a36Sopenharmony_ci and \$3,$h2 22062306a36Sopenharmony_ci add $d3,%rax 22162306a36Sopenharmony_ci add %rax,$h0 22262306a36Sopenharmony_ci adc \$0,$h1 22362306a36Sopenharmony_ci adc \$0,$h2 22462306a36Sopenharmony_ci___ 22562306a36Sopenharmony_ci} 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci######################################################################## 22862306a36Sopenharmony_ci# Layout of opaque area is following. 22962306a36Sopenharmony_ci# 23062306a36Sopenharmony_ci# unsigned __int64 h[3]; # current hash value base 2^64 23162306a36Sopenharmony_ci# unsigned __int64 r[2]; # key value base 2^64 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci$code.=<<___; 23462306a36Sopenharmony_ci.text 23562306a36Sopenharmony_ci___ 23662306a36Sopenharmony_ci$code.=<<___ if (!$kernel); 23762306a36Sopenharmony_ci.extern OPENSSL_ia32cap_P 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci.globl poly1305_init_x86_64 24062306a36Sopenharmony_ci.hidden poly1305_init_x86_64 24162306a36Sopenharmony_ci.globl poly1305_blocks_x86_64 24262306a36Sopenharmony_ci.hidden poly1305_blocks_x86_64 24362306a36Sopenharmony_ci.globl poly1305_emit_x86_64 24462306a36Sopenharmony_ci.hidden poly1305_emit_x86_64 24562306a36Sopenharmony_ci___ 24662306a36Sopenharmony_ci&declare_function("poly1305_init_x86_64", 32, 3); 24762306a36Sopenharmony_ci$code.=<<___; 24862306a36Sopenharmony_ci xor %eax,%eax 24962306a36Sopenharmony_ci mov %rax,0($ctx) # initialize hash value 25062306a36Sopenharmony_ci mov %rax,8($ctx) 25162306a36Sopenharmony_ci mov %rax,16($ctx) 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci test $inp,$inp 25462306a36Sopenharmony_ci je .Lno_key 25562306a36Sopenharmony_ci___ 25662306a36Sopenharmony_ci$code.=<<___ if (!$kernel); 25762306a36Sopenharmony_ci lea poly1305_blocks_x86_64(%rip),%r10 25862306a36Sopenharmony_ci lea poly1305_emit_x86_64(%rip),%r11 25962306a36Sopenharmony_ci___ 26062306a36Sopenharmony_ci$code.=<<___ if (!$kernel && $avx); 26162306a36Sopenharmony_ci mov OPENSSL_ia32cap_P+4(%rip),%r9 26262306a36Sopenharmony_ci lea poly1305_blocks_avx(%rip),%rax 26362306a36Sopenharmony_ci lea poly1305_emit_avx(%rip),%rcx 26462306a36Sopenharmony_ci bt \$`60-32`,%r9 # AVX? 26562306a36Sopenharmony_ci cmovc %rax,%r10 26662306a36Sopenharmony_ci cmovc %rcx,%r11 26762306a36Sopenharmony_ci___ 26862306a36Sopenharmony_ci$code.=<<___ if (!$kernel && $avx>1); 26962306a36Sopenharmony_ci lea poly1305_blocks_avx2(%rip),%rax 27062306a36Sopenharmony_ci bt \$`5+32`,%r9 # AVX2? 27162306a36Sopenharmony_ci cmovc %rax,%r10 27262306a36Sopenharmony_ci___ 27362306a36Sopenharmony_ci$code.=<<___ if (!$kernel && $avx>3); 27462306a36Sopenharmony_ci mov \$`(1<<31|1<<21|1<<16)`,%rax 27562306a36Sopenharmony_ci shr \$32,%r9 27662306a36Sopenharmony_ci and %rax,%r9 27762306a36Sopenharmony_ci cmp %rax,%r9 27862306a36Sopenharmony_ci je .Linit_base2_44 27962306a36Sopenharmony_ci___ 28062306a36Sopenharmony_ci$code.=<<___; 28162306a36Sopenharmony_ci mov \$0x0ffffffc0fffffff,%rax 28262306a36Sopenharmony_ci mov \$0x0ffffffc0ffffffc,%rcx 28362306a36Sopenharmony_ci and 0($inp),%rax 28462306a36Sopenharmony_ci and 8($inp),%rcx 28562306a36Sopenharmony_ci mov %rax,24($ctx) 28662306a36Sopenharmony_ci mov %rcx,32($ctx) 28762306a36Sopenharmony_ci___ 28862306a36Sopenharmony_ci$code.=<<___ if (!$kernel && $flavour !~ /elf32/); 28962306a36Sopenharmony_ci mov %r10,0(%rdx) 29062306a36Sopenharmony_ci mov %r11,8(%rdx) 29162306a36Sopenharmony_ci___ 29262306a36Sopenharmony_ci$code.=<<___ if (!$kernel && $flavour =~ /elf32/); 29362306a36Sopenharmony_ci mov %r10d,0(%rdx) 29462306a36Sopenharmony_ci mov %r11d,4(%rdx) 29562306a36Sopenharmony_ci___ 29662306a36Sopenharmony_ci$code.=<<___; 29762306a36Sopenharmony_ci mov \$1,%eax 29862306a36Sopenharmony_ci.Lno_key: 29962306a36Sopenharmony_ci RET 30062306a36Sopenharmony_ci___ 30162306a36Sopenharmony_ci&end_function("poly1305_init_x86_64"); 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci&declare_function("poly1305_blocks_x86_64", 32, 4); 30462306a36Sopenharmony_ci$code.=<<___; 30562306a36Sopenharmony_ci.cfi_startproc 30662306a36Sopenharmony_ci.Lblocks: 30762306a36Sopenharmony_ci shr \$4,$len 30862306a36Sopenharmony_ci jz .Lno_data # too short 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci push %rbx 31162306a36Sopenharmony_ci.cfi_push %rbx 31262306a36Sopenharmony_ci push %r12 31362306a36Sopenharmony_ci.cfi_push %r12 31462306a36Sopenharmony_ci push %r13 31562306a36Sopenharmony_ci.cfi_push %r13 31662306a36Sopenharmony_ci push %r14 31762306a36Sopenharmony_ci.cfi_push %r14 31862306a36Sopenharmony_ci push %r15 31962306a36Sopenharmony_ci.cfi_push %r15 32062306a36Sopenharmony_ci push $ctx 32162306a36Sopenharmony_ci.cfi_push $ctx 32262306a36Sopenharmony_ci.Lblocks_body: 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci mov $len,%r15 # reassign $len 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci mov 24($ctx),$r0 # load r 32762306a36Sopenharmony_ci mov 32($ctx),$s1 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci mov 0($ctx),$h0 # load hash value 33062306a36Sopenharmony_ci mov 8($ctx),$h1 33162306a36Sopenharmony_ci mov 16($ctx),$h2 33262306a36Sopenharmony_ci 33362306a36Sopenharmony_ci mov $s1,$r1 33462306a36Sopenharmony_ci shr \$2,$s1 33562306a36Sopenharmony_ci mov $r1,%rax 33662306a36Sopenharmony_ci add $r1,$s1 # s1 = r1 + (r1 >> 2) 33762306a36Sopenharmony_ci jmp .Loop 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci.align 32 34062306a36Sopenharmony_ci.Loop: 34162306a36Sopenharmony_ci add 0($inp),$h0 # accumulate input 34262306a36Sopenharmony_ci adc 8($inp),$h1 34362306a36Sopenharmony_ci lea 16($inp),$inp 34462306a36Sopenharmony_ci adc $padbit,$h2 34562306a36Sopenharmony_ci___ 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci &poly1305_iteration(); 34862306a36Sopenharmony_ci 34962306a36Sopenharmony_ci$code.=<<___; 35062306a36Sopenharmony_ci mov $r1,%rax 35162306a36Sopenharmony_ci dec %r15 # len-=16 35262306a36Sopenharmony_ci jnz .Loop 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci mov 0(%rsp),$ctx 35562306a36Sopenharmony_ci.cfi_restore $ctx 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci mov $h0,0($ctx) # store hash value 35862306a36Sopenharmony_ci mov $h1,8($ctx) 35962306a36Sopenharmony_ci mov $h2,16($ctx) 36062306a36Sopenharmony_ci 36162306a36Sopenharmony_ci mov 8(%rsp),%r15 36262306a36Sopenharmony_ci.cfi_restore %r15 36362306a36Sopenharmony_ci mov 16(%rsp),%r14 36462306a36Sopenharmony_ci.cfi_restore %r14 36562306a36Sopenharmony_ci mov 24(%rsp),%r13 36662306a36Sopenharmony_ci.cfi_restore %r13 36762306a36Sopenharmony_ci mov 32(%rsp),%r12 36862306a36Sopenharmony_ci.cfi_restore %r12 36962306a36Sopenharmony_ci mov 40(%rsp),%rbx 37062306a36Sopenharmony_ci.cfi_restore %rbx 37162306a36Sopenharmony_ci lea 48(%rsp),%rsp 37262306a36Sopenharmony_ci.cfi_adjust_cfa_offset -48 37362306a36Sopenharmony_ci.Lno_data: 37462306a36Sopenharmony_ci.Lblocks_epilogue: 37562306a36Sopenharmony_ci RET 37662306a36Sopenharmony_ci.cfi_endproc 37762306a36Sopenharmony_ci___ 37862306a36Sopenharmony_ci&end_function("poly1305_blocks_x86_64"); 37962306a36Sopenharmony_ci 38062306a36Sopenharmony_ci&declare_function("poly1305_emit_x86_64", 32, 3); 38162306a36Sopenharmony_ci$code.=<<___; 38262306a36Sopenharmony_ci.Lemit: 38362306a36Sopenharmony_ci mov 0($ctx),%r8 # load hash value 38462306a36Sopenharmony_ci mov 8($ctx),%r9 38562306a36Sopenharmony_ci mov 16($ctx),%r10 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci mov %r8,%rax 38862306a36Sopenharmony_ci add \$5,%r8 # compare to modulus 38962306a36Sopenharmony_ci mov %r9,%rcx 39062306a36Sopenharmony_ci adc \$0,%r9 39162306a36Sopenharmony_ci adc \$0,%r10 39262306a36Sopenharmony_ci shr \$2,%r10 # did 130-bit value overflow? 39362306a36Sopenharmony_ci cmovnz %r8,%rax 39462306a36Sopenharmony_ci cmovnz %r9,%rcx 39562306a36Sopenharmony_ci 39662306a36Sopenharmony_ci add 0($nonce),%rax # accumulate nonce 39762306a36Sopenharmony_ci adc 8($nonce),%rcx 39862306a36Sopenharmony_ci mov %rax,0($mac) # write result 39962306a36Sopenharmony_ci mov %rcx,8($mac) 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci RET 40262306a36Sopenharmony_ci___ 40362306a36Sopenharmony_ci&end_function("poly1305_emit_x86_64"); 40462306a36Sopenharmony_ciif ($avx) { 40562306a36Sopenharmony_ci 40662306a36Sopenharmony_ci######################################################################## 40762306a36Sopenharmony_ci# Layout of opaque area is following. 40862306a36Sopenharmony_ci# 40962306a36Sopenharmony_ci# unsigned __int32 h[5]; # current hash value base 2^26 41062306a36Sopenharmony_ci# unsigned __int32 is_base2_26; 41162306a36Sopenharmony_ci# unsigned __int64 r[2]; # key value base 2^64 41262306a36Sopenharmony_ci# unsigned __int64 pad; 41362306a36Sopenharmony_ci# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; 41462306a36Sopenharmony_ci# 41562306a36Sopenharmony_ci# where r^n are base 2^26 digits of degrees of multiplier key. There are 41662306a36Sopenharmony_ci# 5 digits, but last four are interleaved with multiples of 5, totalling 41762306a36Sopenharmony_ci# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_cimy ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = 42062306a36Sopenharmony_ci map("%xmm$_",(0..15)); 42162306a36Sopenharmony_ci 42262306a36Sopenharmony_ci$code.=<<___; 42362306a36Sopenharmony_ci.type __poly1305_block,\@abi-omnipotent 42462306a36Sopenharmony_ci.align 32 42562306a36Sopenharmony_ci__poly1305_block: 42662306a36Sopenharmony_ci push $ctx 42762306a36Sopenharmony_ci___ 42862306a36Sopenharmony_ci &poly1305_iteration(); 42962306a36Sopenharmony_ci$code.=<<___; 43062306a36Sopenharmony_ci pop $ctx 43162306a36Sopenharmony_ci RET 43262306a36Sopenharmony_ci.size __poly1305_block,.-__poly1305_block 43362306a36Sopenharmony_ci 43462306a36Sopenharmony_ci.type __poly1305_init_avx,\@abi-omnipotent 43562306a36Sopenharmony_ci.align 32 43662306a36Sopenharmony_ci__poly1305_init_avx: 43762306a36Sopenharmony_ci push %rbp 43862306a36Sopenharmony_ci mov %rsp,%rbp 43962306a36Sopenharmony_ci mov $r0,$h0 44062306a36Sopenharmony_ci mov $r1,$h1 44162306a36Sopenharmony_ci xor $h2,$h2 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci lea 48+64($ctx),$ctx # size optimization 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci mov $r1,%rax 44662306a36Sopenharmony_ci call __poly1305_block # r^2 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 44962306a36Sopenharmony_ci mov \$0x3ffffff,%edx 45062306a36Sopenharmony_ci mov $h0,$d1 45162306a36Sopenharmony_ci and $h0#d,%eax 45262306a36Sopenharmony_ci mov $r0,$d2 45362306a36Sopenharmony_ci and $r0#d,%edx 45462306a36Sopenharmony_ci mov %eax,`16*0+0-64`($ctx) 45562306a36Sopenharmony_ci shr \$26,$d1 45662306a36Sopenharmony_ci mov %edx,`16*0+4-64`($ctx) 45762306a36Sopenharmony_ci shr \$26,$d2 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_ci mov \$0x3ffffff,%eax 46062306a36Sopenharmony_ci mov \$0x3ffffff,%edx 46162306a36Sopenharmony_ci and $d1#d,%eax 46262306a36Sopenharmony_ci and $d2#d,%edx 46362306a36Sopenharmony_ci mov %eax,`16*1+0-64`($ctx) 46462306a36Sopenharmony_ci lea (%rax,%rax,4),%eax # *5 46562306a36Sopenharmony_ci mov %edx,`16*1+4-64`($ctx) 46662306a36Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 46762306a36Sopenharmony_ci mov %eax,`16*2+0-64`($ctx) 46862306a36Sopenharmony_ci shr \$26,$d1 46962306a36Sopenharmony_ci mov %edx,`16*2+4-64`($ctx) 47062306a36Sopenharmony_ci shr \$26,$d2 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci mov $h1,%rax 47362306a36Sopenharmony_ci mov $r1,%rdx 47462306a36Sopenharmony_ci shl \$12,%rax 47562306a36Sopenharmony_ci shl \$12,%rdx 47662306a36Sopenharmony_ci or $d1,%rax 47762306a36Sopenharmony_ci or $d2,%rdx 47862306a36Sopenharmony_ci and \$0x3ffffff,%eax 47962306a36Sopenharmony_ci and \$0x3ffffff,%edx 48062306a36Sopenharmony_ci mov %eax,`16*3+0-64`($ctx) 48162306a36Sopenharmony_ci lea (%rax,%rax,4),%eax # *5 48262306a36Sopenharmony_ci mov %edx,`16*3+4-64`($ctx) 48362306a36Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 48462306a36Sopenharmony_ci mov %eax,`16*4+0-64`($ctx) 48562306a36Sopenharmony_ci mov $h1,$d1 48662306a36Sopenharmony_ci mov %edx,`16*4+4-64`($ctx) 48762306a36Sopenharmony_ci mov $r1,$d2 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_ci mov \$0x3ffffff,%eax 49062306a36Sopenharmony_ci mov \$0x3ffffff,%edx 49162306a36Sopenharmony_ci shr \$14,$d1 49262306a36Sopenharmony_ci shr \$14,$d2 49362306a36Sopenharmony_ci and $d1#d,%eax 49462306a36Sopenharmony_ci and $d2#d,%edx 49562306a36Sopenharmony_ci mov %eax,`16*5+0-64`($ctx) 49662306a36Sopenharmony_ci lea (%rax,%rax,4),%eax # *5 49762306a36Sopenharmony_ci mov %edx,`16*5+4-64`($ctx) 49862306a36Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 49962306a36Sopenharmony_ci mov %eax,`16*6+0-64`($ctx) 50062306a36Sopenharmony_ci shr \$26,$d1 50162306a36Sopenharmony_ci mov %edx,`16*6+4-64`($ctx) 50262306a36Sopenharmony_ci shr \$26,$d2 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_ci mov $h2,%rax 50562306a36Sopenharmony_ci shl \$24,%rax 50662306a36Sopenharmony_ci or %rax,$d1 50762306a36Sopenharmony_ci mov $d1#d,`16*7+0-64`($ctx) 50862306a36Sopenharmony_ci lea ($d1,$d1,4),$d1 # *5 50962306a36Sopenharmony_ci mov $d2#d,`16*7+4-64`($ctx) 51062306a36Sopenharmony_ci lea ($d2,$d2,4),$d2 # *5 51162306a36Sopenharmony_ci mov $d1#d,`16*8+0-64`($ctx) 51262306a36Sopenharmony_ci mov $d2#d,`16*8+4-64`($ctx) 51362306a36Sopenharmony_ci 51462306a36Sopenharmony_ci mov $r1,%rax 51562306a36Sopenharmony_ci call __poly1305_block # r^3 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci mov \$0x3ffffff,%eax # save r^3 base 2^26 51862306a36Sopenharmony_ci mov $h0,$d1 51962306a36Sopenharmony_ci and $h0#d,%eax 52062306a36Sopenharmony_ci shr \$26,$d1 52162306a36Sopenharmony_ci mov %eax,`16*0+12-64`($ctx) 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci mov \$0x3ffffff,%edx 52462306a36Sopenharmony_ci and $d1#d,%edx 52562306a36Sopenharmony_ci mov %edx,`16*1+12-64`($ctx) 52662306a36Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 52762306a36Sopenharmony_ci shr \$26,$d1 52862306a36Sopenharmony_ci mov %edx,`16*2+12-64`($ctx) 52962306a36Sopenharmony_ci 53062306a36Sopenharmony_ci mov $h1,%rax 53162306a36Sopenharmony_ci shl \$12,%rax 53262306a36Sopenharmony_ci or $d1,%rax 53362306a36Sopenharmony_ci and \$0x3ffffff,%eax 53462306a36Sopenharmony_ci mov %eax,`16*3+12-64`($ctx) 53562306a36Sopenharmony_ci lea (%rax,%rax,4),%eax # *5 53662306a36Sopenharmony_ci mov $h1,$d1 53762306a36Sopenharmony_ci mov %eax,`16*4+12-64`($ctx) 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci mov \$0x3ffffff,%edx 54062306a36Sopenharmony_ci shr \$14,$d1 54162306a36Sopenharmony_ci and $d1#d,%edx 54262306a36Sopenharmony_ci mov %edx,`16*5+12-64`($ctx) 54362306a36Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 54462306a36Sopenharmony_ci shr \$26,$d1 54562306a36Sopenharmony_ci mov %edx,`16*6+12-64`($ctx) 54662306a36Sopenharmony_ci 54762306a36Sopenharmony_ci mov $h2,%rax 54862306a36Sopenharmony_ci shl \$24,%rax 54962306a36Sopenharmony_ci or %rax,$d1 55062306a36Sopenharmony_ci mov $d1#d,`16*7+12-64`($ctx) 55162306a36Sopenharmony_ci lea ($d1,$d1,4),$d1 # *5 55262306a36Sopenharmony_ci mov $d1#d,`16*8+12-64`($ctx) 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci mov $r1,%rax 55562306a36Sopenharmony_ci call __poly1305_block # r^4 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ci mov \$0x3ffffff,%eax # save r^4 base 2^26 55862306a36Sopenharmony_ci mov $h0,$d1 55962306a36Sopenharmony_ci and $h0#d,%eax 56062306a36Sopenharmony_ci shr \$26,$d1 56162306a36Sopenharmony_ci mov %eax,`16*0+8-64`($ctx) 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_ci mov \$0x3ffffff,%edx 56462306a36Sopenharmony_ci and $d1#d,%edx 56562306a36Sopenharmony_ci mov %edx,`16*1+8-64`($ctx) 56662306a36Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 56762306a36Sopenharmony_ci shr \$26,$d1 56862306a36Sopenharmony_ci mov %edx,`16*2+8-64`($ctx) 56962306a36Sopenharmony_ci 57062306a36Sopenharmony_ci mov $h1,%rax 57162306a36Sopenharmony_ci shl \$12,%rax 57262306a36Sopenharmony_ci or $d1,%rax 57362306a36Sopenharmony_ci and \$0x3ffffff,%eax 57462306a36Sopenharmony_ci mov %eax,`16*3+8-64`($ctx) 57562306a36Sopenharmony_ci lea (%rax,%rax,4),%eax # *5 57662306a36Sopenharmony_ci mov $h1,$d1 57762306a36Sopenharmony_ci mov %eax,`16*4+8-64`($ctx) 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci mov \$0x3ffffff,%edx 58062306a36Sopenharmony_ci shr \$14,$d1 58162306a36Sopenharmony_ci and $d1#d,%edx 58262306a36Sopenharmony_ci mov %edx,`16*5+8-64`($ctx) 58362306a36Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 58462306a36Sopenharmony_ci shr \$26,$d1 58562306a36Sopenharmony_ci mov %edx,`16*6+8-64`($ctx) 58662306a36Sopenharmony_ci 58762306a36Sopenharmony_ci mov $h2,%rax 58862306a36Sopenharmony_ci shl \$24,%rax 58962306a36Sopenharmony_ci or %rax,$d1 59062306a36Sopenharmony_ci mov $d1#d,`16*7+8-64`($ctx) 59162306a36Sopenharmony_ci lea ($d1,$d1,4),$d1 # *5 59262306a36Sopenharmony_ci mov $d1#d,`16*8+8-64`($ctx) 59362306a36Sopenharmony_ci 59462306a36Sopenharmony_ci lea -48-64($ctx),$ctx # size [de-]optimization 59562306a36Sopenharmony_ci pop %rbp 59662306a36Sopenharmony_ci RET 59762306a36Sopenharmony_ci.size __poly1305_init_avx,.-__poly1305_init_avx 59862306a36Sopenharmony_ci___ 59962306a36Sopenharmony_ci 60062306a36Sopenharmony_ci&declare_function("poly1305_blocks_avx", 32, 4); 60162306a36Sopenharmony_ci$code.=<<___; 60262306a36Sopenharmony_ci.cfi_startproc 60362306a36Sopenharmony_ci mov 20($ctx),%r8d # is_base2_26 60462306a36Sopenharmony_ci cmp \$128,$len 60562306a36Sopenharmony_ci jae .Lblocks_avx 60662306a36Sopenharmony_ci test %r8d,%r8d 60762306a36Sopenharmony_ci jz .Lblocks 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_ci.Lblocks_avx: 61062306a36Sopenharmony_ci and \$-16,$len 61162306a36Sopenharmony_ci jz .Lno_data_avx 61262306a36Sopenharmony_ci 61362306a36Sopenharmony_ci vzeroupper 61462306a36Sopenharmony_ci 61562306a36Sopenharmony_ci test %r8d,%r8d 61662306a36Sopenharmony_ci jz .Lbase2_64_avx 61762306a36Sopenharmony_ci 61862306a36Sopenharmony_ci test \$31,$len 61962306a36Sopenharmony_ci jz .Leven_avx 62062306a36Sopenharmony_ci 62162306a36Sopenharmony_ci push %rbp 62262306a36Sopenharmony_ci.cfi_push %rbp 62362306a36Sopenharmony_ci mov %rsp,%rbp 62462306a36Sopenharmony_ci push %rbx 62562306a36Sopenharmony_ci.cfi_push %rbx 62662306a36Sopenharmony_ci push %r12 62762306a36Sopenharmony_ci.cfi_push %r12 62862306a36Sopenharmony_ci push %r13 62962306a36Sopenharmony_ci.cfi_push %r13 63062306a36Sopenharmony_ci push %r14 63162306a36Sopenharmony_ci.cfi_push %r14 63262306a36Sopenharmony_ci push %r15 63362306a36Sopenharmony_ci.cfi_push %r15 63462306a36Sopenharmony_ci.Lblocks_avx_body: 63562306a36Sopenharmony_ci 63662306a36Sopenharmony_ci mov $len,%r15 # reassign $len 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci mov 0($ctx),$d1 # load hash value 63962306a36Sopenharmony_ci mov 8($ctx),$d2 64062306a36Sopenharmony_ci mov 16($ctx),$h2#d 64162306a36Sopenharmony_ci 64262306a36Sopenharmony_ci mov 24($ctx),$r0 # load r 64362306a36Sopenharmony_ci mov 32($ctx),$s1 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ci ################################# base 2^26 -> base 2^64 64662306a36Sopenharmony_ci mov $d1#d,$h0#d 64762306a36Sopenharmony_ci and \$`-1*(1<<31)`,$d1 64862306a36Sopenharmony_ci mov $d2,$r1 # borrow $r1 64962306a36Sopenharmony_ci mov $d2#d,$h1#d 65062306a36Sopenharmony_ci and \$`-1*(1<<31)`,$d2 65162306a36Sopenharmony_ci 65262306a36Sopenharmony_ci shr \$6,$d1 65362306a36Sopenharmony_ci shl \$52,$r1 65462306a36Sopenharmony_ci add $d1,$h0 65562306a36Sopenharmony_ci shr \$12,$h1 65662306a36Sopenharmony_ci shr \$18,$d2 65762306a36Sopenharmony_ci add $r1,$h0 65862306a36Sopenharmony_ci adc $d2,$h1 65962306a36Sopenharmony_ci 66062306a36Sopenharmony_ci mov $h2,$d1 66162306a36Sopenharmony_ci shl \$40,$d1 66262306a36Sopenharmony_ci shr \$24,$h2 66362306a36Sopenharmony_ci add $d1,$h1 66462306a36Sopenharmony_ci adc \$0,$h2 # can be partially reduced... 66562306a36Sopenharmony_ci 66662306a36Sopenharmony_ci mov \$-4,$d2 # ... so reduce 66762306a36Sopenharmony_ci mov $h2,$d1 66862306a36Sopenharmony_ci and $h2,$d2 66962306a36Sopenharmony_ci shr \$2,$d1 67062306a36Sopenharmony_ci and \$3,$h2 67162306a36Sopenharmony_ci add $d2,$d1 # =*5 67262306a36Sopenharmony_ci add $d1,$h0 67362306a36Sopenharmony_ci adc \$0,$h1 67462306a36Sopenharmony_ci adc \$0,$h2 67562306a36Sopenharmony_ci 67662306a36Sopenharmony_ci mov $s1,$r1 67762306a36Sopenharmony_ci mov $s1,%rax 67862306a36Sopenharmony_ci shr \$2,$s1 67962306a36Sopenharmony_ci add $r1,$s1 # s1 = r1 + (r1 >> 2) 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_ci add 0($inp),$h0 # accumulate input 68262306a36Sopenharmony_ci adc 8($inp),$h1 68362306a36Sopenharmony_ci lea 16($inp),$inp 68462306a36Sopenharmony_ci adc $padbit,$h2 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci call __poly1305_block 68762306a36Sopenharmony_ci 68862306a36Sopenharmony_ci test $padbit,$padbit # if $padbit is zero, 68962306a36Sopenharmony_ci jz .Lstore_base2_64_avx # store hash in base 2^64 format 69062306a36Sopenharmony_ci 69162306a36Sopenharmony_ci ################################# base 2^64 -> base 2^26 69262306a36Sopenharmony_ci mov $h0,%rax 69362306a36Sopenharmony_ci mov $h0,%rdx 69462306a36Sopenharmony_ci shr \$52,$h0 69562306a36Sopenharmony_ci mov $h1,$r0 69662306a36Sopenharmony_ci mov $h1,$r1 69762306a36Sopenharmony_ci shr \$26,%rdx 69862306a36Sopenharmony_ci and \$0x3ffffff,%rax # h[0] 69962306a36Sopenharmony_ci shl \$12,$r0 70062306a36Sopenharmony_ci and \$0x3ffffff,%rdx # h[1] 70162306a36Sopenharmony_ci shr \$14,$h1 70262306a36Sopenharmony_ci or $r0,$h0 70362306a36Sopenharmony_ci shl \$24,$h2 70462306a36Sopenharmony_ci and \$0x3ffffff,$h0 # h[2] 70562306a36Sopenharmony_ci shr \$40,$r1 70662306a36Sopenharmony_ci and \$0x3ffffff,$h1 # h[3] 70762306a36Sopenharmony_ci or $r1,$h2 # h[4] 70862306a36Sopenharmony_ci 70962306a36Sopenharmony_ci sub \$16,%r15 71062306a36Sopenharmony_ci jz .Lstore_base2_26_avx 71162306a36Sopenharmony_ci 71262306a36Sopenharmony_ci vmovd %rax#d,$H0 71362306a36Sopenharmony_ci vmovd %rdx#d,$H1 71462306a36Sopenharmony_ci vmovd $h0#d,$H2 71562306a36Sopenharmony_ci vmovd $h1#d,$H3 71662306a36Sopenharmony_ci vmovd $h2#d,$H4 71762306a36Sopenharmony_ci jmp .Lproceed_avx 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_ci.align 32 72062306a36Sopenharmony_ci.Lstore_base2_64_avx: 72162306a36Sopenharmony_ci mov $h0,0($ctx) 72262306a36Sopenharmony_ci mov $h1,8($ctx) 72362306a36Sopenharmony_ci mov $h2,16($ctx) # note that is_base2_26 is zeroed 72462306a36Sopenharmony_ci jmp .Ldone_avx 72562306a36Sopenharmony_ci 72662306a36Sopenharmony_ci.align 16 72762306a36Sopenharmony_ci.Lstore_base2_26_avx: 72862306a36Sopenharmony_ci mov %rax#d,0($ctx) # store hash value base 2^26 72962306a36Sopenharmony_ci mov %rdx#d,4($ctx) 73062306a36Sopenharmony_ci mov $h0#d,8($ctx) 73162306a36Sopenharmony_ci mov $h1#d,12($ctx) 73262306a36Sopenharmony_ci mov $h2#d,16($ctx) 73362306a36Sopenharmony_ci.align 16 73462306a36Sopenharmony_ci.Ldone_avx: 73562306a36Sopenharmony_ci pop %r15 73662306a36Sopenharmony_ci.cfi_restore %r15 73762306a36Sopenharmony_ci pop %r14 73862306a36Sopenharmony_ci.cfi_restore %r14 73962306a36Sopenharmony_ci pop %r13 74062306a36Sopenharmony_ci.cfi_restore %r13 74162306a36Sopenharmony_ci pop %r12 74262306a36Sopenharmony_ci.cfi_restore %r12 74362306a36Sopenharmony_ci pop %rbx 74462306a36Sopenharmony_ci.cfi_restore %rbx 74562306a36Sopenharmony_ci pop %rbp 74662306a36Sopenharmony_ci.cfi_restore %rbp 74762306a36Sopenharmony_ci.Lno_data_avx: 74862306a36Sopenharmony_ci.Lblocks_avx_epilogue: 74962306a36Sopenharmony_ci RET 75062306a36Sopenharmony_ci.cfi_endproc 75162306a36Sopenharmony_ci 75262306a36Sopenharmony_ci.align 32 75362306a36Sopenharmony_ci.Lbase2_64_avx: 75462306a36Sopenharmony_ci.cfi_startproc 75562306a36Sopenharmony_ci push %rbp 75662306a36Sopenharmony_ci.cfi_push %rbp 75762306a36Sopenharmony_ci mov %rsp,%rbp 75862306a36Sopenharmony_ci push %rbx 75962306a36Sopenharmony_ci.cfi_push %rbx 76062306a36Sopenharmony_ci push %r12 76162306a36Sopenharmony_ci.cfi_push %r12 76262306a36Sopenharmony_ci push %r13 76362306a36Sopenharmony_ci.cfi_push %r13 76462306a36Sopenharmony_ci push %r14 76562306a36Sopenharmony_ci.cfi_push %r14 76662306a36Sopenharmony_ci push %r15 76762306a36Sopenharmony_ci.cfi_push %r15 76862306a36Sopenharmony_ci.Lbase2_64_avx_body: 76962306a36Sopenharmony_ci 77062306a36Sopenharmony_ci mov $len,%r15 # reassign $len 77162306a36Sopenharmony_ci 77262306a36Sopenharmony_ci mov 24($ctx),$r0 # load r 77362306a36Sopenharmony_ci mov 32($ctx),$s1 77462306a36Sopenharmony_ci 77562306a36Sopenharmony_ci mov 0($ctx),$h0 # load hash value 77662306a36Sopenharmony_ci mov 8($ctx),$h1 77762306a36Sopenharmony_ci mov 16($ctx),$h2#d 77862306a36Sopenharmony_ci 77962306a36Sopenharmony_ci mov $s1,$r1 78062306a36Sopenharmony_ci mov $s1,%rax 78162306a36Sopenharmony_ci shr \$2,$s1 78262306a36Sopenharmony_ci add $r1,$s1 # s1 = r1 + (r1 >> 2) 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_ci test \$31,$len 78562306a36Sopenharmony_ci jz .Linit_avx 78662306a36Sopenharmony_ci 78762306a36Sopenharmony_ci add 0($inp),$h0 # accumulate input 78862306a36Sopenharmony_ci adc 8($inp),$h1 78962306a36Sopenharmony_ci lea 16($inp),$inp 79062306a36Sopenharmony_ci adc $padbit,$h2 79162306a36Sopenharmony_ci sub \$16,%r15 79262306a36Sopenharmony_ci 79362306a36Sopenharmony_ci call __poly1305_block 79462306a36Sopenharmony_ci 79562306a36Sopenharmony_ci.Linit_avx: 79662306a36Sopenharmony_ci ################################# base 2^64 -> base 2^26 79762306a36Sopenharmony_ci mov $h0,%rax 79862306a36Sopenharmony_ci mov $h0,%rdx 79962306a36Sopenharmony_ci shr \$52,$h0 80062306a36Sopenharmony_ci mov $h1,$d1 80162306a36Sopenharmony_ci mov $h1,$d2 80262306a36Sopenharmony_ci shr \$26,%rdx 80362306a36Sopenharmony_ci and \$0x3ffffff,%rax # h[0] 80462306a36Sopenharmony_ci shl \$12,$d1 80562306a36Sopenharmony_ci and \$0x3ffffff,%rdx # h[1] 80662306a36Sopenharmony_ci shr \$14,$h1 80762306a36Sopenharmony_ci or $d1,$h0 80862306a36Sopenharmony_ci shl \$24,$h2 80962306a36Sopenharmony_ci and \$0x3ffffff,$h0 # h[2] 81062306a36Sopenharmony_ci shr \$40,$d2 81162306a36Sopenharmony_ci and \$0x3ffffff,$h1 # h[3] 81262306a36Sopenharmony_ci or $d2,$h2 # h[4] 81362306a36Sopenharmony_ci 81462306a36Sopenharmony_ci vmovd %rax#d,$H0 81562306a36Sopenharmony_ci vmovd %rdx#d,$H1 81662306a36Sopenharmony_ci vmovd $h0#d,$H2 81762306a36Sopenharmony_ci vmovd $h1#d,$H3 81862306a36Sopenharmony_ci vmovd $h2#d,$H4 81962306a36Sopenharmony_ci movl \$1,20($ctx) # set is_base2_26 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_ci call __poly1305_init_avx 82262306a36Sopenharmony_ci 82362306a36Sopenharmony_ci.Lproceed_avx: 82462306a36Sopenharmony_ci mov %r15,$len 82562306a36Sopenharmony_ci pop %r15 82662306a36Sopenharmony_ci.cfi_restore %r15 82762306a36Sopenharmony_ci pop %r14 82862306a36Sopenharmony_ci.cfi_restore %r14 82962306a36Sopenharmony_ci pop %r13 83062306a36Sopenharmony_ci.cfi_restore %r13 83162306a36Sopenharmony_ci pop %r12 83262306a36Sopenharmony_ci.cfi_restore %r12 83362306a36Sopenharmony_ci pop %rbx 83462306a36Sopenharmony_ci.cfi_restore %rbx 83562306a36Sopenharmony_ci pop %rbp 83662306a36Sopenharmony_ci.cfi_restore %rbp 83762306a36Sopenharmony_ci.Lbase2_64_avx_epilogue: 83862306a36Sopenharmony_ci jmp .Ldo_avx 83962306a36Sopenharmony_ci.cfi_endproc 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_ci.align 32 84262306a36Sopenharmony_ci.Leven_avx: 84362306a36Sopenharmony_ci.cfi_startproc 84462306a36Sopenharmony_ci vmovd 4*0($ctx),$H0 # load hash value 84562306a36Sopenharmony_ci vmovd 4*1($ctx),$H1 84662306a36Sopenharmony_ci vmovd 4*2($ctx),$H2 84762306a36Sopenharmony_ci vmovd 4*3($ctx),$H3 84862306a36Sopenharmony_ci vmovd 4*4($ctx),$H4 84962306a36Sopenharmony_ci 85062306a36Sopenharmony_ci.Ldo_avx: 85162306a36Sopenharmony_ci___ 85262306a36Sopenharmony_ci$code.=<<___ if (!$win64); 85362306a36Sopenharmony_ci lea 8(%rsp),%r10 85462306a36Sopenharmony_ci.cfi_def_cfa_register %r10 85562306a36Sopenharmony_ci and \$-32,%rsp 85662306a36Sopenharmony_ci sub \$-8,%rsp 85762306a36Sopenharmony_ci lea -0x58(%rsp),%r11 85862306a36Sopenharmony_ci sub \$0x178,%rsp 85962306a36Sopenharmony_ci___ 86062306a36Sopenharmony_ci$code.=<<___ if ($win64); 86162306a36Sopenharmony_ci lea -0xf8(%rsp),%r11 86262306a36Sopenharmony_ci sub \$0x218,%rsp 86362306a36Sopenharmony_ci vmovdqa %xmm6,0x50(%r11) 86462306a36Sopenharmony_ci vmovdqa %xmm7,0x60(%r11) 86562306a36Sopenharmony_ci vmovdqa %xmm8,0x70(%r11) 86662306a36Sopenharmony_ci vmovdqa %xmm9,0x80(%r11) 86762306a36Sopenharmony_ci vmovdqa %xmm10,0x90(%r11) 86862306a36Sopenharmony_ci vmovdqa %xmm11,0xa0(%r11) 86962306a36Sopenharmony_ci vmovdqa %xmm12,0xb0(%r11) 87062306a36Sopenharmony_ci vmovdqa %xmm13,0xc0(%r11) 87162306a36Sopenharmony_ci vmovdqa %xmm14,0xd0(%r11) 87262306a36Sopenharmony_ci vmovdqa %xmm15,0xe0(%r11) 87362306a36Sopenharmony_ci.Ldo_avx_body: 87462306a36Sopenharmony_ci___ 87562306a36Sopenharmony_ci$code.=<<___; 87662306a36Sopenharmony_ci sub \$64,$len 87762306a36Sopenharmony_ci lea -32($inp),%rax 87862306a36Sopenharmony_ci cmovc %rax,$inp 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_ci vmovdqu `16*3`($ctx),$D4 # preload r0^2 88162306a36Sopenharmony_ci lea `16*3+64`($ctx),$ctx # size optimization 88262306a36Sopenharmony_ci lea .Lconst(%rip),%rcx 88362306a36Sopenharmony_ci 88462306a36Sopenharmony_ci ################################################################ 88562306a36Sopenharmony_ci # load input 88662306a36Sopenharmony_ci vmovdqu 16*2($inp),$T0 88762306a36Sopenharmony_ci vmovdqu 16*3($inp),$T1 88862306a36Sopenharmony_ci vmovdqa 64(%rcx),$MASK # .Lmask26 88962306a36Sopenharmony_ci 89062306a36Sopenharmony_ci vpsrldq \$6,$T0,$T2 # splat input 89162306a36Sopenharmony_ci vpsrldq \$6,$T1,$T3 89262306a36Sopenharmony_ci vpunpckhqdq $T1,$T0,$T4 # 4 89362306a36Sopenharmony_ci vpunpcklqdq $T1,$T0,$T0 # 0:1 89462306a36Sopenharmony_ci vpunpcklqdq $T3,$T2,$T3 # 2:3 89562306a36Sopenharmony_ci 89662306a36Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 89762306a36Sopenharmony_ci vpsrlq \$26,$T0,$T1 89862306a36Sopenharmony_ci vpand $MASK,$T0,$T0 # 0 89962306a36Sopenharmony_ci vpsrlq \$4,$T3,$T2 90062306a36Sopenharmony_ci vpand $MASK,$T1,$T1 # 1 90162306a36Sopenharmony_ci vpsrlq \$30,$T3,$T3 90262306a36Sopenharmony_ci vpand $MASK,$T2,$T2 # 2 90362306a36Sopenharmony_ci vpand $MASK,$T3,$T3 # 3 90462306a36Sopenharmony_ci vpor 32(%rcx),$T4,$T4 # padbit, yes, always 90562306a36Sopenharmony_ci 90662306a36Sopenharmony_ci jbe .Lskip_loop_avx 90762306a36Sopenharmony_ci 90862306a36Sopenharmony_ci # expand and copy pre-calculated table to stack 90962306a36Sopenharmony_ci vmovdqu `16*1-64`($ctx),$D1 91062306a36Sopenharmony_ci vmovdqu `16*2-64`($ctx),$D2 91162306a36Sopenharmony_ci vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 91262306a36Sopenharmony_ci vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 91362306a36Sopenharmony_ci vmovdqa $D3,-0x90(%r11) 91462306a36Sopenharmony_ci vmovdqa $D0,0x00(%rsp) 91562306a36Sopenharmony_ci vpshufd \$0xEE,$D1,$D4 91662306a36Sopenharmony_ci vmovdqu `16*3-64`($ctx),$D0 91762306a36Sopenharmony_ci vpshufd \$0x44,$D1,$D1 91862306a36Sopenharmony_ci vmovdqa $D4,-0x80(%r11) 91962306a36Sopenharmony_ci vmovdqa $D1,0x10(%rsp) 92062306a36Sopenharmony_ci vpshufd \$0xEE,$D2,$D3 92162306a36Sopenharmony_ci vmovdqu `16*4-64`($ctx),$D1 92262306a36Sopenharmony_ci vpshufd \$0x44,$D2,$D2 92362306a36Sopenharmony_ci vmovdqa $D3,-0x70(%r11) 92462306a36Sopenharmony_ci vmovdqa $D2,0x20(%rsp) 92562306a36Sopenharmony_ci vpshufd \$0xEE,$D0,$D4 92662306a36Sopenharmony_ci vmovdqu `16*5-64`($ctx),$D2 92762306a36Sopenharmony_ci vpshufd \$0x44,$D0,$D0 92862306a36Sopenharmony_ci vmovdqa $D4,-0x60(%r11) 92962306a36Sopenharmony_ci vmovdqa $D0,0x30(%rsp) 93062306a36Sopenharmony_ci vpshufd \$0xEE,$D1,$D3 93162306a36Sopenharmony_ci vmovdqu `16*6-64`($ctx),$D0 93262306a36Sopenharmony_ci vpshufd \$0x44,$D1,$D1 93362306a36Sopenharmony_ci vmovdqa $D3,-0x50(%r11) 93462306a36Sopenharmony_ci vmovdqa $D1,0x40(%rsp) 93562306a36Sopenharmony_ci vpshufd \$0xEE,$D2,$D4 93662306a36Sopenharmony_ci vmovdqu `16*7-64`($ctx),$D1 93762306a36Sopenharmony_ci vpshufd \$0x44,$D2,$D2 93862306a36Sopenharmony_ci vmovdqa $D4,-0x40(%r11) 93962306a36Sopenharmony_ci vmovdqa $D2,0x50(%rsp) 94062306a36Sopenharmony_ci vpshufd \$0xEE,$D0,$D3 94162306a36Sopenharmony_ci vmovdqu `16*8-64`($ctx),$D2 94262306a36Sopenharmony_ci vpshufd \$0x44,$D0,$D0 94362306a36Sopenharmony_ci vmovdqa $D3,-0x30(%r11) 94462306a36Sopenharmony_ci vmovdqa $D0,0x60(%rsp) 94562306a36Sopenharmony_ci vpshufd \$0xEE,$D1,$D4 94662306a36Sopenharmony_ci vpshufd \$0x44,$D1,$D1 94762306a36Sopenharmony_ci vmovdqa $D4,-0x20(%r11) 94862306a36Sopenharmony_ci vmovdqa $D1,0x70(%rsp) 94962306a36Sopenharmony_ci vpshufd \$0xEE,$D2,$D3 95062306a36Sopenharmony_ci vmovdqa 0x00(%rsp),$D4 # preload r0^2 95162306a36Sopenharmony_ci vpshufd \$0x44,$D2,$D2 95262306a36Sopenharmony_ci vmovdqa $D3,-0x10(%r11) 95362306a36Sopenharmony_ci vmovdqa $D2,0x80(%rsp) 95462306a36Sopenharmony_ci 95562306a36Sopenharmony_ci jmp .Loop_avx 95662306a36Sopenharmony_ci 95762306a36Sopenharmony_ci.align 32 95862306a36Sopenharmony_ci.Loop_avx: 95962306a36Sopenharmony_ci ################################################################ 96062306a36Sopenharmony_ci # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 96162306a36Sopenharmony_ci # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 96262306a36Sopenharmony_ci # \___________________/ 96362306a36Sopenharmony_ci # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 96462306a36Sopenharmony_ci # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 96562306a36Sopenharmony_ci # \___________________/ \____________________/ 96662306a36Sopenharmony_ci # 96762306a36Sopenharmony_ci # Note that we start with inp[2:3]*r^2. This is because it 96862306a36Sopenharmony_ci # doesn't depend on reduction in previous iteration. 96962306a36Sopenharmony_ci ################################################################ 97062306a36Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 97162306a36Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 97262306a36Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 97362306a36Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 97462306a36Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 97562306a36Sopenharmony_ci # 97662306a36Sopenharmony_ci # though note that $Tx and $Hx are "reversed" in this section, 97762306a36Sopenharmony_ci # and $D4 is preloaded with r0^2... 97862306a36Sopenharmony_ci 97962306a36Sopenharmony_ci vpmuludq $T0,$D4,$D0 # d0 = h0*r0 98062306a36Sopenharmony_ci vpmuludq $T1,$D4,$D1 # d1 = h1*r0 98162306a36Sopenharmony_ci vmovdqa $H2,0x20(%r11) # offload hash 98262306a36Sopenharmony_ci vpmuludq $T2,$D4,$D2 # d3 = h2*r0 98362306a36Sopenharmony_ci vmovdqa 0x10(%rsp),$H2 # r1^2 98462306a36Sopenharmony_ci vpmuludq $T3,$D4,$D3 # d3 = h3*r0 98562306a36Sopenharmony_ci vpmuludq $T4,$D4,$D4 # d4 = h4*r0 98662306a36Sopenharmony_ci 98762306a36Sopenharmony_ci vmovdqa $H0,0x00(%r11) # 98862306a36Sopenharmony_ci vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 98962306a36Sopenharmony_ci vmovdqa $H1,0x10(%r11) # 99062306a36Sopenharmony_ci vpmuludq $T3,$H2,$H1 # h3*r1 99162306a36Sopenharmony_ci vpaddq $H0,$D0,$D0 # d0 += h4*s1 99262306a36Sopenharmony_ci vpaddq $H1,$D4,$D4 # d4 += h3*r1 99362306a36Sopenharmony_ci vmovdqa $H3,0x30(%r11) # 99462306a36Sopenharmony_ci vpmuludq $T2,$H2,$H0 # h2*r1 99562306a36Sopenharmony_ci vpmuludq $T1,$H2,$H1 # h1*r1 99662306a36Sopenharmony_ci vpaddq $H0,$D3,$D3 # d3 += h2*r1 99762306a36Sopenharmony_ci vmovdqa 0x30(%rsp),$H3 # r2^2 99862306a36Sopenharmony_ci vpaddq $H1,$D2,$D2 # d2 += h1*r1 99962306a36Sopenharmony_ci vmovdqa $H4,0x40(%r11) # 100062306a36Sopenharmony_ci vpmuludq $T0,$H2,$H2 # h0*r1 100162306a36Sopenharmony_ci vpmuludq $T2,$H3,$H0 # h2*r2 100262306a36Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h0*r1 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_ci vmovdqa 0x40(%rsp),$H4 # s2^2 100562306a36Sopenharmony_ci vpaddq $H0,$D4,$D4 # d4 += h2*r2 100662306a36Sopenharmony_ci vpmuludq $T1,$H3,$H1 # h1*r2 100762306a36Sopenharmony_ci vpmuludq $T0,$H3,$H3 # h0*r2 100862306a36Sopenharmony_ci vpaddq $H1,$D3,$D3 # d3 += h1*r2 100962306a36Sopenharmony_ci vmovdqa 0x50(%rsp),$H2 # r3^2 101062306a36Sopenharmony_ci vpaddq $H3,$D2,$D2 # d2 += h0*r2 101162306a36Sopenharmony_ci vpmuludq $T4,$H4,$H0 # h4*s2 101262306a36Sopenharmony_ci vpmuludq $T3,$H4,$H4 # h3*s2 101362306a36Sopenharmony_ci vpaddq $H0,$D1,$D1 # d1 += h4*s2 101462306a36Sopenharmony_ci vmovdqa 0x60(%rsp),$H3 # s3^2 101562306a36Sopenharmony_ci vpaddq $H4,$D0,$D0 # d0 += h3*s2 101662306a36Sopenharmony_ci 101762306a36Sopenharmony_ci vmovdqa 0x80(%rsp),$H4 # s4^2 101862306a36Sopenharmony_ci vpmuludq $T1,$H2,$H1 # h1*r3 101962306a36Sopenharmony_ci vpmuludq $T0,$H2,$H2 # h0*r3 102062306a36Sopenharmony_ci vpaddq $H1,$D4,$D4 # d4 += h1*r3 102162306a36Sopenharmony_ci vpaddq $H2,$D3,$D3 # d3 += h0*r3 102262306a36Sopenharmony_ci vpmuludq $T4,$H3,$H0 # h4*s3 102362306a36Sopenharmony_ci vpmuludq $T3,$H3,$H1 # h3*s3 102462306a36Sopenharmony_ci vpaddq $H0,$D2,$D2 # d2 += h4*s3 102562306a36Sopenharmony_ci vmovdqu 16*0($inp),$H0 # load input 102662306a36Sopenharmony_ci vpaddq $H1,$D1,$D1 # d1 += h3*s3 102762306a36Sopenharmony_ci vpmuludq $T2,$H3,$H3 # h2*s3 102862306a36Sopenharmony_ci vpmuludq $T2,$H4,$T2 # h2*s4 102962306a36Sopenharmony_ci vpaddq $H3,$D0,$D0 # d0 += h2*s3 103062306a36Sopenharmony_ci 103162306a36Sopenharmony_ci vmovdqu 16*1($inp),$H1 # 103262306a36Sopenharmony_ci vpaddq $T2,$D1,$D1 # d1 += h2*s4 103362306a36Sopenharmony_ci vpmuludq $T3,$H4,$T3 # h3*s4 103462306a36Sopenharmony_ci vpmuludq $T4,$H4,$T4 # h4*s4 103562306a36Sopenharmony_ci vpsrldq \$6,$H0,$H2 # splat input 103662306a36Sopenharmony_ci vpaddq $T3,$D2,$D2 # d2 += h3*s4 103762306a36Sopenharmony_ci vpaddq $T4,$D3,$D3 # d3 += h4*s4 103862306a36Sopenharmony_ci vpsrldq \$6,$H1,$H3 # 103962306a36Sopenharmony_ci vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 104062306a36Sopenharmony_ci vpmuludq $T1,$H4,$T0 # h1*s4 104162306a36Sopenharmony_ci vpunpckhqdq $H1,$H0,$H4 # 4 104262306a36Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h0*r4 104362306a36Sopenharmony_ci vmovdqa -0x90(%r11),$T4 # r0^4 104462306a36Sopenharmony_ci vpaddq $T0,$D0,$D0 # d0 += h1*s4 104562306a36Sopenharmony_ci 104662306a36Sopenharmony_ci vpunpcklqdq $H1,$H0,$H0 # 0:1 104762306a36Sopenharmony_ci vpunpcklqdq $H3,$H2,$H3 # 2:3 104862306a36Sopenharmony_ci 104962306a36Sopenharmony_ci #vpsrlq \$40,$H4,$H4 # 4 105062306a36Sopenharmony_ci vpsrldq \$`40/8`,$H4,$H4 # 4 105162306a36Sopenharmony_ci vpsrlq \$26,$H0,$H1 105262306a36Sopenharmony_ci vpand $MASK,$H0,$H0 # 0 105362306a36Sopenharmony_ci vpsrlq \$4,$H3,$H2 105462306a36Sopenharmony_ci vpand $MASK,$H1,$H1 # 1 105562306a36Sopenharmony_ci vpand 0(%rcx),$H4,$H4 # .Lmask24 105662306a36Sopenharmony_ci vpsrlq \$30,$H3,$H3 105762306a36Sopenharmony_ci vpand $MASK,$H2,$H2 # 2 105862306a36Sopenharmony_ci vpand $MASK,$H3,$H3 # 3 105962306a36Sopenharmony_ci vpor 32(%rcx),$H4,$H4 # padbit, yes, always 106062306a36Sopenharmony_ci 106162306a36Sopenharmony_ci vpaddq 0x00(%r11),$H0,$H0 # add hash value 106262306a36Sopenharmony_ci vpaddq 0x10(%r11),$H1,$H1 106362306a36Sopenharmony_ci vpaddq 0x20(%r11),$H2,$H2 106462306a36Sopenharmony_ci vpaddq 0x30(%r11),$H3,$H3 106562306a36Sopenharmony_ci vpaddq 0x40(%r11),$H4,$H4 106662306a36Sopenharmony_ci 106762306a36Sopenharmony_ci lea 16*2($inp),%rax 106862306a36Sopenharmony_ci lea 16*4($inp),$inp 106962306a36Sopenharmony_ci sub \$64,$len 107062306a36Sopenharmony_ci cmovc %rax,$inp 107162306a36Sopenharmony_ci 107262306a36Sopenharmony_ci ################################################################ 107362306a36Sopenharmony_ci # Now we accumulate (inp[0:1]+hash)*r^4 107462306a36Sopenharmony_ci ################################################################ 107562306a36Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 107662306a36Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 107762306a36Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 107862306a36Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 107962306a36Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 108062306a36Sopenharmony_ci 108162306a36Sopenharmony_ci vpmuludq $H0,$T4,$T0 # h0*r0 108262306a36Sopenharmony_ci vpmuludq $H1,$T4,$T1 # h1*r0 108362306a36Sopenharmony_ci vpaddq $T0,$D0,$D0 108462306a36Sopenharmony_ci vpaddq $T1,$D1,$D1 108562306a36Sopenharmony_ci vmovdqa -0x80(%r11),$T2 # r1^4 108662306a36Sopenharmony_ci vpmuludq $H2,$T4,$T0 # h2*r0 108762306a36Sopenharmony_ci vpmuludq $H3,$T4,$T1 # h3*r0 108862306a36Sopenharmony_ci vpaddq $T0,$D2,$D2 108962306a36Sopenharmony_ci vpaddq $T1,$D3,$D3 109062306a36Sopenharmony_ci vpmuludq $H4,$T4,$T4 # h4*r0 109162306a36Sopenharmony_ci vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 109262306a36Sopenharmony_ci vpaddq $T4,$D4,$D4 109362306a36Sopenharmony_ci 109462306a36Sopenharmony_ci vpaddq $T0,$D0,$D0 # d0 += h4*s1 109562306a36Sopenharmony_ci vpmuludq $H2,$T2,$T1 # h2*r1 109662306a36Sopenharmony_ci vpmuludq $H3,$T2,$T0 # h3*r1 109762306a36Sopenharmony_ci vpaddq $T1,$D3,$D3 # d3 += h2*r1 109862306a36Sopenharmony_ci vmovdqa -0x60(%r11),$T3 # r2^4 109962306a36Sopenharmony_ci vpaddq $T0,$D4,$D4 # d4 += h3*r1 110062306a36Sopenharmony_ci vpmuludq $H1,$T2,$T1 # h1*r1 110162306a36Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r1 110262306a36Sopenharmony_ci vpaddq $T1,$D2,$D2 # d2 += h1*r1 110362306a36Sopenharmony_ci vpaddq $T2,$D1,$D1 # d1 += h0*r1 110462306a36Sopenharmony_ci 110562306a36Sopenharmony_ci vmovdqa -0x50(%r11),$T4 # s2^4 110662306a36Sopenharmony_ci vpmuludq $H2,$T3,$T0 # h2*r2 110762306a36Sopenharmony_ci vpmuludq $H1,$T3,$T1 # h1*r2 110862306a36Sopenharmony_ci vpaddq $T0,$D4,$D4 # d4 += h2*r2 110962306a36Sopenharmony_ci vpaddq $T1,$D3,$D3 # d3 += h1*r2 111062306a36Sopenharmony_ci vmovdqa -0x40(%r11),$T2 # r3^4 111162306a36Sopenharmony_ci vpmuludq $H0,$T3,$T3 # h0*r2 111262306a36Sopenharmony_ci vpmuludq $H4,$T4,$T0 # h4*s2 111362306a36Sopenharmony_ci vpaddq $T3,$D2,$D2 # d2 += h0*r2 111462306a36Sopenharmony_ci vpaddq $T0,$D1,$D1 # d1 += h4*s2 111562306a36Sopenharmony_ci vmovdqa -0x30(%r11),$T3 # s3^4 111662306a36Sopenharmony_ci vpmuludq $H3,$T4,$T4 # h3*s2 111762306a36Sopenharmony_ci vpmuludq $H1,$T2,$T1 # h1*r3 111862306a36Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h3*s2 111962306a36Sopenharmony_ci 112062306a36Sopenharmony_ci vmovdqa -0x10(%r11),$T4 # s4^4 112162306a36Sopenharmony_ci vpaddq $T1,$D4,$D4 # d4 += h1*r3 112262306a36Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r3 112362306a36Sopenharmony_ci vpmuludq $H4,$T3,$T0 # h4*s3 112462306a36Sopenharmony_ci vpaddq $T2,$D3,$D3 # d3 += h0*r3 112562306a36Sopenharmony_ci vpaddq $T0,$D2,$D2 # d2 += h4*s3 112662306a36Sopenharmony_ci vmovdqu 16*2($inp),$T0 # load input 112762306a36Sopenharmony_ci vpmuludq $H3,$T3,$T2 # h3*s3 112862306a36Sopenharmony_ci vpmuludq $H2,$T3,$T3 # h2*s3 112962306a36Sopenharmony_ci vpaddq $T2,$D1,$D1 # d1 += h3*s3 113062306a36Sopenharmony_ci vmovdqu 16*3($inp),$T1 # 113162306a36Sopenharmony_ci vpaddq $T3,$D0,$D0 # d0 += h2*s3 113262306a36Sopenharmony_ci 113362306a36Sopenharmony_ci vpmuludq $H2,$T4,$H2 # h2*s4 113462306a36Sopenharmony_ci vpmuludq $H3,$T4,$H3 # h3*s4 113562306a36Sopenharmony_ci vpsrldq \$6,$T0,$T2 # splat input 113662306a36Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h2*s4 113762306a36Sopenharmony_ci vpmuludq $H4,$T4,$H4 # h4*s4 113862306a36Sopenharmony_ci vpsrldq \$6,$T1,$T3 # 113962306a36Sopenharmony_ci vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 114062306a36Sopenharmony_ci vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 114162306a36Sopenharmony_ci vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 114262306a36Sopenharmony_ci vpmuludq $H1,$T4,$H0 114362306a36Sopenharmony_ci vpunpckhqdq $T1,$T0,$T4 # 4 114462306a36Sopenharmony_ci vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 114562306a36Sopenharmony_ci vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 114662306a36Sopenharmony_ci 114762306a36Sopenharmony_ci vpunpcklqdq $T1,$T0,$T0 # 0:1 114862306a36Sopenharmony_ci vpunpcklqdq $T3,$T2,$T3 # 2:3 114962306a36Sopenharmony_ci 115062306a36Sopenharmony_ci #vpsrlq \$40,$T4,$T4 # 4 115162306a36Sopenharmony_ci vpsrldq \$`40/8`,$T4,$T4 # 4 115262306a36Sopenharmony_ci vpsrlq \$26,$T0,$T1 115362306a36Sopenharmony_ci vmovdqa 0x00(%rsp),$D4 # preload r0^2 115462306a36Sopenharmony_ci vpand $MASK,$T0,$T0 # 0 115562306a36Sopenharmony_ci vpsrlq \$4,$T3,$T2 115662306a36Sopenharmony_ci vpand $MASK,$T1,$T1 # 1 115762306a36Sopenharmony_ci vpand 0(%rcx),$T4,$T4 # .Lmask24 115862306a36Sopenharmony_ci vpsrlq \$30,$T3,$T3 115962306a36Sopenharmony_ci vpand $MASK,$T2,$T2 # 2 116062306a36Sopenharmony_ci vpand $MASK,$T3,$T3 # 3 116162306a36Sopenharmony_ci vpor 32(%rcx),$T4,$T4 # padbit, yes, always 116262306a36Sopenharmony_ci 116362306a36Sopenharmony_ci ################################################################ 116462306a36Sopenharmony_ci # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 116562306a36Sopenharmony_ci # and P. Schwabe 116662306a36Sopenharmony_ci 116762306a36Sopenharmony_ci vpsrlq \$26,$H3,$D3 116862306a36Sopenharmony_ci vpand $MASK,$H3,$H3 116962306a36Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 117062306a36Sopenharmony_ci 117162306a36Sopenharmony_ci vpsrlq \$26,$H0,$D0 117262306a36Sopenharmony_ci vpand $MASK,$H0,$H0 117362306a36Sopenharmony_ci vpaddq $D0,$D1,$H1 # h0 -> h1 117462306a36Sopenharmony_ci 117562306a36Sopenharmony_ci vpsrlq \$26,$H4,$D0 117662306a36Sopenharmony_ci vpand $MASK,$H4,$H4 117762306a36Sopenharmony_ci 117862306a36Sopenharmony_ci vpsrlq \$26,$H1,$D1 117962306a36Sopenharmony_ci vpand $MASK,$H1,$H1 118062306a36Sopenharmony_ci vpaddq $D1,$H2,$H2 # h1 -> h2 118162306a36Sopenharmony_ci 118262306a36Sopenharmony_ci vpaddq $D0,$H0,$H0 118362306a36Sopenharmony_ci vpsllq \$2,$D0,$D0 118462306a36Sopenharmony_ci vpaddq $D0,$H0,$H0 # h4 -> h0 118562306a36Sopenharmony_ci 118662306a36Sopenharmony_ci vpsrlq \$26,$H2,$D2 118762306a36Sopenharmony_ci vpand $MASK,$H2,$H2 118862306a36Sopenharmony_ci vpaddq $D2,$H3,$H3 # h2 -> h3 118962306a36Sopenharmony_ci 119062306a36Sopenharmony_ci vpsrlq \$26,$H0,$D0 119162306a36Sopenharmony_ci vpand $MASK,$H0,$H0 119262306a36Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 119362306a36Sopenharmony_ci 119462306a36Sopenharmony_ci vpsrlq \$26,$H3,$D3 119562306a36Sopenharmony_ci vpand $MASK,$H3,$H3 119662306a36Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 119762306a36Sopenharmony_ci 119862306a36Sopenharmony_ci ja .Loop_avx 119962306a36Sopenharmony_ci 120062306a36Sopenharmony_ci.Lskip_loop_avx: 120162306a36Sopenharmony_ci ################################################################ 120262306a36Sopenharmony_ci # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_ci vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 120562306a36Sopenharmony_ci add \$32,$len 120662306a36Sopenharmony_ci jnz .Long_tail_avx 120762306a36Sopenharmony_ci 120862306a36Sopenharmony_ci vpaddq $H2,$T2,$T2 120962306a36Sopenharmony_ci vpaddq $H0,$T0,$T0 121062306a36Sopenharmony_ci vpaddq $H1,$T1,$T1 121162306a36Sopenharmony_ci vpaddq $H3,$T3,$T3 121262306a36Sopenharmony_ci vpaddq $H4,$T4,$T4 121362306a36Sopenharmony_ci 121462306a36Sopenharmony_ci.Long_tail_avx: 121562306a36Sopenharmony_ci vmovdqa $H2,0x20(%r11) 121662306a36Sopenharmony_ci vmovdqa $H0,0x00(%r11) 121762306a36Sopenharmony_ci vmovdqa $H1,0x10(%r11) 121862306a36Sopenharmony_ci vmovdqa $H3,0x30(%r11) 121962306a36Sopenharmony_ci vmovdqa $H4,0x40(%r11) 122062306a36Sopenharmony_ci 122162306a36Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 122262306a36Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 122362306a36Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 122462306a36Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 122562306a36Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 122662306a36Sopenharmony_ci 122762306a36Sopenharmony_ci vpmuludq $T2,$D4,$D2 # d2 = h2*r0 122862306a36Sopenharmony_ci vpmuludq $T0,$D4,$D0 # d0 = h0*r0 122962306a36Sopenharmony_ci vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n 123062306a36Sopenharmony_ci vpmuludq $T1,$D4,$D1 # d1 = h1*r0 123162306a36Sopenharmony_ci vpmuludq $T3,$D4,$D3 # d3 = h3*r0 123262306a36Sopenharmony_ci vpmuludq $T4,$D4,$D4 # d4 = h4*r0 123362306a36Sopenharmony_ci 123462306a36Sopenharmony_ci vpmuludq $T3,$H2,$H0 # h3*r1 123562306a36Sopenharmony_ci vpaddq $H0,$D4,$D4 # d4 += h3*r1 123662306a36Sopenharmony_ci vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n 123762306a36Sopenharmony_ci vpmuludq $T2,$H2,$H1 # h2*r1 123862306a36Sopenharmony_ci vpaddq $H1,$D3,$D3 # d3 += h2*r1 123962306a36Sopenharmony_ci vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n 124062306a36Sopenharmony_ci vpmuludq $T1,$H2,$H0 # h1*r1 124162306a36Sopenharmony_ci vpaddq $H0,$D2,$D2 # d2 += h1*r1 124262306a36Sopenharmony_ci vpmuludq $T0,$H2,$H2 # h0*r1 124362306a36Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h0*r1 124462306a36Sopenharmony_ci vpmuludq $T4,$H3,$H3 # h4*s1 124562306a36Sopenharmony_ci vpaddq $H3,$D0,$D0 # d0 += h4*s1 124662306a36Sopenharmony_ci 124762306a36Sopenharmony_ci vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n 124862306a36Sopenharmony_ci vpmuludq $T2,$H4,$H1 # h2*r2 124962306a36Sopenharmony_ci vpaddq $H1,$D4,$D4 # d4 += h2*r2 125062306a36Sopenharmony_ci vpmuludq $T1,$H4,$H0 # h1*r2 125162306a36Sopenharmony_ci vpaddq $H0,$D3,$D3 # d3 += h1*r2 125262306a36Sopenharmony_ci vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n 125362306a36Sopenharmony_ci vpmuludq $T0,$H4,$H4 # h0*r2 125462306a36Sopenharmony_ci vpaddq $H4,$D2,$D2 # d2 += h0*r2 125562306a36Sopenharmony_ci vpmuludq $T4,$H2,$H1 # h4*s2 125662306a36Sopenharmony_ci vpaddq $H1,$D1,$D1 # d1 += h4*s2 125762306a36Sopenharmony_ci vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n 125862306a36Sopenharmony_ci vpmuludq $T3,$H2,$H2 # h3*s2 125962306a36Sopenharmony_ci vpaddq $H2,$D0,$D0 # d0 += h3*s2 126062306a36Sopenharmony_ci 126162306a36Sopenharmony_ci vpmuludq $T1,$H3,$H0 # h1*r3 126262306a36Sopenharmony_ci vpaddq $H0,$D4,$D4 # d4 += h1*r3 126362306a36Sopenharmony_ci vpmuludq $T0,$H3,$H3 # h0*r3 126462306a36Sopenharmony_ci vpaddq $H3,$D3,$D3 # d3 += h0*r3 126562306a36Sopenharmony_ci vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n 126662306a36Sopenharmony_ci vpmuludq $T4,$H4,$H1 # h4*s3 126762306a36Sopenharmony_ci vpaddq $H1,$D2,$D2 # d2 += h4*s3 126862306a36Sopenharmony_ci vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n 126962306a36Sopenharmony_ci vpmuludq $T3,$H4,$H0 # h3*s3 127062306a36Sopenharmony_ci vpaddq $H0,$D1,$D1 # d1 += h3*s3 127162306a36Sopenharmony_ci vpmuludq $T2,$H4,$H4 # h2*s3 127262306a36Sopenharmony_ci vpaddq $H4,$D0,$D0 # d0 += h2*s3 127362306a36Sopenharmony_ci 127462306a36Sopenharmony_ci vpmuludq $T0,$H2,$H2 # h0*r4 127562306a36Sopenharmony_ci vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 127662306a36Sopenharmony_ci vpmuludq $T4,$H3,$H1 # h4*s4 127762306a36Sopenharmony_ci vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 127862306a36Sopenharmony_ci vpmuludq $T3,$H3,$H0 # h3*s4 127962306a36Sopenharmony_ci vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 128062306a36Sopenharmony_ci vpmuludq $T2,$H3,$H1 # h2*s4 128162306a36Sopenharmony_ci vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 128262306a36Sopenharmony_ci vpmuludq $T1,$H3,$H3 # h1*s4 128362306a36Sopenharmony_ci vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 128462306a36Sopenharmony_ci 128562306a36Sopenharmony_ci jz .Lshort_tail_avx 128662306a36Sopenharmony_ci 128762306a36Sopenharmony_ci vmovdqu 16*0($inp),$H0 # load input 128862306a36Sopenharmony_ci vmovdqu 16*1($inp),$H1 128962306a36Sopenharmony_ci 129062306a36Sopenharmony_ci vpsrldq \$6,$H0,$H2 # splat input 129162306a36Sopenharmony_ci vpsrldq \$6,$H1,$H3 129262306a36Sopenharmony_ci vpunpckhqdq $H1,$H0,$H4 # 4 129362306a36Sopenharmony_ci vpunpcklqdq $H1,$H0,$H0 # 0:1 129462306a36Sopenharmony_ci vpunpcklqdq $H3,$H2,$H3 # 2:3 129562306a36Sopenharmony_ci 129662306a36Sopenharmony_ci vpsrlq \$40,$H4,$H4 # 4 129762306a36Sopenharmony_ci vpsrlq \$26,$H0,$H1 129862306a36Sopenharmony_ci vpand $MASK,$H0,$H0 # 0 129962306a36Sopenharmony_ci vpsrlq \$4,$H3,$H2 130062306a36Sopenharmony_ci vpand $MASK,$H1,$H1 # 1 130162306a36Sopenharmony_ci vpsrlq \$30,$H3,$H3 130262306a36Sopenharmony_ci vpand $MASK,$H2,$H2 # 2 130362306a36Sopenharmony_ci vpand $MASK,$H3,$H3 # 3 130462306a36Sopenharmony_ci vpor 32(%rcx),$H4,$H4 # padbit, yes, always 130562306a36Sopenharmony_ci 130662306a36Sopenharmony_ci vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 130762306a36Sopenharmony_ci vpaddq 0x00(%r11),$H0,$H0 130862306a36Sopenharmony_ci vpaddq 0x10(%r11),$H1,$H1 130962306a36Sopenharmony_ci vpaddq 0x20(%r11),$H2,$H2 131062306a36Sopenharmony_ci vpaddq 0x30(%r11),$H3,$H3 131162306a36Sopenharmony_ci vpaddq 0x40(%r11),$H4,$H4 131262306a36Sopenharmony_ci 131362306a36Sopenharmony_ci ################################################################ 131462306a36Sopenharmony_ci # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate 131562306a36Sopenharmony_ci 131662306a36Sopenharmony_ci vpmuludq $H0,$T4,$T0 # h0*r0 131762306a36Sopenharmony_ci vpaddq $T0,$D0,$D0 # d0 += h0*r0 131862306a36Sopenharmony_ci vpmuludq $H1,$T4,$T1 # h1*r0 131962306a36Sopenharmony_ci vpaddq $T1,$D1,$D1 # d1 += h1*r0 132062306a36Sopenharmony_ci vpmuludq $H2,$T4,$T0 # h2*r0 132162306a36Sopenharmony_ci vpaddq $T0,$D2,$D2 # d2 += h2*r0 132262306a36Sopenharmony_ci vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n 132362306a36Sopenharmony_ci vpmuludq $H3,$T4,$T1 # h3*r0 132462306a36Sopenharmony_ci vpaddq $T1,$D3,$D3 # d3 += h3*r0 132562306a36Sopenharmony_ci vpmuludq $H4,$T4,$T4 # h4*r0 132662306a36Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h4*r0 132762306a36Sopenharmony_ci 132862306a36Sopenharmony_ci vpmuludq $H3,$T2,$T0 # h3*r1 132962306a36Sopenharmony_ci vpaddq $T0,$D4,$D4 # d4 += h3*r1 133062306a36Sopenharmony_ci vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 133162306a36Sopenharmony_ci vpmuludq $H2,$T2,$T1 # h2*r1 133262306a36Sopenharmony_ci vpaddq $T1,$D3,$D3 # d3 += h2*r1 133362306a36Sopenharmony_ci vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 133462306a36Sopenharmony_ci vpmuludq $H1,$T2,$T0 # h1*r1 133562306a36Sopenharmony_ci vpaddq $T0,$D2,$D2 # d2 += h1*r1 133662306a36Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r1 133762306a36Sopenharmony_ci vpaddq $T2,$D1,$D1 # d1 += h0*r1 133862306a36Sopenharmony_ci vpmuludq $H4,$T3,$T3 # h4*s1 133962306a36Sopenharmony_ci vpaddq $T3,$D0,$D0 # d0 += h4*s1 134062306a36Sopenharmony_ci 134162306a36Sopenharmony_ci vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 134262306a36Sopenharmony_ci vpmuludq $H2,$T4,$T1 # h2*r2 134362306a36Sopenharmony_ci vpaddq $T1,$D4,$D4 # d4 += h2*r2 134462306a36Sopenharmony_ci vpmuludq $H1,$T4,$T0 # h1*r2 134562306a36Sopenharmony_ci vpaddq $T0,$D3,$D3 # d3 += h1*r2 134662306a36Sopenharmony_ci vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 134762306a36Sopenharmony_ci vpmuludq $H0,$T4,$T4 # h0*r2 134862306a36Sopenharmony_ci vpaddq $T4,$D2,$D2 # d2 += h0*r2 134962306a36Sopenharmony_ci vpmuludq $H4,$T2,$T1 # h4*s2 135062306a36Sopenharmony_ci vpaddq $T1,$D1,$D1 # d1 += h4*s2 135162306a36Sopenharmony_ci vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 135262306a36Sopenharmony_ci vpmuludq $H3,$T2,$T2 # h3*s2 135362306a36Sopenharmony_ci vpaddq $T2,$D0,$D0 # d0 += h3*s2 135462306a36Sopenharmony_ci 135562306a36Sopenharmony_ci vpmuludq $H1,$T3,$T0 # h1*r3 135662306a36Sopenharmony_ci vpaddq $T0,$D4,$D4 # d4 += h1*r3 135762306a36Sopenharmony_ci vpmuludq $H0,$T3,$T3 # h0*r3 135862306a36Sopenharmony_ci vpaddq $T3,$D3,$D3 # d3 += h0*r3 135962306a36Sopenharmony_ci vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 136062306a36Sopenharmony_ci vpmuludq $H4,$T4,$T1 # h4*s3 136162306a36Sopenharmony_ci vpaddq $T1,$D2,$D2 # d2 += h4*s3 136262306a36Sopenharmony_ci vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 136362306a36Sopenharmony_ci vpmuludq $H3,$T4,$T0 # h3*s3 136462306a36Sopenharmony_ci vpaddq $T0,$D1,$D1 # d1 += h3*s3 136562306a36Sopenharmony_ci vpmuludq $H2,$T4,$T4 # h2*s3 136662306a36Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h2*s3 136762306a36Sopenharmony_ci 136862306a36Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r4 136962306a36Sopenharmony_ci vpaddq $T2,$D4,$D4 # d4 += h0*r4 137062306a36Sopenharmony_ci vpmuludq $H4,$T3,$T1 # h4*s4 137162306a36Sopenharmony_ci vpaddq $T1,$D3,$D3 # d3 += h4*s4 137262306a36Sopenharmony_ci vpmuludq $H3,$T3,$T0 # h3*s4 137362306a36Sopenharmony_ci vpaddq $T0,$D2,$D2 # d2 += h3*s4 137462306a36Sopenharmony_ci vpmuludq $H2,$T3,$T1 # h2*s4 137562306a36Sopenharmony_ci vpaddq $T1,$D1,$D1 # d1 += h2*s4 137662306a36Sopenharmony_ci vpmuludq $H1,$T3,$T3 # h1*s4 137762306a36Sopenharmony_ci vpaddq $T3,$D0,$D0 # d0 += h1*s4 137862306a36Sopenharmony_ci 137962306a36Sopenharmony_ci.Lshort_tail_avx: 138062306a36Sopenharmony_ci ################################################################ 138162306a36Sopenharmony_ci # horizontal addition 138262306a36Sopenharmony_ci 138362306a36Sopenharmony_ci vpsrldq \$8,$D4,$T4 138462306a36Sopenharmony_ci vpsrldq \$8,$D3,$T3 138562306a36Sopenharmony_ci vpsrldq \$8,$D1,$T1 138662306a36Sopenharmony_ci vpsrldq \$8,$D0,$T0 138762306a36Sopenharmony_ci vpsrldq \$8,$D2,$T2 138862306a36Sopenharmony_ci vpaddq $T3,$D3,$D3 138962306a36Sopenharmony_ci vpaddq $T4,$D4,$D4 139062306a36Sopenharmony_ci vpaddq $T0,$D0,$D0 139162306a36Sopenharmony_ci vpaddq $T1,$D1,$D1 139262306a36Sopenharmony_ci vpaddq $T2,$D2,$D2 139362306a36Sopenharmony_ci 139462306a36Sopenharmony_ci ################################################################ 139562306a36Sopenharmony_ci # lazy reduction 139662306a36Sopenharmony_ci 139762306a36Sopenharmony_ci vpsrlq \$26,$D3,$H3 139862306a36Sopenharmony_ci vpand $MASK,$D3,$D3 139962306a36Sopenharmony_ci vpaddq $H3,$D4,$D4 # h3 -> h4 140062306a36Sopenharmony_ci 140162306a36Sopenharmony_ci vpsrlq \$26,$D0,$H0 140262306a36Sopenharmony_ci vpand $MASK,$D0,$D0 140362306a36Sopenharmony_ci vpaddq $H0,$D1,$D1 # h0 -> h1 140462306a36Sopenharmony_ci 140562306a36Sopenharmony_ci vpsrlq \$26,$D4,$H4 140662306a36Sopenharmony_ci vpand $MASK,$D4,$D4 140762306a36Sopenharmony_ci 140862306a36Sopenharmony_ci vpsrlq \$26,$D1,$H1 140962306a36Sopenharmony_ci vpand $MASK,$D1,$D1 141062306a36Sopenharmony_ci vpaddq $H1,$D2,$D2 # h1 -> h2 141162306a36Sopenharmony_ci 141262306a36Sopenharmony_ci vpaddq $H4,$D0,$D0 141362306a36Sopenharmony_ci vpsllq \$2,$H4,$H4 141462306a36Sopenharmony_ci vpaddq $H4,$D0,$D0 # h4 -> h0 141562306a36Sopenharmony_ci 141662306a36Sopenharmony_ci vpsrlq \$26,$D2,$H2 141762306a36Sopenharmony_ci vpand $MASK,$D2,$D2 141862306a36Sopenharmony_ci vpaddq $H2,$D3,$D3 # h2 -> h3 141962306a36Sopenharmony_ci 142062306a36Sopenharmony_ci vpsrlq \$26,$D0,$H0 142162306a36Sopenharmony_ci vpand $MASK,$D0,$D0 142262306a36Sopenharmony_ci vpaddq $H0,$D1,$D1 # h0 -> h1 142362306a36Sopenharmony_ci 142462306a36Sopenharmony_ci vpsrlq \$26,$D3,$H3 142562306a36Sopenharmony_ci vpand $MASK,$D3,$D3 142662306a36Sopenharmony_ci vpaddq $H3,$D4,$D4 # h3 -> h4 142762306a36Sopenharmony_ci 142862306a36Sopenharmony_ci vmovd $D0,`4*0-48-64`($ctx) # save partially reduced 142962306a36Sopenharmony_ci vmovd $D1,`4*1-48-64`($ctx) 143062306a36Sopenharmony_ci vmovd $D2,`4*2-48-64`($ctx) 143162306a36Sopenharmony_ci vmovd $D3,`4*3-48-64`($ctx) 143262306a36Sopenharmony_ci vmovd $D4,`4*4-48-64`($ctx) 143362306a36Sopenharmony_ci___ 143462306a36Sopenharmony_ci$code.=<<___ if ($win64); 143562306a36Sopenharmony_ci vmovdqa 0x50(%r11),%xmm6 143662306a36Sopenharmony_ci vmovdqa 0x60(%r11),%xmm7 143762306a36Sopenharmony_ci vmovdqa 0x70(%r11),%xmm8 143862306a36Sopenharmony_ci vmovdqa 0x80(%r11),%xmm9 143962306a36Sopenharmony_ci vmovdqa 0x90(%r11),%xmm10 144062306a36Sopenharmony_ci vmovdqa 0xa0(%r11),%xmm11 144162306a36Sopenharmony_ci vmovdqa 0xb0(%r11),%xmm12 144262306a36Sopenharmony_ci vmovdqa 0xc0(%r11),%xmm13 144362306a36Sopenharmony_ci vmovdqa 0xd0(%r11),%xmm14 144462306a36Sopenharmony_ci vmovdqa 0xe0(%r11),%xmm15 144562306a36Sopenharmony_ci lea 0xf8(%r11),%rsp 144662306a36Sopenharmony_ci.Ldo_avx_epilogue: 144762306a36Sopenharmony_ci___ 144862306a36Sopenharmony_ci$code.=<<___ if (!$win64); 144962306a36Sopenharmony_ci lea -8(%r10),%rsp 145062306a36Sopenharmony_ci.cfi_def_cfa_register %rsp 145162306a36Sopenharmony_ci___ 145262306a36Sopenharmony_ci$code.=<<___; 145362306a36Sopenharmony_ci vzeroupper 145462306a36Sopenharmony_ci RET 145562306a36Sopenharmony_ci.cfi_endproc 145662306a36Sopenharmony_ci___ 145762306a36Sopenharmony_ci&end_function("poly1305_blocks_avx"); 145862306a36Sopenharmony_ci 145962306a36Sopenharmony_ci&declare_function("poly1305_emit_avx", 32, 3); 146062306a36Sopenharmony_ci$code.=<<___; 146162306a36Sopenharmony_ci cmpl \$0,20($ctx) # is_base2_26? 146262306a36Sopenharmony_ci je .Lemit 146362306a36Sopenharmony_ci 146462306a36Sopenharmony_ci mov 0($ctx),%eax # load hash value base 2^26 146562306a36Sopenharmony_ci mov 4($ctx),%ecx 146662306a36Sopenharmony_ci mov 8($ctx),%r8d 146762306a36Sopenharmony_ci mov 12($ctx),%r11d 146862306a36Sopenharmony_ci mov 16($ctx),%r10d 146962306a36Sopenharmony_ci 147062306a36Sopenharmony_ci shl \$26,%rcx # base 2^26 -> base 2^64 147162306a36Sopenharmony_ci mov %r8,%r9 147262306a36Sopenharmony_ci shl \$52,%r8 147362306a36Sopenharmony_ci add %rcx,%rax 147462306a36Sopenharmony_ci shr \$12,%r9 147562306a36Sopenharmony_ci add %rax,%r8 # h0 147662306a36Sopenharmony_ci adc \$0,%r9 147762306a36Sopenharmony_ci 147862306a36Sopenharmony_ci shl \$14,%r11 147962306a36Sopenharmony_ci mov %r10,%rax 148062306a36Sopenharmony_ci shr \$24,%r10 148162306a36Sopenharmony_ci add %r11,%r9 148262306a36Sopenharmony_ci shl \$40,%rax 148362306a36Sopenharmony_ci add %rax,%r9 # h1 148462306a36Sopenharmony_ci adc \$0,%r10 # h2 148562306a36Sopenharmony_ci 148662306a36Sopenharmony_ci mov %r10,%rax # could be partially reduced, so reduce 148762306a36Sopenharmony_ci mov %r10,%rcx 148862306a36Sopenharmony_ci and \$3,%r10 148962306a36Sopenharmony_ci shr \$2,%rax 149062306a36Sopenharmony_ci and \$-4,%rcx 149162306a36Sopenharmony_ci add %rcx,%rax 149262306a36Sopenharmony_ci add %rax,%r8 149362306a36Sopenharmony_ci adc \$0,%r9 149462306a36Sopenharmony_ci adc \$0,%r10 149562306a36Sopenharmony_ci 149662306a36Sopenharmony_ci mov %r8,%rax 149762306a36Sopenharmony_ci add \$5,%r8 # compare to modulus 149862306a36Sopenharmony_ci mov %r9,%rcx 149962306a36Sopenharmony_ci adc \$0,%r9 150062306a36Sopenharmony_ci adc \$0,%r10 150162306a36Sopenharmony_ci shr \$2,%r10 # did 130-bit value overflow? 150262306a36Sopenharmony_ci cmovnz %r8,%rax 150362306a36Sopenharmony_ci cmovnz %r9,%rcx 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_ci add 0($nonce),%rax # accumulate nonce 150662306a36Sopenharmony_ci adc 8($nonce),%rcx 150762306a36Sopenharmony_ci mov %rax,0($mac) # write result 150862306a36Sopenharmony_ci mov %rcx,8($mac) 150962306a36Sopenharmony_ci 151062306a36Sopenharmony_ci RET 151162306a36Sopenharmony_ci___ 151262306a36Sopenharmony_ci&end_function("poly1305_emit_avx"); 151362306a36Sopenharmony_ci 151462306a36Sopenharmony_ciif ($avx>1) { 151562306a36Sopenharmony_ci 151662306a36Sopenharmony_cimy ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = 151762306a36Sopenharmony_ci map("%ymm$_",(0..15)); 151862306a36Sopenharmony_cimy $S4=$MASK; 151962306a36Sopenharmony_ci 152062306a36Sopenharmony_cisub poly1305_blocks_avxN { 152162306a36Sopenharmony_ci my ($avx512) = @_; 152262306a36Sopenharmony_ci my $suffix = $avx512 ? "_avx512" : ""; 152362306a36Sopenharmony_ci$code.=<<___; 152462306a36Sopenharmony_ci.cfi_startproc 152562306a36Sopenharmony_ci mov 20($ctx),%r8d # is_base2_26 152662306a36Sopenharmony_ci cmp \$128,$len 152762306a36Sopenharmony_ci jae .Lblocks_avx2$suffix 152862306a36Sopenharmony_ci test %r8d,%r8d 152962306a36Sopenharmony_ci jz .Lblocks 153062306a36Sopenharmony_ci 153162306a36Sopenharmony_ci.Lblocks_avx2$suffix: 153262306a36Sopenharmony_ci and \$-16,$len 153362306a36Sopenharmony_ci jz .Lno_data_avx2$suffix 153462306a36Sopenharmony_ci 153562306a36Sopenharmony_ci vzeroupper 153662306a36Sopenharmony_ci 153762306a36Sopenharmony_ci test %r8d,%r8d 153862306a36Sopenharmony_ci jz .Lbase2_64_avx2$suffix 153962306a36Sopenharmony_ci 154062306a36Sopenharmony_ci test \$63,$len 154162306a36Sopenharmony_ci jz .Leven_avx2$suffix 154262306a36Sopenharmony_ci 154362306a36Sopenharmony_ci push %rbp 154462306a36Sopenharmony_ci.cfi_push %rbp 154562306a36Sopenharmony_ci mov %rsp,%rbp 154662306a36Sopenharmony_ci push %rbx 154762306a36Sopenharmony_ci.cfi_push %rbx 154862306a36Sopenharmony_ci push %r12 154962306a36Sopenharmony_ci.cfi_push %r12 155062306a36Sopenharmony_ci push %r13 155162306a36Sopenharmony_ci.cfi_push %r13 155262306a36Sopenharmony_ci push %r14 155362306a36Sopenharmony_ci.cfi_push %r14 155462306a36Sopenharmony_ci push %r15 155562306a36Sopenharmony_ci.cfi_push %r15 155662306a36Sopenharmony_ci.Lblocks_avx2_body$suffix: 155762306a36Sopenharmony_ci 155862306a36Sopenharmony_ci mov $len,%r15 # reassign $len 155962306a36Sopenharmony_ci 156062306a36Sopenharmony_ci mov 0($ctx),$d1 # load hash value 156162306a36Sopenharmony_ci mov 8($ctx),$d2 156262306a36Sopenharmony_ci mov 16($ctx),$h2#d 156362306a36Sopenharmony_ci 156462306a36Sopenharmony_ci mov 24($ctx),$r0 # load r 156562306a36Sopenharmony_ci mov 32($ctx),$s1 156662306a36Sopenharmony_ci 156762306a36Sopenharmony_ci ################################# base 2^26 -> base 2^64 156862306a36Sopenharmony_ci mov $d1#d,$h0#d 156962306a36Sopenharmony_ci and \$`-1*(1<<31)`,$d1 157062306a36Sopenharmony_ci mov $d2,$r1 # borrow $r1 157162306a36Sopenharmony_ci mov $d2#d,$h1#d 157262306a36Sopenharmony_ci and \$`-1*(1<<31)`,$d2 157362306a36Sopenharmony_ci 157462306a36Sopenharmony_ci shr \$6,$d1 157562306a36Sopenharmony_ci shl \$52,$r1 157662306a36Sopenharmony_ci add $d1,$h0 157762306a36Sopenharmony_ci shr \$12,$h1 157862306a36Sopenharmony_ci shr \$18,$d2 157962306a36Sopenharmony_ci add $r1,$h0 158062306a36Sopenharmony_ci adc $d2,$h1 158162306a36Sopenharmony_ci 158262306a36Sopenharmony_ci mov $h2,$d1 158362306a36Sopenharmony_ci shl \$40,$d1 158462306a36Sopenharmony_ci shr \$24,$h2 158562306a36Sopenharmony_ci add $d1,$h1 158662306a36Sopenharmony_ci adc \$0,$h2 # can be partially reduced... 158762306a36Sopenharmony_ci 158862306a36Sopenharmony_ci mov \$-4,$d2 # ... so reduce 158962306a36Sopenharmony_ci mov $h2,$d1 159062306a36Sopenharmony_ci and $h2,$d2 159162306a36Sopenharmony_ci shr \$2,$d1 159262306a36Sopenharmony_ci and \$3,$h2 159362306a36Sopenharmony_ci add $d2,$d1 # =*5 159462306a36Sopenharmony_ci add $d1,$h0 159562306a36Sopenharmony_ci adc \$0,$h1 159662306a36Sopenharmony_ci adc \$0,$h2 159762306a36Sopenharmony_ci 159862306a36Sopenharmony_ci mov $s1,$r1 159962306a36Sopenharmony_ci mov $s1,%rax 160062306a36Sopenharmony_ci shr \$2,$s1 160162306a36Sopenharmony_ci add $r1,$s1 # s1 = r1 + (r1 >> 2) 160262306a36Sopenharmony_ci 160362306a36Sopenharmony_ci.Lbase2_26_pre_avx2$suffix: 160462306a36Sopenharmony_ci add 0($inp),$h0 # accumulate input 160562306a36Sopenharmony_ci adc 8($inp),$h1 160662306a36Sopenharmony_ci lea 16($inp),$inp 160762306a36Sopenharmony_ci adc $padbit,$h2 160862306a36Sopenharmony_ci sub \$16,%r15 160962306a36Sopenharmony_ci 161062306a36Sopenharmony_ci call __poly1305_block 161162306a36Sopenharmony_ci mov $r1,%rax 161262306a36Sopenharmony_ci 161362306a36Sopenharmony_ci test \$63,%r15 161462306a36Sopenharmony_ci jnz .Lbase2_26_pre_avx2$suffix 161562306a36Sopenharmony_ci 161662306a36Sopenharmony_ci test $padbit,$padbit # if $padbit is zero, 161762306a36Sopenharmony_ci jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format 161862306a36Sopenharmony_ci 161962306a36Sopenharmony_ci ################################# base 2^64 -> base 2^26 162062306a36Sopenharmony_ci mov $h0,%rax 162162306a36Sopenharmony_ci mov $h0,%rdx 162262306a36Sopenharmony_ci shr \$52,$h0 162362306a36Sopenharmony_ci mov $h1,$r0 162462306a36Sopenharmony_ci mov $h1,$r1 162562306a36Sopenharmony_ci shr \$26,%rdx 162662306a36Sopenharmony_ci and \$0x3ffffff,%rax # h[0] 162762306a36Sopenharmony_ci shl \$12,$r0 162862306a36Sopenharmony_ci and \$0x3ffffff,%rdx # h[1] 162962306a36Sopenharmony_ci shr \$14,$h1 163062306a36Sopenharmony_ci or $r0,$h0 163162306a36Sopenharmony_ci shl \$24,$h2 163262306a36Sopenharmony_ci and \$0x3ffffff,$h0 # h[2] 163362306a36Sopenharmony_ci shr \$40,$r1 163462306a36Sopenharmony_ci and \$0x3ffffff,$h1 # h[3] 163562306a36Sopenharmony_ci or $r1,$h2 # h[4] 163662306a36Sopenharmony_ci 163762306a36Sopenharmony_ci test %r15,%r15 163862306a36Sopenharmony_ci jz .Lstore_base2_26_avx2$suffix 163962306a36Sopenharmony_ci 164062306a36Sopenharmony_ci vmovd %rax#d,%x#$H0 164162306a36Sopenharmony_ci vmovd %rdx#d,%x#$H1 164262306a36Sopenharmony_ci vmovd $h0#d,%x#$H2 164362306a36Sopenharmony_ci vmovd $h1#d,%x#$H3 164462306a36Sopenharmony_ci vmovd $h2#d,%x#$H4 164562306a36Sopenharmony_ci jmp .Lproceed_avx2$suffix 164662306a36Sopenharmony_ci 164762306a36Sopenharmony_ci.align 32 164862306a36Sopenharmony_ci.Lstore_base2_64_avx2$suffix: 164962306a36Sopenharmony_ci mov $h0,0($ctx) 165062306a36Sopenharmony_ci mov $h1,8($ctx) 165162306a36Sopenharmony_ci mov $h2,16($ctx) # note that is_base2_26 is zeroed 165262306a36Sopenharmony_ci jmp .Ldone_avx2$suffix 165362306a36Sopenharmony_ci 165462306a36Sopenharmony_ci.align 16 165562306a36Sopenharmony_ci.Lstore_base2_26_avx2$suffix: 165662306a36Sopenharmony_ci mov %rax#d,0($ctx) # store hash value base 2^26 165762306a36Sopenharmony_ci mov %rdx#d,4($ctx) 165862306a36Sopenharmony_ci mov $h0#d,8($ctx) 165962306a36Sopenharmony_ci mov $h1#d,12($ctx) 166062306a36Sopenharmony_ci mov $h2#d,16($ctx) 166162306a36Sopenharmony_ci.align 16 166262306a36Sopenharmony_ci.Ldone_avx2$suffix: 166362306a36Sopenharmony_ci pop %r15 166462306a36Sopenharmony_ci.cfi_restore %r15 166562306a36Sopenharmony_ci pop %r14 166662306a36Sopenharmony_ci.cfi_restore %r14 166762306a36Sopenharmony_ci pop %r13 166862306a36Sopenharmony_ci.cfi_restore %r13 166962306a36Sopenharmony_ci pop %r12 167062306a36Sopenharmony_ci.cfi_restore %r12 167162306a36Sopenharmony_ci pop %rbx 167262306a36Sopenharmony_ci.cfi_restore %rbx 167362306a36Sopenharmony_ci pop %rbp 167462306a36Sopenharmony_ci.cfi_restore %rbp 167562306a36Sopenharmony_ci.Lno_data_avx2$suffix: 167662306a36Sopenharmony_ci.Lblocks_avx2_epilogue$suffix: 167762306a36Sopenharmony_ci RET 167862306a36Sopenharmony_ci.cfi_endproc 167962306a36Sopenharmony_ci 168062306a36Sopenharmony_ci.align 32 168162306a36Sopenharmony_ci.Lbase2_64_avx2$suffix: 168262306a36Sopenharmony_ci.cfi_startproc 168362306a36Sopenharmony_ci push %rbp 168462306a36Sopenharmony_ci.cfi_push %rbp 168562306a36Sopenharmony_ci mov %rsp,%rbp 168662306a36Sopenharmony_ci push %rbx 168762306a36Sopenharmony_ci.cfi_push %rbx 168862306a36Sopenharmony_ci push %r12 168962306a36Sopenharmony_ci.cfi_push %r12 169062306a36Sopenharmony_ci push %r13 169162306a36Sopenharmony_ci.cfi_push %r13 169262306a36Sopenharmony_ci push %r14 169362306a36Sopenharmony_ci.cfi_push %r14 169462306a36Sopenharmony_ci push %r15 169562306a36Sopenharmony_ci.cfi_push %r15 169662306a36Sopenharmony_ci.Lbase2_64_avx2_body$suffix: 169762306a36Sopenharmony_ci 169862306a36Sopenharmony_ci mov $len,%r15 # reassign $len 169962306a36Sopenharmony_ci 170062306a36Sopenharmony_ci mov 24($ctx),$r0 # load r 170162306a36Sopenharmony_ci mov 32($ctx),$s1 170262306a36Sopenharmony_ci 170362306a36Sopenharmony_ci mov 0($ctx),$h0 # load hash value 170462306a36Sopenharmony_ci mov 8($ctx),$h1 170562306a36Sopenharmony_ci mov 16($ctx),$h2#d 170662306a36Sopenharmony_ci 170762306a36Sopenharmony_ci mov $s1,$r1 170862306a36Sopenharmony_ci mov $s1,%rax 170962306a36Sopenharmony_ci shr \$2,$s1 171062306a36Sopenharmony_ci add $r1,$s1 # s1 = r1 + (r1 >> 2) 171162306a36Sopenharmony_ci 171262306a36Sopenharmony_ci test \$63,$len 171362306a36Sopenharmony_ci jz .Linit_avx2$suffix 171462306a36Sopenharmony_ci 171562306a36Sopenharmony_ci.Lbase2_64_pre_avx2$suffix: 171662306a36Sopenharmony_ci add 0($inp),$h0 # accumulate input 171762306a36Sopenharmony_ci adc 8($inp),$h1 171862306a36Sopenharmony_ci lea 16($inp),$inp 171962306a36Sopenharmony_ci adc $padbit,$h2 172062306a36Sopenharmony_ci sub \$16,%r15 172162306a36Sopenharmony_ci 172262306a36Sopenharmony_ci call __poly1305_block 172362306a36Sopenharmony_ci mov $r1,%rax 172462306a36Sopenharmony_ci 172562306a36Sopenharmony_ci test \$63,%r15 172662306a36Sopenharmony_ci jnz .Lbase2_64_pre_avx2$suffix 172762306a36Sopenharmony_ci 172862306a36Sopenharmony_ci.Linit_avx2$suffix: 172962306a36Sopenharmony_ci ################################# base 2^64 -> base 2^26 173062306a36Sopenharmony_ci mov $h0,%rax 173162306a36Sopenharmony_ci mov $h0,%rdx 173262306a36Sopenharmony_ci shr \$52,$h0 173362306a36Sopenharmony_ci mov $h1,$d1 173462306a36Sopenharmony_ci mov $h1,$d2 173562306a36Sopenharmony_ci shr \$26,%rdx 173662306a36Sopenharmony_ci and \$0x3ffffff,%rax # h[0] 173762306a36Sopenharmony_ci shl \$12,$d1 173862306a36Sopenharmony_ci and \$0x3ffffff,%rdx # h[1] 173962306a36Sopenharmony_ci shr \$14,$h1 174062306a36Sopenharmony_ci or $d1,$h0 174162306a36Sopenharmony_ci shl \$24,$h2 174262306a36Sopenharmony_ci and \$0x3ffffff,$h0 # h[2] 174362306a36Sopenharmony_ci shr \$40,$d2 174462306a36Sopenharmony_ci and \$0x3ffffff,$h1 # h[3] 174562306a36Sopenharmony_ci or $d2,$h2 # h[4] 174662306a36Sopenharmony_ci 174762306a36Sopenharmony_ci vmovd %rax#d,%x#$H0 174862306a36Sopenharmony_ci vmovd %rdx#d,%x#$H1 174962306a36Sopenharmony_ci vmovd $h0#d,%x#$H2 175062306a36Sopenharmony_ci vmovd $h1#d,%x#$H3 175162306a36Sopenharmony_ci vmovd $h2#d,%x#$H4 175262306a36Sopenharmony_ci movl \$1,20($ctx) # set is_base2_26 175362306a36Sopenharmony_ci 175462306a36Sopenharmony_ci call __poly1305_init_avx 175562306a36Sopenharmony_ci 175662306a36Sopenharmony_ci.Lproceed_avx2$suffix: 175762306a36Sopenharmony_ci mov %r15,$len # restore $len 175862306a36Sopenharmony_ci___ 175962306a36Sopenharmony_ci$code.=<<___ if (!$kernel); 176062306a36Sopenharmony_ci mov OPENSSL_ia32cap_P+8(%rip),%r9d 176162306a36Sopenharmony_ci mov \$`(1<<31|1<<30|1<<16)`,%r11d 176262306a36Sopenharmony_ci___ 176362306a36Sopenharmony_ci$code.=<<___; 176462306a36Sopenharmony_ci pop %r15 176562306a36Sopenharmony_ci.cfi_restore %r15 176662306a36Sopenharmony_ci pop %r14 176762306a36Sopenharmony_ci.cfi_restore %r14 176862306a36Sopenharmony_ci pop %r13 176962306a36Sopenharmony_ci.cfi_restore %r13 177062306a36Sopenharmony_ci pop %r12 177162306a36Sopenharmony_ci.cfi_restore %r12 177262306a36Sopenharmony_ci pop %rbx 177362306a36Sopenharmony_ci.cfi_restore %rbx 177462306a36Sopenharmony_ci pop %rbp 177562306a36Sopenharmony_ci.cfi_restore %rbp 177662306a36Sopenharmony_ci.Lbase2_64_avx2_epilogue$suffix: 177762306a36Sopenharmony_ci jmp .Ldo_avx2$suffix 177862306a36Sopenharmony_ci.cfi_endproc 177962306a36Sopenharmony_ci 178062306a36Sopenharmony_ci.align 32 178162306a36Sopenharmony_ci.Leven_avx2$suffix: 178262306a36Sopenharmony_ci.cfi_startproc 178362306a36Sopenharmony_ci___ 178462306a36Sopenharmony_ci$code.=<<___ if (!$kernel); 178562306a36Sopenharmony_ci mov OPENSSL_ia32cap_P+8(%rip),%r9d 178662306a36Sopenharmony_ci___ 178762306a36Sopenharmony_ci$code.=<<___; 178862306a36Sopenharmony_ci vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 178962306a36Sopenharmony_ci vmovd 4*1($ctx),%x#$H1 179062306a36Sopenharmony_ci vmovd 4*2($ctx),%x#$H2 179162306a36Sopenharmony_ci vmovd 4*3($ctx),%x#$H3 179262306a36Sopenharmony_ci vmovd 4*4($ctx),%x#$H4 179362306a36Sopenharmony_ci 179462306a36Sopenharmony_ci.Ldo_avx2$suffix: 179562306a36Sopenharmony_ci___ 179662306a36Sopenharmony_ci$code.=<<___ if (!$kernel && $avx>2); 179762306a36Sopenharmony_ci cmp \$512,$len 179862306a36Sopenharmony_ci jb .Lskip_avx512 179962306a36Sopenharmony_ci and %r11d,%r9d 180062306a36Sopenharmony_ci test \$`1<<16`,%r9d # check for AVX512F 180162306a36Sopenharmony_ci jnz .Lblocks_avx512 180262306a36Sopenharmony_ci.Lskip_avx512$suffix: 180362306a36Sopenharmony_ci___ 180462306a36Sopenharmony_ci$code.=<<___ if ($avx > 2 && $avx512 && $kernel); 180562306a36Sopenharmony_ci cmp \$512,$len 180662306a36Sopenharmony_ci jae .Lblocks_avx512 180762306a36Sopenharmony_ci___ 180862306a36Sopenharmony_ci$code.=<<___ if (!$win64); 180962306a36Sopenharmony_ci lea 8(%rsp),%r10 181062306a36Sopenharmony_ci.cfi_def_cfa_register %r10 181162306a36Sopenharmony_ci sub \$0x128,%rsp 181262306a36Sopenharmony_ci___ 181362306a36Sopenharmony_ci$code.=<<___ if ($win64); 181462306a36Sopenharmony_ci lea 8(%rsp),%r10 181562306a36Sopenharmony_ci sub \$0x1c8,%rsp 181662306a36Sopenharmony_ci vmovdqa %xmm6,-0xb0(%r10) 181762306a36Sopenharmony_ci vmovdqa %xmm7,-0xa0(%r10) 181862306a36Sopenharmony_ci vmovdqa %xmm8,-0x90(%r10) 181962306a36Sopenharmony_ci vmovdqa %xmm9,-0x80(%r10) 182062306a36Sopenharmony_ci vmovdqa %xmm10,-0x70(%r10) 182162306a36Sopenharmony_ci vmovdqa %xmm11,-0x60(%r10) 182262306a36Sopenharmony_ci vmovdqa %xmm12,-0x50(%r10) 182362306a36Sopenharmony_ci vmovdqa %xmm13,-0x40(%r10) 182462306a36Sopenharmony_ci vmovdqa %xmm14,-0x30(%r10) 182562306a36Sopenharmony_ci vmovdqa %xmm15,-0x20(%r10) 182662306a36Sopenharmony_ci.Ldo_avx2_body$suffix: 182762306a36Sopenharmony_ci___ 182862306a36Sopenharmony_ci$code.=<<___; 182962306a36Sopenharmony_ci lea .Lconst(%rip),%rcx 183062306a36Sopenharmony_ci lea 48+64($ctx),$ctx # size optimization 183162306a36Sopenharmony_ci vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 183262306a36Sopenharmony_ci 183362306a36Sopenharmony_ci # expand and copy pre-calculated table to stack 183462306a36Sopenharmony_ci vmovdqu `16*0-64`($ctx),%x#$T2 183562306a36Sopenharmony_ci and \$-512,%rsp 183662306a36Sopenharmony_ci vmovdqu `16*1-64`($ctx),%x#$T3 183762306a36Sopenharmony_ci vmovdqu `16*2-64`($ctx),%x#$T4 183862306a36Sopenharmony_ci vmovdqu `16*3-64`($ctx),%x#$D0 183962306a36Sopenharmony_ci vmovdqu `16*4-64`($ctx),%x#$D1 184062306a36Sopenharmony_ci vmovdqu `16*5-64`($ctx),%x#$D2 184162306a36Sopenharmony_ci lea 0x90(%rsp),%rax # size optimization 184262306a36Sopenharmony_ci vmovdqu `16*6-64`($ctx),%x#$D3 184362306a36Sopenharmony_ci vpermd $T2,$T0,$T2 # 00003412 -> 14243444 184462306a36Sopenharmony_ci vmovdqu `16*7-64`($ctx),%x#$D4 184562306a36Sopenharmony_ci vpermd $T3,$T0,$T3 184662306a36Sopenharmony_ci vmovdqu `16*8-64`($ctx),%x#$MASK 184762306a36Sopenharmony_ci vpermd $T4,$T0,$T4 184862306a36Sopenharmony_ci vmovdqa $T2,0x00(%rsp) 184962306a36Sopenharmony_ci vpermd $D0,$T0,$D0 185062306a36Sopenharmony_ci vmovdqa $T3,0x20-0x90(%rax) 185162306a36Sopenharmony_ci vpermd $D1,$T0,$D1 185262306a36Sopenharmony_ci vmovdqa $T4,0x40-0x90(%rax) 185362306a36Sopenharmony_ci vpermd $D2,$T0,$D2 185462306a36Sopenharmony_ci vmovdqa $D0,0x60-0x90(%rax) 185562306a36Sopenharmony_ci vpermd $D3,$T0,$D3 185662306a36Sopenharmony_ci vmovdqa $D1,0x80-0x90(%rax) 185762306a36Sopenharmony_ci vpermd $D4,$T0,$D4 185862306a36Sopenharmony_ci vmovdqa $D2,0xa0-0x90(%rax) 185962306a36Sopenharmony_ci vpermd $MASK,$T0,$MASK 186062306a36Sopenharmony_ci vmovdqa $D3,0xc0-0x90(%rax) 186162306a36Sopenharmony_ci vmovdqa $D4,0xe0-0x90(%rax) 186262306a36Sopenharmony_ci vmovdqa $MASK,0x100-0x90(%rax) 186362306a36Sopenharmony_ci vmovdqa 64(%rcx),$MASK # .Lmask26 186462306a36Sopenharmony_ci 186562306a36Sopenharmony_ci ################################################################ 186662306a36Sopenharmony_ci # load input 186762306a36Sopenharmony_ci vmovdqu 16*0($inp),%x#$T0 186862306a36Sopenharmony_ci vmovdqu 16*1($inp),%x#$T1 186962306a36Sopenharmony_ci vinserti128 \$1,16*2($inp),$T0,$T0 187062306a36Sopenharmony_ci vinserti128 \$1,16*3($inp),$T1,$T1 187162306a36Sopenharmony_ci lea 16*4($inp),$inp 187262306a36Sopenharmony_ci 187362306a36Sopenharmony_ci vpsrldq \$6,$T0,$T2 # splat input 187462306a36Sopenharmony_ci vpsrldq \$6,$T1,$T3 187562306a36Sopenharmony_ci vpunpckhqdq $T1,$T0,$T4 # 4 187662306a36Sopenharmony_ci vpunpcklqdq $T3,$T2,$T2 # 2:3 187762306a36Sopenharmony_ci vpunpcklqdq $T1,$T0,$T0 # 0:1 187862306a36Sopenharmony_ci 187962306a36Sopenharmony_ci vpsrlq \$30,$T2,$T3 188062306a36Sopenharmony_ci vpsrlq \$4,$T2,$T2 188162306a36Sopenharmony_ci vpsrlq \$26,$T0,$T1 188262306a36Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 188362306a36Sopenharmony_ci vpand $MASK,$T2,$T2 # 2 188462306a36Sopenharmony_ci vpand $MASK,$T0,$T0 # 0 188562306a36Sopenharmony_ci vpand $MASK,$T1,$T1 # 1 188662306a36Sopenharmony_ci vpand $MASK,$T3,$T3 # 3 188762306a36Sopenharmony_ci vpor 32(%rcx),$T4,$T4 # padbit, yes, always 188862306a36Sopenharmony_ci 188962306a36Sopenharmony_ci vpaddq $H2,$T2,$H2 # accumulate input 189062306a36Sopenharmony_ci sub \$64,$len 189162306a36Sopenharmony_ci jz .Ltail_avx2$suffix 189262306a36Sopenharmony_ci jmp .Loop_avx2$suffix 189362306a36Sopenharmony_ci 189462306a36Sopenharmony_ci.align 32 189562306a36Sopenharmony_ci.Loop_avx2$suffix: 189662306a36Sopenharmony_ci ################################################################ 189762306a36Sopenharmony_ci # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 189862306a36Sopenharmony_ci # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 189962306a36Sopenharmony_ci # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 190062306a36Sopenharmony_ci # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 190162306a36Sopenharmony_ci # \________/\__________/ 190262306a36Sopenharmony_ci ################################################################ 190362306a36Sopenharmony_ci #vpaddq $H2,$T2,$H2 # accumulate input 190462306a36Sopenharmony_ci vpaddq $H0,$T0,$H0 190562306a36Sopenharmony_ci vmovdqa `32*0`(%rsp),$T0 # r0^4 190662306a36Sopenharmony_ci vpaddq $H1,$T1,$H1 190762306a36Sopenharmony_ci vmovdqa `32*1`(%rsp),$T1 # r1^4 190862306a36Sopenharmony_ci vpaddq $H3,$T3,$H3 190962306a36Sopenharmony_ci vmovdqa `32*3`(%rsp),$T2 # r2^4 191062306a36Sopenharmony_ci vpaddq $H4,$T4,$H4 191162306a36Sopenharmony_ci vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 191262306a36Sopenharmony_ci vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 191362306a36Sopenharmony_ci 191462306a36Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 191562306a36Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 191662306a36Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 191762306a36Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 191862306a36Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 191962306a36Sopenharmony_ci # 192062306a36Sopenharmony_ci # however, as h2 is "chronologically" first one available pull 192162306a36Sopenharmony_ci # corresponding operations up, so it's 192262306a36Sopenharmony_ci # 192362306a36Sopenharmony_ci # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 192462306a36Sopenharmony_ci # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 192562306a36Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 192662306a36Sopenharmony_ci # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 192762306a36Sopenharmony_ci # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 192862306a36Sopenharmony_ci 192962306a36Sopenharmony_ci vpmuludq $H2,$T0,$D2 # d2 = h2*r0 193062306a36Sopenharmony_ci vpmuludq $H2,$T1,$D3 # d3 = h2*r1 193162306a36Sopenharmony_ci vpmuludq $H2,$T2,$D4 # d4 = h2*r2 193262306a36Sopenharmony_ci vpmuludq $H2,$T3,$D0 # d0 = h2*s3 193362306a36Sopenharmony_ci vpmuludq $H2,$S4,$D1 # d1 = h2*s4 193462306a36Sopenharmony_ci 193562306a36Sopenharmony_ci vpmuludq $H0,$T1,$T4 # h0*r1 193662306a36Sopenharmony_ci vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp 193762306a36Sopenharmony_ci vpaddq $T4,$D1,$D1 # d1 += h0*r1 193862306a36Sopenharmony_ci vpaddq $H2,$D2,$D2 # d2 += h1*r1 193962306a36Sopenharmony_ci vpmuludq $H3,$T1,$T4 # h3*r1 194062306a36Sopenharmony_ci vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 194162306a36Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h3*r1 194262306a36Sopenharmony_ci vpaddq $H2,$D0,$D0 # d0 += h4*s1 194362306a36Sopenharmony_ci vmovdqa `32*4-0x90`(%rax),$T1 # s2 194462306a36Sopenharmony_ci 194562306a36Sopenharmony_ci vpmuludq $H0,$T0,$T4 # h0*r0 194662306a36Sopenharmony_ci vpmuludq $H1,$T0,$H2 # h1*r0 194762306a36Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h0*r0 194862306a36Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h1*r0 194962306a36Sopenharmony_ci vpmuludq $H3,$T0,$T4 # h3*r0 195062306a36Sopenharmony_ci vpmuludq $H4,$T0,$H2 # h4*r0 195162306a36Sopenharmony_ci vmovdqu 16*0($inp),%x#$T0 # load input 195262306a36Sopenharmony_ci vpaddq $T4,$D3,$D3 # d3 += h3*r0 195362306a36Sopenharmony_ci vpaddq $H2,$D4,$D4 # d4 += h4*r0 195462306a36Sopenharmony_ci vinserti128 \$1,16*2($inp),$T0,$T0 195562306a36Sopenharmony_ci 195662306a36Sopenharmony_ci vpmuludq $H3,$T1,$T4 # h3*s2 195762306a36Sopenharmony_ci vpmuludq $H4,$T1,$H2 # h4*s2 195862306a36Sopenharmony_ci vmovdqu 16*1($inp),%x#$T1 195962306a36Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h3*s2 196062306a36Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h4*s2 196162306a36Sopenharmony_ci vmovdqa `32*5-0x90`(%rax),$H2 # r3 196262306a36Sopenharmony_ci vpmuludq $H1,$T2,$T4 # h1*r2 196362306a36Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r2 196462306a36Sopenharmony_ci vpaddq $T4,$D3,$D3 # d3 += h1*r2 196562306a36Sopenharmony_ci vpaddq $T2,$D2,$D2 # d2 += h0*r2 196662306a36Sopenharmony_ci vinserti128 \$1,16*3($inp),$T1,$T1 196762306a36Sopenharmony_ci lea 16*4($inp),$inp 196862306a36Sopenharmony_ci 196962306a36Sopenharmony_ci vpmuludq $H1,$H2,$T4 # h1*r3 197062306a36Sopenharmony_ci vpmuludq $H0,$H2,$H2 # h0*r3 197162306a36Sopenharmony_ci vpsrldq \$6,$T0,$T2 # splat input 197262306a36Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h1*r3 197362306a36Sopenharmony_ci vpaddq $H2,$D3,$D3 # d3 += h0*r3 197462306a36Sopenharmony_ci vpmuludq $H3,$T3,$T4 # h3*s3 197562306a36Sopenharmony_ci vpmuludq $H4,$T3,$H2 # h4*s3 197662306a36Sopenharmony_ci vpsrldq \$6,$T1,$T3 197762306a36Sopenharmony_ci vpaddq $T4,$D1,$D1 # d1 += h3*s3 197862306a36Sopenharmony_ci vpaddq $H2,$D2,$D2 # d2 += h4*s3 197962306a36Sopenharmony_ci vpunpckhqdq $T1,$T0,$T4 # 4 198062306a36Sopenharmony_ci 198162306a36Sopenharmony_ci vpmuludq $H3,$S4,$H3 # h3*s4 198262306a36Sopenharmony_ci vpmuludq $H4,$S4,$H4 # h4*s4 198362306a36Sopenharmony_ci vpunpcklqdq $T1,$T0,$T0 # 0:1 198462306a36Sopenharmony_ci vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 198562306a36Sopenharmony_ci vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 198662306a36Sopenharmony_ci vpunpcklqdq $T3,$T2,$T3 # 2:3 198762306a36Sopenharmony_ci vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 198862306a36Sopenharmony_ci vpmuludq $H1,$S4,$H0 # h1*s4 198962306a36Sopenharmony_ci vmovdqa 64(%rcx),$MASK # .Lmask26 199062306a36Sopenharmony_ci vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 199162306a36Sopenharmony_ci vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 199262306a36Sopenharmony_ci 199362306a36Sopenharmony_ci ################################################################ 199462306a36Sopenharmony_ci # lazy reduction (interleaved with tail of input splat) 199562306a36Sopenharmony_ci 199662306a36Sopenharmony_ci vpsrlq \$26,$H3,$D3 199762306a36Sopenharmony_ci vpand $MASK,$H3,$H3 199862306a36Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 199962306a36Sopenharmony_ci 200062306a36Sopenharmony_ci vpsrlq \$26,$H0,$D0 200162306a36Sopenharmony_ci vpand $MASK,$H0,$H0 200262306a36Sopenharmony_ci vpaddq $D0,$D1,$H1 # h0 -> h1 200362306a36Sopenharmony_ci 200462306a36Sopenharmony_ci vpsrlq \$26,$H4,$D4 200562306a36Sopenharmony_ci vpand $MASK,$H4,$H4 200662306a36Sopenharmony_ci 200762306a36Sopenharmony_ci vpsrlq \$4,$T3,$T2 200862306a36Sopenharmony_ci 200962306a36Sopenharmony_ci vpsrlq \$26,$H1,$D1 201062306a36Sopenharmony_ci vpand $MASK,$H1,$H1 201162306a36Sopenharmony_ci vpaddq $D1,$H2,$H2 # h1 -> h2 201262306a36Sopenharmony_ci 201362306a36Sopenharmony_ci vpaddq $D4,$H0,$H0 201462306a36Sopenharmony_ci vpsllq \$2,$D4,$D4 201562306a36Sopenharmony_ci vpaddq $D4,$H0,$H0 # h4 -> h0 201662306a36Sopenharmony_ci 201762306a36Sopenharmony_ci vpand $MASK,$T2,$T2 # 2 201862306a36Sopenharmony_ci vpsrlq \$26,$T0,$T1 201962306a36Sopenharmony_ci 202062306a36Sopenharmony_ci vpsrlq \$26,$H2,$D2 202162306a36Sopenharmony_ci vpand $MASK,$H2,$H2 202262306a36Sopenharmony_ci vpaddq $D2,$H3,$H3 # h2 -> h3 202362306a36Sopenharmony_ci 202462306a36Sopenharmony_ci vpaddq $T2,$H2,$H2 # modulo-scheduled 202562306a36Sopenharmony_ci vpsrlq \$30,$T3,$T3 202662306a36Sopenharmony_ci 202762306a36Sopenharmony_ci vpsrlq \$26,$H0,$D0 202862306a36Sopenharmony_ci vpand $MASK,$H0,$H0 202962306a36Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 203062306a36Sopenharmony_ci 203162306a36Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 203262306a36Sopenharmony_ci 203362306a36Sopenharmony_ci vpsrlq \$26,$H3,$D3 203462306a36Sopenharmony_ci vpand $MASK,$H3,$H3 203562306a36Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 203662306a36Sopenharmony_ci 203762306a36Sopenharmony_ci vpand $MASK,$T0,$T0 # 0 203862306a36Sopenharmony_ci vpand $MASK,$T1,$T1 # 1 203962306a36Sopenharmony_ci vpand $MASK,$T3,$T3 # 3 204062306a36Sopenharmony_ci vpor 32(%rcx),$T4,$T4 # padbit, yes, always 204162306a36Sopenharmony_ci 204262306a36Sopenharmony_ci sub \$64,$len 204362306a36Sopenharmony_ci jnz .Loop_avx2$suffix 204462306a36Sopenharmony_ci 204562306a36Sopenharmony_ci .byte 0x66,0x90 204662306a36Sopenharmony_ci.Ltail_avx2$suffix: 204762306a36Sopenharmony_ci ################################################################ 204862306a36Sopenharmony_ci # while above multiplications were by r^4 in all lanes, in last 204962306a36Sopenharmony_ci # iteration we multiply least significant lane by r^4 and most 205062306a36Sopenharmony_ci # significant one by r, so copy of above except that references 205162306a36Sopenharmony_ci # to the precomputed table are displaced by 4... 205262306a36Sopenharmony_ci 205362306a36Sopenharmony_ci #vpaddq $H2,$T2,$H2 # accumulate input 205462306a36Sopenharmony_ci vpaddq $H0,$T0,$H0 205562306a36Sopenharmony_ci vmovdqu `32*0+4`(%rsp),$T0 # r0^4 205662306a36Sopenharmony_ci vpaddq $H1,$T1,$H1 205762306a36Sopenharmony_ci vmovdqu `32*1+4`(%rsp),$T1 # r1^4 205862306a36Sopenharmony_ci vpaddq $H3,$T3,$H3 205962306a36Sopenharmony_ci vmovdqu `32*3+4`(%rsp),$T2 # r2^4 206062306a36Sopenharmony_ci vpaddq $H4,$T4,$H4 206162306a36Sopenharmony_ci vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 206262306a36Sopenharmony_ci vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 206362306a36Sopenharmony_ci 206462306a36Sopenharmony_ci vpmuludq $H2,$T0,$D2 # d2 = h2*r0 206562306a36Sopenharmony_ci vpmuludq $H2,$T1,$D3 # d3 = h2*r1 206662306a36Sopenharmony_ci vpmuludq $H2,$T2,$D4 # d4 = h2*r2 206762306a36Sopenharmony_ci vpmuludq $H2,$T3,$D0 # d0 = h2*s3 206862306a36Sopenharmony_ci vpmuludq $H2,$S4,$D1 # d1 = h2*s4 206962306a36Sopenharmony_ci 207062306a36Sopenharmony_ci vpmuludq $H0,$T1,$T4 # h0*r1 207162306a36Sopenharmony_ci vpmuludq $H1,$T1,$H2 # h1*r1 207262306a36Sopenharmony_ci vpaddq $T4,$D1,$D1 # d1 += h0*r1 207362306a36Sopenharmony_ci vpaddq $H2,$D2,$D2 # d2 += h1*r1 207462306a36Sopenharmony_ci vpmuludq $H3,$T1,$T4 # h3*r1 207562306a36Sopenharmony_ci vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 207662306a36Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h3*r1 207762306a36Sopenharmony_ci vpaddq $H2,$D0,$D0 # d0 += h4*s1 207862306a36Sopenharmony_ci 207962306a36Sopenharmony_ci vpmuludq $H0,$T0,$T4 # h0*r0 208062306a36Sopenharmony_ci vpmuludq $H1,$T0,$H2 # h1*r0 208162306a36Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h0*r0 208262306a36Sopenharmony_ci vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 208362306a36Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h1*r0 208462306a36Sopenharmony_ci vpmuludq $H3,$T0,$T4 # h3*r0 208562306a36Sopenharmony_ci vpmuludq $H4,$T0,$H2 # h4*r0 208662306a36Sopenharmony_ci vpaddq $T4,$D3,$D3 # d3 += h3*r0 208762306a36Sopenharmony_ci vpaddq $H2,$D4,$D4 # d4 += h4*r0 208862306a36Sopenharmony_ci 208962306a36Sopenharmony_ci vpmuludq $H3,$T1,$T4 # h3*s2 209062306a36Sopenharmony_ci vpmuludq $H4,$T1,$H2 # h4*s2 209162306a36Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h3*s2 209262306a36Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h4*s2 209362306a36Sopenharmony_ci vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 209462306a36Sopenharmony_ci vpmuludq $H1,$T2,$T4 # h1*r2 209562306a36Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r2 209662306a36Sopenharmony_ci vpaddq $T4,$D3,$D3 # d3 += h1*r2 209762306a36Sopenharmony_ci vpaddq $T2,$D2,$D2 # d2 += h0*r2 209862306a36Sopenharmony_ci 209962306a36Sopenharmony_ci vpmuludq $H1,$H2,$T4 # h1*r3 210062306a36Sopenharmony_ci vpmuludq $H0,$H2,$H2 # h0*r3 210162306a36Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h1*r3 210262306a36Sopenharmony_ci vpaddq $H2,$D3,$D3 # d3 += h0*r3 210362306a36Sopenharmony_ci vpmuludq $H3,$T3,$T4 # h3*s3 210462306a36Sopenharmony_ci vpmuludq $H4,$T3,$H2 # h4*s3 210562306a36Sopenharmony_ci vpaddq $T4,$D1,$D1 # d1 += h3*s3 210662306a36Sopenharmony_ci vpaddq $H2,$D2,$D2 # d2 += h4*s3 210762306a36Sopenharmony_ci 210862306a36Sopenharmony_ci vpmuludq $H3,$S4,$H3 # h3*s4 210962306a36Sopenharmony_ci vpmuludq $H4,$S4,$H4 # h4*s4 211062306a36Sopenharmony_ci vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 211162306a36Sopenharmony_ci vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 211262306a36Sopenharmony_ci vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 211362306a36Sopenharmony_ci vpmuludq $H1,$S4,$H0 # h1*s4 211462306a36Sopenharmony_ci vmovdqa 64(%rcx),$MASK # .Lmask26 211562306a36Sopenharmony_ci vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 211662306a36Sopenharmony_ci vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 211762306a36Sopenharmony_ci 211862306a36Sopenharmony_ci ################################################################ 211962306a36Sopenharmony_ci # horizontal addition 212062306a36Sopenharmony_ci 212162306a36Sopenharmony_ci vpsrldq \$8,$D1,$T1 212262306a36Sopenharmony_ci vpsrldq \$8,$H2,$T2 212362306a36Sopenharmony_ci vpsrldq \$8,$H3,$T3 212462306a36Sopenharmony_ci vpsrldq \$8,$H4,$T4 212562306a36Sopenharmony_ci vpsrldq \$8,$H0,$T0 212662306a36Sopenharmony_ci vpaddq $T1,$D1,$D1 212762306a36Sopenharmony_ci vpaddq $T2,$H2,$H2 212862306a36Sopenharmony_ci vpaddq $T3,$H3,$H3 212962306a36Sopenharmony_ci vpaddq $T4,$H4,$H4 213062306a36Sopenharmony_ci vpaddq $T0,$H0,$H0 213162306a36Sopenharmony_ci 213262306a36Sopenharmony_ci vpermq \$0x2,$H3,$T3 213362306a36Sopenharmony_ci vpermq \$0x2,$H4,$T4 213462306a36Sopenharmony_ci vpermq \$0x2,$H0,$T0 213562306a36Sopenharmony_ci vpermq \$0x2,$D1,$T1 213662306a36Sopenharmony_ci vpermq \$0x2,$H2,$T2 213762306a36Sopenharmony_ci vpaddq $T3,$H3,$H3 213862306a36Sopenharmony_ci vpaddq $T4,$H4,$H4 213962306a36Sopenharmony_ci vpaddq $T0,$H0,$H0 214062306a36Sopenharmony_ci vpaddq $T1,$D1,$D1 214162306a36Sopenharmony_ci vpaddq $T2,$H2,$H2 214262306a36Sopenharmony_ci 214362306a36Sopenharmony_ci ################################################################ 214462306a36Sopenharmony_ci # lazy reduction 214562306a36Sopenharmony_ci 214662306a36Sopenharmony_ci vpsrlq \$26,$H3,$D3 214762306a36Sopenharmony_ci vpand $MASK,$H3,$H3 214862306a36Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 214962306a36Sopenharmony_ci 215062306a36Sopenharmony_ci vpsrlq \$26,$H0,$D0 215162306a36Sopenharmony_ci vpand $MASK,$H0,$H0 215262306a36Sopenharmony_ci vpaddq $D0,$D1,$H1 # h0 -> h1 215362306a36Sopenharmony_ci 215462306a36Sopenharmony_ci vpsrlq \$26,$H4,$D4 215562306a36Sopenharmony_ci vpand $MASK,$H4,$H4 215662306a36Sopenharmony_ci 215762306a36Sopenharmony_ci vpsrlq \$26,$H1,$D1 215862306a36Sopenharmony_ci vpand $MASK,$H1,$H1 215962306a36Sopenharmony_ci vpaddq $D1,$H2,$H2 # h1 -> h2 216062306a36Sopenharmony_ci 216162306a36Sopenharmony_ci vpaddq $D4,$H0,$H0 216262306a36Sopenharmony_ci vpsllq \$2,$D4,$D4 216362306a36Sopenharmony_ci vpaddq $D4,$H0,$H0 # h4 -> h0 216462306a36Sopenharmony_ci 216562306a36Sopenharmony_ci vpsrlq \$26,$H2,$D2 216662306a36Sopenharmony_ci vpand $MASK,$H2,$H2 216762306a36Sopenharmony_ci vpaddq $D2,$H3,$H3 # h2 -> h3 216862306a36Sopenharmony_ci 216962306a36Sopenharmony_ci vpsrlq \$26,$H0,$D0 217062306a36Sopenharmony_ci vpand $MASK,$H0,$H0 217162306a36Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 217262306a36Sopenharmony_ci 217362306a36Sopenharmony_ci vpsrlq \$26,$H3,$D3 217462306a36Sopenharmony_ci vpand $MASK,$H3,$H3 217562306a36Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 217662306a36Sopenharmony_ci 217762306a36Sopenharmony_ci vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 217862306a36Sopenharmony_ci vmovd %x#$H1,`4*1-48-64`($ctx) 217962306a36Sopenharmony_ci vmovd %x#$H2,`4*2-48-64`($ctx) 218062306a36Sopenharmony_ci vmovd %x#$H3,`4*3-48-64`($ctx) 218162306a36Sopenharmony_ci vmovd %x#$H4,`4*4-48-64`($ctx) 218262306a36Sopenharmony_ci___ 218362306a36Sopenharmony_ci$code.=<<___ if ($win64); 218462306a36Sopenharmony_ci vmovdqa -0xb0(%r10),%xmm6 218562306a36Sopenharmony_ci vmovdqa -0xa0(%r10),%xmm7 218662306a36Sopenharmony_ci vmovdqa -0x90(%r10),%xmm8 218762306a36Sopenharmony_ci vmovdqa -0x80(%r10),%xmm9 218862306a36Sopenharmony_ci vmovdqa -0x70(%r10),%xmm10 218962306a36Sopenharmony_ci vmovdqa -0x60(%r10),%xmm11 219062306a36Sopenharmony_ci vmovdqa -0x50(%r10),%xmm12 219162306a36Sopenharmony_ci vmovdqa -0x40(%r10),%xmm13 219262306a36Sopenharmony_ci vmovdqa -0x30(%r10),%xmm14 219362306a36Sopenharmony_ci vmovdqa -0x20(%r10),%xmm15 219462306a36Sopenharmony_ci lea -8(%r10),%rsp 219562306a36Sopenharmony_ci.Ldo_avx2_epilogue$suffix: 219662306a36Sopenharmony_ci___ 219762306a36Sopenharmony_ci$code.=<<___ if (!$win64); 219862306a36Sopenharmony_ci lea -8(%r10),%rsp 219962306a36Sopenharmony_ci.cfi_def_cfa_register %rsp 220062306a36Sopenharmony_ci___ 220162306a36Sopenharmony_ci$code.=<<___; 220262306a36Sopenharmony_ci vzeroupper 220362306a36Sopenharmony_ci RET 220462306a36Sopenharmony_ci.cfi_endproc 220562306a36Sopenharmony_ci___ 220662306a36Sopenharmony_ciif($avx > 2 && $avx512) { 220762306a36Sopenharmony_cimy ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); 220862306a36Sopenharmony_cimy ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); 220962306a36Sopenharmony_cimy $PADBIT="%zmm30"; 221062306a36Sopenharmony_ci 221162306a36Sopenharmony_cimap(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain 221262306a36Sopenharmony_cimap(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); 221362306a36Sopenharmony_cimap(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); 221462306a36Sopenharmony_cimap(s/%y/%z/,($MASK)); 221562306a36Sopenharmony_ci 221662306a36Sopenharmony_ci$code.=<<___; 221762306a36Sopenharmony_ci.cfi_startproc 221862306a36Sopenharmony_ci.Lblocks_avx512: 221962306a36Sopenharmony_ci mov \$15,%eax 222062306a36Sopenharmony_ci kmovw %eax,%k2 222162306a36Sopenharmony_ci___ 222262306a36Sopenharmony_ci$code.=<<___ if (!$win64); 222362306a36Sopenharmony_ci lea 8(%rsp),%r10 222462306a36Sopenharmony_ci.cfi_def_cfa_register %r10 222562306a36Sopenharmony_ci sub \$0x128,%rsp 222662306a36Sopenharmony_ci___ 222762306a36Sopenharmony_ci$code.=<<___ if ($win64); 222862306a36Sopenharmony_ci lea 8(%rsp),%r10 222962306a36Sopenharmony_ci sub \$0x1c8,%rsp 223062306a36Sopenharmony_ci vmovdqa %xmm6,-0xb0(%r10) 223162306a36Sopenharmony_ci vmovdqa %xmm7,-0xa0(%r10) 223262306a36Sopenharmony_ci vmovdqa %xmm8,-0x90(%r10) 223362306a36Sopenharmony_ci vmovdqa %xmm9,-0x80(%r10) 223462306a36Sopenharmony_ci vmovdqa %xmm10,-0x70(%r10) 223562306a36Sopenharmony_ci vmovdqa %xmm11,-0x60(%r10) 223662306a36Sopenharmony_ci vmovdqa %xmm12,-0x50(%r10) 223762306a36Sopenharmony_ci vmovdqa %xmm13,-0x40(%r10) 223862306a36Sopenharmony_ci vmovdqa %xmm14,-0x30(%r10) 223962306a36Sopenharmony_ci vmovdqa %xmm15,-0x20(%r10) 224062306a36Sopenharmony_ci.Ldo_avx512_body: 224162306a36Sopenharmony_ci___ 224262306a36Sopenharmony_ci$code.=<<___; 224362306a36Sopenharmony_ci lea .Lconst(%rip),%rcx 224462306a36Sopenharmony_ci lea 48+64($ctx),$ctx # size optimization 224562306a36Sopenharmony_ci vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 224662306a36Sopenharmony_ci 224762306a36Sopenharmony_ci # expand pre-calculated table 224862306a36Sopenharmony_ci vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} 224962306a36Sopenharmony_ci and \$-512,%rsp 225062306a36Sopenharmony_ci vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} 225162306a36Sopenharmony_ci mov \$0x20,%rax 225262306a36Sopenharmony_ci vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} 225362306a36Sopenharmony_ci vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} 225462306a36Sopenharmony_ci vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} 225562306a36Sopenharmony_ci vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} 225662306a36Sopenharmony_ci vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} 225762306a36Sopenharmony_ci vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} 225862306a36Sopenharmony_ci vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} 225962306a36Sopenharmony_ci vpermd $D0,$T2,$R0 # 00003412 -> 14243444 226062306a36Sopenharmony_ci vpbroadcastq 64(%rcx),$MASK # .Lmask26 226162306a36Sopenharmony_ci vpermd $D1,$T2,$R1 226262306a36Sopenharmony_ci vpermd $T0,$T2,$S1 226362306a36Sopenharmony_ci vpermd $D2,$T2,$R2 226462306a36Sopenharmony_ci vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 226562306a36Sopenharmony_ci vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 226662306a36Sopenharmony_ci vpermd $T1,$T2,$S2 226762306a36Sopenharmony_ci vmovdqu64 $R1,0x00(%rsp,%rax){%k2} 226862306a36Sopenharmony_ci vpsrlq \$32,$R1,$T1 226962306a36Sopenharmony_ci vpermd $D3,$T2,$R3 227062306a36Sopenharmony_ci vmovdqa64 $S1,0x40(%rsp){%k2} 227162306a36Sopenharmony_ci vpermd $T3,$T2,$S3 227262306a36Sopenharmony_ci vpermd $D4,$T2,$R4 227362306a36Sopenharmony_ci vmovdqu64 $R2,0x40(%rsp,%rax){%k2} 227462306a36Sopenharmony_ci vpermd $T4,$T2,$S4 227562306a36Sopenharmony_ci vmovdqa64 $S2,0x80(%rsp){%k2} 227662306a36Sopenharmony_ci vmovdqu64 $R3,0x80(%rsp,%rax){%k2} 227762306a36Sopenharmony_ci vmovdqa64 $S3,0xc0(%rsp){%k2} 227862306a36Sopenharmony_ci vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} 227962306a36Sopenharmony_ci vmovdqa64 $S4,0x100(%rsp){%k2} 228062306a36Sopenharmony_ci 228162306a36Sopenharmony_ci ################################################################ 228262306a36Sopenharmony_ci # calculate 5th through 8th powers of the key 228362306a36Sopenharmony_ci # 228462306a36Sopenharmony_ci # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 228562306a36Sopenharmony_ci # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 228662306a36Sopenharmony_ci # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 228762306a36Sopenharmony_ci # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 228862306a36Sopenharmony_ci # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 228962306a36Sopenharmony_ci 229062306a36Sopenharmony_ci vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 229162306a36Sopenharmony_ci vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 229262306a36Sopenharmony_ci vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 229362306a36Sopenharmony_ci vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 229462306a36Sopenharmony_ci vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 229562306a36Sopenharmony_ci vpsrlq \$32,$R2,$T2 229662306a36Sopenharmony_ci 229762306a36Sopenharmony_ci vpmuludq $T1,$S4,$M0 229862306a36Sopenharmony_ci vpmuludq $T1,$R0,$M1 229962306a36Sopenharmony_ci vpmuludq $T1,$R1,$M2 230062306a36Sopenharmony_ci vpmuludq $T1,$R2,$M3 230162306a36Sopenharmony_ci vpmuludq $T1,$R3,$M4 230262306a36Sopenharmony_ci vpsrlq \$32,$R3,$T3 230362306a36Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 230462306a36Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += r1'*r0 230562306a36Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += r1'*r1 230662306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += r1'*r2 230762306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += r1'*r3 230862306a36Sopenharmony_ci 230962306a36Sopenharmony_ci vpmuludq $T2,$S3,$M0 231062306a36Sopenharmony_ci vpmuludq $T2,$S4,$M1 231162306a36Sopenharmony_ci vpmuludq $T2,$R1,$M3 231262306a36Sopenharmony_ci vpmuludq $T2,$R2,$M4 231362306a36Sopenharmony_ci vpmuludq $T2,$R0,$M2 231462306a36Sopenharmony_ci vpsrlq \$32,$R4,$T4 231562306a36Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 231662306a36Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 231762306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += r2'*r1 231862306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += r2'*r2 231962306a36Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += r2'*r0 232062306a36Sopenharmony_ci 232162306a36Sopenharmony_ci vpmuludq $T3,$S2,$M0 232262306a36Sopenharmony_ci vpmuludq $T3,$R0,$M3 232362306a36Sopenharmony_ci vpmuludq $T3,$R1,$M4 232462306a36Sopenharmony_ci vpmuludq $T3,$S3,$M1 232562306a36Sopenharmony_ci vpmuludq $T3,$S4,$M2 232662306a36Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 232762306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += r3'*r0 232862306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += r3'*r1 232962306a36Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 233062306a36Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 233162306a36Sopenharmony_ci 233262306a36Sopenharmony_ci vpmuludq $T4,$S4,$M3 233362306a36Sopenharmony_ci vpmuludq $T4,$R0,$M4 233462306a36Sopenharmony_ci vpmuludq $T4,$S1,$M0 233562306a36Sopenharmony_ci vpmuludq $T4,$S2,$M1 233662306a36Sopenharmony_ci vpmuludq $T4,$S3,$M2 233762306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 233862306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += r2'*r0 233962306a36Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 234062306a36Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 234162306a36Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 234262306a36Sopenharmony_ci 234362306a36Sopenharmony_ci ################################################################ 234462306a36Sopenharmony_ci # load input 234562306a36Sopenharmony_ci vmovdqu64 16*0($inp),%z#$T3 234662306a36Sopenharmony_ci vmovdqu64 16*4($inp),%z#$T4 234762306a36Sopenharmony_ci lea 16*8($inp),$inp 234862306a36Sopenharmony_ci 234962306a36Sopenharmony_ci ################################################################ 235062306a36Sopenharmony_ci # lazy reduction 235162306a36Sopenharmony_ci 235262306a36Sopenharmony_ci vpsrlq \$26,$D3,$M3 235362306a36Sopenharmony_ci vpandq $MASK,$D3,$D3 235462306a36Sopenharmony_ci vpaddq $M3,$D4,$D4 # d3 -> d4 235562306a36Sopenharmony_ci 235662306a36Sopenharmony_ci vpsrlq \$26,$D0,$M0 235762306a36Sopenharmony_ci vpandq $MASK,$D0,$D0 235862306a36Sopenharmony_ci vpaddq $M0,$D1,$D1 # d0 -> d1 235962306a36Sopenharmony_ci 236062306a36Sopenharmony_ci vpsrlq \$26,$D4,$M4 236162306a36Sopenharmony_ci vpandq $MASK,$D4,$D4 236262306a36Sopenharmony_ci 236362306a36Sopenharmony_ci vpsrlq \$26,$D1,$M1 236462306a36Sopenharmony_ci vpandq $MASK,$D1,$D1 236562306a36Sopenharmony_ci vpaddq $M1,$D2,$D2 # d1 -> d2 236662306a36Sopenharmony_ci 236762306a36Sopenharmony_ci vpaddq $M4,$D0,$D0 236862306a36Sopenharmony_ci vpsllq \$2,$M4,$M4 236962306a36Sopenharmony_ci vpaddq $M4,$D0,$D0 # d4 -> d0 237062306a36Sopenharmony_ci 237162306a36Sopenharmony_ci vpsrlq \$26,$D2,$M2 237262306a36Sopenharmony_ci vpandq $MASK,$D2,$D2 237362306a36Sopenharmony_ci vpaddq $M2,$D3,$D3 # d2 -> d3 237462306a36Sopenharmony_ci 237562306a36Sopenharmony_ci vpsrlq \$26,$D0,$M0 237662306a36Sopenharmony_ci vpandq $MASK,$D0,$D0 237762306a36Sopenharmony_ci vpaddq $M0,$D1,$D1 # d0 -> d1 237862306a36Sopenharmony_ci 237962306a36Sopenharmony_ci vpsrlq \$26,$D3,$M3 238062306a36Sopenharmony_ci vpandq $MASK,$D3,$D3 238162306a36Sopenharmony_ci vpaddq $M3,$D4,$D4 # d3 -> d4 238262306a36Sopenharmony_ci 238362306a36Sopenharmony_ci ################################################################ 238462306a36Sopenharmony_ci # at this point we have 14243444 in $R0-$S4 and 05060708 in 238562306a36Sopenharmony_ci # $D0-$D4, ... 238662306a36Sopenharmony_ci 238762306a36Sopenharmony_ci vpunpcklqdq $T4,$T3,$T0 # transpose input 238862306a36Sopenharmony_ci vpunpckhqdq $T4,$T3,$T4 238962306a36Sopenharmony_ci 239062306a36Sopenharmony_ci # ... since input 64-bit lanes are ordered as 73625140, we could 239162306a36Sopenharmony_ci # "vperm" it to 76543210 (here and in each loop iteration), *or* 239262306a36Sopenharmony_ci # we could just flow along, hence the goal for $R0-$S4 is 239362306a36Sopenharmony_ci # 1858286838784888 ... 239462306a36Sopenharmony_ci 239562306a36Sopenharmony_ci vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: 239662306a36Sopenharmony_ci mov \$0x7777,%eax 239762306a36Sopenharmony_ci kmovw %eax,%k1 239862306a36Sopenharmony_ci 239962306a36Sopenharmony_ci vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- 240062306a36Sopenharmony_ci vpermd $R1,$M0,$R1 240162306a36Sopenharmony_ci vpermd $R2,$M0,$R2 240262306a36Sopenharmony_ci vpermd $R3,$M0,$R3 240362306a36Sopenharmony_ci vpermd $R4,$M0,$R4 240462306a36Sopenharmony_ci 240562306a36Sopenharmony_ci vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 240662306a36Sopenharmony_ci vpermd $D1,$M0,${R1}{%k1} 240762306a36Sopenharmony_ci vpermd $D2,$M0,${R2}{%k1} 240862306a36Sopenharmony_ci vpermd $D3,$M0,${R3}{%k1} 240962306a36Sopenharmony_ci vpermd $D4,$M0,${R4}{%k1} 241062306a36Sopenharmony_ci 241162306a36Sopenharmony_ci vpslld \$2,$R1,$S1 # *5 241262306a36Sopenharmony_ci vpslld \$2,$R2,$S2 241362306a36Sopenharmony_ci vpslld \$2,$R3,$S3 241462306a36Sopenharmony_ci vpslld \$2,$R4,$S4 241562306a36Sopenharmony_ci vpaddd $R1,$S1,$S1 241662306a36Sopenharmony_ci vpaddd $R2,$S2,$S2 241762306a36Sopenharmony_ci vpaddd $R3,$S3,$S3 241862306a36Sopenharmony_ci vpaddd $R4,$S4,$S4 241962306a36Sopenharmony_ci 242062306a36Sopenharmony_ci vpbroadcastq 32(%rcx),$PADBIT # .L129 242162306a36Sopenharmony_ci 242262306a36Sopenharmony_ci vpsrlq \$52,$T0,$T2 # splat input 242362306a36Sopenharmony_ci vpsllq \$12,$T4,$T3 242462306a36Sopenharmony_ci vporq $T3,$T2,$T2 242562306a36Sopenharmony_ci vpsrlq \$26,$T0,$T1 242662306a36Sopenharmony_ci vpsrlq \$14,$T4,$T3 242762306a36Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 242862306a36Sopenharmony_ci vpandq $MASK,$T2,$T2 # 2 242962306a36Sopenharmony_ci vpandq $MASK,$T0,$T0 # 0 243062306a36Sopenharmony_ci #vpandq $MASK,$T1,$T1 # 1 243162306a36Sopenharmony_ci #vpandq $MASK,$T3,$T3 # 3 243262306a36Sopenharmony_ci #vporq $PADBIT,$T4,$T4 # padbit, yes, always 243362306a36Sopenharmony_ci 243462306a36Sopenharmony_ci vpaddq $H2,$T2,$H2 # accumulate input 243562306a36Sopenharmony_ci sub \$192,$len 243662306a36Sopenharmony_ci jbe .Ltail_avx512 243762306a36Sopenharmony_ci jmp .Loop_avx512 243862306a36Sopenharmony_ci 243962306a36Sopenharmony_ci.align 32 244062306a36Sopenharmony_ci.Loop_avx512: 244162306a36Sopenharmony_ci ################################################################ 244262306a36Sopenharmony_ci # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 244362306a36Sopenharmony_ci # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 244462306a36Sopenharmony_ci # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 244562306a36Sopenharmony_ci # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 244662306a36Sopenharmony_ci # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 244762306a36Sopenharmony_ci # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 244862306a36Sopenharmony_ci # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 244962306a36Sopenharmony_ci # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 245062306a36Sopenharmony_ci # \________/\___________/ 245162306a36Sopenharmony_ci ################################################################ 245262306a36Sopenharmony_ci #vpaddq $H2,$T2,$H2 # accumulate input 245362306a36Sopenharmony_ci 245462306a36Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 245562306a36Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 245662306a36Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 245762306a36Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 245862306a36Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 245962306a36Sopenharmony_ci # 246062306a36Sopenharmony_ci # however, as h2 is "chronologically" first one available pull 246162306a36Sopenharmony_ci # corresponding operations up, so it's 246262306a36Sopenharmony_ci # 246362306a36Sopenharmony_ci # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 246462306a36Sopenharmony_ci # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 246562306a36Sopenharmony_ci # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 246662306a36Sopenharmony_ci # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 246762306a36Sopenharmony_ci # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 246862306a36Sopenharmony_ci 246962306a36Sopenharmony_ci vpmuludq $H2,$R1,$D3 # d3 = h2*r1 247062306a36Sopenharmony_ci vpaddq $H0,$T0,$H0 247162306a36Sopenharmony_ci vpmuludq $H2,$R2,$D4 # d4 = h2*r2 247262306a36Sopenharmony_ci vpandq $MASK,$T1,$T1 # 1 247362306a36Sopenharmony_ci vpmuludq $H2,$S3,$D0 # d0 = h2*s3 247462306a36Sopenharmony_ci vpandq $MASK,$T3,$T3 # 3 247562306a36Sopenharmony_ci vpmuludq $H2,$S4,$D1 # d1 = h2*s4 247662306a36Sopenharmony_ci vporq $PADBIT,$T4,$T4 # padbit, yes, always 247762306a36Sopenharmony_ci vpmuludq $H2,$R0,$D2 # d2 = h2*r0 247862306a36Sopenharmony_ci vpaddq $H1,$T1,$H1 # accumulate input 247962306a36Sopenharmony_ci vpaddq $H3,$T3,$H3 248062306a36Sopenharmony_ci vpaddq $H4,$T4,$H4 248162306a36Sopenharmony_ci 248262306a36Sopenharmony_ci vmovdqu64 16*0($inp),$T3 # load input 248362306a36Sopenharmony_ci vmovdqu64 16*4($inp),$T4 248462306a36Sopenharmony_ci lea 16*8($inp),$inp 248562306a36Sopenharmony_ci vpmuludq $H0,$R3,$M3 248662306a36Sopenharmony_ci vpmuludq $H0,$R4,$M4 248762306a36Sopenharmony_ci vpmuludq $H0,$R0,$M0 248862306a36Sopenharmony_ci vpmuludq $H0,$R1,$M1 248962306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h0*r3 249062306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h0*r4 249162306a36Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h0*r0 249262306a36Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h0*r1 249362306a36Sopenharmony_ci 249462306a36Sopenharmony_ci vpmuludq $H1,$R2,$M3 249562306a36Sopenharmony_ci vpmuludq $H1,$R3,$M4 249662306a36Sopenharmony_ci vpmuludq $H1,$S4,$M0 249762306a36Sopenharmony_ci vpmuludq $H0,$R2,$M2 249862306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h1*r2 249962306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h1*r3 250062306a36Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h1*s4 250162306a36Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h0*r2 250262306a36Sopenharmony_ci 250362306a36Sopenharmony_ci vpunpcklqdq $T4,$T3,$T0 # transpose input 250462306a36Sopenharmony_ci vpunpckhqdq $T4,$T3,$T4 250562306a36Sopenharmony_ci 250662306a36Sopenharmony_ci vpmuludq $H3,$R0,$M3 250762306a36Sopenharmony_ci vpmuludq $H3,$R1,$M4 250862306a36Sopenharmony_ci vpmuludq $H1,$R0,$M1 250962306a36Sopenharmony_ci vpmuludq $H1,$R1,$M2 251062306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h3*r0 251162306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h3*r1 251262306a36Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h1*r0 251362306a36Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h1*r1 251462306a36Sopenharmony_ci 251562306a36Sopenharmony_ci vpmuludq $H4,$S4,$M3 251662306a36Sopenharmony_ci vpmuludq $H4,$R0,$M4 251762306a36Sopenharmony_ci vpmuludq $H3,$S2,$M0 251862306a36Sopenharmony_ci vpmuludq $H3,$S3,$M1 251962306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h4*s4 252062306a36Sopenharmony_ci vpmuludq $H3,$S4,$M2 252162306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h4*r0 252262306a36Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h3*s2 252362306a36Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h3*s3 252462306a36Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h3*s4 252562306a36Sopenharmony_ci 252662306a36Sopenharmony_ci vpmuludq $H4,$S1,$M0 252762306a36Sopenharmony_ci vpmuludq $H4,$S2,$M1 252862306a36Sopenharmony_ci vpmuludq $H4,$S3,$M2 252962306a36Sopenharmony_ci vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 253062306a36Sopenharmony_ci vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 253162306a36Sopenharmony_ci vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 253262306a36Sopenharmony_ci 253362306a36Sopenharmony_ci ################################################################ 253462306a36Sopenharmony_ci # lazy reduction (interleaved with input splat) 253562306a36Sopenharmony_ci 253662306a36Sopenharmony_ci vpsrlq \$52,$T0,$T2 # splat input 253762306a36Sopenharmony_ci vpsllq \$12,$T4,$T3 253862306a36Sopenharmony_ci 253962306a36Sopenharmony_ci vpsrlq \$26,$D3,$H3 254062306a36Sopenharmony_ci vpandq $MASK,$D3,$D3 254162306a36Sopenharmony_ci vpaddq $H3,$D4,$H4 # h3 -> h4 254262306a36Sopenharmony_ci 254362306a36Sopenharmony_ci vporq $T3,$T2,$T2 254462306a36Sopenharmony_ci 254562306a36Sopenharmony_ci vpsrlq \$26,$H0,$D0 254662306a36Sopenharmony_ci vpandq $MASK,$H0,$H0 254762306a36Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 254862306a36Sopenharmony_ci 254962306a36Sopenharmony_ci vpandq $MASK,$T2,$T2 # 2 255062306a36Sopenharmony_ci 255162306a36Sopenharmony_ci vpsrlq \$26,$H4,$D4 255262306a36Sopenharmony_ci vpandq $MASK,$H4,$H4 255362306a36Sopenharmony_ci 255462306a36Sopenharmony_ci vpsrlq \$26,$H1,$D1 255562306a36Sopenharmony_ci vpandq $MASK,$H1,$H1 255662306a36Sopenharmony_ci vpaddq $D1,$H2,$H2 # h1 -> h2 255762306a36Sopenharmony_ci 255862306a36Sopenharmony_ci vpaddq $D4,$H0,$H0 255962306a36Sopenharmony_ci vpsllq \$2,$D4,$D4 256062306a36Sopenharmony_ci vpaddq $D4,$H0,$H0 # h4 -> h0 256162306a36Sopenharmony_ci 256262306a36Sopenharmony_ci vpaddq $T2,$H2,$H2 # modulo-scheduled 256362306a36Sopenharmony_ci vpsrlq \$26,$T0,$T1 256462306a36Sopenharmony_ci 256562306a36Sopenharmony_ci vpsrlq \$26,$H2,$D2 256662306a36Sopenharmony_ci vpandq $MASK,$H2,$H2 256762306a36Sopenharmony_ci vpaddq $D2,$D3,$H3 # h2 -> h3 256862306a36Sopenharmony_ci 256962306a36Sopenharmony_ci vpsrlq \$14,$T4,$T3 257062306a36Sopenharmony_ci 257162306a36Sopenharmony_ci vpsrlq \$26,$H0,$D0 257262306a36Sopenharmony_ci vpandq $MASK,$H0,$H0 257362306a36Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 257462306a36Sopenharmony_ci 257562306a36Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 257662306a36Sopenharmony_ci 257762306a36Sopenharmony_ci vpsrlq \$26,$H3,$D3 257862306a36Sopenharmony_ci vpandq $MASK,$H3,$H3 257962306a36Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 258062306a36Sopenharmony_ci 258162306a36Sopenharmony_ci vpandq $MASK,$T0,$T0 # 0 258262306a36Sopenharmony_ci #vpandq $MASK,$T1,$T1 # 1 258362306a36Sopenharmony_ci #vpandq $MASK,$T3,$T3 # 3 258462306a36Sopenharmony_ci #vporq $PADBIT,$T4,$T4 # padbit, yes, always 258562306a36Sopenharmony_ci 258662306a36Sopenharmony_ci sub \$128,$len 258762306a36Sopenharmony_ci ja .Loop_avx512 258862306a36Sopenharmony_ci 258962306a36Sopenharmony_ci.Ltail_avx512: 259062306a36Sopenharmony_ci ################################################################ 259162306a36Sopenharmony_ci # while above multiplications were by r^8 in all lanes, in last 259262306a36Sopenharmony_ci # iteration we multiply least significant lane by r^8 and most 259362306a36Sopenharmony_ci # significant one by r, that's why table gets shifted... 259462306a36Sopenharmony_ci 259562306a36Sopenharmony_ci vpsrlq \$32,$R0,$R0 # 0105020603070408 259662306a36Sopenharmony_ci vpsrlq \$32,$R1,$R1 259762306a36Sopenharmony_ci vpsrlq \$32,$R2,$R2 259862306a36Sopenharmony_ci vpsrlq \$32,$S3,$S3 259962306a36Sopenharmony_ci vpsrlq \$32,$S4,$S4 260062306a36Sopenharmony_ci vpsrlq \$32,$R3,$R3 260162306a36Sopenharmony_ci vpsrlq \$32,$R4,$R4 260262306a36Sopenharmony_ci vpsrlq \$32,$S1,$S1 260362306a36Sopenharmony_ci vpsrlq \$32,$S2,$S2 260462306a36Sopenharmony_ci 260562306a36Sopenharmony_ci ################################################################ 260662306a36Sopenharmony_ci # load either next or last 64 byte of input 260762306a36Sopenharmony_ci lea ($inp,$len),$inp 260862306a36Sopenharmony_ci 260962306a36Sopenharmony_ci #vpaddq $H2,$T2,$H2 # accumulate input 261062306a36Sopenharmony_ci vpaddq $H0,$T0,$H0 261162306a36Sopenharmony_ci 261262306a36Sopenharmony_ci vpmuludq $H2,$R1,$D3 # d3 = h2*r1 261362306a36Sopenharmony_ci vpmuludq $H2,$R2,$D4 # d4 = h2*r2 261462306a36Sopenharmony_ci vpmuludq $H2,$S3,$D0 # d0 = h2*s3 261562306a36Sopenharmony_ci vpandq $MASK,$T1,$T1 # 1 261662306a36Sopenharmony_ci vpmuludq $H2,$S4,$D1 # d1 = h2*s4 261762306a36Sopenharmony_ci vpandq $MASK,$T3,$T3 # 3 261862306a36Sopenharmony_ci vpmuludq $H2,$R0,$D2 # d2 = h2*r0 261962306a36Sopenharmony_ci vporq $PADBIT,$T4,$T4 # padbit, yes, always 262062306a36Sopenharmony_ci vpaddq $H1,$T1,$H1 # accumulate input 262162306a36Sopenharmony_ci vpaddq $H3,$T3,$H3 262262306a36Sopenharmony_ci vpaddq $H4,$T4,$H4 262362306a36Sopenharmony_ci 262462306a36Sopenharmony_ci vmovdqu 16*0($inp),%x#$T0 262562306a36Sopenharmony_ci vpmuludq $H0,$R3,$M3 262662306a36Sopenharmony_ci vpmuludq $H0,$R4,$M4 262762306a36Sopenharmony_ci vpmuludq $H0,$R0,$M0 262862306a36Sopenharmony_ci vpmuludq $H0,$R1,$M1 262962306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h0*r3 263062306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h0*r4 263162306a36Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h0*r0 263262306a36Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h0*r1 263362306a36Sopenharmony_ci 263462306a36Sopenharmony_ci vmovdqu 16*1($inp),%x#$T1 263562306a36Sopenharmony_ci vpmuludq $H1,$R2,$M3 263662306a36Sopenharmony_ci vpmuludq $H1,$R3,$M4 263762306a36Sopenharmony_ci vpmuludq $H1,$S4,$M0 263862306a36Sopenharmony_ci vpmuludq $H0,$R2,$M2 263962306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h1*r2 264062306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h1*r3 264162306a36Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h1*s4 264262306a36Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h0*r2 264362306a36Sopenharmony_ci 264462306a36Sopenharmony_ci vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 264562306a36Sopenharmony_ci vpmuludq $H3,$R0,$M3 264662306a36Sopenharmony_ci vpmuludq $H3,$R1,$M4 264762306a36Sopenharmony_ci vpmuludq $H1,$R0,$M1 264862306a36Sopenharmony_ci vpmuludq $H1,$R1,$M2 264962306a36Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h3*r0 265062306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h3*r1 265162306a36Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h1*r0 265262306a36Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h1*r1 265362306a36Sopenharmony_ci 265462306a36Sopenharmony_ci vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 265562306a36Sopenharmony_ci vpmuludq $H4,$S4,$M3 265662306a36Sopenharmony_ci vpmuludq $H4,$R0,$M4 265762306a36Sopenharmony_ci vpmuludq $H3,$S2,$M0 265862306a36Sopenharmony_ci vpmuludq $H3,$S3,$M1 265962306a36Sopenharmony_ci vpmuludq $H3,$S4,$M2 266062306a36Sopenharmony_ci vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 266162306a36Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h4*r0 266262306a36Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h3*s2 266362306a36Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h3*s3 266462306a36Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h3*s4 266562306a36Sopenharmony_ci 266662306a36Sopenharmony_ci vpmuludq $H4,$S1,$M0 266762306a36Sopenharmony_ci vpmuludq $H4,$S2,$M1 266862306a36Sopenharmony_ci vpmuludq $H4,$S3,$M2 266962306a36Sopenharmony_ci vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 267062306a36Sopenharmony_ci vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 267162306a36Sopenharmony_ci vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 267262306a36Sopenharmony_ci 267362306a36Sopenharmony_ci ################################################################ 267462306a36Sopenharmony_ci # horizontal addition 267562306a36Sopenharmony_ci 267662306a36Sopenharmony_ci mov \$1,%eax 267762306a36Sopenharmony_ci vpermq \$0xb1,$H3,$D3 267862306a36Sopenharmony_ci vpermq \$0xb1,$D4,$H4 267962306a36Sopenharmony_ci vpermq \$0xb1,$H0,$D0 268062306a36Sopenharmony_ci vpermq \$0xb1,$H1,$D1 268162306a36Sopenharmony_ci vpermq \$0xb1,$H2,$D2 268262306a36Sopenharmony_ci vpaddq $D3,$H3,$H3 268362306a36Sopenharmony_ci vpaddq $D4,$H4,$H4 268462306a36Sopenharmony_ci vpaddq $D0,$H0,$H0 268562306a36Sopenharmony_ci vpaddq $D1,$H1,$H1 268662306a36Sopenharmony_ci vpaddq $D2,$H2,$H2 268762306a36Sopenharmony_ci 268862306a36Sopenharmony_ci kmovw %eax,%k3 268962306a36Sopenharmony_ci vpermq \$0x2,$H3,$D3 269062306a36Sopenharmony_ci vpermq \$0x2,$H4,$D4 269162306a36Sopenharmony_ci vpermq \$0x2,$H0,$D0 269262306a36Sopenharmony_ci vpermq \$0x2,$H1,$D1 269362306a36Sopenharmony_ci vpermq \$0x2,$H2,$D2 269462306a36Sopenharmony_ci vpaddq $D3,$H3,$H3 269562306a36Sopenharmony_ci vpaddq $D4,$H4,$H4 269662306a36Sopenharmony_ci vpaddq $D0,$H0,$H0 269762306a36Sopenharmony_ci vpaddq $D1,$H1,$H1 269862306a36Sopenharmony_ci vpaddq $D2,$H2,$H2 269962306a36Sopenharmony_ci 270062306a36Sopenharmony_ci vextracti64x4 \$0x1,$H3,%y#$D3 270162306a36Sopenharmony_ci vextracti64x4 \$0x1,$H4,%y#$D4 270262306a36Sopenharmony_ci vextracti64x4 \$0x1,$H0,%y#$D0 270362306a36Sopenharmony_ci vextracti64x4 \$0x1,$H1,%y#$D1 270462306a36Sopenharmony_ci vextracti64x4 \$0x1,$H2,%y#$D2 270562306a36Sopenharmony_ci vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case 270662306a36Sopenharmony_ci vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 270762306a36Sopenharmony_ci vpaddq $D0,$H0,${H0}{%k3}{z} 270862306a36Sopenharmony_ci vpaddq $D1,$H1,${H1}{%k3}{z} 270962306a36Sopenharmony_ci vpaddq $D2,$H2,${H2}{%k3}{z} 271062306a36Sopenharmony_ci___ 271162306a36Sopenharmony_cimap(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); 271262306a36Sopenharmony_cimap(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); 271362306a36Sopenharmony_ci$code.=<<___; 271462306a36Sopenharmony_ci ################################################################ 271562306a36Sopenharmony_ci # lazy reduction (interleaved with input splat) 271662306a36Sopenharmony_ci 271762306a36Sopenharmony_ci vpsrlq \$26,$H3,$D3 271862306a36Sopenharmony_ci vpand $MASK,$H3,$H3 271962306a36Sopenharmony_ci vpsrldq \$6,$T0,$T2 # splat input 272062306a36Sopenharmony_ci vpsrldq \$6,$T1,$T3 272162306a36Sopenharmony_ci vpunpckhqdq $T1,$T0,$T4 # 4 272262306a36Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 272362306a36Sopenharmony_ci 272462306a36Sopenharmony_ci vpsrlq \$26,$H0,$D0 272562306a36Sopenharmony_ci vpand $MASK,$H0,$H0 272662306a36Sopenharmony_ci vpunpcklqdq $T3,$T2,$T2 # 2:3 272762306a36Sopenharmony_ci vpunpcklqdq $T1,$T0,$T0 # 0:1 272862306a36Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 272962306a36Sopenharmony_ci 273062306a36Sopenharmony_ci vpsrlq \$26,$H4,$D4 273162306a36Sopenharmony_ci vpand $MASK,$H4,$H4 273262306a36Sopenharmony_ci 273362306a36Sopenharmony_ci vpsrlq \$26,$H1,$D1 273462306a36Sopenharmony_ci vpand $MASK,$H1,$H1 273562306a36Sopenharmony_ci vpsrlq \$30,$T2,$T3 273662306a36Sopenharmony_ci vpsrlq \$4,$T2,$T2 273762306a36Sopenharmony_ci vpaddq $D1,$H2,$H2 # h1 -> h2 273862306a36Sopenharmony_ci 273962306a36Sopenharmony_ci vpaddq $D4,$H0,$H0 274062306a36Sopenharmony_ci vpsllq \$2,$D4,$D4 274162306a36Sopenharmony_ci vpsrlq \$26,$T0,$T1 274262306a36Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 274362306a36Sopenharmony_ci vpaddq $D4,$H0,$H0 # h4 -> h0 274462306a36Sopenharmony_ci 274562306a36Sopenharmony_ci vpsrlq \$26,$H2,$D2 274662306a36Sopenharmony_ci vpand $MASK,$H2,$H2 274762306a36Sopenharmony_ci vpand $MASK,$T2,$T2 # 2 274862306a36Sopenharmony_ci vpand $MASK,$T0,$T0 # 0 274962306a36Sopenharmony_ci vpaddq $D2,$H3,$H3 # h2 -> h3 275062306a36Sopenharmony_ci 275162306a36Sopenharmony_ci vpsrlq \$26,$H0,$D0 275262306a36Sopenharmony_ci vpand $MASK,$H0,$H0 275362306a36Sopenharmony_ci vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 275462306a36Sopenharmony_ci vpand $MASK,$T1,$T1 # 1 275562306a36Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 275662306a36Sopenharmony_ci 275762306a36Sopenharmony_ci vpsrlq \$26,$H3,$D3 275862306a36Sopenharmony_ci vpand $MASK,$H3,$H3 275962306a36Sopenharmony_ci vpand $MASK,$T3,$T3 # 3 276062306a36Sopenharmony_ci vpor 32(%rcx),$T4,$T4 # padbit, yes, always 276162306a36Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 276262306a36Sopenharmony_ci 276362306a36Sopenharmony_ci lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 276462306a36Sopenharmony_ci add \$64,$len 276562306a36Sopenharmony_ci jnz .Ltail_avx2$suffix 276662306a36Sopenharmony_ci 276762306a36Sopenharmony_ci vpsubq $T2,$H2,$H2 # undo input accumulation 276862306a36Sopenharmony_ci vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 276962306a36Sopenharmony_ci vmovd %x#$H1,`4*1-48-64`($ctx) 277062306a36Sopenharmony_ci vmovd %x#$H2,`4*2-48-64`($ctx) 277162306a36Sopenharmony_ci vmovd %x#$H3,`4*3-48-64`($ctx) 277262306a36Sopenharmony_ci vmovd %x#$H4,`4*4-48-64`($ctx) 277362306a36Sopenharmony_ci vzeroall 277462306a36Sopenharmony_ci___ 277562306a36Sopenharmony_ci$code.=<<___ if ($win64); 277662306a36Sopenharmony_ci movdqa -0xb0(%r10),%xmm6 277762306a36Sopenharmony_ci movdqa -0xa0(%r10),%xmm7 277862306a36Sopenharmony_ci movdqa -0x90(%r10),%xmm8 277962306a36Sopenharmony_ci movdqa -0x80(%r10),%xmm9 278062306a36Sopenharmony_ci movdqa -0x70(%r10),%xmm10 278162306a36Sopenharmony_ci movdqa -0x60(%r10),%xmm11 278262306a36Sopenharmony_ci movdqa -0x50(%r10),%xmm12 278362306a36Sopenharmony_ci movdqa -0x40(%r10),%xmm13 278462306a36Sopenharmony_ci movdqa -0x30(%r10),%xmm14 278562306a36Sopenharmony_ci movdqa -0x20(%r10),%xmm15 278662306a36Sopenharmony_ci lea -8(%r10),%rsp 278762306a36Sopenharmony_ci.Ldo_avx512_epilogue: 278862306a36Sopenharmony_ci___ 278962306a36Sopenharmony_ci$code.=<<___ if (!$win64); 279062306a36Sopenharmony_ci lea -8(%r10),%rsp 279162306a36Sopenharmony_ci.cfi_def_cfa_register %rsp 279262306a36Sopenharmony_ci___ 279362306a36Sopenharmony_ci$code.=<<___; 279462306a36Sopenharmony_ci RET 279562306a36Sopenharmony_ci.cfi_endproc 279662306a36Sopenharmony_ci___ 279762306a36Sopenharmony_ci 279862306a36Sopenharmony_ci} 279962306a36Sopenharmony_ci 280062306a36Sopenharmony_ci} 280162306a36Sopenharmony_ci 280262306a36Sopenharmony_ci&declare_function("poly1305_blocks_avx2", 32, 4); 280362306a36Sopenharmony_cipoly1305_blocks_avxN(0); 280462306a36Sopenharmony_ci&end_function("poly1305_blocks_avx2"); 280562306a36Sopenharmony_ci 280662306a36Sopenharmony_ci####################################################################### 280762306a36Sopenharmony_ciif ($avx>2) { 280862306a36Sopenharmony_ci# On entry we have input length divisible by 64. But since inner loop 280962306a36Sopenharmony_ci# processes 128 bytes per iteration, cases when length is not divisible 281062306a36Sopenharmony_ci# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this 281162306a36Sopenharmony_ci# reason stack layout is kept identical to poly1305_blocks_avx2. If not 281262306a36Sopenharmony_ci# for this tail, we wouldn't have to even allocate stack frame... 281362306a36Sopenharmony_ci 281462306a36Sopenharmony_ciif($kernel) { 281562306a36Sopenharmony_ci $code .= "#ifdef CONFIG_AS_AVX512\n"; 281662306a36Sopenharmony_ci} 281762306a36Sopenharmony_ci 281862306a36Sopenharmony_ci&declare_function("poly1305_blocks_avx512", 32, 4); 281962306a36Sopenharmony_cipoly1305_blocks_avxN(1); 282062306a36Sopenharmony_ci&end_function("poly1305_blocks_avx512"); 282162306a36Sopenharmony_ci 282262306a36Sopenharmony_ciif ($kernel) { 282362306a36Sopenharmony_ci $code .= "#endif\n"; 282462306a36Sopenharmony_ci} 282562306a36Sopenharmony_ci 282662306a36Sopenharmony_ciif (!$kernel && $avx>3) { 282762306a36Sopenharmony_ci######################################################################## 282862306a36Sopenharmony_ci# VPMADD52 version using 2^44 radix. 282962306a36Sopenharmony_ci# 283062306a36Sopenharmony_ci# One can argue that base 2^52 would be more natural. Well, even though 283162306a36Sopenharmony_ci# some operations would be more natural, one has to recognize couple of 283262306a36Sopenharmony_ci# things. Base 2^52 doesn't provide advantage over base 2^44 if you look 283362306a36Sopenharmony_ci# at amount of multiply-n-accumulate operations. Secondly, it makes it 283462306a36Sopenharmony_ci# impossible to pre-compute multiples of 5 [referred to as s[]/sN in 283562306a36Sopenharmony_ci# reference implementations], which means that more such operations 283662306a36Sopenharmony_ci# would have to be performed in inner loop, which in turn makes critical 283762306a36Sopenharmony_ci# path longer. In other words, even though base 2^44 reduction might 283862306a36Sopenharmony_ci# look less elegant, overall critical path is actually shorter... 283962306a36Sopenharmony_ci 284062306a36Sopenharmony_ci######################################################################## 284162306a36Sopenharmony_ci# Layout of opaque area is following. 284262306a36Sopenharmony_ci# 284362306a36Sopenharmony_ci# unsigned __int64 h[3]; # current hash value base 2^44 284462306a36Sopenharmony_ci# unsigned __int64 s[2]; # key value*20 base 2^44 284562306a36Sopenharmony_ci# unsigned __int64 r[3]; # key value base 2^44 284662306a36Sopenharmony_ci# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; 284762306a36Sopenharmony_ci# # r^n positions reflect 284862306a36Sopenharmony_ci# # placement in register, not 284962306a36Sopenharmony_ci# # memory, R[3] is R[1]*20 285062306a36Sopenharmony_ci 285162306a36Sopenharmony_ci$code.=<<___; 285262306a36Sopenharmony_ci.type poly1305_init_base2_44,\@function,3 285362306a36Sopenharmony_ci.align 32 285462306a36Sopenharmony_cipoly1305_init_base2_44: 285562306a36Sopenharmony_ci xor %eax,%eax 285662306a36Sopenharmony_ci mov %rax,0($ctx) # initialize hash value 285762306a36Sopenharmony_ci mov %rax,8($ctx) 285862306a36Sopenharmony_ci mov %rax,16($ctx) 285962306a36Sopenharmony_ci 286062306a36Sopenharmony_ci.Linit_base2_44: 286162306a36Sopenharmony_ci lea poly1305_blocks_vpmadd52(%rip),%r10 286262306a36Sopenharmony_ci lea poly1305_emit_base2_44(%rip),%r11 286362306a36Sopenharmony_ci 286462306a36Sopenharmony_ci mov \$0x0ffffffc0fffffff,%rax 286562306a36Sopenharmony_ci mov \$0x0ffffffc0ffffffc,%rcx 286662306a36Sopenharmony_ci and 0($inp),%rax 286762306a36Sopenharmony_ci mov \$0x00000fffffffffff,%r8 286862306a36Sopenharmony_ci and 8($inp),%rcx 286962306a36Sopenharmony_ci mov \$0x00000fffffffffff,%r9 287062306a36Sopenharmony_ci and %rax,%r8 287162306a36Sopenharmony_ci shrd \$44,%rcx,%rax 287262306a36Sopenharmony_ci mov %r8,40($ctx) # r0 287362306a36Sopenharmony_ci and %r9,%rax 287462306a36Sopenharmony_ci shr \$24,%rcx 287562306a36Sopenharmony_ci mov %rax,48($ctx) # r1 287662306a36Sopenharmony_ci lea (%rax,%rax,4),%rax # *5 287762306a36Sopenharmony_ci mov %rcx,56($ctx) # r2 287862306a36Sopenharmony_ci shl \$2,%rax # magic <<2 287962306a36Sopenharmony_ci lea (%rcx,%rcx,4),%rcx # *5 288062306a36Sopenharmony_ci shl \$2,%rcx # magic <<2 288162306a36Sopenharmony_ci mov %rax,24($ctx) # s1 288262306a36Sopenharmony_ci mov %rcx,32($ctx) # s2 288362306a36Sopenharmony_ci movq \$-1,64($ctx) # write impossible value 288462306a36Sopenharmony_ci___ 288562306a36Sopenharmony_ci$code.=<<___ if ($flavour !~ /elf32/); 288662306a36Sopenharmony_ci mov %r10,0(%rdx) 288762306a36Sopenharmony_ci mov %r11,8(%rdx) 288862306a36Sopenharmony_ci___ 288962306a36Sopenharmony_ci$code.=<<___ if ($flavour =~ /elf32/); 289062306a36Sopenharmony_ci mov %r10d,0(%rdx) 289162306a36Sopenharmony_ci mov %r11d,4(%rdx) 289262306a36Sopenharmony_ci___ 289362306a36Sopenharmony_ci$code.=<<___; 289462306a36Sopenharmony_ci mov \$1,%eax 289562306a36Sopenharmony_ci RET 289662306a36Sopenharmony_ci.size poly1305_init_base2_44,.-poly1305_init_base2_44 289762306a36Sopenharmony_ci___ 289862306a36Sopenharmony_ci{ 289962306a36Sopenharmony_cimy ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); 290062306a36Sopenharmony_cimy ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); 290162306a36Sopenharmony_cimy ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); 290262306a36Sopenharmony_ci 290362306a36Sopenharmony_ci$code.=<<___; 290462306a36Sopenharmony_ci.type poly1305_blocks_vpmadd52,\@function,4 290562306a36Sopenharmony_ci.align 32 290662306a36Sopenharmony_cipoly1305_blocks_vpmadd52: 290762306a36Sopenharmony_ci shr \$4,$len 290862306a36Sopenharmony_ci jz .Lno_data_vpmadd52 # too short 290962306a36Sopenharmony_ci 291062306a36Sopenharmony_ci shl \$40,$padbit 291162306a36Sopenharmony_ci mov 64($ctx),%r8 # peek on power of the key 291262306a36Sopenharmony_ci 291362306a36Sopenharmony_ci # if powers of the key are not calculated yet, process up to 3 291462306a36Sopenharmony_ci # blocks with this single-block subroutine, otherwise ensure that 291562306a36Sopenharmony_ci # length is divisible by 2 blocks and pass the rest down to next 291662306a36Sopenharmony_ci # subroutine... 291762306a36Sopenharmony_ci 291862306a36Sopenharmony_ci mov \$3,%rax 291962306a36Sopenharmony_ci mov \$1,%r10 292062306a36Sopenharmony_ci cmp \$4,$len # is input long 292162306a36Sopenharmony_ci cmovae %r10,%rax 292262306a36Sopenharmony_ci test %r8,%r8 # is power value impossible? 292362306a36Sopenharmony_ci cmovns %r10,%rax 292462306a36Sopenharmony_ci 292562306a36Sopenharmony_ci and $len,%rax # is input of favourable length? 292662306a36Sopenharmony_ci jz .Lblocks_vpmadd52_4x 292762306a36Sopenharmony_ci 292862306a36Sopenharmony_ci sub %rax,$len 292962306a36Sopenharmony_ci mov \$7,%r10d 293062306a36Sopenharmony_ci mov \$1,%r11d 293162306a36Sopenharmony_ci kmovw %r10d,%k7 293262306a36Sopenharmony_ci lea .L2_44_inp_permd(%rip),%r10 293362306a36Sopenharmony_ci kmovw %r11d,%k1 293462306a36Sopenharmony_ci 293562306a36Sopenharmony_ci vmovq $padbit,%x#$PAD 293662306a36Sopenharmony_ci vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd 293762306a36Sopenharmony_ci vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift 293862306a36Sopenharmony_ci vpermq \$0xcf,$PAD,$PAD 293962306a36Sopenharmony_ci vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask 294062306a36Sopenharmony_ci 294162306a36Sopenharmony_ci vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value 294262306a36Sopenharmony_ci vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys 294362306a36Sopenharmony_ci vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} 294462306a36Sopenharmony_ci vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} 294562306a36Sopenharmony_ci 294662306a36Sopenharmony_ci vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt 294762306a36Sopenharmony_ci vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft 294862306a36Sopenharmony_ci 294962306a36Sopenharmony_ci jmp .Loop_vpmadd52 295062306a36Sopenharmony_ci 295162306a36Sopenharmony_ci.align 32 295262306a36Sopenharmony_ci.Loop_vpmadd52: 295362306a36Sopenharmony_ci vmovdqu32 0($inp),%x#$T0 # load input as ----3210 295462306a36Sopenharmony_ci lea 16($inp),$inp 295562306a36Sopenharmony_ci 295662306a36Sopenharmony_ci vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 295762306a36Sopenharmony_ci vpsrlvq $inp_shift,$T0,$T0 295862306a36Sopenharmony_ci vpandq $reduc_mask,$T0,$T0 295962306a36Sopenharmony_ci vporq $PAD,$T0,$T0 296062306a36Sopenharmony_ci 296162306a36Sopenharmony_ci vpaddq $T0,$Dlo,$Dlo # accumulate input 296262306a36Sopenharmony_ci 296362306a36Sopenharmony_ci vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value 296462306a36Sopenharmony_ci vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} 296562306a36Sopenharmony_ci vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} 296662306a36Sopenharmony_ci 296762306a36Sopenharmony_ci vpxord $Dlo,$Dlo,$Dlo 296862306a36Sopenharmony_ci vpxord $Dhi,$Dhi,$Dhi 296962306a36Sopenharmony_ci 297062306a36Sopenharmony_ci vpmadd52luq $r2r1r0,$H0,$Dlo 297162306a36Sopenharmony_ci vpmadd52huq $r2r1r0,$H0,$Dhi 297262306a36Sopenharmony_ci 297362306a36Sopenharmony_ci vpmadd52luq $r1r0s2,$H1,$Dlo 297462306a36Sopenharmony_ci vpmadd52huq $r1r0s2,$H1,$Dhi 297562306a36Sopenharmony_ci 297662306a36Sopenharmony_ci vpmadd52luq $r0s2s1,$H2,$Dlo 297762306a36Sopenharmony_ci vpmadd52huq $r0s2s1,$H2,$Dhi 297862306a36Sopenharmony_ci 297962306a36Sopenharmony_ci vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword 298062306a36Sopenharmony_ci vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword 298162306a36Sopenharmony_ci vpandq $reduc_mask,$Dlo,$Dlo 298262306a36Sopenharmony_ci 298362306a36Sopenharmony_ci vpaddq $T0,$Dhi,$Dhi 298462306a36Sopenharmony_ci 298562306a36Sopenharmony_ci vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword 298662306a36Sopenharmony_ci 298762306a36Sopenharmony_ci vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) 298862306a36Sopenharmony_ci 298962306a36Sopenharmony_ci vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word 299062306a36Sopenharmony_ci vpandq $reduc_mask,$Dlo,$Dlo 299162306a36Sopenharmony_ci 299262306a36Sopenharmony_ci vpermq \$0b10010011,$T0,$T0 299362306a36Sopenharmony_ci 299462306a36Sopenharmony_ci vpaddq $T0,$Dlo,$Dlo 299562306a36Sopenharmony_ci 299662306a36Sopenharmony_ci vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} 299762306a36Sopenharmony_ci 299862306a36Sopenharmony_ci vpaddq $T0,$Dlo,$Dlo 299962306a36Sopenharmony_ci vpsllq \$2,$T0,$T0 300062306a36Sopenharmony_ci 300162306a36Sopenharmony_ci vpaddq $T0,$Dlo,$Dlo 300262306a36Sopenharmony_ci 300362306a36Sopenharmony_ci dec %rax # len-=16 300462306a36Sopenharmony_ci jnz .Loop_vpmadd52 300562306a36Sopenharmony_ci 300662306a36Sopenharmony_ci vmovdqu64 $Dlo,0($ctx){%k7} # store hash value 300762306a36Sopenharmony_ci 300862306a36Sopenharmony_ci test $len,$len 300962306a36Sopenharmony_ci jnz .Lblocks_vpmadd52_4x 301062306a36Sopenharmony_ci 301162306a36Sopenharmony_ci.Lno_data_vpmadd52: 301262306a36Sopenharmony_ci RET 301362306a36Sopenharmony_ci.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 301462306a36Sopenharmony_ci___ 301562306a36Sopenharmony_ci} 301662306a36Sopenharmony_ci{ 301762306a36Sopenharmony_ci######################################################################## 301862306a36Sopenharmony_ci# As implied by its name 4x subroutine processes 4 blocks in parallel 301962306a36Sopenharmony_ci# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power 302062306a36Sopenharmony_ci# and is handled in 256-bit %ymm registers. 302162306a36Sopenharmony_ci 302262306a36Sopenharmony_cimy ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 302362306a36Sopenharmony_cimy ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 302462306a36Sopenharmony_cimy ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 302562306a36Sopenharmony_ci 302662306a36Sopenharmony_ci$code.=<<___; 302762306a36Sopenharmony_ci.type poly1305_blocks_vpmadd52_4x,\@function,4 302862306a36Sopenharmony_ci.align 32 302962306a36Sopenharmony_cipoly1305_blocks_vpmadd52_4x: 303062306a36Sopenharmony_ci shr \$4,$len 303162306a36Sopenharmony_ci jz .Lno_data_vpmadd52_4x # too short 303262306a36Sopenharmony_ci 303362306a36Sopenharmony_ci shl \$40,$padbit 303462306a36Sopenharmony_ci mov 64($ctx),%r8 # peek on power of the key 303562306a36Sopenharmony_ci 303662306a36Sopenharmony_ci.Lblocks_vpmadd52_4x: 303762306a36Sopenharmony_ci vpbroadcastq $padbit,$PAD 303862306a36Sopenharmony_ci 303962306a36Sopenharmony_ci vmovdqa64 .Lx_mask44(%rip),$mask44 304062306a36Sopenharmony_ci mov \$5,%eax 304162306a36Sopenharmony_ci vmovdqa64 .Lx_mask42(%rip),$mask42 304262306a36Sopenharmony_ci kmovw %eax,%k1 # used in 2x path 304362306a36Sopenharmony_ci 304462306a36Sopenharmony_ci test %r8,%r8 # is power value impossible? 304562306a36Sopenharmony_ci js .Linit_vpmadd52 # if it is, then init R[4] 304662306a36Sopenharmony_ci 304762306a36Sopenharmony_ci vmovq 0($ctx),%x#$H0 # load current hash value 304862306a36Sopenharmony_ci vmovq 8($ctx),%x#$H1 304962306a36Sopenharmony_ci vmovq 16($ctx),%x#$H2 305062306a36Sopenharmony_ci 305162306a36Sopenharmony_ci test \$3,$len # is length 4*n+2? 305262306a36Sopenharmony_ci jnz .Lblocks_vpmadd52_2x_do 305362306a36Sopenharmony_ci 305462306a36Sopenharmony_ci.Lblocks_vpmadd52_4x_do: 305562306a36Sopenharmony_ci vpbroadcastq 64($ctx),$R0 # load 4th power of the key 305662306a36Sopenharmony_ci vpbroadcastq 96($ctx),$R1 305762306a36Sopenharmony_ci vpbroadcastq 128($ctx),$R2 305862306a36Sopenharmony_ci vpbroadcastq 160($ctx),$S1 305962306a36Sopenharmony_ci 306062306a36Sopenharmony_ci.Lblocks_vpmadd52_4x_key_loaded: 306162306a36Sopenharmony_ci vpsllq \$2,$R2,$S2 # S2 = R2*5*4 306262306a36Sopenharmony_ci vpaddq $R2,$S2,$S2 306362306a36Sopenharmony_ci vpsllq \$2,$S2,$S2 306462306a36Sopenharmony_ci 306562306a36Sopenharmony_ci test \$7,$len # is len 8*n? 306662306a36Sopenharmony_ci jz .Lblocks_vpmadd52_8x 306762306a36Sopenharmony_ci 306862306a36Sopenharmony_ci vmovdqu64 16*0($inp),$T2 # load data 306962306a36Sopenharmony_ci vmovdqu64 16*2($inp),$T3 307062306a36Sopenharmony_ci lea 16*4($inp),$inp 307162306a36Sopenharmony_ci 307262306a36Sopenharmony_ci vpunpcklqdq $T3,$T2,$T1 # transpose data 307362306a36Sopenharmony_ci vpunpckhqdq $T3,$T2,$T3 307462306a36Sopenharmony_ci 307562306a36Sopenharmony_ci # at this point 64-bit lanes are ordered as 3-1-2-0 307662306a36Sopenharmony_ci 307762306a36Sopenharmony_ci vpsrlq \$24,$T3,$T2 # splat the data 307862306a36Sopenharmony_ci vporq $PAD,$T2,$T2 307962306a36Sopenharmony_ci vpaddq $T2,$H2,$H2 # accumulate input 308062306a36Sopenharmony_ci vpandq $mask44,$T1,$T0 308162306a36Sopenharmony_ci vpsrlq \$44,$T1,$T1 308262306a36Sopenharmony_ci vpsllq \$20,$T3,$T3 308362306a36Sopenharmony_ci vporq $T3,$T1,$T1 308462306a36Sopenharmony_ci vpandq $mask44,$T1,$T1 308562306a36Sopenharmony_ci 308662306a36Sopenharmony_ci sub \$4,$len 308762306a36Sopenharmony_ci jz .Ltail_vpmadd52_4x 308862306a36Sopenharmony_ci jmp .Loop_vpmadd52_4x 308962306a36Sopenharmony_ci ud2 309062306a36Sopenharmony_ci 309162306a36Sopenharmony_ci.align 32 309262306a36Sopenharmony_ci.Linit_vpmadd52: 309362306a36Sopenharmony_ci vmovq 24($ctx),%x#$S1 # load key 309462306a36Sopenharmony_ci vmovq 56($ctx),%x#$H2 309562306a36Sopenharmony_ci vmovq 32($ctx),%x#$S2 309662306a36Sopenharmony_ci vmovq 40($ctx),%x#$R0 309762306a36Sopenharmony_ci vmovq 48($ctx),%x#$R1 309862306a36Sopenharmony_ci 309962306a36Sopenharmony_ci vmovdqa $R0,$H0 310062306a36Sopenharmony_ci vmovdqa $R1,$H1 310162306a36Sopenharmony_ci vmovdqa $H2,$R2 310262306a36Sopenharmony_ci 310362306a36Sopenharmony_ci mov \$2,%eax 310462306a36Sopenharmony_ci 310562306a36Sopenharmony_ci.Lmul_init_vpmadd52: 310662306a36Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 310762306a36Sopenharmony_ci vpmadd52luq $H2,$S1,$D0lo 310862306a36Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 310962306a36Sopenharmony_ci vpmadd52huq $H2,$S1,$D0hi 311062306a36Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 311162306a36Sopenharmony_ci vpmadd52luq $H2,$S2,$D1lo 311262306a36Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 311362306a36Sopenharmony_ci vpmadd52huq $H2,$S2,$D1hi 311462306a36Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 311562306a36Sopenharmony_ci vpmadd52luq $H2,$R0,$D2lo 311662306a36Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 311762306a36Sopenharmony_ci vpmadd52huq $H2,$R0,$D2hi 311862306a36Sopenharmony_ci 311962306a36Sopenharmony_ci vpmadd52luq $H0,$R0,$D0lo 312062306a36Sopenharmony_ci vpmadd52huq $H0,$R0,$D0hi 312162306a36Sopenharmony_ci vpmadd52luq $H0,$R1,$D1lo 312262306a36Sopenharmony_ci vpmadd52huq $H0,$R1,$D1hi 312362306a36Sopenharmony_ci vpmadd52luq $H0,$R2,$D2lo 312462306a36Sopenharmony_ci vpmadd52huq $H0,$R2,$D2hi 312562306a36Sopenharmony_ci 312662306a36Sopenharmony_ci vpmadd52luq $H1,$S2,$D0lo 312762306a36Sopenharmony_ci vpmadd52huq $H1,$S2,$D0hi 312862306a36Sopenharmony_ci vpmadd52luq $H1,$R0,$D1lo 312962306a36Sopenharmony_ci vpmadd52huq $H1,$R0,$D1hi 313062306a36Sopenharmony_ci vpmadd52luq $H1,$R1,$D2lo 313162306a36Sopenharmony_ci vpmadd52huq $H1,$R1,$D2hi 313262306a36Sopenharmony_ci 313362306a36Sopenharmony_ci ################################################################ 313462306a36Sopenharmony_ci # partial reduction 313562306a36Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 313662306a36Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 313762306a36Sopenharmony_ci vpandq $mask44,$D0lo,$H0 313862306a36Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 313962306a36Sopenharmony_ci 314062306a36Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 314162306a36Sopenharmony_ci 314262306a36Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 314362306a36Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 314462306a36Sopenharmony_ci vpandq $mask44,$D1lo,$H1 314562306a36Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 314662306a36Sopenharmony_ci 314762306a36Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 314862306a36Sopenharmony_ci 314962306a36Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 315062306a36Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 315162306a36Sopenharmony_ci vpandq $mask42,$D2lo,$H2 315262306a36Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 315362306a36Sopenharmony_ci 315462306a36Sopenharmony_ci vpaddq $D2hi,$H0,$H0 315562306a36Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 315662306a36Sopenharmony_ci 315762306a36Sopenharmony_ci vpaddq $D2hi,$H0,$H0 315862306a36Sopenharmony_ci 315962306a36Sopenharmony_ci vpsrlq \$44,$H0,$tmp # additional step 316062306a36Sopenharmony_ci vpandq $mask44,$H0,$H0 316162306a36Sopenharmony_ci 316262306a36Sopenharmony_ci vpaddq $tmp,$H1,$H1 316362306a36Sopenharmony_ci 316462306a36Sopenharmony_ci dec %eax 316562306a36Sopenharmony_ci jz .Ldone_init_vpmadd52 316662306a36Sopenharmony_ci 316762306a36Sopenharmony_ci vpunpcklqdq $R1,$H1,$R1 # 1,2 316862306a36Sopenharmony_ci vpbroadcastq %x#$H1,%x#$H1 # 2,2 316962306a36Sopenharmony_ci vpunpcklqdq $R2,$H2,$R2 317062306a36Sopenharmony_ci vpbroadcastq %x#$H2,%x#$H2 317162306a36Sopenharmony_ci vpunpcklqdq $R0,$H0,$R0 317262306a36Sopenharmony_ci vpbroadcastq %x#$H0,%x#$H0 317362306a36Sopenharmony_ci 317462306a36Sopenharmony_ci vpsllq \$2,$R1,$S1 # S1 = R1*5*4 317562306a36Sopenharmony_ci vpsllq \$2,$R2,$S2 # S2 = R2*5*4 317662306a36Sopenharmony_ci vpaddq $R1,$S1,$S1 317762306a36Sopenharmony_ci vpaddq $R2,$S2,$S2 317862306a36Sopenharmony_ci vpsllq \$2,$S1,$S1 317962306a36Sopenharmony_ci vpsllq \$2,$S2,$S2 318062306a36Sopenharmony_ci 318162306a36Sopenharmony_ci jmp .Lmul_init_vpmadd52 318262306a36Sopenharmony_ci ud2 318362306a36Sopenharmony_ci 318462306a36Sopenharmony_ci.align 32 318562306a36Sopenharmony_ci.Ldone_init_vpmadd52: 318662306a36Sopenharmony_ci vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 318762306a36Sopenharmony_ci vinserti128 \$1,%x#$R2,$H2,$R2 318862306a36Sopenharmony_ci vinserti128 \$1,%x#$R0,$H0,$R0 318962306a36Sopenharmony_ci 319062306a36Sopenharmony_ci vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 319162306a36Sopenharmony_ci vpermq \$0b11011000,$R2,$R2 319262306a36Sopenharmony_ci vpermq \$0b11011000,$R0,$R0 319362306a36Sopenharmony_ci 319462306a36Sopenharmony_ci vpsllq \$2,$R1,$S1 # S1 = R1*5*4 319562306a36Sopenharmony_ci vpaddq $R1,$S1,$S1 319662306a36Sopenharmony_ci vpsllq \$2,$S1,$S1 319762306a36Sopenharmony_ci 319862306a36Sopenharmony_ci vmovq 0($ctx),%x#$H0 # load current hash value 319962306a36Sopenharmony_ci vmovq 8($ctx),%x#$H1 320062306a36Sopenharmony_ci vmovq 16($ctx),%x#$H2 320162306a36Sopenharmony_ci 320262306a36Sopenharmony_ci test \$3,$len # is length 4*n+2? 320362306a36Sopenharmony_ci jnz .Ldone_init_vpmadd52_2x 320462306a36Sopenharmony_ci 320562306a36Sopenharmony_ci vmovdqu64 $R0,64($ctx) # save key powers 320662306a36Sopenharmony_ci vpbroadcastq %x#$R0,$R0 # broadcast 4th power 320762306a36Sopenharmony_ci vmovdqu64 $R1,96($ctx) 320862306a36Sopenharmony_ci vpbroadcastq %x#$R1,$R1 320962306a36Sopenharmony_ci vmovdqu64 $R2,128($ctx) 321062306a36Sopenharmony_ci vpbroadcastq %x#$R2,$R2 321162306a36Sopenharmony_ci vmovdqu64 $S1,160($ctx) 321262306a36Sopenharmony_ci vpbroadcastq %x#$S1,$S1 321362306a36Sopenharmony_ci 321462306a36Sopenharmony_ci jmp .Lblocks_vpmadd52_4x_key_loaded 321562306a36Sopenharmony_ci ud2 321662306a36Sopenharmony_ci 321762306a36Sopenharmony_ci.align 32 321862306a36Sopenharmony_ci.Ldone_init_vpmadd52_2x: 321962306a36Sopenharmony_ci vmovdqu64 $R0,64($ctx) # save key powers 322062306a36Sopenharmony_ci vpsrldq \$8,$R0,$R0 # 0-1-0-2 322162306a36Sopenharmony_ci vmovdqu64 $R1,96($ctx) 322262306a36Sopenharmony_ci vpsrldq \$8,$R1,$R1 322362306a36Sopenharmony_ci vmovdqu64 $R2,128($ctx) 322462306a36Sopenharmony_ci vpsrldq \$8,$R2,$R2 322562306a36Sopenharmony_ci vmovdqu64 $S1,160($ctx) 322662306a36Sopenharmony_ci vpsrldq \$8,$S1,$S1 322762306a36Sopenharmony_ci jmp .Lblocks_vpmadd52_2x_key_loaded 322862306a36Sopenharmony_ci ud2 322962306a36Sopenharmony_ci 323062306a36Sopenharmony_ci.align 32 323162306a36Sopenharmony_ci.Lblocks_vpmadd52_2x_do: 323262306a36Sopenharmony_ci vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers 323362306a36Sopenharmony_ci vmovdqu64 160+8($ctx),${S1}{%k1}{z} 323462306a36Sopenharmony_ci vmovdqu64 64+8($ctx),${R0}{%k1}{z} 323562306a36Sopenharmony_ci vmovdqu64 96+8($ctx),${R1}{%k1}{z} 323662306a36Sopenharmony_ci 323762306a36Sopenharmony_ci.Lblocks_vpmadd52_2x_key_loaded: 323862306a36Sopenharmony_ci vmovdqu64 16*0($inp),$T2 # load data 323962306a36Sopenharmony_ci vpxorq $T3,$T3,$T3 324062306a36Sopenharmony_ci lea 16*2($inp),$inp 324162306a36Sopenharmony_ci 324262306a36Sopenharmony_ci vpunpcklqdq $T3,$T2,$T1 # transpose data 324362306a36Sopenharmony_ci vpunpckhqdq $T3,$T2,$T3 324462306a36Sopenharmony_ci 324562306a36Sopenharmony_ci # at this point 64-bit lanes are ordered as x-1-x-0 324662306a36Sopenharmony_ci 324762306a36Sopenharmony_ci vpsrlq \$24,$T3,$T2 # splat the data 324862306a36Sopenharmony_ci vporq $PAD,$T2,$T2 324962306a36Sopenharmony_ci vpaddq $T2,$H2,$H2 # accumulate input 325062306a36Sopenharmony_ci vpandq $mask44,$T1,$T0 325162306a36Sopenharmony_ci vpsrlq \$44,$T1,$T1 325262306a36Sopenharmony_ci vpsllq \$20,$T3,$T3 325362306a36Sopenharmony_ci vporq $T3,$T1,$T1 325462306a36Sopenharmony_ci vpandq $mask44,$T1,$T1 325562306a36Sopenharmony_ci 325662306a36Sopenharmony_ci jmp .Ltail_vpmadd52_2x 325762306a36Sopenharmony_ci ud2 325862306a36Sopenharmony_ci 325962306a36Sopenharmony_ci.align 32 326062306a36Sopenharmony_ci.Loop_vpmadd52_4x: 326162306a36Sopenharmony_ci #vpaddq $T2,$H2,$H2 # accumulate input 326262306a36Sopenharmony_ci vpaddq $T0,$H0,$H0 326362306a36Sopenharmony_ci vpaddq $T1,$H1,$H1 326462306a36Sopenharmony_ci 326562306a36Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 326662306a36Sopenharmony_ci vpmadd52luq $H2,$S1,$D0lo 326762306a36Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 326862306a36Sopenharmony_ci vpmadd52huq $H2,$S1,$D0hi 326962306a36Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 327062306a36Sopenharmony_ci vpmadd52luq $H2,$S2,$D1lo 327162306a36Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 327262306a36Sopenharmony_ci vpmadd52huq $H2,$S2,$D1hi 327362306a36Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 327462306a36Sopenharmony_ci vpmadd52luq $H2,$R0,$D2lo 327562306a36Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 327662306a36Sopenharmony_ci vpmadd52huq $H2,$R0,$D2hi 327762306a36Sopenharmony_ci 327862306a36Sopenharmony_ci vmovdqu64 16*0($inp),$T2 # load data 327962306a36Sopenharmony_ci vmovdqu64 16*2($inp),$T3 328062306a36Sopenharmony_ci lea 16*4($inp),$inp 328162306a36Sopenharmony_ci vpmadd52luq $H0,$R0,$D0lo 328262306a36Sopenharmony_ci vpmadd52huq $H0,$R0,$D0hi 328362306a36Sopenharmony_ci vpmadd52luq $H0,$R1,$D1lo 328462306a36Sopenharmony_ci vpmadd52huq $H0,$R1,$D1hi 328562306a36Sopenharmony_ci vpmadd52luq $H0,$R2,$D2lo 328662306a36Sopenharmony_ci vpmadd52huq $H0,$R2,$D2hi 328762306a36Sopenharmony_ci 328862306a36Sopenharmony_ci vpunpcklqdq $T3,$T2,$T1 # transpose data 328962306a36Sopenharmony_ci vpunpckhqdq $T3,$T2,$T3 329062306a36Sopenharmony_ci vpmadd52luq $H1,$S2,$D0lo 329162306a36Sopenharmony_ci vpmadd52huq $H1,$S2,$D0hi 329262306a36Sopenharmony_ci vpmadd52luq $H1,$R0,$D1lo 329362306a36Sopenharmony_ci vpmadd52huq $H1,$R0,$D1hi 329462306a36Sopenharmony_ci vpmadd52luq $H1,$R1,$D2lo 329562306a36Sopenharmony_ci vpmadd52huq $H1,$R1,$D2hi 329662306a36Sopenharmony_ci 329762306a36Sopenharmony_ci ################################################################ 329862306a36Sopenharmony_ci # partial reduction (interleaved with data splat) 329962306a36Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 330062306a36Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 330162306a36Sopenharmony_ci vpandq $mask44,$D0lo,$H0 330262306a36Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 330362306a36Sopenharmony_ci 330462306a36Sopenharmony_ci vpsrlq \$24,$T3,$T2 330562306a36Sopenharmony_ci vporq $PAD,$T2,$T2 330662306a36Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 330762306a36Sopenharmony_ci 330862306a36Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 330962306a36Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 331062306a36Sopenharmony_ci vpandq $mask44,$D1lo,$H1 331162306a36Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 331262306a36Sopenharmony_ci 331362306a36Sopenharmony_ci vpandq $mask44,$T1,$T0 331462306a36Sopenharmony_ci vpsrlq \$44,$T1,$T1 331562306a36Sopenharmony_ci vpsllq \$20,$T3,$T3 331662306a36Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 331762306a36Sopenharmony_ci 331862306a36Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 331962306a36Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 332062306a36Sopenharmony_ci vpandq $mask42,$D2lo,$H2 332162306a36Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 332262306a36Sopenharmony_ci 332362306a36Sopenharmony_ci vpaddq $T2,$H2,$H2 # accumulate input 332462306a36Sopenharmony_ci vpaddq $D2hi,$H0,$H0 332562306a36Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 332662306a36Sopenharmony_ci 332762306a36Sopenharmony_ci vpaddq $D2hi,$H0,$H0 332862306a36Sopenharmony_ci vporq $T3,$T1,$T1 332962306a36Sopenharmony_ci vpandq $mask44,$T1,$T1 333062306a36Sopenharmony_ci 333162306a36Sopenharmony_ci vpsrlq \$44,$H0,$tmp # additional step 333262306a36Sopenharmony_ci vpandq $mask44,$H0,$H0 333362306a36Sopenharmony_ci 333462306a36Sopenharmony_ci vpaddq $tmp,$H1,$H1 333562306a36Sopenharmony_ci 333662306a36Sopenharmony_ci sub \$4,$len # len-=64 333762306a36Sopenharmony_ci jnz .Loop_vpmadd52_4x 333862306a36Sopenharmony_ci 333962306a36Sopenharmony_ci.Ltail_vpmadd52_4x: 334062306a36Sopenharmony_ci vmovdqu64 128($ctx),$R2 # load all key powers 334162306a36Sopenharmony_ci vmovdqu64 160($ctx),$S1 334262306a36Sopenharmony_ci vmovdqu64 64($ctx),$R0 334362306a36Sopenharmony_ci vmovdqu64 96($ctx),$R1 334462306a36Sopenharmony_ci 334562306a36Sopenharmony_ci.Ltail_vpmadd52_2x: 334662306a36Sopenharmony_ci vpsllq \$2,$R2,$S2 # S2 = R2*5*4 334762306a36Sopenharmony_ci vpaddq $R2,$S2,$S2 334862306a36Sopenharmony_ci vpsllq \$2,$S2,$S2 334962306a36Sopenharmony_ci 335062306a36Sopenharmony_ci #vpaddq $T2,$H2,$H2 # accumulate input 335162306a36Sopenharmony_ci vpaddq $T0,$H0,$H0 335262306a36Sopenharmony_ci vpaddq $T1,$H1,$H1 335362306a36Sopenharmony_ci 335462306a36Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 335562306a36Sopenharmony_ci vpmadd52luq $H2,$S1,$D0lo 335662306a36Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 335762306a36Sopenharmony_ci vpmadd52huq $H2,$S1,$D0hi 335862306a36Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 335962306a36Sopenharmony_ci vpmadd52luq $H2,$S2,$D1lo 336062306a36Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 336162306a36Sopenharmony_ci vpmadd52huq $H2,$S2,$D1hi 336262306a36Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 336362306a36Sopenharmony_ci vpmadd52luq $H2,$R0,$D2lo 336462306a36Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 336562306a36Sopenharmony_ci vpmadd52huq $H2,$R0,$D2hi 336662306a36Sopenharmony_ci 336762306a36Sopenharmony_ci vpmadd52luq $H0,$R0,$D0lo 336862306a36Sopenharmony_ci vpmadd52huq $H0,$R0,$D0hi 336962306a36Sopenharmony_ci vpmadd52luq $H0,$R1,$D1lo 337062306a36Sopenharmony_ci vpmadd52huq $H0,$R1,$D1hi 337162306a36Sopenharmony_ci vpmadd52luq $H0,$R2,$D2lo 337262306a36Sopenharmony_ci vpmadd52huq $H0,$R2,$D2hi 337362306a36Sopenharmony_ci 337462306a36Sopenharmony_ci vpmadd52luq $H1,$S2,$D0lo 337562306a36Sopenharmony_ci vpmadd52huq $H1,$S2,$D0hi 337662306a36Sopenharmony_ci vpmadd52luq $H1,$R0,$D1lo 337762306a36Sopenharmony_ci vpmadd52huq $H1,$R0,$D1hi 337862306a36Sopenharmony_ci vpmadd52luq $H1,$R1,$D2lo 337962306a36Sopenharmony_ci vpmadd52huq $H1,$R1,$D2hi 338062306a36Sopenharmony_ci 338162306a36Sopenharmony_ci ################################################################ 338262306a36Sopenharmony_ci # horizontal addition 338362306a36Sopenharmony_ci 338462306a36Sopenharmony_ci mov \$1,%eax 338562306a36Sopenharmony_ci kmovw %eax,%k1 338662306a36Sopenharmony_ci vpsrldq \$8,$D0lo,$T0 338762306a36Sopenharmony_ci vpsrldq \$8,$D0hi,$H0 338862306a36Sopenharmony_ci vpsrldq \$8,$D1lo,$T1 338962306a36Sopenharmony_ci vpsrldq \$8,$D1hi,$H1 339062306a36Sopenharmony_ci vpaddq $T0,$D0lo,$D0lo 339162306a36Sopenharmony_ci vpaddq $H0,$D0hi,$D0hi 339262306a36Sopenharmony_ci vpsrldq \$8,$D2lo,$T2 339362306a36Sopenharmony_ci vpsrldq \$8,$D2hi,$H2 339462306a36Sopenharmony_ci vpaddq $T1,$D1lo,$D1lo 339562306a36Sopenharmony_ci vpaddq $H1,$D1hi,$D1hi 339662306a36Sopenharmony_ci vpermq \$0x2,$D0lo,$T0 339762306a36Sopenharmony_ci vpermq \$0x2,$D0hi,$H0 339862306a36Sopenharmony_ci vpaddq $T2,$D2lo,$D2lo 339962306a36Sopenharmony_ci vpaddq $H2,$D2hi,$D2hi 340062306a36Sopenharmony_ci 340162306a36Sopenharmony_ci vpermq \$0x2,$D1lo,$T1 340262306a36Sopenharmony_ci vpermq \$0x2,$D1hi,$H1 340362306a36Sopenharmony_ci vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 340462306a36Sopenharmony_ci vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 340562306a36Sopenharmony_ci vpermq \$0x2,$D2lo,$T2 340662306a36Sopenharmony_ci vpermq \$0x2,$D2hi,$H2 340762306a36Sopenharmony_ci vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 340862306a36Sopenharmony_ci vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 340962306a36Sopenharmony_ci vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 341062306a36Sopenharmony_ci vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 341162306a36Sopenharmony_ci 341262306a36Sopenharmony_ci ################################################################ 341362306a36Sopenharmony_ci # partial reduction 341462306a36Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 341562306a36Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 341662306a36Sopenharmony_ci vpandq $mask44,$D0lo,$H0 341762306a36Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 341862306a36Sopenharmony_ci 341962306a36Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 342062306a36Sopenharmony_ci 342162306a36Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 342262306a36Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 342362306a36Sopenharmony_ci vpandq $mask44,$D1lo,$H1 342462306a36Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 342562306a36Sopenharmony_ci 342662306a36Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 342762306a36Sopenharmony_ci 342862306a36Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 342962306a36Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 343062306a36Sopenharmony_ci vpandq $mask42,$D2lo,$H2 343162306a36Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 343262306a36Sopenharmony_ci 343362306a36Sopenharmony_ci vpaddq $D2hi,$H0,$H0 343462306a36Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 343562306a36Sopenharmony_ci 343662306a36Sopenharmony_ci vpaddq $D2hi,$H0,$H0 343762306a36Sopenharmony_ci 343862306a36Sopenharmony_ci vpsrlq \$44,$H0,$tmp # additional step 343962306a36Sopenharmony_ci vpandq $mask44,$H0,$H0 344062306a36Sopenharmony_ci 344162306a36Sopenharmony_ci vpaddq $tmp,$H1,$H1 344262306a36Sopenharmony_ci # at this point $len is 344362306a36Sopenharmony_ci # either 4*n+2 or 0... 344462306a36Sopenharmony_ci sub \$2,$len # len-=32 344562306a36Sopenharmony_ci ja .Lblocks_vpmadd52_4x_do 344662306a36Sopenharmony_ci 344762306a36Sopenharmony_ci vmovq %x#$H0,0($ctx) 344862306a36Sopenharmony_ci vmovq %x#$H1,8($ctx) 344962306a36Sopenharmony_ci vmovq %x#$H2,16($ctx) 345062306a36Sopenharmony_ci vzeroall 345162306a36Sopenharmony_ci 345262306a36Sopenharmony_ci.Lno_data_vpmadd52_4x: 345362306a36Sopenharmony_ci RET 345462306a36Sopenharmony_ci.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 345562306a36Sopenharmony_ci___ 345662306a36Sopenharmony_ci} 345762306a36Sopenharmony_ci{ 345862306a36Sopenharmony_ci######################################################################## 345962306a36Sopenharmony_ci# As implied by its name 8x subroutine processes 8 blocks in parallel... 346062306a36Sopenharmony_ci# This is intermediate version, as it's used only in cases when input 346162306a36Sopenharmony_ci# length is either 8*n, 8*n+1 or 8*n+2... 346262306a36Sopenharmony_ci 346362306a36Sopenharmony_cimy ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 346462306a36Sopenharmony_cimy ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 346562306a36Sopenharmony_cimy ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 346662306a36Sopenharmony_cimy ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); 346762306a36Sopenharmony_ci 346862306a36Sopenharmony_ci$code.=<<___; 346962306a36Sopenharmony_ci.type poly1305_blocks_vpmadd52_8x,\@function,4 347062306a36Sopenharmony_ci.align 32 347162306a36Sopenharmony_cipoly1305_blocks_vpmadd52_8x: 347262306a36Sopenharmony_ci shr \$4,$len 347362306a36Sopenharmony_ci jz .Lno_data_vpmadd52_8x # too short 347462306a36Sopenharmony_ci 347562306a36Sopenharmony_ci shl \$40,$padbit 347662306a36Sopenharmony_ci mov 64($ctx),%r8 # peek on power of the key 347762306a36Sopenharmony_ci 347862306a36Sopenharmony_ci vmovdqa64 .Lx_mask44(%rip),$mask44 347962306a36Sopenharmony_ci vmovdqa64 .Lx_mask42(%rip),$mask42 348062306a36Sopenharmony_ci 348162306a36Sopenharmony_ci test %r8,%r8 # is power value impossible? 348262306a36Sopenharmony_ci js .Linit_vpmadd52 # if it is, then init R[4] 348362306a36Sopenharmony_ci 348462306a36Sopenharmony_ci vmovq 0($ctx),%x#$H0 # load current hash value 348562306a36Sopenharmony_ci vmovq 8($ctx),%x#$H1 348662306a36Sopenharmony_ci vmovq 16($ctx),%x#$H2 348762306a36Sopenharmony_ci 348862306a36Sopenharmony_ci.Lblocks_vpmadd52_8x: 348962306a36Sopenharmony_ci ################################################################ 349062306a36Sopenharmony_ci # fist we calculate more key powers 349162306a36Sopenharmony_ci 349262306a36Sopenharmony_ci vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers 349362306a36Sopenharmony_ci vmovdqu64 160($ctx),$S1 349462306a36Sopenharmony_ci vmovdqu64 64($ctx),$R0 349562306a36Sopenharmony_ci vmovdqu64 96($ctx),$R1 349662306a36Sopenharmony_ci 349762306a36Sopenharmony_ci vpsllq \$2,$R2,$S2 # S2 = R2*5*4 349862306a36Sopenharmony_ci vpaddq $R2,$S2,$S2 349962306a36Sopenharmony_ci vpsllq \$2,$S2,$S2 350062306a36Sopenharmony_ci 350162306a36Sopenharmony_ci vpbroadcastq %x#$R2,$RR2 # broadcast 4th power 350262306a36Sopenharmony_ci vpbroadcastq %x#$R0,$RR0 350362306a36Sopenharmony_ci vpbroadcastq %x#$R1,$RR1 350462306a36Sopenharmony_ci 350562306a36Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 350662306a36Sopenharmony_ci vpmadd52luq $RR2,$S1,$D0lo 350762306a36Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 350862306a36Sopenharmony_ci vpmadd52huq $RR2,$S1,$D0hi 350962306a36Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 351062306a36Sopenharmony_ci vpmadd52luq $RR2,$S2,$D1lo 351162306a36Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 351262306a36Sopenharmony_ci vpmadd52huq $RR2,$S2,$D1hi 351362306a36Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 351462306a36Sopenharmony_ci vpmadd52luq $RR2,$R0,$D2lo 351562306a36Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 351662306a36Sopenharmony_ci vpmadd52huq $RR2,$R0,$D2hi 351762306a36Sopenharmony_ci 351862306a36Sopenharmony_ci vpmadd52luq $RR0,$R0,$D0lo 351962306a36Sopenharmony_ci vpmadd52huq $RR0,$R0,$D0hi 352062306a36Sopenharmony_ci vpmadd52luq $RR0,$R1,$D1lo 352162306a36Sopenharmony_ci vpmadd52huq $RR0,$R1,$D1hi 352262306a36Sopenharmony_ci vpmadd52luq $RR0,$R2,$D2lo 352362306a36Sopenharmony_ci vpmadd52huq $RR0,$R2,$D2hi 352462306a36Sopenharmony_ci 352562306a36Sopenharmony_ci vpmadd52luq $RR1,$S2,$D0lo 352662306a36Sopenharmony_ci vpmadd52huq $RR1,$S2,$D0hi 352762306a36Sopenharmony_ci vpmadd52luq $RR1,$R0,$D1lo 352862306a36Sopenharmony_ci vpmadd52huq $RR1,$R0,$D1hi 352962306a36Sopenharmony_ci vpmadd52luq $RR1,$R1,$D2lo 353062306a36Sopenharmony_ci vpmadd52huq $RR1,$R1,$D2hi 353162306a36Sopenharmony_ci 353262306a36Sopenharmony_ci ################################################################ 353362306a36Sopenharmony_ci # partial reduction 353462306a36Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 353562306a36Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 353662306a36Sopenharmony_ci vpandq $mask44,$D0lo,$RR0 353762306a36Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 353862306a36Sopenharmony_ci 353962306a36Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 354062306a36Sopenharmony_ci 354162306a36Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 354262306a36Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 354362306a36Sopenharmony_ci vpandq $mask44,$D1lo,$RR1 354462306a36Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 354562306a36Sopenharmony_ci 354662306a36Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 354762306a36Sopenharmony_ci 354862306a36Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 354962306a36Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 355062306a36Sopenharmony_ci vpandq $mask42,$D2lo,$RR2 355162306a36Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 355262306a36Sopenharmony_ci 355362306a36Sopenharmony_ci vpaddq $D2hi,$RR0,$RR0 355462306a36Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 355562306a36Sopenharmony_ci 355662306a36Sopenharmony_ci vpaddq $D2hi,$RR0,$RR0 355762306a36Sopenharmony_ci 355862306a36Sopenharmony_ci vpsrlq \$44,$RR0,$tmp # additional step 355962306a36Sopenharmony_ci vpandq $mask44,$RR0,$RR0 356062306a36Sopenharmony_ci 356162306a36Sopenharmony_ci vpaddq $tmp,$RR1,$RR1 356262306a36Sopenharmony_ci 356362306a36Sopenharmony_ci ################################################################ 356462306a36Sopenharmony_ci # At this point Rx holds 1324 powers, RRx - 5768, and the goal 356562306a36Sopenharmony_ci # is 15263748, which reflects how data is loaded... 356662306a36Sopenharmony_ci 356762306a36Sopenharmony_ci vpunpcklqdq $R2,$RR2,$T2 # 3748 356862306a36Sopenharmony_ci vpunpckhqdq $R2,$RR2,$R2 # 1526 356962306a36Sopenharmony_ci vpunpcklqdq $R0,$RR0,$T0 357062306a36Sopenharmony_ci vpunpckhqdq $R0,$RR0,$R0 357162306a36Sopenharmony_ci vpunpcklqdq $R1,$RR1,$T1 357262306a36Sopenharmony_ci vpunpckhqdq $R1,$RR1,$R1 357362306a36Sopenharmony_ci___ 357462306a36Sopenharmony_ci######## switch to %zmm 357562306a36Sopenharmony_cimap(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 357662306a36Sopenharmony_cimap(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 357762306a36Sopenharmony_cimap(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 357862306a36Sopenharmony_cimap(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); 357962306a36Sopenharmony_ci 358062306a36Sopenharmony_ci$code.=<<___; 358162306a36Sopenharmony_ci vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 358262306a36Sopenharmony_ci vshufi64x2 \$0x44,$R0,$T0,$RR0 358362306a36Sopenharmony_ci vshufi64x2 \$0x44,$R1,$T1,$RR1 358462306a36Sopenharmony_ci 358562306a36Sopenharmony_ci vmovdqu64 16*0($inp),$T2 # load data 358662306a36Sopenharmony_ci vmovdqu64 16*4($inp),$T3 358762306a36Sopenharmony_ci lea 16*8($inp),$inp 358862306a36Sopenharmony_ci 358962306a36Sopenharmony_ci vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 359062306a36Sopenharmony_ci vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 359162306a36Sopenharmony_ci vpaddq $RR2,$SS2,$SS2 359262306a36Sopenharmony_ci vpaddq $RR1,$SS1,$SS1 359362306a36Sopenharmony_ci vpsllq \$2,$SS2,$SS2 359462306a36Sopenharmony_ci vpsllq \$2,$SS1,$SS1 359562306a36Sopenharmony_ci 359662306a36Sopenharmony_ci vpbroadcastq $padbit,$PAD 359762306a36Sopenharmony_ci vpbroadcastq %x#$mask44,$mask44 359862306a36Sopenharmony_ci vpbroadcastq %x#$mask42,$mask42 359962306a36Sopenharmony_ci 360062306a36Sopenharmony_ci vpbroadcastq %x#$SS1,$S1 # broadcast 8th power 360162306a36Sopenharmony_ci vpbroadcastq %x#$SS2,$S2 360262306a36Sopenharmony_ci vpbroadcastq %x#$RR0,$R0 360362306a36Sopenharmony_ci vpbroadcastq %x#$RR1,$R1 360462306a36Sopenharmony_ci vpbroadcastq %x#$RR2,$R2 360562306a36Sopenharmony_ci 360662306a36Sopenharmony_ci vpunpcklqdq $T3,$T2,$T1 # transpose data 360762306a36Sopenharmony_ci vpunpckhqdq $T3,$T2,$T3 360862306a36Sopenharmony_ci 360962306a36Sopenharmony_ci # at this point 64-bit lanes are ordered as 73625140 361062306a36Sopenharmony_ci 361162306a36Sopenharmony_ci vpsrlq \$24,$T3,$T2 # splat the data 361262306a36Sopenharmony_ci vporq $PAD,$T2,$T2 361362306a36Sopenharmony_ci vpaddq $T2,$H2,$H2 # accumulate input 361462306a36Sopenharmony_ci vpandq $mask44,$T1,$T0 361562306a36Sopenharmony_ci vpsrlq \$44,$T1,$T1 361662306a36Sopenharmony_ci vpsllq \$20,$T3,$T3 361762306a36Sopenharmony_ci vporq $T3,$T1,$T1 361862306a36Sopenharmony_ci vpandq $mask44,$T1,$T1 361962306a36Sopenharmony_ci 362062306a36Sopenharmony_ci sub \$8,$len 362162306a36Sopenharmony_ci jz .Ltail_vpmadd52_8x 362262306a36Sopenharmony_ci jmp .Loop_vpmadd52_8x 362362306a36Sopenharmony_ci 362462306a36Sopenharmony_ci.align 32 362562306a36Sopenharmony_ci.Loop_vpmadd52_8x: 362662306a36Sopenharmony_ci #vpaddq $T2,$H2,$H2 # accumulate input 362762306a36Sopenharmony_ci vpaddq $T0,$H0,$H0 362862306a36Sopenharmony_ci vpaddq $T1,$H1,$H1 362962306a36Sopenharmony_ci 363062306a36Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 363162306a36Sopenharmony_ci vpmadd52luq $H2,$S1,$D0lo 363262306a36Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 363362306a36Sopenharmony_ci vpmadd52huq $H2,$S1,$D0hi 363462306a36Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 363562306a36Sopenharmony_ci vpmadd52luq $H2,$S2,$D1lo 363662306a36Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 363762306a36Sopenharmony_ci vpmadd52huq $H2,$S2,$D1hi 363862306a36Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 363962306a36Sopenharmony_ci vpmadd52luq $H2,$R0,$D2lo 364062306a36Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 364162306a36Sopenharmony_ci vpmadd52huq $H2,$R0,$D2hi 364262306a36Sopenharmony_ci 364362306a36Sopenharmony_ci vmovdqu64 16*0($inp),$T2 # load data 364462306a36Sopenharmony_ci vmovdqu64 16*4($inp),$T3 364562306a36Sopenharmony_ci lea 16*8($inp),$inp 364662306a36Sopenharmony_ci vpmadd52luq $H0,$R0,$D0lo 364762306a36Sopenharmony_ci vpmadd52huq $H0,$R0,$D0hi 364862306a36Sopenharmony_ci vpmadd52luq $H0,$R1,$D1lo 364962306a36Sopenharmony_ci vpmadd52huq $H0,$R1,$D1hi 365062306a36Sopenharmony_ci vpmadd52luq $H0,$R2,$D2lo 365162306a36Sopenharmony_ci vpmadd52huq $H0,$R2,$D2hi 365262306a36Sopenharmony_ci 365362306a36Sopenharmony_ci vpunpcklqdq $T3,$T2,$T1 # transpose data 365462306a36Sopenharmony_ci vpunpckhqdq $T3,$T2,$T3 365562306a36Sopenharmony_ci vpmadd52luq $H1,$S2,$D0lo 365662306a36Sopenharmony_ci vpmadd52huq $H1,$S2,$D0hi 365762306a36Sopenharmony_ci vpmadd52luq $H1,$R0,$D1lo 365862306a36Sopenharmony_ci vpmadd52huq $H1,$R0,$D1hi 365962306a36Sopenharmony_ci vpmadd52luq $H1,$R1,$D2lo 366062306a36Sopenharmony_ci vpmadd52huq $H1,$R1,$D2hi 366162306a36Sopenharmony_ci 366262306a36Sopenharmony_ci ################################################################ 366362306a36Sopenharmony_ci # partial reduction (interleaved with data splat) 366462306a36Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 366562306a36Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 366662306a36Sopenharmony_ci vpandq $mask44,$D0lo,$H0 366762306a36Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 366862306a36Sopenharmony_ci 366962306a36Sopenharmony_ci vpsrlq \$24,$T3,$T2 367062306a36Sopenharmony_ci vporq $PAD,$T2,$T2 367162306a36Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 367262306a36Sopenharmony_ci 367362306a36Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 367462306a36Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 367562306a36Sopenharmony_ci vpandq $mask44,$D1lo,$H1 367662306a36Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 367762306a36Sopenharmony_ci 367862306a36Sopenharmony_ci vpandq $mask44,$T1,$T0 367962306a36Sopenharmony_ci vpsrlq \$44,$T1,$T1 368062306a36Sopenharmony_ci vpsllq \$20,$T3,$T3 368162306a36Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 368262306a36Sopenharmony_ci 368362306a36Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 368462306a36Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 368562306a36Sopenharmony_ci vpandq $mask42,$D2lo,$H2 368662306a36Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 368762306a36Sopenharmony_ci 368862306a36Sopenharmony_ci vpaddq $T2,$H2,$H2 # accumulate input 368962306a36Sopenharmony_ci vpaddq $D2hi,$H0,$H0 369062306a36Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 369162306a36Sopenharmony_ci 369262306a36Sopenharmony_ci vpaddq $D2hi,$H0,$H0 369362306a36Sopenharmony_ci vporq $T3,$T1,$T1 369462306a36Sopenharmony_ci vpandq $mask44,$T1,$T1 369562306a36Sopenharmony_ci 369662306a36Sopenharmony_ci vpsrlq \$44,$H0,$tmp # additional step 369762306a36Sopenharmony_ci vpandq $mask44,$H0,$H0 369862306a36Sopenharmony_ci 369962306a36Sopenharmony_ci vpaddq $tmp,$H1,$H1 370062306a36Sopenharmony_ci 370162306a36Sopenharmony_ci sub \$8,$len # len-=128 370262306a36Sopenharmony_ci jnz .Loop_vpmadd52_8x 370362306a36Sopenharmony_ci 370462306a36Sopenharmony_ci.Ltail_vpmadd52_8x: 370562306a36Sopenharmony_ci #vpaddq $T2,$H2,$H2 # accumulate input 370662306a36Sopenharmony_ci vpaddq $T0,$H0,$H0 370762306a36Sopenharmony_ci vpaddq $T1,$H1,$H1 370862306a36Sopenharmony_ci 370962306a36Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 371062306a36Sopenharmony_ci vpmadd52luq $H2,$SS1,$D0lo 371162306a36Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 371262306a36Sopenharmony_ci vpmadd52huq $H2,$SS1,$D0hi 371362306a36Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 371462306a36Sopenharmony_ci vpmadd52luq $H2,$SS2,$D1lo 371562306a36Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 371662306a36Sopenharmony_ci vpmadd52huq $H2,$SS2,$D1hi 371762306a36Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 371862306a36Sopenharmony_ci vpmadd52luq $H2,$RR0,$D2lo 371962306a36Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 372062306a36Sopenharmony_ci vpmadd52huq $H2,$RR0,$D2hi 372162306a36Sopenharmony_ci 372262306a36Sopenharmony_ci vpmadd52luq $H0,$RR0,$D0lo 372362306a36Sopenharmony_ci vpmadd52huq $H0,$RR0,$D0hi 372462306a36Sopenharmony_ci vpmadd52luq $H0,$RR1,$D1lo 372562306a36Sopenharmony_ci vpmadd52huq $H0,$RR1,$D1hi 372662306a36Sopenharmony_ci vpmadd52luq $H0,$RR2,$D2lo 372762306a36Sopenharmony_ci vpmadd52huq $H0,$RR2,$D2hi 372862306a36Sopenharmony_ci 372962306a36Sopenharmony_ci vpmadd52luq $H1,$SS2,$D0lo 373062306a36Sopenharmony_ci vpmadd52huq $H1,$SS2,$D0hi 373162306a36Sopenharmony_ci vpmadd52luq $H1,$RR0,$D1lo 373262306a36Sopenharmony_ci vpmadd52huq $H1,$RR0,$D1hi 373362306a36Sopenharmony_ci vpmadd52luq $H1,$RR1,$D2lo 373462306a36Sopenharmony_ci vpmadd52huq $H1,$RR1,$D2hi 373562306a36Sopenharmony_ci 373662306a36Sopenharmony_ci ################################################################ 373762306a36Sopenharmony_ci # horizontal addition 373862306a36Sopenharmony_ci 373962306a36Sopenharmony_ci mov \$1,%eax 374062306a36Sopenharmony_ci kmovw %eax,%k1 374162306a36Sopenharmony_ci vpsrldq \$8,$D0lo,$T0 374262306a36Sopenharmony_ci vpsrldq \$8,$D0hi,$H0 374362306a36Sopenharmony_ci vpsrldq \$8,$D1lo,$T1 374462306a36Sopenharmony_ci vpsrldq \$8,$D1hi,$H1 374562306a36Sopenharmony_ci vpaddq $T0,$D0lo,$D0lo 374662306a36Sopenharmony_ci vpaddq $H0,$D0hi,$D0hi 374762306a36Sopenharmony_ci vpsrldq \$8,$D2lo,$T2 374862306a36Sopenharmony_ci vpsrldq \$8,$D2hi,$H2 374962306a36Sopenharmony_ci vpaddq $T1,$D1lo,$D1lo 375062306a36Sopenharmony_ci vpaddq $H1,$D1hi,$D1hi 375162306a36Sopenharmony_ci vpermq \$0x2,$D0lo,$T0 375262306a36Sopenharmony_ci vpermq \$0x2,$D0hi,$H0 375362306a36Sopenharmony_ci vpaddq $T2,$D2lo,$D2lo 375462306a36Sopenharmony_ci vpaddq $H2,$D2hi,$D2hi 375562306a36Sopenharmony_ci 375662306a36Sopenharmony_ci vpermq \$0x2,$D1lo,$T1 375762306a36Sopenharmony_ci vpermq \$0x2,$D1hi,$H1 375862306a36Sopenharmony_ci vpaddq $T0,$D0lo,$D0lo 375962306a36Sopenharmony_ci vpaddq $H0,$D0hi,$D0hi 376062306a36Sopenharmony_ci vpermq \$0x2,$D2lo,$T2 376162306a36Sopenharmony_ci vpermq \$0x2,$D2hi,$H2 376262306a36Sopenharmony_ci vpaddq $T1,$D1lo,$D1lo 376362306a36Sopenharmony_ci vpaddq $H1,$D1hi,$D1hi 376462306a36Sopenharmony_ci vextracti64x4 \$1,$D0lo,%y#$T0 376562306a36Sopenharmony_ci vextracti64x4 \$1,$D0hi,%y#$H0 376662306a36Sopenharmony_ci vpaddq $T2,$D2lo,$D2lo 376762306a36Sopenharmony_ci vpaddq $H2,$D2hi,$D2hi 376862306a36Sopenharmony_ci 376962306a36Sopenharmony_ci vextracti64x4 \$1,$D1lo,%y#$T1 377062306a36Sopenharmony_ci vextracti64x4 \$1,$D1hi,%y#$H1 377162306a36Sopenharmony_ci vextracti64x4 \$1,$D2lo,%y#$T2 377262306a36Sopenharmony_ci vextracti64x4 \$1,$D2hi,%y#$H2 377362306a36Sopenharmony_ci___ 377462306a36Sopenharmony_ci######## switch back to %ymm 377562306a36Sopenharmony_cimap(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 377662306a36Sopenharmony_cimap(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 377762306a36Sopenharmony_cimap(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 377862306a36Sopenharmony_ci 377962306a36Sopenharmony_ci$code.=<<___; 378062306a36Sopenharmony_ci vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 378162306a36Sopenharmony_ci vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 378262306a36Sopenharmony_ci vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 378362306a36Sopenharmony_ci vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 378462306a36Sopenharmony_ci vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 378562306a36Sopenharmony_ci vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 378662306a36Sopenharmony_ci 378762306a36Sopenharmony_ci ################################################################ 378862306a36Sopenharmony_ci # partial reduction 378962306a36Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 379062306a36Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 379162306a36Sopenharmony_ci vpandq $mask44,$D0lo,$H0 379262306a36Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 379362306a36Sopenharmony_ci 379462306a36Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 379562306a36Sopenharmony_ci 379662306a36Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 379762306a36Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 379862306a36Sopenharmony_ci vpandq $mask44,$D1lo,$H1 379962306a36Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 380062306a36Sopenharmony_ci 380162306a36Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 380262306a36Sopenharmony_ci 380362306a36Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 380462306a36Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 380562306a36Sopenharmony_ci vpandq $mask42,$D2lo,$H2 380662306a36Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 380762306a36Sopenharmony_ci 380862306a36Sopenharmony_ci vpaddq $D2hi,$H0,$H0 380962306a36Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 381062306a36Sopenharmony_ci 381162306a36Sopenharmony_ci vpaddq $D2hi,$H0,$H0 381262306a36Sopenharmony_ci 381362306a36Sopenharmony_ci vpsrlq \$44,$H0,$tmp # additional step 381462306a36Sopenharmony_ci vpandq $mask44,$H0,$H0 381562306a36Sopenharmony_ci 381662306a36Sopenharmony_ci vpaddq $tmp,$H1,$H1 381762306a36Sopenharmony_ci 381862306a36Sopenharmony_ci ################################################################ 381962306a36Sopenharmony_ci 382062306a36Sopenharmony_ci vmovq %x#$H0,0($ctx) 382162306a36Sopenharmony_ci vmovq %x#$H1,8($ctx) 382262306a36Sopenharmony_ci vmovq %x#$H2,16($ctx) 382362306a36Sopenharmony_ci vzeroall 382462306a36Sopenharmony_ci 382562306a36Sopenharmony_ci.Lno_data_vpmadd52_8x: 382662306a36Sopenharmony_ci RET 382762306a36Sopenharmony_ci.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 382862306a36Sopenharmony_ci___ 382962306a36Sopenharmony_ci} 383062306a36Sopenharmony_ci$code.=<<___; 383162306a36Sopenharmony_ci.type poly1305_emit_base2_44,\@function,3 383262306a36Sopenharmony_ci.align 32 383362306a36Sopenharmony_cipoly1305_emit_base2_44: 383462306a36Sopenharmony_ci mov 0($ctx),%r8 # load hash value 383562306a36Sopenharmony_ci mov 8($ctx),%r9 383662306a36Sopenharmony_ci mov 16($ctx),%r10 383762306a36Sopenharmony_ci 383862306a36Sopenharmony_ci mov %r9,%rax 383962306a36Sopenharmony_ci shr \$20,%r9 384062306a36Sopenharmony_ci shl \$44,%rax 384162306a36Sopenharmony_ci mov %r10,%rcx 384262306a36Sopenharmony_ci shr \$40,%r10 384362306a36Sopenharmony_ci shl \$24,%rcx 384462306a36Sopenharmony_ci 384562306a36Sopenharmony_ci add %rax,%r8 384662306a36Sopenharmony_ci adc %rcx,%r9 384762306a36Sopenharmony_ci adc \$0,%r10 384862306a36Sopenharmony_ci 384962306a36Sopenharmony_ci mov %r8,%rax 385062306a36Sopenharmony_ci add \$5,%r8 # compare to modulus 385162306a36Sopenharmony_ci mov %r9,%rcx 385262306a36Sopenharmony_ci adc \$0,%r9 385362306a36Sopenharmony_ci adc \$0,%r10 385462306a36Sopenharmony_ci shr \$2,%r10 # did 130-bit value overflow? 385562306a36Sopenharmony_ci cmovnz %r8,%rax 385662306a36Sopenharmony_ci cmovnz %r9,%rcx 385762306a36Sopenharmony_ci 385862306a36Sopenharmony_ci add 0($nonce),%rax # accumulate nonce 385962306a36Sopenharmony_ci adc 8($nonce),%rcx 386062306a36Sopenharmony_ci mov %rax,0($mac) # write result 386162306a36Sopenharmony_ci mov %rcx,8($mac) 386262306a36Sopenharmony_ci 386362306a36Sopenharmony_ci RET 386462306a36Sopenharmony_ci.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 386562306a36Sopenharmony_ci___ 386662306a36Sopenharmony_ci} } } 386762306a36Sopenharmony_ci} 386862306a36Sopenharmony_ci 386962306a36Sopenharmony_ciif (!$kernel) 387062306a36Sopenharmony_ci{ # chacha20-poly1305 helpers 387162306a36Sopenharmony_cimy ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 387262306a36Sopenharmony_ci ("%rdi","%rsi","%rdx","%rcx"); # Unix order 387362306a36Sopenharmony_ci$code.=<<___; 387462306a36Sopenharmony_ci.globl xor128_encrypt_n_pad 387562306a36Sopenharmony_ci.type xor128_encrypt_n_pad,\@abi-omnipotent 387662306a36Sopenharmony_ci.align 16 387762306a36Sopenharmony_cixor128_encrypt_n_pad: 387862306a36Sopenharmony_ci sub $otp,$inp 387962306a36Sopenharmony_ci sub $otp,$out 388062306a36Sopenharmony_ci mov $len,%r10 # put len aside 388162306a36Sopenharmony_ci shr \$4,$len # len / 16 388262306a36Sopenharmony_ci jz .Ltail_enc 388362306a36Sopenharmony_ci nop 388462306a36Sopenharmony_ci.Loop_enc_xmm: 388562306a36Sopenharmony_ci movdqu ($inp,$otp),%xmm0 388662306a36Sopenharmony_ci pxor ($otp),%xmm0 388762306a36Sopenharmony_ci movdqu %xmm0,($out,$otp) 388862306a36Sopenharmony_ci movdqa %xmm0,($otp) 388962306a36Sopenharmony_ci lea 16($otp),$otp 389062306a36Sopenharmony_ci dec $len 389162306a36Sopenharmony_ci jnz .Loop_enc_xmm 389262306a36Sopenharmony_ci 389362306a36Sopenharmony_ci and \$15,%r10 # len % 16 389462306a36Sopenharmony_ci jz .Ldone_enc 389562306a36Sopenharmony_ci 389662306a36Sopenharmony_ci.Ltail_enc: 389762306a36Sopenharmony_ci mov \$16,$len 389862306a36Sopenharmony_ci sub %r10,$len 389962306a36Sopenharmony_ci xor %eax,%eax 390062306a36Sopenharmony_ci.Loop_enc_byte: 390162306a36Sopenharmony_ci mov ($inp,$otp),%al 390262306a36Sopenharmony_ci xor ($otp),%al 390362306a36Sopenharmony_ci mov %al,($out,$otp) 390462306a36Sopenharmony_ci mov %al,($otp) 390562306a36Sopenharmony_ci lea 1($otp),$otp 390662306a36Sopenharmony_ci dec %r10 390762306a36Sopenharmony_ci jnz .Loop_enc_byte 390862306a36Sopenharmony_ci 390962306a36Sopenharmony_ci xor %eax,%eax 391062306a36Sopenharmony_ci.Loop_enc_pad: 391162306a36Sopenharmony_ci mov %al,($otp) 391262306a36Sopenharmony_ci lea 1($otp),$otp 391362306a36Sopenharmony_ci dec $len 391462306a36Sopenharmony_ci jnz .Loop_enc_pad 391562306a36Sopenharmony_ci 391662306a36Sopenharmony_ci.Ldone_enc: 391762306a36Sopenharmony_ci mov $otp,%rax 391862306a36Sopenharmony_ci RET 391962306a36Sopenharmony_ci.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 392062306a36Sopenharmony_ci 392162306a36Sopenharmony_ci.globl xor128_decrypt_n_pad 392262306a36Sopenharmony_ci.type xor128_decrypt_n_pad,\@abi-omnipotent 392362306a36Sopenharmony_ci.align 16 392462306a36Sopenharmony_cixor128_decrypt_n_pad: 392562306a36Sopenharmony_ci sub $otp,$inp 392662306a36Sopenharmony_ci sub $otp,$out 392762306a36Sopenharmony_ci mov $len,%r10 # put len aside 392862306a36Sopenharmony_ci shr \$4,$len # len / 16 392962306a36Sopenharmony_ci jz .Ltail_dec 393062306a36Sopenharmony_ci nop 393162306a36Sopenharmony_ci.Loop_dec_xmm: 393262306a36Sopenharmony_ci movdqu ($inp,$otp),%xmm0 393362306a36Sopenharmony_ci movdqa ($otp),%xmm1 393462306a36Sopenharmony_ci pxor %xmm0,%xmm1 393562306a36Sopenharmony_ci movdqu %xmm1,($out,$otp) 393662306a36Sopenharmony_ci movdqa %xmm0,($otp) 393762306a36Sopenharmony_ci lea 16($otp),$otp 393862306a36Sopenharmony_ci dec $len 393962306a36Sopenharmony_ci jnz .Loop_dec_xmm 394062306a36Sopenharmony_ci 394162306a36Sopenharmony_ci pxor %xmm1,%xmm1 394262306a36Sopenharmony_ci and \$15,%r10 # len % 16 394362306a36Sopenharmony_ci jz .Ldone_dec 394462306a36Sopenharmony_ci 394562306a36Sopenharmony_ci.Ltail_dec: 394662306a36Sopenharmony_ci mov \$16,$len 394762306a36Sopenharmony_ci sub %r10,$len 394862306a36Sopenharmony_ci xor %eax,%eax 394962306a36Sopenharmony_ci xor %r11d,%r11d 395062306a36Sopenharmony_ci.Loop_dec_byte: 395162306a36Sopenharmony_ci mov ($inp,$otp),%r11b 395262306a36Sopenharmony_ci mov ($otp),%al 395362306a36Sopenharmony_ci xor %r11b,%al 395462306a36Sopenharmony_ci mov %al,($out,$otp) 395562306a36Sopenharmony_ci mov %r11b,($otp) 395662306a36Sopenharmony_ci lea 1($otp),$otp 395762306a36Sopenharmony_ci dec %r10 395862306a36Sopenharmony_ci jnz .Loop_dec_byte 395962306a36Sopenharmony_ci 396062306a36Sopenharmony_ci xor %eax,%eax 396162306a36Sopenharmony_ci.Loop_dec_pad: 396262306a36Sopenharmony_ci mov %al,($otp) 396362306a36Sopenharmony_ci lea 1($otp),$otp 396462306a36Sopenharmony_ci dec $len 396562306a36Sopenharmony_ci jnz .Loop_dec_pad 396662306a36Sopenharmony_ci 396762306a36Sopenharmony_ci.Ldone_dec: 396862306a36Sopenharmony_ci mov $otp,%rax 396962306a36Sopenharmony_ci RET 397062306a36Sopenharmony_ci.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 397162306a36Sopenharmony_ci___ 397262306a36Sopenharmony_ci} 397362306a36Sopenharmony_ci 397462306a36Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 397562306a36Sopenharmony_ci# CONTEXT *context,DISPATCHER_CONTEXT *disp) 397662306a36Sopenharmony_ciif ($win64) { 397762306a36Sopenharmony_ci$rec="%rcx"; 397862306a36Sopenharmony_ci$frame="%rdx"; 397962306a36Sopenharmony_ci$context="%r8"; 398062306a36Sopenharmony_ci$disp="%r9"; 398162306a36Sopenharmony_ci 398262306a36Sopenharmony_ci$code.=<<___; 398362306a36Sopenharmony_ci.extern __imp_RtlVirtualUnwind 398462306a36Sopenharmony_ci.type se_handler,\@abi-omnipotent 398562306a36Sopenharmony_ci.align 16 398662306a36Sopenharmony_cise_handler: 398762306a36Sopenharmony_ci push %rsi 398862306a36Sopenharmony_ci push %rdi 398962306a36Sopenharmony_ci push %rbx 399062306a36Sopenharmony_ci push %rbp 399162306a36Sopenharmony_ci push %r12 399262306a36Sopenharmony_ci push %r13 399362306a36Sopenharmony_ci push %r14 399462306a36Sopenharmony_ci push %r15 399562306a36Sopenharmony_ci pushfq 399662306a36Sopenharmony_ci sub \$64,%rsp 399762306a36Sopenharmony_ci 399862306a36Sopenharmony_ci mov 120($context),%rax # pull context->Rax 399962306a36Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 400062306a36Sopenharmony_ci 400162306a36Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 400262306a36Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 400362306a36Sopenharmony_ci 400462306a36Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 400562306a36Sopenharmony_ci lea (%rsi,%r10),%r10 # prologue label 400662306a36Sopenharmony_ci cmp %r10,%rbx # context->Rip<.Lprologue 400762306a36Sopenharmony_ci jb .Lcommon_seh_tail 400862306a36Sopenharmony_ci 400962306a36Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 401062306a36Sopenharmony_ci 401162306a36Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 401262306a36Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 401362306a36Sopenharmony_ci cmp %r10,%rbx # context->Rip>=.Lepilogue 401462306a36Sopenharmony_ci jae .Lcommon_seh_tail 401562306a36Sopenharmony_ci 401662306a36Sopenharmony_ci lea 48(%rax),%rax 401762306a36Sopenharmony_ci 401862306a36Sopenharmony_ci mov -8(%rax),%rbx 401962306a36Sopenharmony_ci mov -16(%rax),%rbp 402062306a36Sopenharmony_ci mov -24(%rax),%r12 402162306a36Sopenharmony_ci mov -32(%rax),%r13 402262306a36Sopenharmony_ci mov -40(%rax),%r14 402362306a36Sopenharmony_ci mov -48(%rax),%r15 402462306a36Sopenharmony_ci mov %rbx,144($context) # restore context->Rbx 402562306a36Sopenharmony_ci mov %rbp,160($context) # restore context->Rbp 402662306a36Sopenharmony_ci mov %r12,216($context) # restore context->R12 402762306a36Sopenharmony_ci mov %r13,224($context) # restore context->R13 402862306a36Sopenharmony_ci mov %r14,232($context) # restore context->R14 402962306a36Sopenharmony_ci mov %r15,240($context) # restore context->R14 403062306a36Sopenharmony_ci 403162306a36Sopenharmony_ci jmp .Lcommon_seh_tail 403262306a36Sopenharmony_ci.size se_handler,.-se_handler 403362306a36Sopenharmony_ci 403462306a36Sopenharmony_ci.type avx_handler,\@abi-omnipotent 403562306a36Sopenharmony_ci.align 16 403662306a36Sopenharmony_ciavx_handler: 403762306a36Sopenharmony_ci push %rsi 403862306a36Sopenharmony_ci push %rdi 403962306a36Sopenharmony_ci push %rbx 404062306a36Sopenharmony_ci push %rbp 404162306a36Sopenharmony_ci push %r12 404262306a36Sopenharmony_ci push %r13 404362306a36Sopenharmony_ci push %r14 404462306a36Sopenharmony_ci push %r15 404562306a36Sopenharmony_ci pushfq 404662306a36Sopenharmony_ci sub \$64,%rsp 404762306a36Sopenharmony_ci 404862306a36Sopenharmony_ci mov 120($context),%rax # pull context->Rax 404962306a36Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 405062306a36Sopenharmony_ci 405162306a36Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 405262306a36Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 405362306a36Sopenharmony_ci 405462306a36Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 405562306a36Sopenharmony_ci lea (%rsi,%r10),%r10 # prologue label 405662306a36Sopenharmony_ci cmp %r10,%rbx # context->Rip<prologue label 405762306a36Sopenharmony_ci jb .Lcommon_seh_tail 405862306a36Sopenharmony_ci 405962306a36Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 406062306a36Sopenharmony_ci 406162306a36Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 406262306a36Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 406362306a36Sopenharmony_ci cmp %r10,%rbx # context->Rip>=epilogue label 406462306a36Sopenharmony_ci jae .Lcommon_seh_tail 406562306a36Sopenharmony_ci 406662306a36Sopenharmony_ci mov 208($context),%rax # pull context->R11 406762306a36Sopenharmony_ci 406862306a36Sopenharmony_ci lea 0x50(%rax),%rsi 406962306a36Sopenharmony_ci lea 0xf8(%rax),%rax 407062306a36Sopenharmony_ci lea 512($context),%rdi # &context.Xmm6 407162306a36Sopenharmony_ci mov \$20,%ecx 407262306a36Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 407362306a36Sopenharmony_ci 407462306a36Sopenharmony_ci.Lcommon_seh_tail: 407562306a36Sopenharmony_ci mov 8(%rax),%rdi 407662306a36Sopenharmony_ci mov 16(%rax),%rsi 407762306a36Sopenharmony_ci mov %rax,152($context) # restore context->Rsp 407862306a36Sopenharmony_ci mov %rsi,168($context) # restore context->Rsi 407962306a36Sopenharmony_ci mov %rdi,176($context) # restore context->Rdi 408062306a36Sopenharmony_ci 408162306a36Sopenharmony_ci mov 40($disp),%rdi # disp->ContextRecord 408262306a36Sopenharmony_ci mov $context,%rsi # context 408362306a36Sopenharmony_ci mov \$154,%ecx # sizeof(CONTEXT) 408462306a36Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 408562306a36Sopenharmony_ci 408662306a36Sopenharmony_ci mov $disp,%rsi 408762306a36Sopenharmony_ci xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER 408862306a36Sopenharmony_ci mov 8(%rsi),%rdx # arg2, disp->ImageBase 408962306a36Sopenharmony_ci mov 0(%rsi),%r8 # arg3, disp->ControlPc 409062306a36Sopenharmony_ci mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 409162306a36Sopenharmony_ci mov 40(%rsi),%r10 # disp->ContextRecord 409262306a36Sopenharmony_ci lea 56(%rsi),%r11 # &disp->HandlerData 409362306a36Sopenharmony_ci lea 24(%rsi),%r12 # &disp->EstablisherFrame 409462306a36Sopenharmony_ci mov %r10,32(%rsp) # arg5 409562306a36Sopenharmony_ci mov %r11,40(%rsp) # arg6 409662306a36Sopenharmony_ci mov %r12,48(%rsp) # arg7 409762306a36Sopenharmony_ci mov %rcx,56(%rsp) # arg8, (NULL) 409862306a36Sopenharmony_ci call *__imp_RtlVirtualUnwind(%rip) 409962306a36Sopenharmony_ci 410062306a36Sopenharmony_ci mov \$1,%eax # ExceptionContinueSearch 410162306a36Sopenharmony_ci add \$64,%rsp 410262306a36Sopenharmony_ci popfq 410362306a36Sopenharmony_ci pop %r15 410462306a36Sopenharmony_ci pop %r14 410562306a36Sopenharmony_ci pop %r13 410662306a36Sopenharmony_ci pop %r12 410762306a36Sopenharmony_ci pop %rbp 410862306a36Sopenharmony_ci pop %rbx 410962306a36Sopenharmony_ci pop %rdi 411062306a36Sopenharmony_ci pop %rsi 411162306a36Sopenharmony_ci RET 411262306a36Sopenharmony_ci.size avx_handler,.-avx_handler 411362306a36Sopenharmony_ci 411462306a36Sopenharmony_ci.section .pdata 411562306a36Sopenharmony_ci.align 4 411662306a36Sopenharmony_ci .rva .LSEH_begin_poly1305_init_x86_64 411762306a36Sopenharmony_ci .rva .LSEH_end_poly1305_init_x86_64 411862306a36Sopenharmony_ci .rva .LSEH_info_poly1305_init_x86_64 411962306a36Sopenharmony_ci 412062306a36Sopenharmony_ci .rva .LSEH_begin_poly1305_blocks_x86_64 412162306a36Sopenharmony_ci .rva .LSEH_end_poly1305_blocks_x86_64 412262306a36Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_x86_64 412362306a36Sopenharmony_ci 412462306a36Sopenharmony_ci .rva .LSEH_begin_poly1305_emit_x86_64 412562306a36Sopenharmony_ci .rva .LSEH_end_poly1305_emit_x86_64 412662306a36Sopenharmony_ci .rva .LSEH_info_poly1305_emit_x86_64 412762306a36Sopenharmony_ci___ 412862306a36Sopenharmony_ci$code.=<<___ if ($avx); 412962306a36Sopenharmony_ci .rva .LSEH_begin_poly1305_blocks_avx 413062306a36Sopenharmony_ci .rva .Lbase2_64_avx 413162306a36Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx_1 413262306a36Sopenharmony_ci 413362306a36Sopenharmony_ci .rva .Lbase2_64_avx 413462306a36Sopenharmony_ci .rva .Leven_avx 413562306a36Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx_2 413662306a36Sopenharmony_ci 413762306a36Sopenharmony_ci .rva .Leven_avx 413862306a36Sopenharmony_ci .rva .LSEH_end_poly1305_blocks_avx 413962306a36Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx_3 414062306a36Sopenharmony_ci 414162306a36Sopenharmony_ci .rva .LSEH_begin_poly1305_emit_avx 414262306a36Sopenharmony_ci .rva .LSEH_end_poly1305_emit_avx 414362306a36Sopenharmony_ci .rva .LSEH_info_poly1305_emit_avx 414462306a36Sopenharmony_ci___ 414562306a36Sopenharmony_ci$code.=<<___ if ($avx>1); 414662306a36Sopenharmony_ci .rva .LSEH_begin_poly1305_blocks_avx2 414762306a36Sopenharmony_ci .rva .Lbase2_64_avx2 414862306a36Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx2_1 414962306a36Sopenharmony_ci 415062306a36Sopenharmony_ci .rva .Lbase2_64_avx2 415162306a36Sopenharmony_ci .rva .Leven_avx2 415262306a36Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx2_2 415362306a36Sopenharmony_ci 415462306a36Sopenharmony_ci .rva .Leven_avx2 415562306a36Sopenharmony_ci .rva .LSEH_end_poly1305_blocks_avx2 415662306a36Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx2_3 415762306a36Sopenharmony_ci___ 415862306a36Sopenharmony_ci$code.=<<___ if ($avx>2); 415962306a36Sopenharmony_ci .rva .LSEH_begin_poly1305_blocks_avx512 416062306a36Sopenharmony_ci .rva .LSEH_end_poly1305_blocks_avx512 416162306a36Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx512 416262306a36Sopenharmony_ci___ 416362306a36Sopenharmony_ci$code.=<<___; 416462306a36Sopenharmony_ci.section .xdata 416562306a36Sopenharmony_ci.align 8 416662306a36Sopenharmony_ci.LSEH_info_poly1305_init_x86_64: 416762306a36Sopenharmony_ci .byte 9,0,0,0 416862306a36Sopenharmony_ci .rva se_handler 416962306a36Sopenharmony_ci .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 417062306a36Sopenharmony_ci 417162306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_x86_64: 417262306a36Sopenharmony_ci .byte 9,0,0,0 417362306a36Sopenharmony_ci .rva se_handler 417462306a36Sopenharmony_ci .rva .Lblocks_body,.Lblocks_epilogue 417562306a36Sopenharmony_ci 417662306a36Sopenharmony_ci.LSEH_info_poly1305_emit_x86_64: 417762306a36Sopenharmony_ci .byte 9,0,0,0 417862306a36Sopenharmony_ci .rva se_handler 417962306a36Sopenharmony_ci .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 418062306a36Sopenharmony_ci___ 418162306a36Sopenharmony_ci$code.=<<___ if ($avx); 418262306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_1: 418362306a36Sopenharmony_ci .byte 9,0,0,0 418462306a36Sopenharmony_ci .rva se_handler 418562306a36Sopenharmony_ci .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] 418662306a36Sopenharmony_ci 418762306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_2: 418862306a36Sopenharmony_ci .byte 9,0,0,0 418962306a36Sopenharmony_ci .rva se_handler 419062306a36Sopenharmony_ci .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] 419162306a36Sopenharmony_ci 419262306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_3: 419362306a36Sopenharmony_ci .byte 9,0,0,0 419462306a36Sopenharmony_ci .rva avx_handler 419562306a36Sopenharmony_ci .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] 419662306a36Sopenharmony_ci 419762306a36Sopenharmony_ci.LSEH_info_poly1305_emit_avx: 419862306a36Sopenharmony_ci .byte 9,0,0,0 419962306a36Sopenharmony_ci .rva se_handler 420062306a36Sopenharmony_ci .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx 420162306a36Sopenharmony_ci___ 420262306a36Sopenharmony_ci$code.=<<___ if ($avx>1); 420362306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_1: 420462306a36Sopenharmony_ci .byte 9,0,0,0 420562306a36Sopenharmony_ci .rva se_handler 420662306a36Sopenharmony_ci .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] 420762306a36Sopenharmony_ci 420862306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_2: 420962306a36Sopenharmony_ci .byte 9,0,0,0 421062306a36Sopenharmony_ci .rva se_handler 421162306a36Sopenharmony_ci .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] 421262306a36Sopenharmony_ci 421362306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_3: 421462306a36Sopenharmony_ci .byte 9,0,0,0 421562306a36Sopenharmony_ci .rva avx_handler 421662306a36Sopenharmony_ci .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] 421762306a36Sopenharmony_ci___ 421862306a36Sopenharmony_ci$code.=<<___ if ($avx>2); 421962306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx512: 422062306a36Sopenharmony_ci .byte 9,0,0,0 422162306a36Sopenharmony_ci .rva avx_handler 422262306a36Sopenharmony_ci .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] 422362306a36Sopenharmony_ci___ 422462306a36Sopenharmony_ci} 422562306a36Sopenharmony_ci 422662306a36Sopenharmony_ciopen SELF,$0; 422762306a36Sopenharmony_ciwhile(<SELF>) { 422862306a36Sopenharmony_ci next if (/^#!/); 422962306a36Sopenharmony_ci last if (!s/^#/\/\// and !/^$/); 423062306a36Sopenharmony_ci print; 423162306a36Sopenharmony_ci} 423262306a36Sopenharmony_ciclose SELF; 423362306a36Sopenharmony_ci 423462306a36Sopenharmony_ciforeach (split('\n',$code)) { 423562306a36Sopenharmony_ci s/\`([^\`]*)\`/eval($1)/ge; 423662306a36Sopenharmony_ci s/%r([a-z]+)#d/%e$1/g; 423762306a36Sopenharmony_ci s/%r([0-9]+)#d/%r$1d/g; 423862306a36Sopenharmony_ci s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; 423962306a36Sopenharmony_ci 424062306a36Sopenharmony_ci if ($kernel) { 424162306a36Sopenharmony_ci s/(^\.type.*),[0-9]+$/\1/; 424262306a36Sopenharmony_ci s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; 424362306a36Sopenharmony_ci next if /^\.cfi.*/; 424462306a36Sopenharmony_ci } 424562306a36Sopenharmony_ci 424662306a36Sopenharmony_ci print $_,"\n"; 424762306a36Sopenharmony_ci} 424862306a36Sopenharmony_ciclose STDOUT; 4249