18c2ecf20Sopenharmony_ci#!/usr/bin/env perl 28c2ecf20Sopenharmony_ci# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 38c2ecf20Sopenharmony_ci# 48c2ecf20Sopenharmony_ci# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 58c2ecf20Sopenharmony_ci# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 68c2ecf20Sopenharmony_ci# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. 78c2ecf20Sopenharmony_ci# 88c2ecf20Sopenharmony_ci# This code is taken from the OpenSSL project but the author, Andy Polyakov, 98c2ecf20Sopenharmony_ci# has relicensed it under the licenses specified in the SPDX header above. 108c2ecf20Sopenharmony_ci# The original headers, including the original license headers, are 118c2ecf20Sopenharmony_ci# included below for completeness. 128c2ecf20Sopenharmony_ci# 138c2ecf20Sopenharmony_ci# ==================================================================== 148c2ecf20Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 158c2ecf20Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 168c2ecf20Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 178c2ecf20Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 188c2ecf20Sopenharmony_ci# ==================================================================== 198c2ecf20Sopenharmony_ci# 208c2ecf20Sopenharmony_ci# This module implements Poly1305 hash for x86_64. 218c2ecf20Sopenharmony_ci# 228c2ecf20Sopenharmony_ci# March 2015 238c2ecf20Sopenharmony_ci# 248c2ecf20Sopenharmony_ci# Initial release. 258c2ecf20Sopenharmony_ci# 268c2ecf20Sopenharmony_ci# December 2016 278c2ecf20Sopenharmony_ci# 288c2ecf20Sopenharmony_ci# Add AVX512F+VL+BW code path. 298c2ecf20Sopenharmony_ci# 308c2ecf20Sopenharmony_ci# November 2017 318c2ecf20Sopenharmony_ci# 328c2ecf20Sopenharmony_ci# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be 338c2ecf20Sopenharmony_ci# executed even on Knights Landing. Trigger for modification was 348c2ecf20Sopenharmony_ci# observation that AVX512 code paths can negatively affect overall 358c2ecf20Sopenharmony_ci# Skylake-X system performance. Since we are likely to suppress 368c2ecf20Sopenharmony_ci# AVX512F capability flag [at least on Skylake-X], conversion serves 378c2ecf20Sopenharmony_ci# as kind of "investment protection". Note that next *lake processor, 388c2ecf20Sopenharmony_ci# Cannonlake, has AVX512IFMA code path to execute... 398c2ecf20Sopenharmony_ci# 408c2ecf20Sopenharmony_ci# Numbers are cycles per processed byte with poly1305_blocks alone, 418c2ecf20Sopenharmony_ci# measured with rdtsc at fixed clock frequency. 428c2ecf20Sopenharmony_ci# 438c2ecf20Sopenharmony_ci# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 448c2ecf20Sopenharmony_ci# P4 4.46/+120% - 458c2ecf20Sopenharmony_ci# Core 2 2.41/+90% - 468c2ecf20Sopenharmony_ci# Westmere 1.88/+120% - 478c2ecf20Sopenharmony_ci# Sandy Bridge 1.39/+140% 1.10 488c2ecf20Sopenharmony_ci# Haswell 1.14/+175% 1.11 0.65 498c2ecf20Sopenharmony_ci# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] 508c2ecf20Sopenharmony_ci# Silvermont 2.83/+95% - 518c2ecf20Sopenharmony_ci# Knights L 3.60/? 1.65 1.10 0.41(***) 528c2ecf20Sopenharmony_ci# Goldmont 1.70/+180% - 538c2ecf20Sopenharmony_ci# VIA Nano 1.82/+150% - 548c2ecf20Sopenharmony_ci# Sledgehammer 1.38/+160% - 558c2ecf20Sopenharmony_ci# Bulldozer 2.30/+130% 0.97 568c2ecf20Sopenharmony_ci# Ryzen 1.15/+200% 1.08 1.18 578c2ecf20Sopenharmony_ci# 588c2ecf20Sopenharmony_ci# (*) improvement coefficients relative to clang are more modest and 598c2ecf20Sopenharmony_ci# are ~50% on most processors, in both cases we are comparing to 608c2ecf20Sopenharmony_ci# __int128 code; 618c2ecf20Sopenharmony_ci# (**) SSE2 implementation was attempted, but among non-AVX processors 628c2ecf20Sopenharmony_ci# it was faster than integer-only code only on older Intel P4 and 638c2ecf20Sopenharmony_ci# Core processors, 50-30%, less newer processor is, but slower on 648c2ecf20Sopenharmony_ci# contemporary ones, for example almost 2x slower on Atom, and as 658c2ecf20Sopenharmony_ci# former are naturally disappearing, SSE2 is deemed unnecessary; 668c2ecf20Sopenharmony_ci# (***) strangely enough performance seems to vary from core to core, 678c2ecf20Sopenharmony_ci# listed result is best case; 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci$flavour = shift; 708c2ecf20Sopenharmony_ci$output = shift; 718c2ecf20Sopenharmony_ciif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 748c2ecf20Sopenharmony_ci$kernel=0; $kernel=1 if (!$flavour && !$output); 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ciif (!$kernel) { 778c2ecf20Sopenharmony_ci $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 788c2ecf20Sopenharmony_ci ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 798c2ecf20Sopenharmony_ci ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 808c2ecf20Sopenharmony_ci die "can't locate x86_64-xlate.pl"; 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 838c2ecf20Sopenharmony_ci *STDOUT=*OUT; 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 868c2ecf20Sopenharmony_ci =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 878c2ecf20Sopenharmony_ci $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 888c2ecf20Sopenharmony_ci } 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 918c2ecf20Sopenharmony_ci `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 928c2ecf20Sopenharmony_ci $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 938c2ecf20Sopenharmony_ci $avx += 1 if ($1==2.11 && $2>=8); 948c2ecf20Sopenharmony_ci } 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_ci if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 978c2ecf20Sopenharmony_ci `ml64 2>&1` =~ /Version ([0-9]+)\./) { 988c2ecf20Sopenharmony_ci $avx = ($1>=10) + ($1>=11); 998c2ecf20Sopenharmony_ci } 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 1028c2ecf20Sopenharmony_ci $avx = ($2>=3.0) + ($2>3.0); 1038c2ecf20Sopenharmony_ci } 1048c2ecf20Sopenharmony_ci} else { 1058c2ecf20Sopenharmony_ci $avx = 4; # The kernel uses ifdefs for this. 1068c2ecf20Sopenharmony_ci} 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_cisub declare_function() { 1098c2ecf20Sopenharmony_ci my ($name, $align, $nargs) = @_; 1108c2ecf20Sopenharmony_ci if($kernel) { 1118c2ecf20Sopenharmony_ci $code .= ".align $align\n"; 1128c2ecf20Sopenharmony_ci $code .= "SYM_FUNC_START($name)\n"; 1138c2ecf20Sopenharmony_ci $code .= ".L$name:\n"; 1148c2ecf20Sopenharmony_ci } else { 1158c2ecf20Sopenharmony_ci $code .= ".globl $name\n"; 1168c2ecf20Sopenharmony_ci $code .= ".type $name,\@function,$nargs\n"; 1178c2ecf20Sopenharmony_ci $code .= ".align $align\n"; 1188c2ecf20Sopenharmony_ci $code .= "$name:\n"; 1198c2ecf20Sopenharmony_ci } 1208c2ecf20Sopenharmony_ci} 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_cisub end_function() { 1238c2ecf20Sopenharmony_ci my ($name) = @_; 1248c2ecf20Sopenharmony_ci if($kernel) { 1258c2ecf20Sopenharmony_ci $code .= "SYM_FUNC_END($name)\n"; 1268c2ecf20Sopenharmony_ci } else { 1278c2ecf20Sopenharmony_ci $code .= ".size $name,.-$name\n"; 1288c2ecf20Sopenharmony_ci } 1298c2ecf20Sopenharmony_ci} 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_ci$code.=<<___ if $kernel; 1328c2ecf20Sopenharmony_ci#include <linux/linkage.h> 1338c2ecf20Sopenharmony_ci___ 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ciif ($avx) { 1368c2ecf20Sopenharmony_ci$code.=<<___ if $kernel; 1378c2ecf20Sopenharmony_ci.section .rodata 1388c2ecf20Sopenharmony_ci___ 1398c2ecf20Sopenharmony_ci$code.=<<___; 1408c2ecf20Sopenharmony_ci.align 64 1418c2ecf20Sopenharmony_ci.Lconst: 1428c2ecf20Sopenharmony_ci.Lmask24: 1438c2ecf20Sopenharmony_ci.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 1448c2ecf20Sopenharmony_ci.L129: 1458c2ecf20Sopenharmony_ci.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 1468c2ecf20Sopenharmony_ci.Lmask26: 1478c2ecf20Sopenharmony_ci.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 1488c2ecf20Sopenharmony_ci.Lpermd_avx2: 1498c2ecf20Sopenharmony_ci.long 2,2,2,3,2,0,2,1 1508c2ecf20Sopenharmony_ci.Lpermd_avx512: 1518c2ecf20Sopenharmony_ci.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci.L2_44_inp_permd: 1548c2ecf20Sopenharmony_ci.long 0,1,1,2,2,3,7,7 1558c2ecf20Sopenharmony_ci.L2_44_inp_shift: 1568c2ecf20Sopenharmony_ci.quad 0,12,24,64 1578c2ecf20Sopenharmony_ci.L2_44_mask: 1588c2ecf20Sopenharmony_ci.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 1598c2ecf20Sopenharmony_ci.L2_44_shift_rgt: 1608c2ecf20Sopenharmony_ci.quad 44,44,42,64 1618c2ecf20Sopenharmony_ci.L2_44_shift_lft: 1628c2ecf20Sopenharmony_ci.quad 8,8,10,64 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci.align 64 1658c2ecf20Sopenharmony_ci.Lx_mask44: 1668c2ecf20Sopenharmony_ci.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 1678c2ecf20Sopenharmony_ci.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 1688c2ecf20Sopenharmony_ci.Lx_mask42: 1698c2ecf20Sopenharmony_ci.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 1708c2ecf20Sopenharmony_ci.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 1718c2ecf20Sopenharmony_ci___ 1728c2ecf20Sopenharmony_ci} 1738c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel); 1748c2ecf20Sopenharmony_ci.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1758c2ecf20Sopenharmony_ci.align 16 1768c2ecf20Sopenharmony_ci___ 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_cimy ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); 1798c2ecf20Sopenharmony_cimy ($mac,$nonce)=($inp,$len); # *_emit arguments 1808c2ecf20Sopenharmony_cimy ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); 1818c2ecf20Sopenharmony_cimy ($h0,$h1,$h2)=("%r14","%rbx","%r10"); 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_cisub poly1305_iteration { 1848c2ecf20Sopenharmony_ci# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 1858c2ecf20Sopenharmony_ci# output: $h0-$h2 *= $r0-$r1 1868c2ecf20Sopenharmony_ci$code.=<<___; 1878c2ecf20Sopenharmony_ci mulq $h0 # h0*r1 1888c2ecf20Sopenharmony_ci mov %rax,$d2 1898c2ecf20Sopenharmony_ci mov $r0,%rax 1908c2ecf20Sopenharmony_ci mov %rdx,$d3 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_ci mulq $h0 # h0*r0 1938c2ecf20Sopenharmony_ci mov %rax,$h0 # future $h0 1948c2ecf20Sopenharmony_ci mov $r0,%rax 1958c2ecf20Sopenharmony_ci mov %rdx,$d1 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci mulq $h1 # h1*r0 1988c2ecf20Sopenharmony_ci add %rax,$d2 1998c2ecf20Sopenharmony_ci mov $s1,%rax 2008c2ecf20Sopenharmony_ci adc %rdx,$d3 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci mulq $h1 # h1*s1 2038c2ecf20Sopenharmony_ci mov $h2,$h1 # borrow $h1 2048c2ecf20Sopenharmony_ci add %rax,$h0 2058c2ecf20Sopenharmony_ci adc %rdx,$d1 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci imulq $s1,$h1 # h2*s1 2088c2ecf20Sopenharmony_ci add $h1,$d2 2098c2ecf20Sopenharmony_ci mov $d1,$h1 2108c2ecf20Sopenharmony_ci adc \$0,$d3 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci imulq $r0,$h2 # h2*r0 2138c2ecf20Sopenharmony_ci add $d2,$h1 2148c2ecf20Sopenharmony_ci mov \$-4,%rax # mask value 2158c2ecf20Sopenharmony_ci adc $h2,$d3 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_ci and $d3,%rax # last reduction step 2188c2ecf20Sopenharmony_ci mov $d3,$h2 2198c2ecf20Sopenharmony_ci shr \$2,$d3 2208c2ecf20Sopenharmony_ci and \$3,$h2 2218c2ecf20Sopenharmony_ci add $d3,%rax 2228c2ecf20Sopenharmony_ci add %rax,$h0 2238c2ecf20Sopenharmony_ci adc \$0,$h1 2248c2ecf20Sopenharmony_ci adc \$0,$h2 2258c2ecf20Sopenharmony_ci___ 2268c2ecf20Sopenharmony_ci} 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci######################################################################## 2298c2ecf20Sopenharmony_ci# Layout of opaque area is following. 2308c2ecf20Sopenharmony_ci# 2318c2ecf20Sopenharmony_ci# unsigned __int64 h[3]; # current hash value base 2^64 2328c2ecf20Sopenharmony_ci# unsigned __int64 r[2]; # key value base 2^64 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci$code.=<<___; 2358c2ecf20Sopenharmony_ci.text 2368c2ecf20Sopenharmony_ci___ 2378c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel); 2388c2ecf20Sopenharmony_ci.extern OPENSSL_ia32cap_P 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci.globl poly1305_init_x86_64 2418c2ecf20Sopenharmony_ci.hidden poly1305_init_x86_64 2428c2ecf20Sopenharmony_ci.globl poly1305_blocks_x86_64 2438c2ecf20Sopenharmony_ci.hidden poly1305_blocks_x86_64 2448c2ecf20Sopenharmony_ci.globl poly1305_emit_x86_64 2458c2ecf20Sopenharmony_ci.hidden poly1305_emit_x86_64 2468c2ecf20Sopenharmony_ci___ 2478c2ecf20Sopenharmony_ci&declare_function("poly1305_init_x86_64", 32, 3); 2488c2ecf20Sopenharmony_ci$code.=<<___; 2498c2ecf20Sopenharmony_ci xor %eax,%eax 2508c2ecf20Sopenharmony_ci mov %rax,0($ctx) # initialize hash value 2518c2ecf20Sopenharmony_ci mov %rax,8($ctx) 2528c2ecf20Sopenharmony_ci mov %rax,16($ctx) 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_ci cmp \$0,$inp 2558c2ecf20Sopenharmony_ci je .Lno_key 2568c2ecf20Sopenharmony_ci___ 2578c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel); 2588c2ecf20Sopenharmony_ci lea poly1305_blocks_x86_64(%rip),%r10 2598c2ecf20Sopenharmony_ci lea poly1305_emit_x86_64(%rip),%r11 2608c2ecf20Sopenharmony_ci___ 2618c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel && $avx); 2628c2ecf20Sopenharmony_ci mov OPENSSL_ia32cap_P+4(%rip),%r9 2638c2ecf20Sopenharmony_ci lea poly1305_blocks_avx(%rip),%rax 2648c2ecf20Sopenharmony_ci lea poly1305_emit_avx(%rip),%rcx 2658c2ecf20Sopenharmony_ci bt \$`60-32`,%r9 # AVX? 2668c2ecf20Sopenharmony_ci cmovc %rax,%r10 2678c2ecf20Sopenharmony_ci cmovc %rcx,%r11 2688c2ecf20Sopenharmony_ci___ 2698c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel && $avx>1); 2708c2ecf20Sopenharmony_ci lea poly1305_blocks_avx2(%rip),%rax 2718c2ecf20Sopenharmony_ci bt \$`5+32`,%r9 # AVX2? 2728c2ecf20Sopenharmony_ci cmovc %rax,%r10 2738c2ecf20Sopenharmony_ci___ 2748c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel && $avx>3); 2758c2ecf20Sopenharmony_ci mov \$`(1<<31|1<<21|1<<16)`,%rax 2768c2ecf20Sopenharmony_ci shr \$32,%r9 2778c2ecf20Sopenharmony_ci and %rax,%r9 2788c2ecf20Sopenharmony_ci cmp %rax,%r9 2798c2ecf20Sopenharmony_ci je .Linit_base2_44 2808c2ecf20Sopenharmony_ci___ 2818c2ecf20Sopenharmony_ci$code.=<<___; 2828c2ecf20Sopenharmony_ci mov \$0x0ffffffc0fffffff,%rax 2838c2ecf20Sopenharmony_ci mov \$0x0ffffffc0ffffffc,%rcx 2848c2ecf20Sopenharmony_ci and 0($inp),%rax 2858c2ecf20Sopenharmony_ci and 8($inp),%rcx 2868c2ecf20Sopenharmony_ci mov %rax,24($ctx) 2878c2ecf20Sopenharmony_ci mov %rcx,32($ctx) 2888c2ecf20Sopenharmony_ci___ 2898c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel && $flavour !~ /elf32/); 2908c2ecf20Sopenharmony_ci mov %r10,0(%rdx) 2918c2ecf20Sopenharmony_ci mov %r11,8(%rdx) 2928c2ecf20Sopenharmony_ci___ 2938c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel && $flavour =~ /elf32/); 2948c2ecf20Sopenharmony_ci mov %r10d,0(%rdx) 2958c2ecf20Sopenharmony_ci mov %r11d,4(%rdx) 2968c2ecf20Sopenharmony_ci___ 2978c2ecf20Sopenharmony_ci$code.=<<___; 2988c2ecf20Sopenharmony_ci mov \$1,%eax 2998c2ecf20Sopenharmony_ci.Lno_key: 3008c2ecf20Sopenharmony_ci RET 3018c2ecf20Sopenharmony_ci___ 3028c2ecf20Sopenharmony_ci&end_function("poly1305_init_x86_64"); 3038c2ecf20Sopenharmony_ci 3048c2ecf20Sopenharmony_ci&declare_function("poly1305_blocks_x86_64", 32, 4); 3058c2ecf20Sopenharmony_ci$code.=<<___; 3068c2ecf20Sopenharmony_ci.cfi_startproc 3078c2ecf20Sopenharmony_ci.Lblocks: 3088c2ecf20Sopenharmony_ci shr \$4,$len 3098c2ecf20Sopenharmony_ci jz .Lno_data # too short 3108c2ecf20Sopenharmony_ci 3118c2ecf20Sopenharmony_ci push %rbx 3128c2ecf20Sopenharmony_ci.cfi_push %rbx 3138c2ecf20Sopenharmony_ci push %r12 3148c2ecf20Sopenharmony_ci.cfi_push %r12 3158c2ecf20Sopenharmony_ci push %r13 3168c2ecf20Sopenharmony_ci.cfi_push %r13 3178c2ecf20Sopenharmony_ci push %r14 3188c2ecf20Sopenharmony_ci.cfi_push %r14 3198c2ecf20Sopenharmony_ci push %r15 3208c2ecf20Sopenharmony_ci.cfi_push %r15 3218c2ecf20Sopenharmony_ci push $ctx 3228c2ecf20Sopenharmony_ci.cfi_push $ctx 3238c2ecf20Sopenharmony_ci.Lblocks_body: 3248c2ecf20Sopenharmony_ci 3258c2ecf20Sopenharmony_ci mov $len,%r15 # reassign $len 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_ci mov 24($ctx),$r0 # load r 3288c2ecf20Sopenharmony_ci mov 32($ctx),$s1 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_ci mov 0($ctx),$h0 # load hash value 3318c2ecf20Sopenharmony_ci mov 8($ctx),$h1 3328c2ecf20Sopenharmony_ci mov 16($ctx),$h2 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_ci mov $s1,$r1 3358c2ecf20Sopenharmony_ci shr \$2,$s1 3368c2ecf20Sopenharmony_ci mov $r1,%rax 3378c2ecf20Sopenharmony_ci add $r1,$s1 # s1 = r1 + (r1 >> 2) 3388c2ecf20Sopenharmony_ci jmp .Loop 3398c2ecf20Sopenharmony_ci 3408c2ecf20Sopenharmony_ci.align 32 3418c2ecf20Sopenharmony_ci.Loop: 3428c2ecf20Sopenharmony_ci add 0($inp),$h0 # accumulate input 3438c2ecf20Sopenharmony_ci adc 8($inp),$h1 3448c2ecf20Sopenharmony_ci lea 16($inp),$inp 3458c2ecf20Sopenharmony_ci adc $padbit,$h2 3468c2ecf20Sopenharmony_ci___ 3478c2ecf20Sopenharmony_ci 3488c2ecf20Sopenharmony_ci &poly1305_iteration(); 3498c2ecf20Sopenharmony_ci 3508c2ecf20Sopenharmony_ci$code.=<<___; 3518c2ecf20Sopenharmony_ci mov $r1,%rax 3528c2ecf20Sopenharmony_ci dec %r15 # len-=16 3538c2ecf20Sopenharmony_ci jnz .Loop 3548c2ecf20Sopenharmony_ci 3558c2ecf20Sopenharmony_ci mov 0(%rsp),$ctx 3568c2ecf20Sopenharmony_ci.cfi_restore $ctx 3578c2ecf20Sopenharmony_ci 3588c2ecf20Sopenharmony_ci mov $h0,0($ctx) # store hash value 3598c2ecf20Sopenharmony_ci mov $h1,8($ctx) 3608c2ecf20Sopenharmony_ci mov $h2,16($ctx) 3618c2ecf20Sopenharmony_ci 3628c2ecf20Sopenharmony_ci mov 8(%rsp),%r15 3638c2ecf20Sopenharmony_ci.cfi_restore %r15 3648c2ecf20Sopenharmony_ci mov 16(%rsp),%r14 3658c2ecf20Sopenharmony_ci.cfi_restore %r14 3668c2ecf20Sopenharmony_ci mov 24(%rsp),%r13 3678c2ecf20Sopenharmony_ci.cfi_restore %r13 3688c2ecf20Sopenharmony_ci mov 32(%rsp),%r12 3698c2ecf20Sopenharmony_ci.cfi_restore %r12 3708c2ecf20Sopenharmony_ci mov 40(%rsp),%rbx 3718c2ecf20Sopenharmony_ci.cfi_restore %rbx 3728c2ecf20Sopenharmony_ci lea 48(%rsp),%rsp 3738c2ecf20Sopenharmony_ci.cfi_adjust_cfa_offset -48 3748c2ecf20Sopenharmony_ci.Lno_data: 3758c2ecf20Sopenharmony_ci.Lblocks_epilogue: 3768c2ecf20Sopenharmony_ci RET 3778c2ecf20Sopenharmony_ci.cfi_endproc 3788c2ecf20Sopenharmony_ci___ 3798c2ecf20Sopenharmony_ci&end_function("poly1305_blocks_x86_64"); 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci&declare_function("poly1305_emit_x86_64", 32, 3); 3828c2ecf20Sopenharmony_ci$code.=<<___; 3838c2ecf20Sopenharmony_ci.Lemit: 3848c2ecf20Sopenharmony_ci mov 0($ctx),%r8 # load hash value 3858c2ecf20Sopenharmony_ci mov 8($ctx),%r9 3868c2ecf20Sopenharmony_ci mov 16($ctx),%r10 3878c2ecf20Sopenharmony_ci 3888c2ecf20Sopenharmony_ci mov %r8,%rax 3898c2ecf20Sopenharmony_ci add \$5,%r8 # compare to modulus 3908c2ecf20Sopenharmony_ci mov %r9,%rcx 3918c2ecf20Sopenharmony_ci adc \$0,%r9 3928c2ecf20Sopenharmony_ci adc \$0,%r10 3938c2ecf20Sopenharmony_ci shr \$2,%r10 # did 130-bit value overflow? 3948c2ecf20Sopenharmony_ci cmovnz %r8,%rax 3958c2ecf20Sopenharmony_ci cmovnz %r9,%rcx 3968c2ecf20Sopenharmony_ci 3978c2ecf20Sopenharmony_ci add 0($nonce),%rax # accumulate nonce 3988c2ecf20Sopenharmony_ci adc 8($nonce),%rcx 3998c2ecf20Sopenharmony_ci mov %rax,0($mac) # write result 4008c2ecf20Sopenharmony_ci mov %rcx,8($mac) 4018c2ecf20Sopenharmony_ci 4028c2ecf20Sopenharmony_ci RET 4038c2ecf20Sopenharmony_ci___ 4048c2ecf20Sopenharmony_ci&end_function("poly1305_emit_x86_64"); 4058c2ecf20Sopenharmony_ciif ($avx) { 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_ci######################################################################## 4088c2ecf20Sopenharmony_ci# Layout of opaque area is following. 4098c2ecf20Sopenharmony_ci# 4108c2ecf20Sopenharmony_ci# unsigned __int32 h[5]; # current hash value base 2^26 4118c2ecf20Sopenharmony_ci# unsigned __int32 is_base2_26; 4128c2ecf20Sopenharmony_ci# unsigned __int64 r[2]; # key value base 2^64 4138c2ecf20Sopenharmony_ci# unsigned __int64 pad; 4148c2ecf20Sopenharmony_ci# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; 4158c2ecf20Sopenharmony_ci# 4168c2ecf20Sopenharmony_ci# where r^n are base 2^26 digits of degrees of multiplier key. There are 4178c2ecf20Sopenharmony_ci# 5 digits, but last four are interleaved with multiples of 5, totalling 4188c2ecf20Sopenharmony_ci# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = 4218c2ecf20Sopenharmony_ci map("%xmm$_",(0..15)); 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_ci$code.=<<___; 4248c2ecf20Sopenharmony_ci.type __poly1305_block,\@abi-omnipotent 4258c2ecf20Sopenharmony_ci.align 32 4268c2ecf20Sopenharmony_ci__poly1305_block: 4278c2ecf20Sopenharmony_ci push $ctx 4288c2ecf20Sopenharmony_ci___ 4298c2ecf20Sopenharmony_ci &poly1305_iteration(); 4308c2ecf20Sopenharmony_ci$code.=<<___; 4318c2ecf20Sopenharmony_ci pop $ctx 4328c2ecf20Sopenharmony_ci RET 4338c2ecf20Sopenharmony_ci.size __poly1305_block,.-__poly1305_block 4348c2ecf20Sopenharmony_ci 4358c2ecf20Sopenharmony_ci.type __poly1305_init_avx,\@abi-omnipotent 4368c2ecf20Sopenharmony_ci.align 32 4378c2ecf20Sopenharmony_ci__poly1305_init_avx: 4388c2ecf20Sopenharmony_ci push %rbp 4398c2ecf20Sopenharmony_ci mov %rsp,%rbp 4408c2ecf20Sopenharmony_ci mov $r0,$h0 4418c2ecf20Sopenharmony_ci mov $r1,$h1 4428c2ecf20Sopenharmony_ci xor $h2,$h2 4438c2ecf20Sopenharmony_ci 4448c2ecf20Sopenharmony_ci lea 48+64($ctx),$ctx # size optimization 4458c2ecf20Sopenharmony_ci 4468c2ecf20Sopenharmony_ci mov $r1,%rax 4478c2ecf20Sopenharmony_ci call __poly1305_block # r^2 4488c2ecf20Sopenharmony_ci 4498c2ecf20Sopenharmony_ci mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 4508c2ecf20Sopenharmony_ci mov \$0x3ffffff,%edx 4518c2ecf20Sopenharmony_ci mov $h0,$d1 4528c2ecf20Sopenharmony_ci and $h0#d,%eax 4538c2ecf20Sopenharmony_ci mov $r0,$d2 4548c2ecf20Sopenharmony_ci and $r0#d,%edx 4558c2ecf20Sopenharmony_ci mov %eax,`16*0+0-64`($ctx) 4568c2ecf20Sopenharmony_ci shr \$26,$d1 4578c2ecf20Sopenharmony_ci mov %edx,`16*0+4-64`($ctx) 4588c2ecf20Sopenharmony_ci shr \$26,$d2 4598c2ecf20Sopenharmony_ci 4608c2ecf20Sopenharmony_ci mov \$0x3ffffff,%eax 4618c2ecf20Sopenharmony_ci mov \$0x3ffffff,%edx 4628c2ecf20Sopenharmony_ci and $d1#d,%eax 4638c2ecf20Sopenharmony_ci and $d2#d,%edx 4648c2ecf20Sopenharmony_ci mov %eax,`16*1+0-64`($ctx) 4658c2ecf20Sopenharmony_ci lea (%rax,%rax,4),%eax # *5 4668c2ecf20Sopenharmony_ci mov %edx,`16*1+4-64`($ctx) 4678c2ecf20Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 4688c2ecf20Sopenharmony_ci mov %eax,`16*2+0-64`($ctx) 4698c2ecf20Sopenharmony_ci shr \$26,$d1 4708c2ecf20Sopenharmony_ci mov %edx,`16*2+4-64`($ctx) 4718c2ecf20Sopenharmony_ci shr \$26,$d2 4728c2ecf20Sopenharmony_ci 4738c2ecf20Sopenharmony_ci mov $h1,%rax 4748c2ecf20Sopenharmony_ci mov $r1,%rdx 4758c2ecf20Sopenharmony_ci shl \$12,%rax 4768c2ecf20Sopenharmony_ci shl \$12,%rdx 4778c2ecf20Sopenharmony_ci or $d1,%rax 4788c2ecf20Sopenharmony_ci or $d2,%rdx 4798c2ecf20Sopenharmony_ci and \$0x3ffffff,%eax 4808c2ecf20Sopenharmony_ci and \$0x3ffffff,%edx 4818c2ecf20Sopenharmony_ci mov %eax,`16*3+0-64`($ctx) 4828c2ecf20Sopenharmony_ci lea (%rax,%rax,4),%eax # *5 4838c2ecf20Sopenharmony_ci mov %edx,`16*3+4-64`($ctx) 4848c2ecf20Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 4858c2ecf20Sopenharmony_ci mov %eax,`16*4+0-64`($ctx) 4868c2ecf20Sopenharmony_ci mov $h1,$d1 4878c2ecf20Sopenharmony_ci mov %edx,`16*4+4-64`($ctx) 4888c2ecf20Sopenharmony_ci mov $r1,$d2 4898c2ecf20Sopenharmony_ci 4908c2ecf20Sopenharmony_ci mov \$0x3ffffff,%eax 4918c2ecf20Sopenharmony_ci mov \$0x3ffffff,%edx 4928c2ecf20Sopenharmony_ci shr \$14,$d1 4938c2ecf20Sopenharmony_ci shr \$14,$d2 4948c2ecf20Sopenharmony_ci and $d1#d,%eax 4958c2ecf20Sopenharmony_ci and $d2#d,%edx 4968c2ecf20Sopenharmony_ci mov %eax,`16*5+0-64`($ctx) 4978c2ecf20Sopenharmony_ci lea (%rax,%rax,4),%eax # *5 4988c2ecf20Sopenharmony_ci mov %edx,`16*5+4-64`($ctx) 4998c2ecf20Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 5008c2ecf20Sopenharmony_ci mov %eax,`16*6+0-64`($ctx) 5018c2ecf20Sopenharmony_ci shr \$26,$d1 5028c2ecf20Sopenharmony_ci mov %edx,`16*6+4-64`($ctx) 5038c2ecf20Sopenharmony_ci shr \$26,$d2 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_ci mov $h2,%rax 5068c2ecf20Sopenharmony_ci shl \$24,%rax 5078c2ecf20Sopenharmony_ci or %rax,$d1 5088c2ecf20Sopenharmony_ci mov $d1#d,`16*7+0-64`($ctx) 5098c2ecf20Sopenharmony_ci lea ($d1,$d1,4),$d1 # *5 5108c2ecf20Sopenharmony_ci mov $d2#d,`16*7+4-64`($ctx) 5118c2ecf20Sopenharmony_ci lea ($d2,$d2,4),$d2 # *5 5128c2ecf20Sopenharmony_ci mov $d1#d,`16*8+0-64`($ctx) 5138c2ecf20Sopenharmony_ci mov $d2#d,`16*8+4-64`($ctx) 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci mov $r1,%rax 5168c2ecf20Sopenharmony_ci call __poly1305_block # r^3 5178c2ecf20Sopenharmony_ci 5188c2ecf20Sopenharmony_ci mov \$0x3ffffff,%eax # save r^3 base 2^26 5198c2ecf20Sopenharmony_ci mov $h0,$d1 5208c2ecf20Sopenharmony_ci and $h0#d,%eax 5218c2ecf20Sopenharmony_ci shr \$26,$d1 5228c2ecf20Sopenharmony_ci mov %eax,`16*0+12-64`($ctx) 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci mov \$0x3ffffff,%edx 5258c2ecf20Sopenharmony_ci and $d1#d,%edx 5268c2ecf20Sopenharmony_ci mov %edx,`16*1+12-64`($ctx) 5278c2ecf20Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 5288c2ecf20Sopenharmony_ci shr \$26,$d1 5298c2ecf20Sopenharmony_ci mov %edx,`16*2+12-64`($ctx) 5308c2ecf20Sopenharmony_ci 5318c2ecf20Sopenharmony_ci mov $h1,%rax 5328c2ecf20Sopenharmony_ci shl \$12,%rax 5338c2ecf20Sopenharmony_ci or $d1,%rax 5348c2ecf20Sopenharmony_ci and \$0x3ffffff,%eax 5358c2ecf20Sopenharmony_ci mov %eax,`16*3+12-64`($ctx) 5368c2ecf20Sopenharmony_ci lea (%rax,%rax,4),%eax # *5 5378c2ecf20Sopenharmony_ci mov $h1,$d1 5388c2ecf20Sopenharmony_ci mov %eax,`16*4+12-64`($ctx) 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci mov \$0x3ffffff,%edx 5418c2ecf20Sopenharmony_ci shr \$14,$d1 5428c2ecf20Sopenharmony_ci and $d1#d,%edx 5438c2ecf20Sopenharmony_ci mov %edx,`16*5+12-64`($ctx) 5448c2ecf20Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 5458c2ecf20Sopenharmony_ci shr \$26,$d1 5468c2ecf20Sopenharmony_ci mov %edx,`16*6+12-64`($ctx) 5478c2ecf20Sopenharmony_ci 5488c2ecf20Sopenharmony_ci mov $h2,%rax 5498c2ecf20Sopenharmony_ci shl \$24,%rax 5508c2ecf20Sopenharmony_ci or %rax,$d1 5518c2ecf20Sopenharmony_ci mov $d1#d,`16*7+12-64`($ctx) 5528c2ecf20Sopenharmony_ci lea ($d1,$d1,4),$d1 # *5 5538c2ecf20Sopenharmony_ci mov $d1#d,`16*8+12-64`($ctx) 5548c2ecf20Sopenharmony_ci 5558c2ecf20Sopenharmony_ci mov $r1,%rax 5568c2ecf20Sopenharmony_ci call __poly1305_block # r^4 5578c2ecf20Sopenharmony_ci 5588c2ecf20Sopenharmony_ci mov \$0x3ffffff,%eax # save r^4 base 2^26 5598c2ecf20Sopenharmony_ci mov $h0,$d1 5608c2ecf20Sopenharmony_ci and $h0#d,%eax 5618c2ecf20Sopenharmony_ci shr \$26,$d1 5628c2ecf20Sopenharmony_ci mov %eax,`16*0+8-64`($ctx) 5638c2ecf20Sopenharmony_ci 5648c2ecf20Sopenharmony_ci mov \$0x3ffffff,%edx 5658c2ecf20Sopenharmony_ci and $d1#d,%edx 5668c2ecf20Sopenharmony_ci mov %edx,`16*1+8-64`($ctx) 5678c2ecf20Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 5688c2ecf20Sopenharmony_ci shr \$26,$d1 5698c2ecf20Sopenharmony_ci mov %edx,`16*2+8-64`($ctx) 5708c2ecf20Sopenharmony_ci 5718c2ecf20Sopenharmony_ci mov $h1,%rax 5728c2ecf20Sopenharmony_ci shl \$12,%rax 5738c2ecf20Sopenharmony_ci or $d1,%rax 5748c2ecf20Sopenharmony_ci and \$0x3ffffff,%eax 5758c2ecf20Sopenharmony_ci mov %eax,`16*3+8-64`($ctx) 5768c2ecf20Sopenharmony_ci lea (%rax,%rax,4),%eax # *5 5778c2ecf20Sopenharmony_ci mov $h1,$d1 5788c2ecf20Sopenharmony_ci mov %eax,`16*4+8-64`($ctx) 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_ci mov \$0x3ffffff,%edx 5818c2ecf20Sopenharmony_ci shr \$14,$d1 5828c2ecf20Sopenharmony_ci and $d1#d,%edx 5838c2ecf20Sopenharmony_ci mov %edx,`16*5+8-64`($ctx) 5848c2ecf20Sopenharmony_ci lea (%rdx,%rdx,4),%edx # *5 5858c2ecf20Sopenharmony_ci shr \$26,$d1 5868c2ecf20Sopenharmony_ci mov %edx,`16*6+8-64`($ctx) 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_ci mov $h2,%rax 5898c2ecf20Sopenharmony_ci shl \$24,%rax 5908c2ecf20Sopenharmony_ci or %rax,$d1 5918c2ecf20Sopenharmony_ci mov $d1#d,`16*7+8-64`($ctx) 5928c2ecf20Sopenharmony_ci lea ($d1,$d1,4),$d1 # *5 5938c2ecf20Sopenharmony_ci mov $d1#d,`16*8+8-64`($ctx) 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci lea -48-64($ctx),$ctx # size [de-]optimization 5968c2ecf20Sopenharmony_ci pop %rbp 5978c2ecf20Sopenharmony_ci RET 5988c2ecf20Sopenharmony_ci.size __poly1305_init_avx,.-__poly1305_init_avx 5998c2ecf20Sopenharmony_ci___ 6008c2ecf20Sopenharmony_ci 6018c2ecf20Sopenharmony_ci&declare_function("poly1305_blocks_avx", 32, 4); 6028c2ecf20Sopenharmony_ci$code.=<<___; 6038c2ecf20Sopenharmony_ci.cfi_startproc 6048c2ecf20Sopenharmony_ci mov 20($ctx),%r8d # is_base2_26 6058c2ecf20Sopenharmony_ci cmp \$128,$len 6068c2ecf20Sopenharmony_ci jae .Lblocks_avx 6078c2ecf20Sopenharmony_ci test %r8d,%r8d 6088c2ecf20Sopenharmony_ci jz .Lblocks 6098c2ecf20Sopenharmony_ci 6108c2ecf20Sopenharmony_ci.Lblocks_avx: 6118c2ecf20Sopenharmony_ci and \$-16,$len 6128c2ecf20Sopenharmony_ci jz .Lno_data_avx 6138c2ecf20Sopenharmony_ci 6148c2ecf20Sopenharmony_ci vzeroupper 6158c2ecf20Sopenharmony_ci 6168c2ecf20Sopenharmony_ci test %r8d,%r8d 6178c2ecf20Sopenharmony_ci jz .Lbase2_64_avx 6188c2ecf20Sopenharmony_ci 6198c2ecf20Sopenharmony_ci test \$31,$len 6208c2ecf20Sopenharmony_ci jz .Leven_avx 6218c2ecf20Sopenharmony_ci 6228c2ecf20Sopenharmony_ci push %rbp 6238c2ecf20Sopenharmony_ci.cfi_push %rbp 6248c2ecf20Sopenharmony_ci mov %rsp,%rbp 6258c2ecf20Sopenharmony_ci push %rbx 6268c2ecf20Sopenharmony_ci.cfi_push %rbx 6278c2ecf20Sopenharmony_ci push %r12 6288c2ecf20Sopenharmony_ci.cfi_push %r12 6298c2ecf20Sopenharmony_ci push %r13 6308c2ecf20Sopenharmony_ci.cfi_push %r13 6318c2ecf20Sopenharmony_ci push %r14 6328c2ecf20Sopenharmony_ci.cfi_push %r14 6338c2ecf20Sopenharmony_ci push %r15 6348c2ecf20Sopenharmony_ci.cfi_push %r15 6358c2ecf20Sopenharmony_ci.Lblocks_avx_body: 6368c2ecf20Sopenharmony_ci 6378c2ecf20Sopenharmony_ci mov $len,%r15 # reassign $len 6388c2ecf20Sopenharmony_ci 6398c2ecf20Sopenharmony_ci mov 0($ctx),$d1 # load hash value 6408c2ecf20Sopenharmony_ci mov 8($ctx),$d2 6418c2ecf20Sopenharmony_ci mov 16($ctx),$h2#d 6428c2ecf20Sopenharmony_ci 6438c2ecf20Sopenharmony_ci mov 24($ctx),$r0 # load r 6448c2ecf20Sopenharmony_ci mov 32($ctx),$s1 6458c2ecf20Sopenharmony_ci 6468c2ecf20Sopenharmony_ci ################################# base 2^26 -> base 2^64 6478c2ecf20Sopenharmony_ci mov $d1#d,$h0#d 6488c2ecf20Sopenharmony_ci and \$`-1*(1<<31)`,$d1 6498c2ecf20Sopenharmony_ci mov $d2,$r1 # borrow $r1 6508c2ecf20Sopenharmony_ci mov $d2#d,$h1#d 6518c2ecf20Sopenharmony_ci and \$`-1*(1<<31)`,$d2 6528c2ecf20Sopenharmony_ci 6538c2ecf20Sopenharmony_ci shr \$6,$d1 6548c2ecf20Sopenharmony_ci shl \$52,$r1 6558c2ecf20Sopenharmony_ci add $d1,$h0 6568c2ecf20Sopenharmony_ci shr \$12,$h1 6578c2ecf20Sopenharmony_ci shr \$18,$d2 6588c2ecf20Sopenharmony_ci add $r1,$h0 6598c2ecf20Sopenharmony_ci adc $d2,$h1 6608c2ecf20Sopenharmony_ci 6618c2ecf20Sopenharmony_ci mov $h2,$d1 6628c2ecf20Sopenharmony_ci shl \$40,$d1 6638c2ecf20Sopenharmony_ci shr \$24,$h2 6648c2ecf20Sopenharmony_ci add $d1,$h1 6658c2ecf20Sopenharmony_ci adc \$0,$h2 # can be partially reduced... 6668c2ecf20Sopenharmony_ci 6678c2ecf20Sopenharmony_ci mov \$-4,$d2 # ... so reduce 6688c2ecf20Sopenharmony_ci mov $h2,$d1 6698c2ecf20Sopenharmony_ci and $h2,$d2 6708c2ecf20Sopenharmony_ci shr \$2,$d1 6718c2ecf20Sopenharmony_ci and \$3,$h2 6728c2ecf20Sopenharmony_ci add $d2,$d1 # =*5 6738c2ecf20Sopenharmony_ci add $d1,$h0 6748c2ecf20Sopenharmony_ci adc \$0,$h1 6758c2ecf20Sopenharmony_ci adc \$0,$h2 6768c2ecf20Sopenharmony_ci 6778c2ecf20Sopenharmony_ci mov $s1,$r1 6788c2ecf20Sopenharmony_ci mov $s1,%rax 6798c2ecf20Sopenharmony_ci shr \$2,$s1 6808c2ecf20Sopenharmony_ci add $r1,$s1 # s1 = r1 + (r1 >> 2) 6818c2ecf20Sopenharmony_ci 6828c2ecf20Sopenharmony_ci add 0($inp),$h0 # accumulate input 6838c2ecf20Sopenharmony_ci adc 8($inp),$h1 6848c2ecf20Sopenharmony_ci lea 16($inp),$inp 6858c2ecf20Sopenharmony_ci adc $padbit,$h2 6868c2ecf20Sopenharmony_ci 6878c2ecf20Sopenharmony_ci call __poly1305_block 6888c2ecf20Sopenharmony_ci 6898c2ecf20Sopenharmony_ci test $padbit,$padbit # if $padbit is zero, 6908c2ecf20Sopenharmony_ci jz .Lstore_base2_64_avx # store hash in base 2^64 format 6918c2ecf20Sopenharmony_ci 6928c2ecf20Sopenharmony_ci ################################# base 2^64 -> base 2^26 6938c2ecf20Sopenharmony_ci mov $h0,%rax 6948c2ecf20Sopenharmony_ci mov $h0,%rdx 6958c2ecf20Sopenharmony_ci shr \$52,$h0 6968c2ecf20Sopenharmony_ci mov $h1,$r0 6978c2ecf20Sopenharmony_ci mov $h1,$r1 6988c2ecf20Sopenharmony_ci shr \$26,%rdx 6998c2ecf20Sopenharmony_ci and \$0x3ffffff,%rax # h[0] 7008c2ecf20Sopenharmony_ci shl \$12,$r0 7018c2ecf20Sopenharmony_ci and \$0x3ffffff,%rdx # h[1] 7028c2ecf20Sopenharmony_ci shr \$14,$h1 7038c2ecf20Sopenharmony_ci or $r0,$h0 7048c2ecf20Sopenharmony_ci shl \$24,$h2 7058c2ecf20Sopenharmony_ci and \$0x3ffffff,$h0 # h[2] 7068c2ecf20Sopenharmony_ci shr \$40,$r1 7078c2ecf20Sopenharmony_ci and \$0x3ffffff,$h1 # h[3] 7088c2ecf20Sopenharmony_ci or $r1,$h2 # h[4] 7098c2ecf20Sopenharmony_ci 7108c2ecf20Sopenharmony_ci sub \$16,%r15 7118c2ecf20Sopenharmony_ci jz .Lstore_base2_26_avx 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_ci vmovd %rax#d,$H0 7148c2ecf20Sopenharmony_ci vmovd %rdx#d,$H1 7158c2ecf20Sopenharmony_ci vmovd $h0#d,$H2 7168c2ecf20Sopenharmony_ci vmovd $h1#d,$H3 7178c2ecf20Sopenharmony_ci vmovd $h2#d,$H4 7188c2ecf20Sopenharmony_ci jmp .Lproceed_avx 7198c2ecf20Sopenharmony_ci 7208c2ecf20Sopenharmony_ci.align 32 7218c2ecf20Sopenharmony_ci.Lstore_base2_64_avx: 7228c2ecf20Sopenharmony_ci mov $h0,0($ctx) 7238c2ecf20Sopenharmony_ci mov $h1,8($ctx) 7248c2ecf20Sopenharmony_ci mov $h2,16($ctx) # note that is_base2_26 is zeroed 7258c2ecf20Sopenharmony_ci jmp .Ldone_avx 7268c2ecf20Sopenharmony_ci 7278c2ecf20Sopenharmony_ci.align 16 7288c2ecf20Sopenharmony_ci.Lstore_base2_26_avx: 7298c2ecf20Sopenharmony_ci mov %rax#d,0($ctx) # store hash value base 2^26 7308c2ecf20Sopenharmony_ci mov %rdx#d,4($ctx) 7318c2ecf20Sopenharmony_ci mov $h0#d,8($ctx) 7328c2ecf20Sopenharmony_ci mov $h1#d,12($ctx) 7338c2ecf20Sopenharmony_ci mov $h2#d,16($ctx) 7348c2ecf20Sopenharmony_ci.align 16 7358c2ecf20Sopenharmony_ci.Ldone_avx: 7368c2ecf20Sopenharmony_ci pop %r15 7378c2ecf20Sopenharmony_ci.cfi_restore %r15 7388c2ecf20Sopenharmony_ci pop %r14 7398c2ecf20Sopenharmony_ci.cfi_restore %r14 7408c2ecf20Sopenharmony_ci pop %r13 7418c2ecf20Sopenharmony_ci.cfi_restore %r13 7428c2ecf20Sopenharmony_ci pop %r12 7438c2ecf20Sopenharmony_ci.cfi_restore %r12 7448c2ecf20Sopenharmony_ci pop %rbx 7458c2ecf20Sopenharmony_ci.cfi_restore %rbx 7468c2ecf20Sopenharmony_ci pop %rbp 7478c2ecf20Sopenharmony_ci.cfi_restore %rbp 7488c2ecf20Sopenharmony_ci.Lno_data_avx: 7498c2ecf20Sopenharmony_ci.Lblocks_avx_epilogue: 7508c2ecf20Sopenharmony_ci RET 7518c2ecf20Sopenharmony_ci.cfi_endproc 7528c2ecf20Sopenharmony_ci 7538c2ecf20Sopenharmony_ci.align 32 7548c2ecf20Sopenharmony_ci.Lbase2_64_avx: 7558c2ecf20Sopenharmony_ci.cfi_startproc 7568c2ecf20Sopenharmony_ci push %rbp 7578c2ecf20Sopenharmony_ci.cfi_push %rbp 7588c2ecf20Sopenharmony_ci mov %rsp,%rbp 7598c2ecf20Sopenharmony_ci push %rbx 7608c2ecf20Sopenharmony_ci.cfi_push %rbx 7618c2ecf20Sopenharmony_ci push %r12 7628c2ecf20Sopenharmony_ci.cfi_push %r12 7638c2ecf20Sopenharmony_ci push %r13 7648c2ecf20Sopenharmony_ci.cfi_push %r13 7658c2ecf20Sopenharmony_ci push %r14 7668c2ecf20Sopenharmony_ci.cfi_push %r14 7678c2ecf20Sopenharmony_ci push %r15 7688c2ecf20Sopenharmony_ci.cfi_push %r15 7698c2ecf20Sopenharmony_ci.Lbase2_64_avx_body: 7708c2ecf20Sopenharmony_ci 7718c2ecf20Sopenharmony_ci mov $len,%r15 # reassign $len 7728c2ecf20Sopenharmony_ci 7738c2ecf20Sopenharmony_ci mov 24($ctx),$r0 # load r 7748c2ecf20Sopenharmony_ci mov 32($ctx),$s1 7758c2ecf20Sopenharmony_ci 7768c2ecf20Sopenharmony_ci mov 0($ctx),$h0 # load hash value 7778c2ecf20Sopenharmony_ci mov 8($ctx),$h1 7788c2ecf20Sopenharmony_ci mov 16($ctx),$h2#d 7798c2ecf20Sopenharmony_ci 7808c2ecf20Sopenharmony_ci mov $s1,$r1 7818c2ecf20Sopenharmony_ci mov $s1,%rax 7828c2ecf20Sopenharmony_ci shr \$2,$s1 7838c2ecf20Sopenharmony_ci add $r1,$s1 # s1 = r1 + (r1 >> 2) 7848c2ecf20Sopenharmony_ci 7858c2ecf20Sopenharmony_ci test \$31,$len 7868c2ecf20Sopenharmony_ci jz .Linit_avx 7878c2ecf20Sopenharmony_ci 7888c2ecf20Sopenharmony_ci add 0($inp),$h0 # accumulate input 7898c2ecf20Sopenharmony_ci adc 8($inp),$h1 7908c2ecf20Sopenharmony_ci lea 16($inp),$inp 7918c2ecf20Sopenharmony_ci adc $padbit,$h2 7928c2ecf20Sopenharmony_ci sub \$16,%r15 7938c2ecf20Sopenharmony_ci 7948c2ecf20Sopenharmony_ci call __poly1305_block 7958c2ecf20Sopenharmony_ci 7968c2ecf20Sopenharmony_ci.Linit_avx: 7978c2ecf20Sopenharmony_ci ################################# base 2^64 -> base 2^26 7988c2ecf20Sopenharmony_ci mov $h0,%rax 7998c2ecf20Sopenharmony_ci mov $h0,%rdx 8008c2ecf20Sopenharmony_ci shr \$52,$h0 8018c2ecf20Sopenharmony_ci mov $h1,$d1 8028c2ecf20Sopenharmony_ci mov $h1,$d2 8038c2ecf20Sopenharmony_ci shr \$26,%rdx 8048c2ecf20Sopenharmony_ci and \$0x3ffffff,%rax # h[0] 8058c2ecf20Sopenharmony_ci shl \$12,$d1 8068c2ecf20Sopenharmony_ci and \$0x3ffffff,%rdx # h[1] 8078c2ecf20Sopenharmony_ci shr \$14,$h1 8088c2ecf20Sopenharmony_ci or $d1,$h0 8098c2ecf20Sopenharmony_ci shl \$24,$h2 8108c2ecf20Sopenharmony_ci and \$0x3ffffff,$h0 # h[2] 8118c2ecf20Sopenharmony_ci shr \$40,$d2 8128c2ecf20Sopenharmony_ci and \$0x3ffffff,$h1 # h[3] 8138c2ecf20Sopenharmony_ci or $d2,$h2 # h[4] 8148c2ecf20Sopenharmony_ci 8158c2ecf20Sopenharmony_ci vmovd %rax#d,$H0 8168c2ecf20Sopenharmony_ci vmovd %rdx#d,$H1 8178c2ecf20Sopenharmony_ci vmovd $h0#d,$H2 8188c2ecf20Sopenharmony_ci vmovd $h1#d,$H3 8198c2ecf20Sopenharmony_ci vmovd $h2#d,$H4 8208c2ecf20Sopenharmony_ci movl \$1,20($ctx) # set is_base2_26 8218c2ecf20Sopenharmony_ci 8228c2ecf20Sopenharmony_ci call __poly1305_init_avx 8238c2ecf20Sopenharmony_ci 8248c2ecf20Sopenharmony_ci.Lproceed_avx: 8258c2ecf20Sopenharmony_ci mov %r15,$len 8268c2ecf20Sopenharmony_ci pop %r15 8278c2ecf20Sopenharmony_ci.cfi_restore %r15 8288c2ecf20Sopenharmony_ci pop %r14 8298c2ecf20Sopenharmony_ci.cfi_restore %r14 8308c2ecf20Sopenharmony_ci pop %r13 8318c2ecf20Sopenharmony_ci.cfi_restore %r13 8328c2ecf20Sopenharmony_ci pop %r12 8338c2ecf20Sopenharmony_ci.cfi_restore %r12 8348c2ecf20Sopenharmony_ci pop %rbx 8358c2ecf20Sopenharmony_ci.cfi_restore %rbx 8368c2ecf20Sopenharmony_ci pop %rbp 8378c2ecf20Sopenharmony_ci.cfi_restore %rbp 8388c2ecf20Sopenharmony_ci.Lbase2_64_avx_epilogue: 8398c2ecf20Sopenharmony_ci jmp .Ldo_avx 8408c2ecf20Sopenharmony_ci.cfi_endproc 8418c2ecf20Sopenharmony_ci 8428c2ecf20Sopenharmony_ci.align 32 8438c2ecf20Sopenharmony_ci.Leven_avx: 8448c2ecf20Sopenharmony_ci.cfi_startproc 8458c2ecf20Sopenharmony_ci vmovd 4*0($ctx),$H0 # load hash value 8468c2ecf20Sopenharmony_ci vmovd 4*1($ctx),$H1 8478c2ecf20Sopenharmony_ci vmovd 4*2($ctx),$H2 8488c2ecf20Sopenharmony_ci vmovd 4*3($ctx),$H3 8498c2ecf20Sopenharmony_ci vmovd 4*4($ctx),$H4 8508c2ecf20Sopenharmony_ci 8518c2ecf20Sopenharmony_ci.Ldo_avx: 8528c2ecf20Sopenharmony_ci___ 8538c2ecf20Sopenharmony_ci$code.=<<___ if (!$win64); 8548c2ecf20Sopenharmony_ci lea 8(%rsp),%r10 8558c2ecf20Sopenharmony_ci.cfi_def_cfa_register %r10 8568c2ecf20Sopenharmony_ci and \$-32,%rsp 8578c2ecf20Sopenharmony_ci sub \$-8,%rsp 8588c2ecf20Sopenharmony_ci lea -0x58(%rsp),%r11 8598c2ecf20Sopenharmony_ci sub \$0x178,%rsp 8608c2ecf20Sopenharmony_ci___ 8618c2ecf20Sopenharmony_ci$code.=<<___ if ($win64); 8628c2ecf20Sopenharmony_ci lea -0xf8(%rsp),%r11 8638c2ecf20Sopenharmony_ci sub \$0x218,%rsp 8648c2ecf20Sopenharmony_ci vmovdqa %xmm6,0x50(%r11) 8658c2ecf20Sopenharmony_ci vmovdqa %xmm7,0x60(%r11) 8668c2ecf20Sopenharmony_ci vmovdqa %xmm8,0x70(%r11) 8678c2ecf20Sopenharmony_ci vmovdqa %xmm9,0x80(%r11) 8688c2ecf20Sopenharmony_ci vmovdqa %xmm10,0x90(%r11) 8698c2ecf20Sopenharmony_ci vmovdqa %xmm11,0xa0(%r11) 8708c2ecf20Sopenharmony_ci vmovdqa %xmm12,0xb0(%r11) 8718c2ecf20Sopenharmony_ci vmovdqa %xmm13,0xc0(%r11) 8728c2ecf20Sopenharmony_ci vmovdqa %xmm14,0xd0(%r11) 8738c2ecf20Sopenharmony_ci vmovdqa %xmm15,0xe0(%r11) 8748c2ecf20Sopenharmony_ci.Ldo_avx_body: 8758c2ecf20Sopenharmony_ci___ 8768c2ecf20Sopenharmony_ci$code.=<<___; 8778c2ecf20Sopenharmony_ci sub \$64,$len 8788c2ecf20Sopenharmony_ci lea -32($inp),%rax 8798c2ecf20Sopenharmony_ci cmovc %rax,$inp 8808c2ecf20Sopenharmony_ci 8818c2ecf20Sopenharmony_ci vmovdqu `16*3`($ctx),$D4 # preload r0^2 8828c2ecf20Sopenharmony_ci lea `16*3+64`($ctx),$ctx # size optimization 8838c2ecf20Sopenharmony_ci lea .Lconst(%rip),%rcx 8848c2ecf20Sopenharmony_ci 8858c2ecf20Sopenharmony_ci ################################################################ 8868c2ecf20Sopenharmony_ci # load input 8878c2ecf20Sopenharmony_ci vmovdqu 16*2($inp),$T0 8888c2ecf20Sopenharmony_ci vmovdqu 16*3($inp),$T1 8898c2ecf20Sopenharmony_ci vmovdqa 64(%rcx),$MASK # .Lmask26 8908c2ecf20Sopenharmony_ci 8918c2ecf20Sopenharmony_ci vpsrldq \$6,$T0,$T2 # splat input 8928c2ecf20Sopenharmony_ci vpsrldq \$6,$T1,$T3 8938c2ecf20Sopenharmony_ci vpunpckhqdq $T1,$T0,$T4 # 4 8948c2ecf20Sopenharmony_ci vpunpcklqdq $T1,$T0,$T0 # 0:1 8958c2ecf20Sopenharmony_ci vpunpcklqdq $T3,$T2,$T3 # 2:3 8968c2ecf20Sopenharmony_ci 8978c2ecf20Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 8988c2ecf20Sopenharmony_ci vpsrlq \$26,$T0,$T1 8998c2ecf20Sopenharmony_ci vpand $MASK,$T0,$T0 # 0 9008c2ecf20Sopenharmony_ci vpsrlq \$4,$T3,$T2 9018c2ecf20Sopenharmony_ci vpand $MASK,$T1,$T1 # 1 9028c2ecf20Sopenharmony_ci vpsrlq \$30,$T3,$T3 9038c2ecf20Sopenharmony_ci vpand $MASK,$T2,$T2 # 2 9048c2ecf20Sopenharmony_ci vpand $MASK,$T3,$T3 # 3 9058c2ecf20Sopenharmony_ci vpor 32(%rcx),$T4,$T4 # padbit, yes, always 9068c2ecf20Sopenharmony_ci 9078c2ecf20Sopenharmony_ci jbe .Lskip_loop_avx 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_ci # expand and copy pre-calculated table to stack 9108c2ecf20Sopenharmony_ci vmovdqu `16*1-64`($ctx),$D1 9118c2ecf20Sopenharmony_ci vmovdqu `16*2-64`($ctx),$D2 9128c2ecf20Sopenharmony_ci vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 9138c2ecf20Sopenharmony_ci vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 9148c2ecf20Sopenharmony_ci vmovdqa $D3,-0x90(%r11) 9158c2ecf20Sopenharmony_ci vmovdqa $D0,0x00(%rsp) 9168c2ecf20Sopenharmony_ci vpshufd \$0xEE,$D1,$D4 9178c2ecf20Sopenharmony_ci vmovdqu `16*3-64`($ctx),$D0 9188c2ecf20Sopenharmony_ci vpshufd \$0x44,$D1,$D1 9198c2ecf20Sopenharmony_ci vmovdqa $D4,-0x80(%r11) 9208c2ecf20Sopenharmony_ci vmovdqa $D1,0x10(%rsp) 9218c2ecf20Sopenharmony_ci vpshufd \$0xEE,$D2,$D3 9228c2ecf20Sopenharmony_ci vmovdqu `16*4-64`($ctx),$D1 9238c2ecf20Sopenharmony_ci vpshufd \$0x44,$D2,$D2 9248c2ecf20Sopenharmony_ci vmovdqa $D3,-0x70(%r11) 9258c2ecf20Sopenharmony_ci vmovdqa $D2,0x20(%rsp) 9268c2ecf20Sopenharmony_ci vpshufd \$0xEE,$D0,$D4 9278c2ecf20Sopenharmony_ci vmovdqu `16*5-64`($ctx),$D2 9288c2ecf20Sopenharmony_ci vpshufd \$0x44,$D0,$D0 9298c2ecf20Sopenharmony_ci vmovdqa $D4,-0x60(%r11) 9308c2ecf20Sopenharmony_ci vmovdqa $D0,0x30(%rsp) 9318c2ecf20Sopenharmony_ci vpshufd \$0xEE,$D1,$D3 9328c2ecf20Sopenharmony_ci vmovdqu `16*6-64`($ctx),$D0 9338c2ecf20Sopenharmony_ci vpshufd \$0x44,$D1,$D1 9348c2ecf20Sopenharmony_ci vmovdqa $D3,-0x50(%r11) 9358c2ecf20Sopenharmony_ci vmovdqa $D1,0x40(%rsp) 9368c2ecf20Sopenharmony_ci vpshufd \$0xEE,$D2,$D4 9378c2ecf20Sopenharmony_ci vmovdqu `16*7-64`($ctx),$D1 9388c2ecf20Sopenharmony_ci vpshufd \$0x44,$D2,$D2 9398c2ecf20Sopenharmony_ci vmovdqa $D4,-0x40(%r11) 9408c2ecf20Sopenharmony_ci vmovdqa $D2,0x50(%rsp) 9418c2ecf20Sopenharmony_ci vpshufd \$0xEE,$D0,$D3 9428c2ecf20Sopenharmony_ci vmovdqu `16*8-64`($ctx),$D2 9438c2ecf20Sopenharmony_ci vpshufd \$0x44,$D0,$D0 9448c2ecf20Sopenharmony_ci vmovdqa $D3,-0x30(%r11) 9458c2ecf20Sopenharmony_ci vmovdqa $D0,0x60(%rsp) 9468c2ecf20Sopenharmony_ci vpshufd \$0xEE,$D1,$D4 9478c2ecf20Sopenharmony_ci vpshufd \$0x44,$D1,$D1 9488c2ecf20Sopenharmony_ci vmovdqa $D4,-0x20(%r11) 9498c2ecf20Sopenharmony_ci vmovdqa $D1,0x70(%rsp) 9508c2ecf20Sopenharmony_ci vpshufd \$0xEE,$D2,$D3 9518c2ecf20Sopenharmony_ci vmovdqa 0x00(%rsp),$D4 # preload r0^2 9528c2ecf20Sopenharmony_ci vpshufd \$0x44,$D2,$D2 9538c2ecf20Sopenharmony_ci vmovdqa $D3,-0x10(%r11) 9548c2ecf20Sopenharmony_ci vmovdqa $D2,0x80(%rsp) 9558c2ecf20Sopenharmony_ci 9568c2ecf20Sopenharmony_ci jmp .Loop_avx 9578c2ecf20Sopenharmony_ci 9588c2ecf20Sopenharmony_ci.align 32 9598c2ecf20Sopenharmony_ci.Loop_avx: 9608c2ecf20Sopenharmony_ci ################################################################ 9618c2ecf20Sopenharmony_ci # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 9628c2ecf20Sopenharmony_ci # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 9638c2ecf20Sopenharmony_ci # \___________________/ 9648c2ecf20Sopenharmony_ci # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 9658c2ecf20Sopenharmony_ci # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 9668c2ecf20Sopenharmony_ci # \___________________/ \____________________/ 9678c2ecf20Sopenharmony_ci # 9688c2ecf20Sopenharmony_ci # Note that we start with inp[2:3]*r^2. This is because it 9698c2ecf20Sopenharmony_ci # doesn't depend on reduction in previous iteration. 9708c2ecf20Sopenharmony_ci ################################################################ 9718c2ecf20Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 9728c2ecf20Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 9738c2ecf20Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 9748c2ecf20Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 9758c2ecf20Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 9768c2ecf20Sopenharmony_ci # 9778c2ecf20Sopenharmony_ci # though note that $Tx and $Hx are "reversed" in this section, 9788c2ecf20Sopenharmony_ci # and $D4 is preloaded with r0^2... 9798c2ecf20Sopenharmony_ci 9808c2ecf20Sopenharmony_ci vpmuludq $T0,$D4,$D0 # d0 = h0*r0 9818c2ecf20Sopenharmony_ci vpmuludq $T1,$D4,$D1 # d1 = h1*r0 9828c2ecf20Sopenharmony_ci vmovdqa $H2,0x20(%r11) # offload hash 9838c2ecf20Sopenharmony_ci vpmuludq $T2,$D4,$D2 # d3 = h2*r0 9848c2ecf20Sopenharmony_ci vmovdqa 0x10(%rsp),$H2 # r1^2 9858c2ecf20Sopenharmony_ci vpmuludq $T3,$D4,$D3 # d3 = h3*r0 9868c2ecf20Sopenharmony_ci vpmuludq $T4,$D4,$D4 # d4 = h4*r0 9878c2ecf20Sopenharmony_ci 9888c2ecf20Sopenharmony_ci vmovdqa $H0,0x00(%r11) # 9898c2ecf20Sopenharmony_ci vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 9908c2ecf20Sopenharmony_ci vmovdqa $H1,0x10(%r11) # 9918c2ecf20Sopenharmony_ci vpmuludq $T3,$H2,$H1 # h3*r1 9928c2ecf20Sopenharmony_ci vpaddq $H0,$D0,$D0 # d0 += h4*s1 9938c2ecf20Sopenharmony_ci vpaddq $H1,$D4,$D4 # d4 += h3*r1 9948c2ecf20Sopenharmony_ci vmovdqa $H3,0x30(%r11) # 9958c2ecf20Sopenharmony_ci vpmuludq $T2,$H2,$H0 # h2*r1 9968c2ecf20Sopenharmony_ci vpmuludq $T1,$H2,$H1 # h1*r1 9978c2ecf20Sopenharmony_ci vpaddq $H0,$D3,$D3 # d3 += h2*r1 9988c2ecf20Sopenharmony_ci vmovdqa 0x30(%rsp),$H3 # r2^2 9998c2ecf20Sopenharmony_ci vpaddq $H1,$D2,$D2 # d2 += h1*r1 10008c2ecf20Sopenharmony_ci vmovdqa $H4,0x40(%r11) # 10018c2ecf20Sopenharmony_ci vpmuludq $T0,$H2,$H2 # h0*r1 10028c2ecf20Sopenharmony_ci vpmuludq $T2,$H3,$H0 # h2*r2 10038c2ecf20Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h0*r1 10048c2ecf20Sopenharmony_ci 10058c2ecf20Sopenharmony_ci vmovdqa 0x40(%rsp),$H4 # s2^2 10068c2ecf20Sopenharmony_ci vpaddq $H0,$D4,$D4 # d4 += h2*r2 10078c2ecf20Sopenharmony_ci vpmuludq $T1,$H3,$H1 # h1*r2 10088c2ecf20Sopenharmony_ci vpmuludq $T0,$H3,$H3 # h0*r2 10098c2ecf20Sopenharmony_ci vpaddq $H1,$D3,$D3 # d3 += h1*r2 10108c2ecf20Sopenharmony_ci vmovdqa 0x50(%rsp),$H2 # r3^2 10118c2ecf20Sopenharmony_ci vpaddq $H3,$D2,$D2 # d2 += h0*r2 10128c2ecf20Sopenharmony_ci vpmuludq $T4,$H4,$H0 # h4*s2 10138c2ecf20Sopenharmony_ci vpmuludq $T3,$H4,$H4 # h3*s2 10148c2ecf20Sopenharmony_ci vpaddq $H0,$D1,$D1 # d1 += h4*s2 10158c2ecf20Sopenharmony_ci vmovdqa 0x60(%rsp),$H3 # s3^2 10168c2ecf20Sopenharmony_ci vpaddq $H4,$D0,$D0 # d0 += h3*s2 10178c2ecf20Sopenharmony_ci 10188c2ecf20Sopenharmony_ci vmovdqa 0x80(%rsp),$H4 # s4^2 10198c2ecf20Sopenharmony_ci vpmuludq $T1,$H2,$H1 # h1*r3 10208c2ecf20Sopenharmony_ci vpmuludq $T0,$H2,$H2 # h0*r3 10218c2ecf20Sopenharmony_ci vpaddq $H1,$D4,$D4 # d4 += h1*r3 10228c2ecf20Sopenharmony_ci vpaddq $H2,$D3,$D3 # d3 += h0*r3 10238c2ecf20Sopenharmony_ci vpmuludq $T4,$H3,$H0 # h4*s3 10248c2ecf20Sopenharmony_ci vpmuludq $T3,$H3,$H1 # h3*s3 10258c2ecf20Sopenharmony_ci vpaddq $H0,$D2,$D2 # d2 += h4*s3 10268c2ecf20Sopenharmony_ci vmovdqu 16*0($inp),$H0 # load input 10278c2ecf20Sopenharmony_ci vpaddq $H1,$D1,$D1 # d1 += h3*s3 10288c2ecf20Sopenharmony_ci vpmuludq $T2,$H3,$H3 # h2*s3 10298c2ecf20Sopenharmony_ci vpmuludq $T2,$H4,$T2 # h2*s4 10308c2ecf20Sopenharmony_ci vpaddq $H3,$D0,$D0 # d0 += h2*s3 10318c2ecf20Sopenharmony_ci 10328c2ecf20Sopenharmony_ci vmovdqu 16*1($inp),$H1 # 10338c2ecf20Sopenharmony_ci vpaddq $T2,$D1,$D1 # d1 += h2*s4 10348c2ecf20Sopenharmony_ci vpmuludq $T3,$H4,$T3 # h3*s4 10358c2ecf20Sopenharmony_ci vpmuludq $T4,$H4,$T4 # h4*s4 10368c2ecf20Sopenharmony_ci vpsrldq \$6,$H0,$H2 # splat input 10378c2ecf20Sopenharmony_ci vpaddq $T3,$D2,$D2 # d2 += h3*s4 10388c2ecf20Sopenharmony_ci vpaddq $T4,$D3,$D3 # d3 += h4*s4 10398c2ecf20Sopenharmony_ci vpsrldq \$6,$H1,$H3 # 10408c2ecf20Sopenharmony_ci vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 10418c2ecf20Sopenharmony_ci vpmuludq $T1,$H4,$T0 # h1*s4 10428c2ecf20Sopenharmony_ci vpunpckhqdq $H1,$H0,$H4 # 4 10438c2ecf20Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h0*r4 10448c2ecf20Sopenharmony_ci vmovdqa -0x90(%r11),$T4 # r0^4 10458c2ecf20Sopenharmony_ci vpaddq $T0,$D0,$D0 # d0 += h1*s4 10468c2ecf20Sopenharmony_ci 10478c2ecf20Sopenharmony_ci vpunpcklqdq $H1,$H0,$H0 # 0:1 10488c2ecf20Sopenharmony_ci vpunpcklqdq $H3,$H2,$H3 # 2:3 10498c2ecf20Sopenharmony_ci 10508c2ecf20Sopenharmony_ci #vpsrlq \$40,$H4,$H4 # 4 10518c2ecf20Sopenharmony_ci vpsrldq \$`40/8`,$H4,$H4 # 4 10528c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$H1 10538c2ecf20Sopenharmony_ci vpand $MASK,$H0,$H0 # 0 10548c2ecf20Sopenharmony_ci vpsrlq \$4,$H3,$H2 10558c2ecf20Sopenharmony_ci vpand $MASK,$H1,$H1 # 1 10568c2ecf20Sopenharmony_ci vpand 0(%rcx),$H4,$H4 # .Lmask24 10578c2ecf20Sopenharmony_ci vpsrlq \$30,$H3,$H3 10588c2ecf20Sopenharmony_ci vpand $MASK,$H2,$H2 # 2 10598c2ecf20Sopenharmony_ci vpand $MASK,$H3,$H3 # 3 10608c2ecf20Sopenharmony_ci vpor 32(%rcx),$H4,$H4 # padbit, yes, always 10618c2ecf20Sopenharmony_ci 10628c2ecf20Sopenharmony_ci vpaddq 0x00(%r11),$H0,$H0 # add hash value 10638c2ecf20Sopenharmony_ci vpaddq 0x10(%r11),$H1,$H1 10648c2ecf20Sopenharmony_ci vpaddq 0x20(%r11),$H2,$H2 10658c2ecf20Sopenharmony_ci vpaddq 0x30(%r11),$H3,$H3 10668c2ecf20Sopenharmony_ci vpaddq 0x40(%r11),$H4,$H4 10678c2ecf20Sopenharmony_ci 10688c2ecf20Sopenharmony_ci lea 16*2($inp),%rax 10698c2ecf20Sopenharmony_ci lea 16*4($inp),$inp 10708c2ecf20Sopenharmony_ci sub \$64,$len 10718c2ecf20Sopenharmony_ci cmovc %rax,$inp 10728c2ecf20Sopenharmony_ci 10738c2ecf20Sopenharmony_ci ################################################################ 10748c2ecf20Sopenharmony_ci # Now we accumulate (inp[0:1]+hash)*r^4 10758c2ecf20Sopenharmony_ci ################################################################ 10768c2ecf20Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 10778c2ecf20Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 10788c2ecf20Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 10798c2ecf20Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 10808c2ecf20Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 10818c2ecf20Sopenharmony_ci 10828c2ecf20Sopenharmony_ci vpmuludq $H0,$T4,$T0 # h0*r0 10838c2ecf20Sopenharmony_ci vpmuludq $H1,$T4,$T1 # h1*r0 10848c2ecf20Sopenharmony_ci vpaddq $T0,$D0,$D0 10858c2ecf20Sopenharmony_ci vpaddq $T1,$D1,$D1 10868c2ecf20Sopenharmony_ci vmovdqa -0x80(%r11),$T2 # r1^4 10878c2ecf20Sopenharmony_ci vpmuludq $H2,$T4,$T0 # h2*r0 10888c2ecf20Sopenharmony_ci vpmuludq $H3,$T4,$T1 # h3*r0 10898c2ecf20Sopenharmony_ci vpaddq $T0,$D2,$D2 10908c2ecf20Sopenharmony_ci vpaddq $T1,$D3,$D3 10918c2ecf20Sopenharmony_ci vpmuludq $H4,$T4,$T4 # h4*r0 10928c2ecf20Sopenharmony_ci vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 10938c2ecf20Sopenharmony_ci vpaddq $T4,$D4,$D4 10948c2ecf20Sopenharmony_ci 10958c2ecf20Sopenharmony_ci vpaddq $T0,$D0,$D0 # d0 += h4*s1 10968c2ecf20Sopenharmony_ci vpmuludq $H2,$T2,$T1 # h2*r1 10978c2ecf20Sopenharmony_ci vpmuludq $H3,$T2,$T0 # h3*r1 10988c2ecf20Sopenharmony_ci vpaddq $T1,$D3,$D3 # d3 += h2*r1 10998c2ecf20Sopenharmony_ci vmovdqa -0x60(%r11),$T3 # r2^4 11008c2ecf20Sopenharmony_ci vpaddq $T0,$D4,$D4 # d4 += h3*r1 11018c2ecf20Sopenharmony_ci vpmuludq $H1,$T2,$T1 # h1*r1 11028c2ecf20Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r1 11038c2ecf20Sopenharmony_ci vpaddq $T1,$D2,$D2 # d2 += h1*r1 11048c2ecf20Sopenharmony_ci vpaddq $T2,$D1,$D1 # d1 += h0*r1 11058c2ecf20Sopenharmony_ci 11068c2ecf20Sopenharmony_ci vmovdqa -0x50(%r11),$T4 # s2^4 11078c2ecf20Sopenharmony_ci vpmuludq $H2,$T3,$T0 # h2*r2 11088c2ecf20Sopenharmony_ci vpmuludq $H1,$T3,$T1 # h1*r2 11098c2ecf20Sopenharmony_ci vpaddq $T0,$D4,$D4 # d4 += h2*r2 11108c2ecf20Sopenharmony_ci vpaddq $T1,$D3,$D3 # d3 += h1*r2 11118c2ecf20Sopenharmony_ci vmovdqa -0x40(%r11),$T2 # r3^4 11128c2ecf20Sopenharmony_ci vpmuludq $H0,$T3,$T3 # h0*r2 11138c2ecf20Sopenharmony_ci vpmuludq $H4,$T4,$T0 # h4*s2 11148c2ecf20Sopenharmony_ci vpaddq $T3,$D2,$D2 # d2 += h0*r2 11158c2ecf20Sopenharmony_ci vpaddq $T0,$D1,$D1 # d1 += h4*s2 11168c2ecf20Sopenharmony_ci vmovdqa -0x30(%r11),$T3 # s3^4 11178c2ecf20Sopenharmony_ci vpmuludq $H3,$T4,$T4 # h3*s2 11188c2ecf20Sopenharmony_ci vpmuludq $H1,$T2,$T1 # h1*r3 11198c2ecf20Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h3*s2 11208c2ecf20Sopenharmony_ci 11218c2ecf20Sopenharmony_ci vmovdqa -0x10(%r11),$T4 # s4^4 11228c2ecf20Sopenharmony_ci vpaddq $T1,$D4,$D4 # d4 += h1*r3 11238c2ecf20Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r3 11248c2ecf20Sopenharmony_ci vpmuludq $H4,$T3,$T0 # h4*s3 11258c2ecf20Sopenharmony_ci vpaddq $T2,$D3,$D3 # d3 += h0*r3 11268c2ecf20Sopenharmony_ci vpaddq $T0,$D2,$D2 # d2 += h4*s3 11278c2ecf20Sopenharmony_ci vmovdqu 16*2($inp),$T0 # load input 11288c2ecf20Sopenharmony_ci vpmuludq $H3,$T3,$T2 # h3*s3 11298c2ecf20Sopenharmony_ci vpmuludq $H2,$T3,$T3 # h2*s3 11308c2ecf20Sopenharmony_ci vpaddq $T2,$D1,$D1 # d1 += h3*s3 11318c2ecf20Sopenharmony_ci vmovdqu 16*3($inp),$T1 # 11328c2ecf20Sopenharmony_ci vpaddq $T3,$D0,$D0 # d0 += h2*s3 11338c2ecf20Sopenharmony_ci 11348c2ecf20Sopenharmony_ci vpmuludq $H2,$T4,$H2 # h2*s4 11358c2ecf20Sopenharmony_ci vpmuludq $H3,$T4,$H3 # h3*s4 11368c2ecf20Sopenharmony_ci vpsrldq \$6,$T0,$T2 # splat input 11378c2ecf20Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h2*s4 11388c2ecf20Sopenharmony_ci vpmuludq $H4,$T4,$H4 # h4*s4 11398c2ecf20Sopenharmony_ci vpsrldq \$6,$T1,$T3 # 11408c2ecf20Sopenharmony_ci vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 11418c2ecf20Sopenharmony_ci vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 11428c2ecf20Sopenharmony_ci vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 11438c2ecf20Sopenharmony_ci vpmuludq $H1,$T4,$H0 11448c2ecf20Sopenharmony_ci vpunpckhqdq $T1,$T0,$T4 # 4 11458c2ecf20Sopenharmony_ci vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 11468c2ecf20Sopenharmony_ci vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 11478c2ecf20Sopenharmony_ci 11488c2ecf20Sopenharmony_ci vpunpcklqdq $T1,$T0,$T0 # 0:1 11498c2ecf20Sopenharmony_ci vpunpcklqdq $T3,$T2,$T3 # 2:3 11508c2ecf20Sopenharmony_ci 11518c2ecf20Sopenharmony_ci #vpsrlq \$40,$T4,$T4 # 4 11528c2ecf20Sopenharmony_ci vpsrldq \$`40/8`,$T4,$T4 # 4 11538c2ecf20Sopenharmony_ci vpsrlq \$26,$T0,$T1 11548c2ecf20Sopenharmony_ci vmovdqa 0x00(%rsp),$D4 # preload r0^2 11558c2ecf20Sopenharmony_ci vpand $MASK,$T0,$T0 # 0 11568c2ecf20Sopenharmony_ci vpsrlq \$4,$T3,$T2 11578c2ecf20Sopenharmony_ci vpand $MASK,$T1,$T1 # 1 11588c2ecf20Sopenharmony_ci vpand 0(%rcx),$T4,$T4 # .Lmask24 11598c2ecf20Sopenharmony_ci vpsrlq \$30,$T3,$T3 11608c2ecf20Sopenharmony_ci vpand $MASK,$T2,$T2 # 2 11618c2ecf20Sopenharmony_ci vpand $MASK,$T3,$T3 # 3 11628c2ecf20Sopenharmony_ci vpor 32(%rcx),$T4,$T4 # padbit, yes, always 11638c2ecf20Sopenharmony_ci 11648c2ecf20Sopenharmony_ci ################################################################ 11658c2ecf20Sopenharmony_ci # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 11668c2ecf20Sopenharmony_ci # and P. Schwabe 11678c2ecf20Sopenharmony_ci 11688c2ecf20Sopenharmony_ci vpsrlq \$26,$H3,$D3 11698c2ecf20Sopenharmony_ci vpand $MASK,$H3,$H3 11708c2ecf20Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 11718c2ecf20Sopenharmony_ci 11728c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$D0 11738c2ecf20Sopenharmony_ci vpand $MASK,$H0,$H0 11748c2ecf20Sopenharmony_ci vpaddq $D0,$D1,$H1 # h0 -> h1 11758c2ecf20Sopenharmony_ci 11768c2ecf20Sopenharmony_ci vpsrlq \$26,$H4,$D0 11778c2ecf20Sopenharmony_ci vpand $MASK,$H4,$H4 11788c2ecf20Sopenharmony_ci 11798c2ecf20Sopenharmony_ci vpsrlq \$26,$H1,$D1 11808c2ecf20Sopenharmony_ci vpand $MASK,$H1,$H1 11818c2ecf20Sopenharmony_ci vpaddq $D1,$H2,$H2 # h1 -> h2 11828c2ecf20Sopenharmony_ci 11838c2ecf20Sopenharmony_ci vpaddq $D0,$H0,$H0 11848c2ecf20Sopenharmony_ci vpsllq \$2,$D0,$D0 11858c2ecf20Sopenharmony_ci vpaddq $D0,$H0,$H0 # h4 -> h0 11868c2ecf20Sopenharmony_ci 11878c2ecf20Sopenharmony_ci vpsrlq \$26,$H2,$D2 11888c2ecf20Sopenharmony_ci vpand $MASK,$H2,$H2 11898c2ecf20Sopenharmony_ci vpaddq $D2,$H3,$H3 # h2 -> h3 11908c2ecf20Sopenharmony_ci 11918c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$D0 11928c2ecf20Sopenharmony_ci vpand $MASK,$H0,$H0 11938c2ecf20Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 11948c2ecf20Sopenharmony_ci 11958c2ecf20Sopenharmony_ci vpsrlq \$26,$H3,$D3 11968c2ecf20Sopenharmony_ci vpand $MASK,$H3,$H3 11978c2ecf20Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 11988c2ecf20Sopenharmony_ci 11998c2ecf20Sopenharmony_ci ja .Loop_avx 12008c2ecf20Sopenharmony_ci 12018c2ecf20Sopenharmony_ci.Lskip_loop_avx: 12028c2ecf20Sopenharmony_ci ################################################################ 12038c2ecf20Sopenharmony_ci # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 12048c2ecf20Sopenharmony_ci 12058c2ecf20Sopenharmony_ci vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 12068c2ecf20Sopenharmony_ci add \$32,$len 12078c2ecf20Sopenharmony_ci jnz .Long_tail_avx 12088c2ecf20Sopenharmony_ci 12098c2ecf20Sopenharmony_ci vpaddq $H2,$T2,$T2 12108c2ecf20Sopenharmony_ci vpaddq $H0,$T0,$T0 12118c2ecf20Sopenharmony_ci vpaddq $H1,$T1,$T1 12128c2ecf20Sopenharmony_ci vpaddq $H3,$T3,$T3 12138c2ecf20Sopenharmony_ci vpaddq $H4,$T4,$T4 12148c2ecf20Sopenharmony_ci 12158c2ecf20Sopenharmony_ci.Long_tail_avx: 12168c2ecf20Sopenharmony_ci vmovdqa $H2,0x20(%r11) 12178c2ecf20Sopenharmony_ci vmovdqa $H0,0x00(%r11) 12188c2ecf20Sopenharmony_ci vmovdqa $H1,0x10(%r11) 12198c2ecf20Sopenharmony_ci vmovdqa $H3,0x30(%r11) 12208c2ecf20Sopenharmony_ci vmovdqa $H4,0x40(%r11) 12218c2ecf20Sopenharmony_ci 12228c2ecf20Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 12238c2ecf20Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 12248c2ecf20Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 12258c2ecf20Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 12268c2ecf20Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 12278c2ecf20Sopenharmony_ci 12288c2ecf20Sopenharmony_ci vpmuludq $T2,$D4,$D2 # d2 = h2*r0 12298c2ecf20Sopenharmony_ci vpmuludq $T0,$D4,$D0 # d0 = h0*r0 12308c2ecf20Sopenharmony_ci vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n 12318c2ecf20Sopenharmony_ci vpmuludq $T1,$D4,$D1 # d1 = h1*r0 12328c2ecf20Sopenharmony_ci vpmuludq $T3,$D4,$D3 # d3 = h3*r0 12338c2ecf20Sopenharmony_ci vpmuludq $T4,$D4,$D4 # d4 = h4*r0 12348c2ecf20Sopenharmony_ci 12358c2ecf20Sopenharmony_ci vpmuludq $T3,$H2,$H0 # h3*r1 12368c2ecf20Sopenharmony_ci vpaddq $H0,$D4,$D4 # d4 += h3*r1 12378c2ecf20Sopenharmony_ci vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n 12388c2ecf20Sopenharmony_ci vpmuludq $T2,$H2,$H1 # h2*r1 12398c2ecf20Sopenharmony_ci vpaddq $H1,$D3,$D3 # d3 += h2*r1 12408c2ecf20Sopenharmony_ci vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n 12418c2ecf20Sopenharmony_ci vpmuludq $T1,$H2,$H0 # h1*r1 12428c2ecf20Sopenharmony_ci vpaddq $H0,$D2,$D2 # d2 += h1*r1 12438c2ecf20Sopenharmony_ci vpmuludq $T0,$H2,$H2 # h0*r1 12448c2ecf20Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h0*r1 12458c2ecf20Sopenharmony_ci vpmuludq $T4,$H3,$H3 # h4*s1 12468c2ecf20Sopenharmony_ci vpaddq $H3,$D0,$D0 # d0 += h4*s1 12478c2ecf20Sopenharmony_ci 12488c2ecf20Sopenharmony_ci vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n 12498c2ecf20Sopenharmony_ci vpmuludq $T2,$H4,$H1 # h2*r2 12508c2ecf20Sopenharmony_ci vpaddq $H1,$D4,$D4 # d4 += h2*r2 12518c2ecf20Sopenharmony_ci vpmuludq $T1,$H4,$H0 # h1*r2 12528c2ecf20Sopenharmony_ci vpaddq $H0,$D3,$D3 # d3 += h1*r2 12538c2ecf20Sopenharmony_ci vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n 12548c2ecf20Sopenharmony_ci vpmuludq $T0,$H4,$H4 # h0*r2 12558c2ecf20Sopenharmony_ci vpaddq $H4,$D2,$D2 # d2 += h0*r2 12568c2ecf20Sopenharmony_ci vpmuludq $T4,$H2,$H1 # h4*s2 12578c2ecf20Sopenharmony_ci vpaddq $H1,$D1,$D1 # d1 += h4*s2 12588c2ecf20Sopenharmony_ci vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n 12598c2ecf20Sopenharmony_ci vpmuludq $T3,$H2,$H2 # h3*s2 12608c2ecf20Sopenharmony_ci vpaddq $H2,$D0,$D0 # d0 += h3*s2 12618c2ecf20Sopenharmony_ci 12628c2ecf20Sopenharmony_ci vpmuludq $T1,$H3,$H0 # h1*r3 12638c2ecf20Sopenharmony_ci vpaddq $H0,$D4,$D4 # d4 += h1*r3 12648c2ecf20Sopenharmony_ci vpmuludq $T0,$H3,$H3 # h0*r3 12658c2ecf20Sopenharmony_ci vpaddq $H3,$D3,$D3 # d3 += h0*r3 12668c2ecf20Sopenharmony_ci vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n 12678c2ecf20Sopenharmony_ci vpmuludq $T4,$H4,$H1 # h4*s3 12688c2ecf20Sopenharmony_ci vpaddq $H1,$D2,$D2 # d2 += h4*s3 12698c2ecf20Sopenharmony_ci vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n 12708c2ecf20Sopenharmony_ci vpmuludq $T3,$H4,$H0 # h3*s3 12718c2ecf20Sopenharmony_ci vpaddq $H0,$D1,$D1 # d1 += h3*s3 12728c2ecf20Sopenharmony_ci vpmuludq $T2,$H4,$H4 # h2*s3 12738c2ecf20Sopenharmony_ci vpaddq $H4,$D0,$D0 # d0 += h2*s3 12748c2ecf20Sopenharmony_ci 12758c2ecf20Sopenharmony_ci vpmuludq $T0,$H2,$H2 # h0*r4 12768c2ecf20Sopenharmony_ci vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 12778c2ecf20Sopenharmony_ci vpmuludq $T4,$H3,$H1 # h4*s4 12788c2ecf20Sopenharmony_ci vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 12798c2ecf20Sopenharmony_ci vpmuludq $T3,$H3,$H0 # h3*s4 12808c2ecf20Sopenharmony_ci vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 12818c2ecf20Sopenharmony_ci vpmuludq $T2,$H3,$H1 # h2*s4 12828c2ecf20Sopenharmony_ci vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 12838c2ecf20Sopenharmony_ci vpmuludq $T1,$H3,$H3 # h1*s4 12848c2ecf20Sopenharmony_ci vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 12858c2ecf20Sopenharmony_ci 12868c2ecf20Sopenharmony_ci jz .Lshort_tail_avx 12878c2ecf20Sopenharmony_ci 12888c2ecf20Sopenharmony_ci vmovdqu 16*0($inp),$H0 # load input 12898c2ecf20Sopenharmony_ci vmovdqu 16*1($inp),$H1 12908c2ecf20Sopenharmony_ci 12918c2ecf20Sopenharmony_ci vpsrldq \$6,$H0,$H2 # splat input 12928c2ecf20Sopenharmony_ci vpsrldq \$6,$H1,$H3 12938c2ecf20Sopenharmony_ci vpunpckhqdq $H1,$H0,$H4 # 4 12948c2ecf20Sopenharmony_ci vpunpcklqdq $H1,$H0,$H0 # 0:1 12958c2ecf20Sopenharmony_ci vpunpcklqdq $H3,$H2,$H3 # 2:3 12968c2ecf20Sopenharmony_ci 12978c2ecf20Sopenharmony_ci vpsrlq \$40,$H4,$H4 # 4 12988c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$H1 12998c2ecf20Sopenharmony_ci vpand $MASK,$H0,$H0 # 0 13008c2ecf20Sopenharmony_ci vpsrlq \$4,$H3,$H2 13018c2ecf20Sopenharmony_ci vpand $MASK,$H1,$H1 # 1 13028c2ecf20Sopenharmony_ci vpsrlq \$30,$H3,$H3 13038c2ecf20Sopenharmony_ci vpand $MASK,$H2,$H2 # 2 13048c2ecf20Sopenharmony_ci vpand $MASK,$H3,$H3 # 3 13058c2ecf20Sopenharmony_ci vpor 32(%rcx),$H4,$H4 # padbit, yes, always 13068c2ecf20Sopenharmony_ci 13078c2ecf20Sopenharmony_ci vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 13088c2ecf20Sopenharmony_ci vpaddq 0x00(%r11),$H0,$H0 13098c2ecf20Sopenharmony_ci vpaddq 0x10(%r11),$H1,$H1 13108c2ecf20Sopenharmony_ci vpaddq 0x20(%r11),$H2,$H2 13118c2ecf20Sopenharmony_ci vpaddq 0x30(%r11),$H3,$H3 13128c2ecf20Sopenharmony_ci vpaddq 0x40(%r11),$H4,$H4 13138c2ecf20Sopenharmony_ci 13148c2ecf20Sopenharmony_ci ################################################################ 13158c2ecf20Sopenharmony_ci # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate 13168c2ecf20Sopenharmony_ci 13178c2ecf20Sopenharmony_ci vpmuludq $H0,$T4,$T0 # h0*r0 13188c2ecf20Sopenharmony_ci vpaddq $T0,$D0,$D0 # d0 += h0*r0 13198c2ecf20Sopenharmony_ci vpmuludq $H1,$T4,$T1 # h1*r0 13208c2ecf20Sopenharmony_ci vpaddq $T1,$D1,$D1 # d1 += h1*r0 13218c2ecf20Sopenharmony_ci vpmuludq $H2,$T4,$T0 # h2*r0 13228c2ecf20Sopenharmony_ci vpaddq $T0,$D2,$D2 # d2 += h2*r0 13238c2ecf20Sopenharmony_ci vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n 13248c2ecf20Sopenharmony_ci vpmuludq $H3,$T4,$T1 # h3*r0 13258c2ecf20Sopenharmony_ci vpaddq $T1,$D3,$D3 # d3 += h3*r0 13268c2ecf20Sopenharmony_ci vpmuludq $H4,$T4,$T4 # h4*r0 13278c2ecf20Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h4*r0 13288c2ecf20Sopenharmony_ci 13298c2ecf20Sopenharmony_ci vpmuludq $H3,$T2,$T0 # h3*r1 13308c2ecf20Sopenharmony_ci vpaddq $T0,$D4,$D4 # d4 += h3*r1 13318c2ecf20Sopenharmony_ci vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 13328c2ecf20Sopenharmony_ci vpmuludq $H2,$T2,$T1 # h2*r1 13338c2ecf20Sopenharmony_ci vpaddq $T1,$D3,$D3 # d3 += h2*r1 13348c2ecf20Sopenharmony_ci vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 13358c2ecf20Sopenharmony_ci vpmuludq $H1,$T2,$T0 # h1*r1 13368c2ecf20Sopenharmony_ci vpaddq $T0,$D2,$D2 # d2 += h1*r1 13378c2ecf20Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r1 13388c2ecf20Sopenharmony_ci vpaddq $T2,$D1,$D1 # d1 += h0*r1 13398c2ecf20Sopenharmony_ci vpmuludq $H4,$T3,$T3 # h4*s1 13408c2ecf20Sopenharmony_ci vpaddq $T3,$D0,$D0 # d0 += h4*s1 13418c2ecf20Sopenharmony_ci 13428c2ecf20Sopenharmony_ci vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 13438c2ecf20Sopenharmony_ci vpmuludq $H2,$T4,$T1 # h2*r2 13448c2ecf20Sopenharmony_ci vpaddq $T1,$D4,$D4 # d4 += h2*r2 13458c2ecf20Sopenharmony_ci vpmuludq $H1,$T4,$T0 # h1*r2 13468c2ecf20Sopenharmony_ci vpaddq $T0,$D3,$D3 # d3 += h1*r2 13478c2ecf20Sopenharmony_ci vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 13488c2ecf20Sopenharmony_ci vpmuludq $H0,$T4,$T4 # h0*r2 13498c2ecf20Sopenharmony_ci vpaddq $T4,$D2,$D2 # d2 += h0*r2 13508c2ecf20Sopenharmony_ci vpmuludq $H4,$T2,$T1 # h4*s2 13518c2ecf20Sopenharmony_ci vpaddq $T1,$D1,$D1 # d1 += h4*s2 13528c2ecf20Sopenharmony_ci vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 13538c2ecf20Sopenharmony_ci vpmuludq $H3,$T2,$T2 # h3*s2 13548c2ecf20Sopenharmony_ci vpaddq $T2,$D0,$D0 # d0 += h3*s2 13558c2ecf20Sopenharmony_ci 13568c2ecf20Sopenharmony_ci vpmuludq $H1,$T3,$T0 # h1*r3 13578c2ecf20Sopenharmony_ci vpaddq $T0,$D4,$D4 # d4 += h1*r3 13588c2ecf20Sopenharmony_ci vpmuludq $H0,$T3,$T3 # h0*r3 13598c2ecf20Sopenharmony_ci vpaddq $T3,$D3,$D3 # d3 += h0*r3 13608c2ecf20Sopenharmony_ci vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 13618c2ecf20Sopenharmony_ci vpmuludq $H4,$T4,$T1 # h4*s3 13628c2ecf20Sopenharmony_ci vpaddq $T1,$D2,$D2 # d2 += h4*s3 13638c2ecf20Sopenharmony_ci vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 13648c2ecf20Sopenharmony_ci vpmuludq $H3,$T4,$T0 # h3*s3 13658c2ecf20Sopenharmony_ci vpaddq $T0,$D1,$D1 # d1 += h3*s3 13668c2ecf20Sopenharmony_ci vpmuludq $H2,$T4,$T4 # h2*s3 13678c2ecf20Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h2*s3 13688c2ecf20Sopenharmony_ci 13698c2ecf20Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r4 13708c2ecf20Sopenharmony_ci vpaddq $T2,$D4,$D4 # d4 += h0*r4 13718c2ecf20Sopenharmony_ci vpmuludq $H4,$T3,$T1 # h4*s4 13728c2ecf20Sopenharmony_ci vpaddq $T1,$D3,$D3 # d3 += h4*s4 13738c2ecf20Sopenharmony_ci vpmuludq $H3,$T3,$T0 # h3*s4 13748c2ecf20Sopenharmony_ci vpaddq $T0,$D2,$D2 # d2 += h3*s4 13758c2ecf20Sopenharmony_ci vpmuludq $H2,$T3,$T1 # h2*s4 13768c2ecf20Sopenharmony_ci vpaddq $T1,$D1,$D1 # d1 += h2*s4 13778c2ecf20Sopenharmony_ci vpmuludq $H1,$T3,$T3 # h1*s4 13788c2ecf20Sopenharmony_ci vpaddq $T3,$D0,$D0 # d0 += h1*s4 13798c2ecf20Sopenharmony_ci 13808c2ecf20Sopenharmony_ci.Lshort_tail_avx: 13818c2ecf20Sopenharmony_ci ################################################################ 13828c2ecf20Sopenharmony_ci # horizontal addition 13838c2ecf20Sopenharmony_ci 13848c2ecf20Sopenharmony_ci vpsrldq \$8,$D4,$T4 13858c2ecf20Sopenharmony_ci vpsrldq \$8,$D3,$T3 13868c2ecf20Sopenharmony_ci vpsrldq \$8,$D1,$T1 13878c2ecf20Sopenharmony_ci vpsrldq \$8,$D0,$T0 13888c2ecf20Sopenharmony_ci vpsrldq \$8,$D2,$T2 13898c2ecf20Sopenharmony_ci vpaddq $T3,$D3,$D3 13908c2ecf20Sopenharmony_ci vpaddq $T4,$D4,$D4 13918c2ecf20Sopenharmony_ci vpaddq $T0,$D0,$D0 13928c2ecf20Sopenharmony_ci vpaddq $T1,$D1,$D1 13938c2ecf20Sopenharmony_ci vpaddq $T2,$D2,$D2 13948c2ecf20Sopenharmony_ci 13958c2ecf20Sopenharmony_ci ################################################################ 13968c2ecf20Sopenharmony_ci # lazy reduction 13978c2ecf20Sopenharmony_ci 13988c2ecf20Sopenharmony_ci vpsrlq \$26,$D3,$H3 13998c2ecf20Sopenharmony_ci vpand $MASK,$D3,$D3 14008c2ecf20Sopenharmony_ci vpaddq $H3,$D4,$D4 # h3 -> h4 14018c2ecf20Sopenharmony_ci 14028c2ecf20Sopenharmony_ci vpsrlq \$26,$D0,$H0 14038c2ecf20Sopenharmony_ci vpand $MASK,$D0,$D0 14048c2ecf20Sopenharmony_ci vpaddq $H0,$D1,$D1 # h0 -> h1 14058c2ecf20Sopenharmony_ci 14068c2ecf20Sopenharmony_ci vpsrlq \$26,$D4,$H4 14078c2ecf20Sopenharmony_ci vpand $MASK,$D4,$D4 14088c2ecf20Sopenharmony_ci 14098c2ecf20Sopenharmony_ci vpsrlq \$26,$D1,$H1 14108c2ecf20Sopenharmony_ci vpand $MASK,$D1,$D1 14118c2ecf20Sopenharmony_ci vpaddq $H1,$D2,$D2 # h1 -> h2 14128c2ecf20Sopenharmony_ci 14138c2ecf20Sopenharmony_ci vpaddq $H4,$D0,$D0 14148c2ecf20Sopenharmony_ci vpsllq \$2,$H4,$H4 14158c2ecf20Sopenharmony_ci vpaddq $H4,$D0,$D0 # h4 -> h0 14168c2ecf20Sopenharmony_ci 14178c2ecf20Sopenharmony_ci vpsrlq \$26,$D2,$H2 14188c2ecf20Sopenharmony_ci vpand $MASK,$D2,$D2 14198c2ecf20Sopenharmony_ci vpaddq $H2,$D3,$D3 # h2 -> h3 14208c2ecf20Sopenharmony_ci 14218c2ecf20Sopenharmony_ci vpsrlq \$26,$D0,$H0 14228c2ecf20Sopenharmony_ci vpand $MASK,$D0,$D0 14238c2ecf20Sopenharmony_ci vpaddq $H0,$D1,$D1 # h0 -> h1 14248c2ecf20Sopenharmony_ci 14258c2ecf20Sopenharmony_ci vpsrlq \$26,$D3,$H3 14268c2ecf20Sopenharmony_ci vpand $MASK,$D3,$D3 14278c2ecf20Sopenharmony_ci vpaddq $H3,$D4,$D4 # h3 -> h4 14288c2ecf20Sopenharmony_ci 14298c2ecf20Sopenharmony_ci vmovd $D0,`4*0-48-64`($ctx) # save partially reduced 14308c2ecf20Sopenharmony_ci vmovd $D1,`4*1-48-64`($ctx) 14318c2ecf20Sopenharmony_ci vmovd $D2,`4*2-48-64`($ctx) 14328c2ecf20Sopenharmony_ci vmovd $D3,`4*3-48-64`($ctx) 14338c2ecf20Sopenharmony_ci vmovd $D4,`4*4-48-64`($ctx) 14348c2ecf20Sopenharmony_ci___ 14358c2ecf20Sopenharmony_ci$code.=<<___ if ($win64); 14368c2ecf20Sopenharmony_ci vmovdqa 0x50(%r11),%xmm6 14378c2ecf20Sopenharmony_ci vmovdqa 0x60(%r11),%xmm7 14388c2ecf20Sopenharmony_ci vmovdqa 0x70(%r11),%xmm8 14398c2ecf20Sopenharmony_ci vmovdqa 0x80(%r11),%xmm9 14408c2ecf20Sopenharmony_ci vmovdqa 0x90(%r11),%xmm10 14418c2ecf20Sopenharmony_ci vmovdqa 0xa0(%r11),%xmm11 14428c2ecf20Sopenharmony_ci vmovdqa 0xb0(%r11),%xmm12 14438c2ecf20Sopenharmony_ci vmovdqa 0xc0(%r11),%xmm13 14448c2ecf20Sopenharmony_ci vmovdqa 0xd0(%r11),%xmm14 14458c2ecf20Sopenharmony_ci vmovdqa 0xe0(%r11),%xmm15 14468c2ecf20Sopenharmony_ci lea 0xf8(%r11),%rsp 14478c2ecf20Sopenharmony_ci.Ldo_avx_epilogue: 14488c2ecf20Sopenharmony_ci___ 14498c2ecf20Sopenharmony_ci$code.=<<___ if (!$win64); 14508c2ecf20Sopenharmony_ci lea -8(%r10),%rsp 14518c2ecf20Sopenharmony_ci.cfi_def_cfa_register %rsp 14528c2ecf20Sopenharmony_ci___ 14538c2ecf20Sopenharmony_ci$code.=<<___; 14548c2ecf20Sopenharmony_ci vzeroupper 14558c2ecf20Sopenharmony_ci RET 14568c2ecf20Sopenharmony_ci.cfi_endproc 14578c2ecf20Sopenharmony_ci___ 14588c2ecf20Sopenharmony_ci&end_function("poly1305_blocks_avx"); 14598c2ecf20Sopenharmony_ci 14608c2ecf20Sopenharmony_ci&declare_function("poly1305_emit_avx", 32, 3); 14618c2ecf20Sopenharmony_ci$code.=<<___; 14628c2ecf20Sopenharmony_ci cmpl \$0,20($ctx) # is_base2_26? 14638c2ecf20Sopenharmony_ci je .Lemit 14648c2ecf20Sopenharmony_ci 14658c2ecf20Sopenharmony_ci mov 0($ctx),%eax # load hash value base 2^26 14668c2ecf20Sopenharmony_ci mov 4($ctx),%ecx 14678c2ecf20Sopenharmony_ci mov 8($ctx),%r8d 14688c2ecf20Sopenharmony_ci mov 12($ctx),%r11d 14698c2ecf20Sopenharmony_ci mov 16($ctx),%r10d 14708c2ecf20Sopenharmony_ci 14718c2ecf20Sopenharmony_ci shl \$26,%rcx # base 2^26 -> base 2^64 14728c2ecf20Sopenharmony_ci mov %r8,%r9 14738c2ecf20Sopenharmony_ci shl \$52,%r8 14748c2ecf20Sopenharmony_ci add %rcx,%rax 14758c2ecf20Sopenharmony_ci shr \$12,%r9 14768c2ecf20Sopenharmony_ci add %rax,%r8 # h0 14778c2ecf20Sopenharmony_ci adc \$0,%r9 14788c2ecf20Sopenharmony_ci 14798c2ecf20Sopenharmony_ci shl \$14,%r11 14808c2ecf20Sopenharmony_ci mov %r10,%rax 14818c2ecf20Sopenharmony_ci shr \$24,%r10 14828c2ecf20Sopenharmony_ci add %r11,%r9 14838c2ecf20Sopenharmony_ci shl \$40,%rax 14848c2ecf20Sopenharmony_ci add %rax,%r9 # h1 14858c2ecf20Sopenharmony_ci adc \$0,%r10 # h2 14868c2ecf20Sopenharmony_ci 14878c2ecf20Sopenharmony_ci mov %r10,%rax # could be partially reduced, so reduce 14888c2ecf20Sopenharmony_ci mov %r10,%rcx 14898c2ecf20Sopenharmony_ci and \$3,%r10 14908c2ecf20Sopenharmony_ci shr \$2,%rax 14918c2ecf20Sopenharmony_ci and \$-4,%rcx 14928c2ecf20Sopenharmony_ci add %rcx,%rax 14938c2ecf20Sopenharmony_ci add %rax,%r8 14948c2ecf20Sopenharmony_ci adc \$0,%r9 14958c2ecf20Sopenharmony_ci adc \$0,%r10 14968c2ecf20Sopenharmony_ci 14978c2ecf20Sopenharmony_ci mov %r8,%rax 14988c2ecf20Sopenharmony_ci add \$5,%r8 # compare to modulus 14998c2ecf20Sopenharmony_ci mov %r9,%rcx 15008c2ecf20Sopenharmony_ci adc \$0,%r9 15018c2ecf20Sopenharmony_ci adc \$0,%r10 15028c2ecf20Sopenharmony_ci shr \$2,%r10 # did 130-bit value overflow? 15038c2ecf20Sopenharmony_ci cmovnz %r8,%rax 15048c2ecf20Sopenharmony_ci cmovnz %r9,%rcx 15058c2ecf20Sopenharmony_ci 15068c2ecf20Sopenharmony_ci add 0($nonce),%rax # accumulate nonce 15078c2ecf20Sopenharmony_ci adc 8($nonce),%rcx 15088c2ecf20Sopenharmony_ci mov %rax,0($mac) # write result 15098c2ecf20Sopenharmony_ci mov %rcx,8($mac) 15108c2ecf20Sopenharmony_ci 15118c2ecf20Sopenharmony_ci RET 15128c2ecf20Sopenharmony_ci___ 15138c2ecf20Sopenharmony_ci&end_function("poly1305_emit_avx"); 15148c2ecf20Sopenharmony_ci 15158c2ecf20Sopenharmony_ciif ($avx>1) { 15168c2ecf20Sopenharmony_ci 15178c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = 15188c2ecf20Sopenharmony_ci map("%ymm$_",(0..15)); 15198c2ecf20Sopenharmony_cimy $S4=$MASK; 15208c2ecf20Sopenharmony_ci 15218c2ecf20Sopenharmony_cisub poly1305_blocks_avxN { 15228c2ecf20Sopenharmony_ci my ($avx512) = @_; 15238c2ecf20Sopenharmony_ci my $suffix = $avx512 ? "_avx512" : ""; 15248c2ecf20Sopenharmony_ci$code.=<<___; 15258c2ecf20Sopenharmony_ci.cfi_startproc 15268c2ecf20Sopenharmony_ci mov 20($ctx),%r8d # is_base2_26 15278c2ecf20Sopenharmony_ci cmp \$128,$len 15288c2ecf20Sopenharmony_ci jae .Lblocks_avx2$suffix 15298c2ecf20Sopenharmony_ci test %r8d,%r8d 15308c2ecf20Sopenharmony_ci jz .Lblocks 15318c2ecf20Sopenharmony_ci 15328c2ecf20Sopenharmony_ci.Lblocks_avx2$suffix: 15338c2ecf20Sopenharmony_ci and \$-16,$len 15348c2ecf20Sopenharmony_ci jz .Lno_data_avx2$suffix 15358c2ecf20Sopenharmony_ci 15368c2ecf20Sopenharmony_ci vzeroupper 15378c2ecf20Sopenharmony_ci 15388c2ecf20Sopenharmony_ci test %r8d,%r8d 15398c2ecf20Sopenharmony_ci jz .Lbase2_64_avx2$suffix 15408c2ecf20Sopenharmony_ci 15418c2ecf20Sopenharmony_ci test \$63,$len 15428c2ecf20Sopenharmony_ci jz .Leven_avx2$suffix 15438c2ecf20Sopenharmony_ci 15448c2ecf20Sopenharmony_ci push %rbp 15458c2ecf20Sopenharmony_ci.cfi_push %rbp 15468c2ecf20Sopenharmony_ci mov %rsp,%rbp 15478c2ecf20Sopenharmony_ci push %rbx 15488c2ecf20Sopenharmony_ci.cfi_push %rbx 15498c2ecf20Sopenharmony_ci push %r12 15508c2ecf20Sopenharmony_ci.cfi_push %r12 15518c2ecf20Sopenharmony_ci push %r13 15528c2ecf20Sopenharmony_ci.cfi_push %r13 15538c2ecf20Sopenharmony_ci push %r14 15548c2ecf20Sopenharmony_ci.cfi_push %r14 15558c2ecf20Sopenharmony_ci push %r15 15568c2ecf20Sopenharmony_ci.cfi_push %r15 15578c2ecf20Sopenharmony_ci.Lblocks_avx2_body$suffix: 15588c2ecf20Sopenharmony_ci 15598c2ecf20Sopenharmony_ci mov $len,%r15 # reassign $len 15608c2ecf20Sopenharmony_ci 15618c2ecf20Sopenharmony_ci mov 0($ctx),$d1 # load hash value 15628c2ecf20Sopenharmony_ci mov 8($ctx),$d2 15638c2ecf20Sopenharmony_ci mov 16($ctx),$h2#d 15648c2ecf20Sopenharmony_ci 15658c2ecf20Sopenharmony_ci mov 24($ctx),$r0 # load r 15668c2ecf20Sopenharmony_ci mov 32($ctx),$s1 15678c2ecf20Sopenharmony_ci 15688c2ecf20Sopenharmony_ci ################################# base 2^26 -> base 2^64 15698c2ecf20Sopenharmony_ci mov $d1#d,$h0#d 15708c2ecf20Sopenharmony_ci and \$`-1*(1<<31)`,$d1 15718c2ecf20Sopenharmony_ci mov $d2,$r1 # borrow $r1 15728c2ecf20Sopenharmony_ci mov $d2#d,$h1#d 15738c2ecf20Sopenharmony_ci and \$`-1*(1<<31)`,$d2 15748c2ecf20Sopenharmony_ci 15758c2ecf20Sopenharmony_ci shr \$6,$d1 15768c2ecf20Sopenharmony_ci shl \$52,$r1 15778c2ecf20Sopenharmony_ci add $d1,$h0 15788c2ecf20Sopenharmony_ci shr \$12,$h1 15798c2ecf20Sopenharmony_ci shr \$18,$d2 15808c2ecf20Sopenharmony_ci add $r1,$h0 15818c2ecf20Sopenharmony_ci adc $d2,$h1 15828c2ecf20Sopenharmony_ci 15838c2ecf20Sopenharmony_ci mov $h2,$d1 15848c2ecf20Sopenharmony_ci shl \$40,$d1 15858c2ecf20Sopenharmony_ci shr \$24,$h2 15868c2ecf20Sopenharmony_ci add $d1,$h1 15878c2ecf20Sopenharmony_ci adc \$0,$h2 # can be partially reduced... 15888c2ecf20Sopenharmony_ci 15898c2ecf20Sopenharmony_ci mov \$-4,$d2 # ... so reduce 15908c2ecf20Sopenharmony_ci mov $h2,$d1 15918c2ecf20Sopenharmony_ci and $h2,$d2 15928c2ecf20Sopenharmony_ci shr \$2,$d1 15938c2ecf20Sopenharmony_ci and \$3,$h2 15948c2ecf20Sopenharmony_ci add $d2,$d1 # =*5 15958c2ecf20Sopenharmony_ci add $d1,$h0 15968c2ecf20Sopenharmony_ci adc \$0,$h1 15978c2ecf20Sopenharmony_ci adc \$0,$h2 15988c2ecf20Sopenharmony_ci 15998c2ecf20Sopenharmony_ci mov $s1,$r1 16008c2ecf20Sopenharmony_ci mov $s1,%rax 16018c2ecf20Sopenharmony_ci shr \$2,$s1 16028c2ecf20Sopenharmony_ci add $r1,$s1 # s1 = r1 + (r1 >> 2) 16038c2ecf20Sopenharmony_ci 16048c2ecf20Sopenharmony_ci.Lbase2_26_pre_avx2$suffix: 16058c2ecf20Sopenharmony_ci add 0($inp),$h0 # accumulate input 16068c2ecf20Sopenharmony_ci adc 8($inp),$h1 16078c2ecf20Sopenharmony_ci lea 16($inp),$inp 16088c2ecf20Sopenharmony_ci adc $padbit,$h2 16098c2ecf20Sopenharmony_ci sub \$16,%r15 16108c2ecf20Sopenharmony_ci 16118c2ecf20Sopenharmony_ci call __poly1305_block 16128c2ecf20Sopenharmony_ci mov $r1,%rax 16138c2ecf20Sopenharmony_ci 16148c2ecf20Sopenharmony_ci test \$63,%r15 16158c2ecf20Sopenharmony_ci jnz .Lbase2_26_pre_avx2$suffix 16168c2ecf20Sopenharmony_ci 16178c2ecf20Sopenharmony_ci test $padbit,$padbit # if $padbit is zero, 16188c2ecf20Sopenharmony_ci jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format 16198c2ecf20Sopenharmony_ci 16208c2ecf20Sopenharmony_ci ################################# base 2^64 -> base 2^26 16218c2ecf20Sopenharmony_ci mov $h0,%rax 16228c2ecf20Sopenharmony_ci mov $h0,%rdx 16238c2ecf20Sopenharmony_ci shr \$52,$h0 16248c2ecf20Sopenharmony_ci mov $h1,$r0 16258c2ecf20Sopenharmony_ci mov $h1,$r1 16268c2ecf20Sopenharmony_ci shr \$26,%rdx 16278c2ecf20Sopenharmony_ci and \$0x3ffffff,%rax # h[0] 16288c2ecf20Sopenharmony_ci shl \$12,$r0 16298c2ecf20Sopenharmony_ci and \$0x3ffffff,%rdx # h[1] 16308c2ecf20Sopenharmony_ci shr \$14,$h1 16318c2ecf20Sopenharmony_ci or $r0,$h0 16328c2ecf20Sopenharmony_ci shl \$24,$h2 16338c2ecf20Sopenharmony_ci and \$0x3ffffff,$h0 # h[2] 16348c2ecf20Sopenharmony_ci shr \$40,$r1 16358c2ecf20Sopenharmony_ci and \$0x3ffffff,$h1 # h[3] 16368c2ecf20Sopenharmony_ci or $r1,$h2 # h[4] 16378c2ecf20Sopenharmony_ci 16388c2ecf20Sopenharmony_ci test %r15,%r15 16398c2ecf20Sopenharmony_ci jz .Lstore_base2_26_avx2$suffix 16408c2ecf20Sopenharmony_ci 16418c2ecf20Sopenharmony_ci vmovd %rax#d,%x#$H0 16428c2ecf20Sopenharmony_ci vmovd %rdx#d,%x#$H1 16438c2ecf20Sopenharmony_ci vmovd $h0#d,%x#$H2 16448c2ecf20Sopenharmony_ci vmovd $h1#d,%x#$H3 16458c2ecf20Sopenharmony_ci vmovd $h2#d,%x#$H4 16468c2ecf20Sopenharmony_ci jmp .Lproceed_avx2$suffix 16478c2ecf20Sopenharmony_ci 16488c2ecf20Sopenharmony_ci.align 32 16498c2ecf20Sopenharmony_ci.Lstore_base2_64_avx2$suffix: 16508c2ecf20Sopenharmony_ci mov $h0,0($ctx) 16518c2ecf20Sopenharmony_ci mov $h1,8($ctx) 16528c2ecf20Sopenharmony_ci mov $h2,16($ctx) # note that is_base2_26 is zeroed 16538c2ecf20Sopenharmony_ci jmp .Ldone_avx2$suffix 16548c2ecf20Sopenharmony_ci 16558c2ecf20Sopenharmony_ci.align 16 16568c2ecf20Sopenharmony_ci.Lstore_base2_26_avx2$suffix: 16578c2ecf20Sopenharmony_ci mov %rax#d,0($ctx) # store hash value base 2^26 16588c2ecf20Sopenharmony_ci mov %rdx#d,4($ctx) 16598c2ecf20Sopenharmony_ci mov $h0#d,8($ctx) 16608c2ecf20Sopenharmony_ci mov $h1#d,12($ctx) 16618c2ecf20Sopenharmony_ci mov $h2#d,16($ctx) 16628c2ecf20Sopenharmony_ci.align 16 16638c2ecf20Sopenharmony_ci.Ldone_avx2$suffix: 16648c2ecf20Sopenharmony_ci pop %r15 16658c2ecf20Sopenharmony_ci.cfi_restore %r15 16668c2ecf20Sopenharmony_ci pop %r14 16678c2ecf20Sopenharmony_ci.cfi_restore %r14 16688c2ecf20Sopenharmony_ci pop %r13 16698c2ecf20Sopenharmony_ci.cfi_restore %r13 16708c2ecf20Sopenharmony_ci pop %r12 16718c2ecf20Sopenharmony_ci.cfi_restore %r12 16728c2ecf20Sopenharmony_ci pop %rbx 16738c2ecf20Sopenharmony_ci.cfi_restore %rbx 16748c2ecf20Sopenharmony_ci pop %rbp 16758c2ecf20Sopenharmony_ci.cfi_restore %rbp 16768c2ecf20Sopenharmony_ci.Lno_data_avx2$suffix: 16778c2ecf20Sopenharmony_ci.Lblocks_avx2_epilogue$suffix: 16788c2ecf20Sopenharmony_ci RET 16798c2ecf20Sopenharmony_ci.cfi_endproc 16808c2ecf20Sopenharmony_ci 16818c2ecf20Sopenharmony_ci.align 32 16828c2ecf20Sopenharmony_ci.Lbase2_64_avx2$suffix: 16838c2ecf20Sopenharmony_ci.cfi_startproc 16848c2ecf20Sopenharmony_ci push %rbp 16858c2ecf20Sopenharmony_ci.cfi_push %rbp 16868c2ecf20Sopenharmony_ci mov %rsp,%rbp 16878c2ecf20Sopenharmony_ci push %rbx 16888c2ecf20Sopenharmony_ci.cfi_push %rbx 16898c2ecf20Sopenharmony_ci push %r12 16908c2ecf20Sopenharmony_ci.cfi_push %r12 16918c2ecf20Sopenharmony_ci push %r13 16928c2ecf20Sopenharmony_ci.cfi_push %r13 16938c2ecf20Sopenharmony_ci push %r14 16948c2ecf20Sopenharmony_ci.cfi_push %r14 16958c2ecf20Sopenharmony_ci push %r15 16968c2ecf20Sopenharmony_ci.cfi_push %r15 16978c2ecf20Sopenharmony_ci.Lbase2_64_avx2_body$suffix: 16988c2ecf20Sopenharmony_ci 16998c2ecf20Sopenharmony_ci mov $len,%r15 # reassign $len 17008c2ecf20Sopenharmony_ci 17018c2ecf20Sopenharmony_ci mov 24($ctx),$r0 # load r 17028c2ecf20Sopenharmony_ci mov 32($ctx),$s1 17038c2ecf20Sopenharmony_ci 17048c2ecf20Sopenharmony_ci mov 0($ctx),$h0 # load hash value 17058c2ecf20Sopenharmony_ci mov 8($ctx),$h1 17068c2ecf20Sopenharmony_ci mov 16($ctx),$h2#d 17078c2ecf20Sopenharmony_ci 17088c2ecf20Sopenharmony_ci mov $s1,$r1 17098c2ecf20Sopenharmony_ci mov $s1,%rax 17108c2ecf20Sopenharmony_ci shr \$2,$s1 17118c2ecf20Sopenharmony_ci add $r1,$s1 # s1 = r1 + (r1 >> 2) 17128c2ecf20Sopenharmony_ci 17138c2ecf20Sopenharmony_ci test \$63,$len 17148c2ecf20Sopenharmony_ci jz .Linit_avx2$suffix 17158c2ecf20Sopenharmony_ci 17168c2ecf20Sopenharmony_ci.Lbase2_64_pre_avx2$suffix: 17178c2ecf20Sopenharmony_ci add 0($inp),$h0 # accumulate input 17188c2ecf20Sopenharmony_ci adc 8($inp),$h1 17198c2ecf20Sopenharmony_ci lea 16($inp),$inp 17208c2ecf20Sopenharmony_ci adc $padbit,$h2 17218c2ecf20Sopenharmony_ci sub \$16,%r15 17228c2ecf20Sopenharmony_ci 17238c2ecf20Sopenharmony_ci call __poly1305_block 17248c2ecf20Sopenharmony_ci mov $r1,%rax 17258c2ecf20Sopenharmony_ci 17268c2ecf20Sopenharmony_ci test \$63,%r15 17278c2ecf20Sopenharmony_ci jnz .Lbase2_64_pre_avx2$suffix 17288c2ecf20Sopenharmony_ci 17298c2ecf20Sopenharmony_ci.Linit_avx2$suffix: 17308c2ecf20Sopenharmony_ci ################################# base 2^64 -> base 2^26 17318c2ecf20Sopenharmony_ci mov $h0,%rax 17328c2ecf20Sopenharmony_ci mov $h0,%rdx 17338c2ecf20Sopenharmony_ci shr \$52,$h0 17348c2ecf20Sopenharmony_ci mov $h1,$d1 17358c2ecf20Sopenharmony_ci mov $h1,$d2 17368c2ecf20Sopenharmony_ci shr \$26,%rdx 17378c2ecf20Sopenharmony_ci and \$0x3ffffff,%rax # h[0] 17388c2ecf20Sopenharmony_ci shl \$12,$d1 17398c2ecf20Sopenharmony_ci and \$0x3ffffff,%rdx # h[1] 17408c2ecf20Sopenharmony_ci shr \$14,$h1 17418c2ecf20Sopenharmony_ci or $d1,$h0 17428c2ecf20Sopenharmony_ci shl \$24,$h2 17438c2ecf20Sopenharmony_ci and \$0x3ffffff,$h0 # h[2] 17448c2ecf20Sopenharmony_ci shr \$40,$d2 17458c2ecf20Sopenharmony_ci and \$0x3ffffff,$h1 # h[3] 17468c2ecf20Sopenharmony_ci or $d2,$h2 # h[4] 17478c2ecf20Sopenharmony_ci 17488c2ecf20Sopenharmony_ci vmovd %rax#d,%x#$H0 17498c2ecf20Sopenharmony_ci vmovd %rdx#d,%x#$H1 17508c2ecf20Sopenharmony_ci vmovd $h0#d,%x#$H2 17518c2ecf20Sopenharmony_ci vmovd $h1#d,%x#$H3 17528c2ecf20Sopenharmony_ci vmovd $h2#d,%x#$H4 17538c2ecf20Sopenharmony_ci movl \$1,20($ctx) # set is_base2_26 17548c2ecf20Sopenharmony_ci 17558c2ecf20Sopenharmony_ci call __poly1305_init_avx 17568c2ecf20Sopenharmony_ci 17578c2ecf20Sopenharmony_ci.Lproceed_avx2$suffix: 17588c2ecf20Sopenharmony_ci mov %r15,$len # restore $len 17598c2ecf20Sopenharmony_ci___ 17608c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel); 17618c2ecf20Sopenharmony_ci mov OPENSSL_ia32cap_P+8(%rip),%r9d 17628c2ecf20Sopenharmony_ci mov \$`(1<<31|1<<30|1<<16)`,%r11d 17638c2ecf20Sopenharmony_ci___ 17648c2ecf20Sopenharmony_ci$code.=<<___; 17658c2ecf20Sopenharmony_ci pop %r15 17668c2ecf20Sopenharmony_ci.cfi_restore %r15 17678c2ecf20Sopenharmony_ci pop %r14 17688c2ecf20Sopenharmony_ci.cfi_restore %r14 17698c2ecf20Sopenharmony_ci pop %r13 17708c2ecf20Sopenharmony_ci.cfi_restore %r13 17718c2ecf20Sopenharmony_ci pop %r12 17728c2ecf20Sopenharmony_ci.cfi_restore %r12 17738c2ecf20Sopenharmony_ci pop %rbx 17748c2ecf20Sopenharmony_ci.cfi_restore %rbx 17758c2ecf20Sopenharmony_ci pop %rbp 17768c2ecf20Sopenharmony_ci.cfi_restore %rbp 17778c2ecf20Sopenharmony_ci.Lbase2_64_avx2_epilogue$suffix: 17788c2ecf20Sopenharmony_ci jmp .Ldo_avx2$suffix 17798c2ecf20Sopenharmony_ci.cfi_endproc 17808c2ecf20Sopenharmony_ci 17818c2ecf20Sopenharmony_ci.align 32 17828c2ecf20Sopenharmony_ci.Leven_avx2$suffix: 17838c2ecf20Sopenharmony_ci.cfi_startproc 17848c2ecf20Sopenharmony_ci___ 17858c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel); 17868c2ecf20Sopenharmony_ci mov OPENSSL_ia32cap_P+8(%rip),%r9d 17878c2ecf20Sopenharmony_ci___ 17888c2ecf20Sopenharmony_ci$code.=<<___; 17898c2ecf20Sopenharmony_ci vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 17908c2ecf20Sopenharmony_ci vmovd 4*1($ctx),%x#$H1 17918c2ecf20Sopenharmony_ci vmovd 4*2($ctx),%x#$H2 17928c2ecf20Sopenharmony_ci vmovd 4*3($ctx),%x#$H3 17938c2ecf20Sopenharmony_ci vmovd 4*4($ctx),%x#$H4 17948c2ecf20Sopenharmony_ci 17958c2ecf20Sopenharmony_ci.Ldo_avx2$suffix: 17968c2ecf20Sopenharmony_ci___ 17978c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel && $avx>2); 17988c2ecf20Sopenharmony_ci cmp \$512,$len 17998c2ecf20Sopenharmony_ci jb .Lskip_avx512 18008c2ecf20Sopenharmony_ci and %r11d,%r9d 18018c2ecf20Sopenharmony_ci test \$`1<<16`,%r9d # check for AVX512F 18028c2ecf20Sopenharmony_ci jnz .Lblocks_avx512 18038c2ecf20Sopenharmony_ci.Lskip_avx512$suffix: 18048c2ecf20Sopenharmony_ci___ 18058c2ecf20Sopenharmony_ci$code.=<<___ if ($avx > 2 && $avx512 && $kernel); 18068c2ecf20Sopenharmony_ci cmp \$512,$len 18078c2ecf20Sopenharmony_ci jae .Lblocks_avx512 18088c2ecf20Sopenharmony_ci___ 18098c2ecf20Sopenharmony_ci$code.=<<___ if (!$win64); 18108c2ecf20Sopenharmony_ci lea 8(%rsp),%r10 18118c2ecf20Sopenharmony_ci.cfi_def_cfa_register %r10 18128c2ecf20Sopenharmony_ci sub \$0x128,%rsp 18138c2ecf20Sopenharmony_ci___ 18148c2ecf20Sopenharmony_ci$code.=<<___ if ($win64); 18158c2ecf20Sopenharmony_ci lea 8(%rsp),%r10 18168c2ecf20Sopenharmony_ci sub \$0x1c8,%rsp 18178c2ecf20Sopenharmony_ci vmovdqa %xmm6,-0xb0(%r10) 18188c2ecf20Sopenharmony_ci vmovdqa %xmm7,-0xa0(%r10) 18198c2ecf20Sopenharmony_ci vmovdqa %xmm8,-0x90(%r10) 18208c2ecf20Sopenharmony_ci vmovdqa %xmm9,-0x80(%r10) 18218c2ecf20Sopenharmony_ci vmovdqa %xmm10,-0x70(%r10) 18228c2ecf20Sopenharmony_ci vmovdqa %xmm11,-0x60(%r10) 18238c2ecf20Sopenharmony_ci vmovdqa %xmm12,-0x50(%r10) 18248c2ecf20Sopenharmony_ci vmovdqa %xmm13,-0x40(%r10) 18258c2ecf20Sopenharmony_ci vmovdqa %xmm14,-0x30(%r10) 18268c2ecf20Sopenharmony_ci vmovdqa %xmm15,-0x20(%r10) 18278c2ecf20Sopenharmony_ci.Ldo_avx2_body$suffix: 18288c2ecf20Sopenharmony_ci___ 18298c2ecf20Sopenharmony_ci$code.=<<___; 18308c2ecf20Sopenharmony_ci lea .Lconst(%rip),%rcx 18318c2ecf20Sopenharmony_ci lea 48+64($ctx),$ctx # size optimization 18328c2ecf20Sopenharmony_ci vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 18338c2ecf20Sopenharmony_ci 18348c2ecf20Sopenharmony_ci # expand and copy pre-calculated table to stack 18358c2ecf20Sopenharmony_ci vmovdqu `16*0-64`($ctx),%x#$T2 18368c2ecf20Sopenharmony_ci and \$-512,%rsp 18378c2ecf20Sopenharmony_ci vmovdqu `16*1-64`($ctx),%x#$T3 18388c2ecf20Sopenharmony_ci vmovdqu `16*2-64`($ctx),%x#$T4 18398c2ecf20Sopenharmony_ci vmovdqu `16*3-64`($ctx),%x#$D0 18408c2ecf20Sopenharmony_ci vmovdqu `16*4-64`($ctx),%x#$D1 18418c2ecf20Sopenharmony_ci vmovdqu `16*5-64`($ctx),%x#$D2 18428c2ecf20Sopenharmony_ci lea 0x90(%rsp),%rax # size optimization 18438c2ecf20Sopenharmony_ci vmovdqu `16*6-64`($ctx),%x#$D3 18448c2ecf20Sopenharmony_ci vpermd $T2,$T0,$T2 # 00003412 -> 14243444 18458c2ecf20Sopenharmony_ci vmovdqu `16*7-64`($ctx),%x#$D4 18468c2ecf20Sopenharmony_ci vpermd $T3,$T0,$T3 18478c2ecf20Sopenharmony_ci vmovdqu `16*8-64`($ctx),%x#$MASK 18488c2ecf20Sopenharmony_ci vpermd $T4,$T0,$T4 18498c2ecf20Sopenharmony_ci vmovdqa $T2,0x00(%rsp) 18508c2ecf20Sopenharmony_ci vpermd $D0,$T0,$D0 18518c2ecf20Sopenharmony_ci vmovdqa $T3,0x20-0x90(%rax) 18528c2ecf20Sopenharmony_ci vpermd $D1,$T0,$D1 18538c2ecf20Sopenharmony_ci vmovdqa $T4,0x40-0x90(%rax) 18548c2ecf20Sopenharmony_ci vpermd $D2,$T0,$D2 18558c2ecf20Sopenharmony_ci vmovdqa $D0,0x60-0x90(%rax) 18568c2ecf20Sopenharmony_ci vpermd $D3,$T0,$D3 18578c2ecf20Sopenharmony_ci vmovdqa $D1,0x80-0x90(%rax) 18588c2ecf20Sopenharmony_ci vpermd $D4,$T0,$D4 18598c2ecf20Sopenharmony_ci vmovdqa $D2,0xa0-0x90(%rax) 18608c2ecf20Sopenharmony_ci vpermd $MASK,$T0,$MASK 18618c2ecf20Sopenharmony_ci vmovdqa $D3,0xc0-0x90(%rax) 18628c2ecf20Sopenharmony_ci vmovdqa $D4,0xe0-0x90(%rax) 18638c2ecf20Sopenharmony_ci vmovdqa $MASK,0x100-0x90(%rax) 18648c2ecf20Sopenharmony_ci vmovdqa 64(%rcx),$MASK # .Lmask26 18658c2ecf20Sopenharmony_ci 18668c2ecf20Sopenharmony_ci ################################################################ 18678c2ecf20Sopenharmony_ci # load input 18688c2ecf20Sopenharmony_ci vmovdqu 16*0($inp),%x#$T0 18698c2ecf20Sopenharmony_ci vmovdqu 16*1($inp),%x#$T1 18708c2ecf20Sopenharmony_ci vinserti128 \$1,16*2($inp),$T0,$T0 18718c2ecf20Sopenharmony_ci vinserti128 \$1,16*3($inp),$T1,$T1 18728c2ecf20Sopenharmony_ci lea 16*4($inp),$inp 18738c2ecf20Sopenharmony_ci 18748c2ecf20Sopenharmony_ci vpsrldq \$6,$T0,$T2 # splat input 18758c2ecf20Sopenharmony_ci vpsrldq \$6,$T1,$T3 18768c2ecf20Sopenharmony_ci vpunpckhqdq $T1,$T0,$T4 # 4 18778c2ecf20Sopenharmony_ci vpunpcklqdq $T3,$T2,$T2 # 2:3 18788c2ecf20Sopenharmony_ci vpunpcklqdq $T1,$T0,$T0 # 0:1 18798c2ecf20Sopenharmony_ci 18808c2ecf20Sopenharmony_ci vpsrlq \$30,$T2,$T3 18818c2ecf20Sopenharmony_ci vpsrlq \$4,$T2,$T2 18828c2ecf20Sopenharmony_ci vpsrlq \$26,$T0,$T1 18838c2ecf20Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 18848c2ecf20Sopenharmony_ci vpand $MASK,$T2,$T2 # 2 18858c2ecf20Sopenharmony_ci vpand $MASK,$T0,$T0 # 0 18868c2ecf20Sopenharmony_ci vpand $MASK,$T1,$T1 # 1 18878c2ecf20Sopenharmony_ci vpand $MASK,$T3,$T3 # 3 18888c2ecf20Sopenharmony_ci vpor 32(%rcx),$T4,$T4 # padbit, yes, always 18898c2ecf20Sopenharmony_ci 18908c2ecf20Sopenharmony_ci vpaddq $H2,$T2,$H2 # accumulate input 18918c2ecf20Sopenharmony_ci sub \$64,$len 18928c2ecf20Sopenharmony_ci jz .Ltail_avx2$suffix 18938c2ecf20Sopenharmony_ci jmp .Loop_avx2$suffix 18948c2ecf20Sopenharmony_ci 18958c2ecf20Sopenharmony_ci.align 32 18968c2ecf20Sopenharmony_ci.Loop_avx2$suffix: 18978c2ecf20Sopenharmony_ci ################################################################ 18988c2ecf20Sopenharmony_ci # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 18998c2ecf20Sopenharmony_ci # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 19008c2ecf20Sopenharmony_ci # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 19018c2ecf20Sopenharmony_ci # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 19028c2ecf20Sopenharmony_ci # \________/\__________/ 19038c2ecf20Sopenharmony_ci ################################################################ 19048c2ecf20Sopenharmony_ci #vpaddq $H2,$T2,$H2 # accumulate input 19058c2ecf20Sopenharmony_ci vpaddq $H0,$T0,$H0 19068c2ecf20Sopenharmony_ci vmovdqa `32*0`(%rsp),$T0 # r0^4 19078c2ecf20Sopenharmony_ci vpaddq $H1,$T1,$H1 19088c2ecf20Sopenharmony_ci vmovdqa `32*1`(%rsp),$T1 # r1^4 19098c2ecf20Sopenharmony_ci vpaddq $H3,$T3,$H3 19108c2ecf20Sopenharmony_ci vmovdqa `32*3`(%rsp),$T2 # r2^4 19118c2ecf20Sopenharmony_ci vpaddq $H4,$T4,$H4 19128c2ecf20Sopenharmony_ci vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 19138c2ecf20Sopenharmony_ci vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 19148c2ecf20Sopenharmony_ci 19158c2ecf20Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 19168c2ecf20Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 19178c2ecf20Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 19188c2ecf20Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 19198c2ecf20Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 19208c2ecf20Sopenharmony_ci # 19218c2ecf20Sopenharmony_ci # however, as h2 is "chronologically" first one available pull 19228c2ecf20Sopenharmony_ci # corresponding operations up, so it's 19238c2ecf20Sopenharmony_ci # 19248c2ecf20Sopenharmony_ci # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 19258c2ecf20Sopenharmony_ci # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 19268c2ecf20Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 19278c2ecf20Sopenharmony_ci # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 19288c2ecf20Sopenharmony_ci # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 19298c2ecf20Sopenharmony_ci 19308c2ecf20Sopenharmony_ci vpmuludq $H2,$T0,$D2 # d2 = h2*r0 19318c2ecf20Sopenharmony_ci vpmuludq $H2,$T1,$D3 # d3 = h2*r1 19328c2ecf20Sopenharmony_ci vpmuludq $H2,$T2,$D4 # d4 = h2*r2 19338c2ecf20Sopenharmony_ci vpmuludq $H2,$T3,$D0 # d0 = h2*s3 19348c2ecf20Sopenharmony_ci vpmuludq $H2,$S4,$D1 # d1 = h2*s4 19358c2ecf20Sopenharmony_ci 19368c2ecf20Sopenharmony_ci vpmuludq $H0,$T1,$T4 # h0*r1 19378c2ecf20Sopenharmony_ci vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp 19388c2ecf20Sopenharmony_ci vpaddq $T4,$D1,$D1 # d1 += h0*r1 19398c2ecf20Sopenharmony_ci vpaddq $H2,$D2,$D2 # d2 += h1*r1 19408c2ecf20Sopenharmony_ci vpmuludq $H3,$T1,$T4 # h3*r1 19418c2ecf20Sopenharmony_ci vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 19428c2ecf20Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h3*r1 19438c2ecf20Sopenharmony_ci vpaddq $H2,$D0,$D0 # d0 += h4*s1 19448c2ecf20Sopenharmony_ci vmovdqa `32*4-0x90`(%rax),$T1 # s2 19458c2ecf20Sopenharmony_ci 19468c2ecf20Sopenharmony_ci vpmuludq $H0,$T0,$T4 # h0*r0 19478c2ecf20Sopenharmony_ci vpmuludq $H1,$T0,$H2 # h1*r0 19488c2ecf20Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h0*r0 19498c2ecf20Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h1*r0 19508c2ecf20Sopenharmony_ci vpmuludq $H3,$T0,$T4 # h3*r0 19518c2ecf20Sopenharmony_ci vpmuludq $H4,$T0,$H2 # h4*r0 19528c2ecf20Sopenharmony_ci vmovdqu 16*0($inp),%x#$T0 # load input 19538c2ecf20Sopenharmony_ci vpaddq $T4,$D3,$D3 # d3 += h3*r0 19548c2ecf20Sopenharmony_ci vpaddq $H2,$D4,$D4 # d4 += h4*r0 19558c2ecf20Sopenharmony_ci vinserti128 \$1,16*2($inp),$T0,$T0 19568c2ecf20Sopenharmony_ci 19578c2ecf20Sopenharmony_ci vpmuludq $H3,$T1,$T4 # h3*s2 19588c2ecf20Sopenharmony_ci vpmuludq $H4,$T1,$H2 # h4*s2 19598c2ecf20Sopenharmony_ci vmovdqu 16*1($inp),%x#$T1 19608c2ecf20Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h3*s2 19618c2ecf20Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h4*s2 19628c2ecf20Sopenharmony_ci vmovdqa `32*5-0x90`(%rax),$H2 # r3 19638c2ecf20Sopenharmony_ci vpmuludq $H1,$T2,$T4 # h1*r2 19648c2ecf20Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r2 19658c2ecf20Sopenharmony_ci vpaddq $T4,$D3,$D3 # d3 += h1*r2 19668c2ecf20Sopenharmony_ci vpaddq $T2,$D2,$D2 # d2 += h0*r2 19678c2ecf20Sopenharmony_ci vinserti128 \$1,16*3($inp),$T1,$T1 19688c2ecf20Sopenharmony_ci lea 16*4($inp),$inp 19698c2ecf20Sopenharmony_ci 19708c2ecf20Sopenharmony_ci vpmuludq $H1,$H2,$T4 # h1*r3 19718c2ecf20Sopenharmony_ci vpmuludq $H0,$H2,$H2 # h0*r3 19728c2ecf20Sopenharmony_ci vpsrldq \$6,$T0,$T2 # splat input 19738c2ecf20Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h1*r3 19748c2ecf20Sopenharmony_ci vpaddq $H2,$D3,$D3 # d3 += h0*r3 19758c2ecf20Sopenharmony_ci vpmuludq $H3,$T3,$T4 # h3*s3 19768c2ecf20Sopenharmony_ci vpmuludq $H4,$T3,$H2 # h4*s3 19778c2ecf20Sopenharmony_ci vpsrldq \$6,$T1,$T3 19788c2ecf20Sopenharmony_ci vpaddq $T4,$D1,$D1 # d1 += h3*s3 19798c2ecf20Sopenharmony_ci vpaddq $H2,$D2,$D2 # d2 += h4*s3 19808c2ecf20Sopenharmony_ci vpunpckhqdq $T1,$T0,$T4 # 4 19818c2ecf20Sopenharmony_ci 19828c2ecf20Sopenharmony_ci vpmuludq $H3,$S4,$H3 # h3*s4 19838c2ecf20Sopenharmony_ci vpmuludq $H4,$S4,$H4 # h4*s4 19848c2ecf20Sopenharmony_ci vpunpcklqdq $T1,$T0,$T0 # 0:1 19858c2ecf20Sopenharmony_ci vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 19868c2ecf20Sopenharmony_ci vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 19878c2ecf20Sopenharmony_ci vpunpcklqdq $T3,$T2,$T3 # 2:3 19888c2ecf20Sopenharmony_ci vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 19898c2ecf20Sopenharmony_ci vpmuludq $H1,$S4,$H0 # h1*s4 19908c2ecf20Sopenharmony_ci vmovdqa 64(%rcx),$MASK # .Lmask26 19918c2ecf20Sopenharmony_ci vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 19928c2ecf20Sopenharmony_ci vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 19938c2ecf20Sopenharmony_ci 19948c2ecf20Sopenharmony_ci ################################################################ 19958c2ecf20Sopenharmony_ci # lazy reduction (interleaved with tail of input splat) 19968c2ecf20Sopenharmony_ci 19978c2ecf20Sopenharmony_ci vpsrlq \$26,$H3,$D3 19988c2ecf20Sopenharmony_ci vpand $MASK,$H3,$H3 19998c2ecf20Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 20008c2ecf20Sopenharmony_ci 20018c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$D0 20028c2ecf20Sopenharmony_ci vpand $MASK,$H0,$H0 20038c2ecf20Sopenharmony_ci vpaddq $D0,$D1,$H1 # h0 -> h1 20048c2ecf20Sopenharmony_ci 20058c2ecf20Sopenharmony_ci vpsrlq \$26,$H4,$D4 20068c2ecf20Sopenharmony_ci vpand $MASK,$H4,$H4 20078c2ecf20Sopenharmony_ci 20088c2ecf20Sopenharmony_ci vpsrlq \$4,$T3,$T2 20098c2ecf20Sopenharmony_ci 20108c2ecf20Sopenharmony_ci vpsrlq \$26,$H1,$D1 20118c2ecf20Sopenharmony_ci vpand $MASK,$H1,$H1 20128c2ecf20Sopenharmony_ci vpaddq $D1,$H2,$H2 # h1 -> h2 20138c2ecf20Sopenharmony_ci 20148c2ecf20Sopenharmony_ci vpaddq $D4,$H0,$H0 20158c2ecf20Sopenharmony_ci vpsllq \$2,$D4,$D4 20168c2ecf20Sopenharmony_ci vpaddq $D4,$H0,$H0 # h4 -> h0 20178c2ecf20Sopenharmony_ci 20188c2ecf20Sopenharmony_ci vpand $MASK,$T2,$T2 # 2 20198c2ecf20Sopenharmony_ci vpsrlq \$26,$T0,$T1 20208c2ecf20Sopenharmony_ci 20218c2ecf20Sopenharmony_ci vpsrlq \$26,$H2,$D2 20228c2ecf20Sopenharmony_ci vpand $MASK,$H2,$H2 20238c2ecf20Sopenharmony_ci vpaddq $D2,$H3,$H3 # h2 -> h3 20248c2ecf20Sopenharmony_ci 20258c2ecf20Sopenharmony_ci vpaddq $T2,$H2,$H2 # modulo-scheduled 20268c2ecf20Sopenharmony_ci vpsrlq \$30,$T3,$T3 20278c2ecf20Sopenharmony_ci 20288c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$D0 20298c2ecf20Sopenharmony_ci vpand $MASK,$H0,$H0 20308c2ecf20Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 20318c2ecf20Sopenharmony_ci 20328c2ecf20Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 20338c2ecf20Sopenharmony_ci 20348c2ecf20Sopenharmony_ci vpsrlq \$26,$H3,$D3 20358c2ecf20Sopenharmony_ci vpand $MASK,$H3,$H3 20368c2ecf20Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 20378c2ecf20Sopenharmony_ci 20388c2ecf20Sopenharmony_ci vpand $MASK,$T0,$T0 # 0 20398c2ecf20Sopenharmony_ci vpand $MASK,$T1,$T1 # 1 20408c2ecf20Sopenharmony_ci vpand $MASK,$T3,$T3 # 3 20418c2ecf20Sopenharmony_ci vpor 32(%rcx),$T4,$T4 # padbit, yes, always 20428c2ecf20Sopenharmony_ci 20438c2ecf20Sopenharmony_ci sub \$64,$len 20448c2ecf20Sopenharmony_ci jnz .Loop_avx2$suffix 20458c2ecf20Sopenharmony_ci 20468c2ecf20Sopenharmony_ci .byte 0x66,0x90 20478c2ecf20Sopenharmony_ci.Ltail_avx2$suffix: 20488c2ecf20Sopenharmony_ci ################################################################ 20498c2ecf20Sopenharmony_ci # while above multiplications were by r^4 in all lanes, in last 20508c2ecf20Sopenharmony_ci # iteration we multiply least significant lane by r^4 and most 20518c2ecf20Sopenharmony_ci # significant one by r, so copy of above except that references 20528c2ecf20Sopenharmony_ci # to the precomputed table are displaced by 4... 20538c2ecf20Sopenharmony_ci 20548c2ecf20Sopenharmony_ci #vpaddq $H2,$T2,$H2 # accumulate input 20558c2ecf20Sopenharmony_ci vpaddq $H0,$T0,$H0 20568c2ecf20Sopenharmony_ci vmovdqu `32*0+4`(%rsp),$T0 # r0^4 20578c2ecf20Sopenharmony_ci vpaddq $H1,$T1,$H1 20588c2ecf20Sopenharmony_ci vmovdqu `32*1+4`(%rsp),$T1 # r1^4 20598c2ecf20Sopenharmony_ci vpaddq $H3,$T3,$H3 20608c2ecf20Sopenharmony_ci vmovdqu `32*3+4`(%rsp),$T2 # r2^4 20618c2ecf20Sopenharmony_ci vpaddq $H4,$T4,$H4 20628c2ecf20Sopenharmony_ci vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 20638c2ecf20Sopenharmony_ci vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 20648c2ecf20Sopenharmony_ci 20658c2ecf20Sopenharmony_ci vpmuludq $H2,$T0,$D2 # d2 = h2*r0 20668c2ecf20Sopenharmony_ci vpmuludq $H2,$T1,$D3 # d3 = h2*r1 20678c2ecf20Sopenharmony_ci vpmuludq $H2,$T2,$D4 # d4 = h2*r2 20688c2ecf20Sopenharmony_ci vpmuludq $H2,$T3,$D0 # d0 = h2*s3 20698c2ecf20Sopenharmony_ci vpmuludq $H2,$S4,$D1 # d1 = h2*s4 20708c2ecf20Sopenharmony_ci 20718c2ecf20Sopenharmony_ci vpmuludq $H0,$T1,$T4 # h0*r1 20728c2ecf20Sopenharmony_ci vpmuludq $H1,$T1,$H2 # h1*r1 20738c2ecf20Sopenharmony_ci vpaddq $T4,$D1,$D1 # d1 += h0*r1 20748c2ecf20Sopenharmony_ci vpaddq $H2,$D2,$D2 # d2 += h1*r1 20758c2ecf20Sopenharmony_ci vpmuludq $H3,$T1,$T4 # h3*r1 20768c2ecf20Sopenharmony_ci vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 20778c2ecf20Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h3*r1 20788c2ecf20Sopenharmony_ci vpaddq $H2,$D0,$D0 # d0 += h4*s1 20798c2ecf20Sopenharmony_ci 20808c2ecf20Sopenharmony_ci vpmuludq $H0,$T0,$T4 # h0*r0 20818c2ecf20Sopenharmony_ci vpmuludq $H1,$T0,$H2 # h1*r0 20828c2ecf20Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h0*r0 20838c2ecf20Sopenharmony_ci vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 20848c2ecf20Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h1*r0 20858c2ecf20Sopenharmony_ci vpmuludq $H3,$T0,$T4 # h3*r0 20868c2ecf20Sopenharmony_ci vpmuludq $H4,$T0,$H2 # h4*r0 20878c2ecf20Sopenharmony_ci vpaddq $T4,$D3,$D3 # d3 += h3*r0 20888c2ecf20Sopenharmony_ci vpaddq $H2,$D4,$D4 # d4 += h4*r0 20898c2ecf20Sopenharmony_ci 20908c2ecf20Sopenharmony_ci vpmuludq $H3,$T1,$T4 # h3*s2 20918c2ecf20Sopenharmony_ci vpmuludq $H4,$T1,$H2 # h4*s2 20928c2ecf20Sopenharmony_ci vpaddq $T4,$D0,$D0 # d0 += h3*s2 20938c2ecf20Sopenharmony_ci vpaddq $H2,$D1,$D1 # d1 += h4*s2 20948c2ecf20Sopenharmony_ci vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 20958c2ecf20Sopenharmony_ci vpmuludq $H1,$T2,$T4 # h1*r2 20968c2ecf20Sopenharmony_ci vpmuludq $H0,$T2,$T2 # h0*r2 20978c2ecf20Sopenharmony_ci vpaddq $T4,$D3,$D3 # d3 += h1*r2 20988c2ecf20Sopenharmony_ci vpaddq $T2,$D2,$D2 # d2 += h0*r2 20998c2ecf20Sopenharmony_ci 21008c2ecf20Sopenharmony_ci vpmuludq $H1,$H2,$T4 # h1*r3 21018c2ecf20Sopenharmony_ci vpmuludq $H0,$H2,$H2 # h0*r3 21028c2ecf20Sopenharmony_ci vpaddq $T4,$D4,$D4 # d4 += h1*r3 21038c2ecf20Sopenharmony_ci vpaddq $H2,$D3,$D3 # d3 += h0*r3 21048c2ecf20Sopenharmony_ci vpmuludq $H3,$T3,$T4 # h3*s3 21058c2ecf20Sopenharmony_ci vpmuludq $H4,$T3,$H2 # h4*s3 21068c2ecf20Sopenharmony_ci vpaddq $T4,$D1,$D1 # d1 += h3*s3 21078c2ecf20Sopenharmony_ci vpaddq $H2,$D2,$D2 # d2 += h4*s3 21088c2ecf20Sopenharmony_ci 21098c2ecf20Sopenharmony_ci vpmuludq $H3,$S4,$H3 # h3*s4 21108c2ecf20Sopenharmony_ci vpmuludq $H4,$S4,$H4 # h4*s4 21118c2ecf20Sopenharmony_ci vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 21128c2ecf20Sopenharmony_ci vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 21138c2ecf20Sopenharmony_ci vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 21148c2ecf20Sopenharmony_ci vpmuludq $H1,$S4,$H0 # h1*s4 21158c2ecf20Sopenharmony_ci vmovdqa 64(%rcx),$MASK # .Lmask26 21168c2ecf20Sopenharmony_ci vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 21178c2ecf20Sopenharmony_ci vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 21188c2ecf20Sopenharmony_ci 21198c2ecf20Sopenharmony_ci ################################################################ 21208c2ecf20Sopenharmony_ci # horizontal addition 21218c2ecf20Sopenharmony_ci 21228c2ecf20Sopenharmony_ci vpsrldq \$8,$D1,$T1 21238c2ecf20Sopenharmony_ci vpsrldq \$8,$H2,$T2 21248c2ecf20Sopenharmony_ci vpsrldq \$8,$H3,$T3 21258c2ecf20Sopenharmony_ci vpsrldq \$8,$H4,$T4 21268c2ecf20Sopenharmony_ci vpsrldq \$8,$H0,$T0 21278c2ecf20Sopenharmony_ci vpaddq $T1,$D1,$D1 21288c2ecf20Sopenharmony_ci vpaddq $T2,$H2,$H2 21298c2ecf20Sopenharmony_ci vpaddq $T3,$H3,$H3 21308c2ecf20Sopenharmony_ci vpaddq $T4,$H4,$H4 21318c2ecf20Sopenharmony_ci vpaddq $T0,$H0,$H0 21328c2ecf20Sopenharmony_ci 21338c2ecf20Sopenharmony_ci vpermq \$0x2,$H3,$T3 21348c2ecf20Sopenharmony_ci vpermq \$0x2,$H4,$T4 21358c2ecf20Sopenharmony_ci vpermq \$0x2,$H0,$T0 21368c2ecf20Sopenharmony_ci vpermq \$0x2,$D1,$T1 21378c2ecf20Sopenharmony_ci vpermq \$0x2,$H2,$T2 21388c2ecf20Sopenharmony_ci vpaddq $T3,$H3,$H3 21398c2ecf20Sopenharmony_ci vpaddq $T4,$H4,$H4 21408c2ecf20Sopenharmony_ci vpaddq $T0,$H0,$H0 21418c2ecf20Sopenharmony_ci vpaddq $T1,$D1,$D1 21428c2ecf20Sopenharmony_ci vpaddq $T2,$H2,$H2 21438c2ecf20Sopenharmony_ci 21448c2ecf20Sopenharmony_ci ################################################################ 21458c2ecf20Sopenharmony_ci # lazy reduction 21468c2ecf20Sopenharmony_ci 21478c2ecf20Sopenharmony_ci vpsrlq \$26,$H3,$D3 21488c2ecf20Sopenharmony_ci vpand $MASK,$H3,$H3 21498c2ecf20Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 21508c2ecf20Sopenharmony_ci 21518c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$D0 21528c2ecf20Sopenharmony_ci vpand $MASK,$H0,$H0 21538c2ecf20Sopenharmony_ci vpaddq $D0,$D1,$H1 # h0 -> h1 21548c2ecf20Sopenharmony_ci 21558c2ecf20Sopenharmony_ci vpsrlq \$26,$H4,$D4 21568c2ecf20Sopenharmony_ci vpand $MASK,$H4,$H4 21578c2ecf20Sopenharmony_ci 21588c2ecf20Sopenharmony_ci vpsrlq \$26,$H1,$D1 21598c2ecf20Sopenharmony_ci vpand $MASK,$H1,$H1 21608c2ecf20Sopenharmony_ci vpaddq $D1,$H2,$H2 # h1 -> h2 21618c2ecf20Sopenharmony_ci 21628c2ecf20Sopenharmony_ci vpaddq $D4,$H0,$H0 21638c2ecf20Sopenharmony_ci vpsllq \$2,$D4,$D4 21648c2ecf20Sopenharmony_ci vpaddq $D4,$H0,$H0 # h4 -> h0 21658c2ecf20Sopenharmony_ci 21668c2ecf20Sopenharmony_ci vpsrlq \$26,$H2,$D2 21678c2ecf20Sopenharmony_ci vpand $MASK,$H2,$H2 21688c2ecf20Sopenharmony_ci vpaddq $D2,$H3,$H3 # h2 -> h3 21698c2ecf20Sopenharmony_ci 21708c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$D0 21718c2ecf20Sopenharmony_ci vpand $MASK,$H0,$H0 21728c2ecf20Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 21738c2ecf20Sopenharmony_ci 21748c2ecf20Sopenharmony_ci vpsrlq \$26,$H3,$D3 21758c2ecf20Sopenharmony_ci vpand $MASK,$H3,$H3 21768c2ecf20Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 21778c2ecf20Sopenharmony_ci 21788c2ecf20Sopenharmony_ci vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 21798c2ecf20Sopenharmony_ci vmovd %x#$H1,`4*1-48-64`($ctx) 21808c2ecf20Sopenharmony_ci vmovd %x#$H2,`4*2-48-64`($ctx) 21818c2ecf20Sopenharmony_ci vmovd %x#$H3,`4*3-48-64`($ctx) 21828c2ecf20Sopenharmony_ci vmovd %x#$H4,`4*4-48-64`($ctx) 21838c2ecf20Sopenharmony_ci___ 21848c2ecf20Sopenharmony_ci$code.=<<___ if ($win64); 21858c2ecf20Sopenharmony_ci vmovdqa -0xb0(%r10),%xmm6 21868c2ecf20Sopenharmony_ci vmovdqa -0xa0(%r10),%xmm7 21878c2ecf20Sopenharmony_ci vmovdqa -0x90(%r10),%xmm8 21888c2ecf20Sopenharmony_ci vmovdqa -0x80(%r10),%xmm9 21898c2ecf20Sopenharmony_ci vmovdqa -0x70(%r10),%xmm10 21908c2ecf20Sopenharmony_ci vmovdqa -0x60(%r10),%xmm11 21918c2ecf20Sopenharmony_ci vmovdqa -0x50(%r10),%xmm12 21928c2ecf20Sopenharmony_ci vmovdqa -0x40(%r10),%xmm13 21938c2ecf20Sopenharmony_ci vmovdqa -0x30(%r10),%xmm14 21948c2ecf20Sopenharmony_ci vmovdqa -0x20(%r10),%xmm15 21958c2ecf20Sopenharmony_ci lea -8(%r10),%rsp 21968c2ecf20Sopenharmony_ci.Ldo_avx2_epilogue$suffix: 21978c2ecf20Sopenharmony_ci___ 21988c2ecf20Sopenharmony_ci$code.=<<___ if (!$win64); 21998c2ecf20Sopenharmony_ci lea -8(%r10),%rsp 22008c2ecf20Sopenharmony_ci.cfi_def_cfa_register %rsp 22018c2ecf20Sopenharmony_ci___ 22028c2ecf20Sopenharmony_ci$code.=<<___; 22038c2ecf20Sopenharmony_ci vzeroupper 22048c2ecf20Sopenharmony_ci RET 22058c2ecf20Sopenharmony_ci.cfi_endproc 22068c2ecf20Sopenharmony_ci___ 22078c2ecf20Sopenharmony_ciif($avx > 2 && $avx512) { 22088c2ecf20Sopenharmony_cimy ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); 22098c2ecf20Sopenharmony_cimy ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); 22108c2ecf20Sopenharmony_cimy $PADBIT="%zmm30"; 22118c2ecf20Sopenharmony_ci 22128c2ecf20Sopenharmony_cimap(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain 22138c2ecf20Sopenharmony_cimap(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); 22148c2ecf20Sopenharmony_cimap(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); 22158c2ecf20Sopenharmony_cimap(s/%y/%z/,($MASK)); 22168c2ecf20Sopenharmony_ci 22178c2ecf20Sopenharmony_ci$code.=<<___; 22188c2ecf20Sopenharmony_ci.cfi_startproc 22198c2ecf20Sopenharmony_ci.Lblocks_avx512: 22208c2ecf20Sopenharmony_ci mov \$15,%eax 22218c2ecf20Sopenharmony_ci kmovw %eax,%k2 22228c2ecf20Sopenharmony_ci___ 22238c2ecf20Sopenharmony_ci$code.=<<___ if (!$win64); 22248c2ecf20Sopenharmony_ci lea 8(%rsp),%r10 22258c2ecf20Sopenharmony_ci.cfi_def_cfa_register %r10 22268c2ecf20Sopenharmony_ci sub \$0x128,%rsp 22278c2ecf20Sopenharmony_ci___ 22288c2ecf20Sopenharmony_ci$code.=<<___ if ($win64); 22298c2ecf20Sopenharmony_ci lea 8(%rsp),%r10 22308c2ecf20Sopenharmony_ci sub \$0x1c8,%rsp 22318c2ecf20Sopenharmony_ci vmovdqa %xmm6,-0xb0(%r10) 22328c2ecf20Sopenharmony_ci vmovdqa %xmm7,-0xa0(%r10) 22338c2ecf20Sopenharmony_ci vmovdqa %xmm8,-0x90(%r10) 22348c2ecf20Sopenharmony_ci vmovdqa %xmm9,-0x80(%r10) 22358c2ecf20Sopenharmony_ci vmovdqa %xmm10,-0x70(%r10) 22368c2ecf20Sopenharmony_ci vmovdqa %xmm11,-0x60(%r10) 22378c2ecf20Sopenharmony_ci vmovdqa %xmm12,-0x50(%r10) 22388c2ecf20Sopenharmony_ci vmovdqa %xmm13,-0x40(%r10) 22398c2ecf20Sopenharmony_ci vmovdqa %xmm14,-0x30(%r10) 22408c2ecf20Sopenharmony_ci vmovdqa %xmm15,-0x20(%r10) 22418c2ecf20Sopenharmony_ci.Ldo_avx512_body: 22428c2ecf20Sopenharmony_ci___ 22438c2ecf20Sopenharmony_ci$code.=<<___; 22448c2ecf20Sopenharmony_ci lea .Lconst(%rip),%rcx 22458c2ecf20Sopenharmony_ci lea 48+64($ctx),$ctx # size optimization 22468c2ecf20Sopenharmony_ci vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 22478c2ecf20Sopenharmony_ci 22488c2ecf20Sopenharmony_ci # expand pre-calculated table 22498c2ecf20Sopenharmony_ci vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} 22508c2ecf20Sopenharmony_ci and \$-512,%rsp 22518c2ecf20Sopenharmony_ci vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} 22528c2ecf20Sopenharmony_ci mov \$0x20,%rax 22538c2ecf20Sopenharmony_ci vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} 22548c2ecf20Sopenharmony_ci vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} 22558c2ecf20Sopenharmony_ci vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} 22568c2ecf20Sopenharmony_ci vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} 22578c2ecf20Sopenharmony_ci vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} 22588c2ecf20Sopenharmony_ci vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} 22598c2ecf20Sopenharmony_ci vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} 22608c2ecf20Sopenharmony_ci vpermd $D0,$T2,$R0 # 00003412 -> 14243444 22618c2ecf20Sopenharmony_ci vpbroadcastq 64(%rcx),$MASK # .Lmask26 22628c2ecf20Sopenharmony_ci vpermd $D1,$T2,$R1 22638c2ecf20Sopenharmony_ci vpermd $T0,$T2,$S1 22648c2ecf20Sopenharmony_ci vpermd $D2,$T2,$R2 22658c2ecf20Sopenharmony_ci vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 22668c2ecf20Sopenharmony_ci vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 22678c2ecf20Sopenharmony_ci vpermd $T1,$T2,$S2 22688c2ecf20Sopenharmony_ci vmovdqu64 $R1,0x00(%rsp,%rax){%k2} 22698c2ecf20Sopenharmony_ci vpsrlq \$32,$R1,$T1 22708c2ecf20Sopenharmony_ci vpermd $D3,$T2,$R3 22718c2ecf20Sopenharmony_ci vmovdqa64 $S1,0x40(%rsp){%k2} 22728c2ecf20Sopenharmony_ci vpermd $T3,$T2,$S3 22738c2ecf20Sopenharmony_ci vpermd $D4,$T2,$R4 22748c2ecf20Sopenharmony_ci vmovdqu64 $R2,0x40(%rsp,%rax){%k2} 22758c2ecf20Sopenharmony_ci vpermd $T4,$T2,$S4 22768c2ecf20Sopenharmony_ci vmovdqa64 $S2,0x80(%rsp){%k2} 22778c2ecf20Sopenharmony_ci vmovdqu64 $R3,0x80(%rsp,%rax){%k2} 22788c2ecf20Sopenharmony_ci vmovdqa64 $S3,0xc0(%rsp){%k2} 22798c2ecf20Sopenharmony_ci vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} 22808c2ecf20Sopenharmony_ci vmovdqa64 $S4,0x100(%rsp){%k2} 22818c2ecf20Sopenharmony_ci 22828c2ecf20Sopenharmony_ci ################################################################ 22838c2ecf20Sopenharmony_ci # calculate 5th through 8th powers of the key 22848c2ecf20Sopenharmony_ci # 22858c2ecf20Sopenharmony_ci # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 22868c2ecf20Sopenharmony_ci # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 22878c2ecf20Sopenharmony_ci # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 22888c2ecf20Sopenharmony_ci # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 22898c2ecf20Sopenharmony_ci # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 22908c2ecf20Sopenharmony_ci 22918c2ecf20Sopenharmony_ci vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 22928c2ecf20Sopenharmony_ci vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 22938c2ecf20Sopenharmony_ci vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 22948c2ecf20Sopenharmony_ci vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 22958c2ecf20Sopenharmony_ci vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 22968c2ecf20Sopenharmony_ci vpsrlq \$32,$R2,$T2 22978c2ecf20Sopenharmony_ci 22988c2ecf20Sopenharmony_ci vpmuludq $T1,$S4,$M0 22998c2ecf20Sopenharmony_ci vpmuludq $T1,$R0,$M1 23008c2ecf20Sopenharmony_ci vpmuludq $T1,$R1,$M2 23018c2ecf20Sopenharmony_ci vpmuludq $T1,$R2,$M3 23028c2ecf20Sopenharmony_ci vpmuludq $T1,$R3,$M4 23038c2ecf20Sopenharmony_ci vpsrlq \$32,$R3,$T3 23048c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 23058c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += r1'*r0 23068c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += r1'*r1 23078c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += r1'*r2 23088c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += r1'*r3 23098c2ecf20Sopenharmony_ci 23108c2ecf20Sopenharmony_ci vpmuludq $T2,$S3,$M0 23118c2ecf20Sopenharmony_ci vpmuludq $T2,$S4,$M1 23128c2ecf20Sopenharmony_ci vpmuludq $T2,$R1,$M3 23138c2ecf20Sopenharmony_ci vpmuludq $T2,$R2,$M4 23148c2ecf20Sopenharmony_ci vpmuludq $T2,$R0,$M2 23158c2ecf20Sopenharmony_ci vpsrlq \$32,$R4,$T4 23168c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 23178c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 23188c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += r2'*r1 23198c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += r2'*r2 23208c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += r2'*r0 23218c2ecf20Sopenharmony_ci 23228c2ecf20Sopenharmony_ci vpmuludq $T3,$S2,$M0 23238c2ecf20Sopenharmony_ci vpmuludq $T3,$R0,$M3 23248c2ecf20Sopenharmony_ci vpmuludq $T3,$R1,$M4 23258c2ecf20Sopenharmony_ci vpmuludq $T3,$S3,$M1 23268c2ecf20Sopenharmony_ci vpmuludq $T3,$S4,$M2 23278c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 23288c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += r3'*r0 23298c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += r3'*r1 23308c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 23318c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 23328c2ecf20Sopenharmony_ci 23338c2ecf20Sopenharmony_ci vpmuludq $T4,$S4,$M3 23348c2ecf20Sopenharmony_ci vpmuludq $T4,$R0,$M4 23358c2ecf20Sopenharmony_ci vpmuludq $T4,$S1,$M0 23368c2ecf20Sopenharmony_ci vpmuludq $T4,$S2,$M1 23378c2ecf20Sopenharmony_ci vpmuludq $T4,$S3,$M2 23388c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 23398c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += r2'*r0 23408c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 23418c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 23428c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 23438c2ecf20Sopenharmony_ci 23448c2ecf20Sopenharmony_ci ################################################################ 23458c2ecf20Sopenharmony_ci # load input 23468c2ecf20Sopenharmony_ci vmovdqu64 16*0($inp),%z#$T3 23478c2ecf20Sopenharmony_ci vmovdqu64 16*4($inp),%z#$T4 23488c2ecf20Sopenharmony_ci lea 16*8($inp),$inp 23498c2ecf20Sopenharmony_ci 23508c2ecf20Sopenharmony_ci ################################################################ 23518c2ecf20Sopenharmony_ci # lazy reduction 23528c2ecf20Sopenharmony_ci 23538c2ecf20Sopenharmony_ci vpsrlq \$26,$D3,$M3 23548c2ecf20Sopenharmony_ci vpandq $MASK,$D3,$D3 23558c2ecf20Sopenharmony_ci vpaddq $M3,$D4,$D4 # d3 -> d4 23568c2ecf20Sopenharmony_ci 23578c2ecf20Sopenharmony_ci vpsrlq \$26,$D0,$M0 23588c2ecf20Sopenharmony_ci vpandq $MASK,$D0,$D0 23598c2ecf20Sopenharmony_ci vpaddq $M0,$D1,$D1 # d0 -> d1 23608c2ecf20Sopenharmony_ci 23618c2ecf20Sopenharmony_ci vpsrlq \$26,$D4,$M4 23628c2ecf20Sopenharmony_ci vpandq $MASK,$D4,$D4 23638c2ecf20Sopenharmony_ci 23648c2ecf20Sopenharmony_ci vpsrlq \$26,$D1,$M1 23658c2ecf20Sopenharmony_ci vpandq $MASK,$D1,$D1 23668c2ecf20Sopenharmony_ci vpaddq $M1,$D2,$D2 # d1 -> d2 23678c2ecf20Sopenharmony_ci 23688c2ecf20Sopenharmony_ci vpaddq $M4,$D0,$D0 23698c2ecf20Sopenharmony_ci vpsllq \$2,$M4,$M4 23708c2ecf20Sopenharmony_ci vpaddq $M4,$D0,$D0 # d4 -> d0 23718c2ecf20Sopenharmony_ci 23728c2ecf20Sopenharmony_ci vpsrlq \$26,$D2,$M2 23738c2ecf20Sopenharmony_ci vpandq $MASK,$D2,$D2 23748c2ecf20Sopenharmony_ci vpaddq $M2,$D3,$D3 # d2 -> d3 23758c2ecf20Sopenharmony_ci 23768c2ecf20Sopenharmony_ci vpsrlq \$26,$D0,$M0 23778c2ecf20Sopenharmony_ci vpandq $MASK,$D0,$D0 23788c2ecf20Sopenharmony_ci vpaddq $M0,$D1,$D1 # d0 -> d1 23798c2ecf20Sopenharmony_ci 23808c2ecf20Sopenharmony_ci vpsrlq \$26,$D3,$M3 23818c2ecf20Sopenharmony_ci vpandq $MASK,$D3,$D3 23828c2ecf20Sopenharmony_ci vpaddq $M3,$D4,$D4 # d3 -> d4 23838c2ecf20Sopenharmony_ci 23848c2ecf20Sopenharmony_ci ################################################################ 23858c2ecf20Sopenharmony_ci # at this point we have 14243444 in $R0-$S4 and 05060708 in 23868c2ecf20Sopenharmony_ci # $D0-$D4, ... 23878c2ecf20Sopenharmony_ci 23888c2ecf20Sopenharmony_ci vpunpcklqdq $T4,$T3,$T0 # transpose input 23898c2ecf20Sopenharmony_ci vpunpckhqdq $T4,$T3,$T4 23908c2ecf20Sopenharmony_ci 23918c2ecf20Sopenharmony_ci # ... since input 64-bit lanes are ordered as 73625140, we could 23928c2ecf20Sopenharmony_ci # "vperm" it to 76543210 (here and in each loop iteration), *or* 23938c2ecf20Sopenharmony_ci # we could just flow along, hence the goal for $R0-$S4 is 23948c2ecf20Sopenharmony_ci # 1858286838784888 ... 23958c2ecf20Sopenharmony_ci 23968c2ecf20Sopenharmony_ci vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: 23978c2ecf20Sopenharmony_ci mov \$0x7777,%eax 23988c2ecf20Sopenharmony_ci kmovw %eax,%k1 23998c2ecf20Sopenharmony_ci 24008c2ecf20Sopenharmony_ci vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- 24018c2ecf20Sopenharmony_ci vpermd $R1,$M0,$R1 24028c2ecf20Sopenharmony_ci vpermd $R2,$M0,$R2 24038c2ecf20Sopenharmony_ci vpermd $R3,$M0,$R3 24048c2ecf20Sopenharmony_ci vpermd $R4,$M0,$R4 24058c2ecf20Sopenharmony_ci 24068c2ecf20Sopenharmony_ci vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 24078c2ecf20Sopenharmony_ci vpermd $D1,$M0,${R1}{%k1} 24088c2ecf20Sopenharmony_ci vpermd $D2,$M0,${R2}{%k1} 24098c2ecf20Sopenharmony_ci vpermd $D3,$M0,${R3}{%k1} 24108c2ecf20Sopenharmony_ci vpermd $D4,$M0,${R4}{%k1} 24118c2ecf20Sopenharmony_ci 24128c2ecf20Sopenharmony_ci vpslld \$2,$R1,$S1 # *5 24138c2ecf20Sopenharmony_ci vpslld \$2,$R2,$S2 24148c2ecf20Sopenharmony_ci vpslld \$2,$R3,$S3 24158c2ecf20Sopenharmony_ci vpslld \$2,$R4,$S4 24168c2ecf20Sopenharmony_ci vpaddd $R1,$S1,$S1 24178c2ecf20Sopenharmony_ci vpaddd $R2,$S2,$S2 24188c2ecf20Sopenharmony_ci vpaddd $R3,$S3,$S3 24198c2ecf20Sopenharmony_ci vpaddd $R4,$S4,$S4 24208c2ecf20Sopenharmony_ci 24218c2ecf20Sopenharmony_ci vpbroadcastq 32(%rcx),$PADBIT # .L129 24228c2ecf20Sopenharmony_ci 24238c2ecf20Sopenharmony_ci vpsrlq \$52,$T0,$T2 # splat input 24248c2ecf20Sopenharmony_ci vpsllq \$12,$T4,$T3 24258c2ecf20Sopenharmony_ci vporq $T3,$T2,$T2 24268c2ecf20Sopenharmony_ci vpsrlq \$26,$T0,$T1 24278c2ecf20Sopenharmony_ci vpsrlq \$14,$T4,$T3 24288c2ecf20Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 24298c2ecf20Sopenharmony_ci vpandq $MASK,$T2,$T2 # 2 24308c2ecf20Sopenharmony_ci vpandq $MASK,$T0,$T0 # 0 24318c2ecf20Sopenharmony_ci #vpandq $MASK,$T1,$T1 # 1 24328c2ecf20Sopenharmony_ci #vpandq $MASK,$T3,$T3 # 3 24338c2ecf20Sopenharmony_ci #vporq $PADBIT,$T4,$T4 # padbit, yes, always 24348c2ecf20Sopenharmony_ci 24358c2ecf20Sopenharmony_ci vpaddq $H2,$T2,$H2 # accumulate input 24368c2ecf20Sopenharmony_ci sub \$192,$len 24378c2ecf20Sopenharmony_ci jbe .Ltail_avx512 24388c2ecf20Sopenharmony_ci jmp .Loop_avx512 24398c2ecf20Sopenharmony_ci 24408c2ecf20Sopenharmony_ci.align 32 24418c2ecf20Sopenharmony_ci.Loop_avx512: 24428c2ecf20Sopenharmony_ci ################################################################ 24438c2ecf20Sopenharmony_ci # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 24448c2ecf20Sopenharmony_ci # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 24458c2ecf20Sopenharmony_ci # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 24468c2ecf20Sopenharmony_ci # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 24478c2ecf20Sopenharmony_ci # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 24488c2ecf20Sopenharmony_ci # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 24498c2ecf20Sopenharmony_ci # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 24508c2ecf20Sopenharmony_ci # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 24518c2ecf20Sopenharmony_ci # \________/\___________/ 24528c2ecf20Sopenharmony_ci ################################################################ 24538c2ecf20Sopenharmony_ci #vpaddq $H2,$T2,$H2 # accumulate input 24548c2ecf20Sopenharmony_ci 24558c2ecf20Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 24568c2ecf20Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 24578c2ecf20Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 24588c2ecf20Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 24598c2ecf20Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 24608c2ecf20Sopenharmony_ci # 24618c2ecf20Sopenharmony_ci # however, as h2 is "chronologically" first one available pull 24628c2ecf20Sopenharmony_ci # corresponding operations up, so it's 24638c2ecf20Sopenharmony_ci # 24648c2ecf20Sopenharmony_ci # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 24658c2ecf20Sopenharmony_ci # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 24668c2ecf20Sopenharmony_ci # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 24678c2ecf20Sopenharmony_ci # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 24688c2ecf20Sopenharmony_ci # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 24698c2ecf20Sopenharmony_ci 24708c2ecf20Sopenharmony_ci vpmuludq $H2,$R1,$D3 # d3 = h2*r1 24718c2ecf20Sopenharmony_ci vpaddq $H0,$T0,$H0 24728c2ecf20Sopenharmony_ci vpmuludq $H2,$R2,$D4 # d4 = h2*r2 24738c2ecf20Sopenharmony_ci vpandq $MASK,$T1,$T1 # 1 24748c2ecf20Sopenharmony_ci vpmuludq $H2,$S3,$D0 # d0 = h2*s3 24758c2ecf20Sopenharmony_ci vpandq $MASK,$T3,$T3 # 3 24768c2ecf20Sopenharmony_ci vpmuludq $H2,$S4,$D1 # d1 = h2*s4 24778c2ecf20Sopenharmony_ci vporq $PADBIT,$T4,$T4 # padbit, yes, always 24788c2ecf20Sopenharmony_ci vpmuludq $H2,$R0,$D2 # d2 = h2*r0 24798c2ecf20Sopenharmony_ci vpaddq $H1,$T1,$H1 # accumulate input 24808c2ecf20Sopenharmony_ci vpaddq $H3,$T3,$H3 24818c2ecf20Sopenharmony_ci vpaddq $H4,$T4,$H4 24828c2ecf20Sopenharmony_ci 24838c2ecf20Sopenharmony_ci vmovdqu64 16*0($inp),$T3 # load input 24848c2ecf20Sopenharmony_ci vmovdqu64 16*4($inp),$T4 24858c2ecf20Sopenharmony_ci lea 16*8($inp),$inp 24868c2ecf20Sopenharmony_ci vpmuludq $H0,$R3,$M3 24878c2ecf20Sopenharmony_ci vpmuludq $H0,$R4,$M4 24888c2ecf20Sopenharmony_ci vpmuludq $H0,$R0,$M0 24898c2ecf20Sopenharmony_ci vpmuludq $H0,$R1,$M1 24908c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h0*r3 24918c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h0*r4 24928c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h0*r0 24938c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h0*r1 24948c2ecf20Sopenharmony_ci 24958c2ecf20Sopenharmony_ci vpmuludq $H1,$R2,$M3 24968c2ecf20Sopenharmony_ci vpmuludq $H1,$R3,$M4 24978c2ecf20Sopenharmony_ci vpmuludq $H1,$S4,$M0 24988c2ecf20Sopenharmony_ci vpmuludq $H0,$R2,$M2 24998c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h1*r2 25008c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h1*r3 25018c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h1*s4 25028c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h0*r2 25038c2ecf20Sopenharmony_ci 25048c2ecf20Sopenharmony_ci vpunpcklqdq $T4,$T3,$T0 # transpose input 25058c2ecf20Sopenharmony_ci vpunpckhqdq $T4,$T3,$T4 25068c2ecf20Sopenharmony_ci 25078c2ecf20Sopenharmony_ci vpmuludq $H3,$R0,$M3 25088c2ecf20Sopenharmony_ci vpmuludq $H3,$R1,$M4 25098c2ecf20Sopenharmony_ci vpmuludq $H1,$R0,$M1 25108c2ecf20Sopenharmony_ci vpmuludq $H1,$R1,$M2 25118c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h3*r0 25128c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h3*r1 25138c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h1*r0 25148c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h1*r1 25158c2ecf20Sopenharmony_ci 25168c2ecf20Sopenharmony_ci vpmuludq $H4,$S4,$M3 25178c2ecf20Sopenharmony_ci vpmuludq $H4,$R0,$M4 25188c2ecf20Sopenharmony_ci vpmuludq $H3,$S2,$M0 25198c2ecf20Sopenharmony_ci vpmuludq $H3,$S3,$M1 25208c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h4*s4 25218c2ecf20Sopenharmony_ci vpmuludq $H3,$S4,$M2 25228c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h4*r0 25238c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h3*s2 25248c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h3*s3 25258c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h3*s4 25268c2ecf20Sopenharmony_ci 25278c2ecf20Sopenharmony_ci vpmuludq $H4,$S1,$M0 25288c2ecf20Sopenharmony_ci vpmuludq $H4,$S2,$M1 25298c2ecf20Sopenharmony_ci vpmuludq $H4,$S3,$M2 25308c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 25318c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 25328c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 25338c2ecf20Sopenharmony_ci 25348c2ecf20Sopenharmony_ci ################################################################ 25358c2ecf20Sopenharmony_ci # lazy reduction (interleaved with input splat) 25368c2ecf20Sopenharmony_ci 25378c2ecf20Sopenharmony_ci vpsrlq \$52,$T0,$T2 # splat input 25388c2ecf20Sopenharmony_ci vpsllq \$12,$T4,$T3 25398c2ecf20Sopenharmony_ci 25408c2ecf20Sopenharmony_ci vpsrlq \$26,$D3,$H3 25418c2ecf20Sopenharmony_ci vpandq $MASK,$D3,$D3 25428c2ecf20Sopenharmony_ci vpaddq $H3,$D4,$H4 # h3 -> h4 25438c2ecf20Sopenharmony_ci 25448c2ecf20Sopenharmony_ci vporq $T3,$T2,$T2 25458c2ecf20Sopenharmony_ci 25468c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$D0 25478c2ecf20Sopenharmony_ci vpandq $MASK,$H0,$H0 25488c2ecf20Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 25498c2ecf20Sopenharmony_ci 25508c2ecf20Sopenharmony_ci vpandq $MASK,$T2,$T2 # 2 25518c2ecf20Sopenharmony_ci 25528c2ecf20Sopenharmony_ci vpsrlq \$26,$H4,$D4 25538c2ecf20Sopenharmony_ci vpandq $MASK,$H4,$H4 25548c2ecf20Sopenharmony_ci 25558c2ecf20Sopenharmony_ci vpsrlq \$26,$H1,$D1 25568c2ecf20Sopenharmony_ci vpandq $MASK,$H1,$H1 25578c2ecf20Sopenharmony_ci vpaddq $D1,$H2,$H2 # h1 -> h2 25588c2ecf20Sopenharmony_ci 25598c2ecf20Sopenharmony_ci vpaddq $D4,$H0,$H0 25608c2ecf20Sopenharmony_ci vpsllq \$2,$D4,$D4 25618c2ecf20Sopenharmony_ci vpaddq $D4,$H0,$H0 # h4 -> h0 25628c2ecf20Sopenharmony_ci 25638c2ecf20Sopenharmony_ci vpaddq $T2,$H2,$H2 # modulo-scheduled 25648c2ecf20Sopenharmony_ci vpsrlq \$26,$T0,$T1 25658c2ecf20Sopenharmony_ci 25668c2ecf20Sopenharmony_ci vpsrlq \$26,$H2,$D2 25678c2ecf20Sopenharmony_ci vpandq $MASK,$H2,$H2 25688c2ecf20Sopenharmony_ci vpaddq $D2,$D3,$H3 # h2 -> h3 25698c2ecf20Sopenharmony_ci 25708c2ecf20Sopenharmony_ci vpsrlq \$14,$T4,$T3 25718c2ecf20Sopenharmony_ci 25728c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$D0 25738c2ecf20Sopenharmony_ci vpandq $MASK,$H0,$H0 25748c2ecf20Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 25758c2ecf20Sopenharmony_ci 25768c2ecf20Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 25778c2ecf20Sopenharmony_ci 25788c2ecf20Sopenharmony_ci vpsrlq \$26,$H3,$D3 25798c2ecf20Sopenharmony_ci vpandq $MASK,$H3,$H3 25808c2ecf20Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 25818c2ecf20Sopenharmony_ci 25828c2ecf20Sopenharmony_ci vpandq $MASK,$T0,$T0 # 0 25838c2ecf20Sopenharmony_ci #vpandq $MASK,$T1,$T1 # 1 25848c2ecf20Sopenharmony_ci #vpandq $MASK,$T3,$T3 # 3 25858c2ecf20Sopenharmony_ci #vporq $PADBIT,$T4,$T4 # padbit, yes, always 25868c2ecf20Sopenharmony_ci 25878c2ecf20Sopenharmony_ci sub \$128,$len 25888c2ecf20Sopenharmony_ci ja .Loop_avx512 25898c2ecf20Sopenharmony_ci 25908c2ecf20Sopenharmony_ci.Ltail_avx512: 25918c2ecf20Sopenharmony_ci ################################################################ 25928c2ecf20Sopenharmony_ci # while above multiplications were by r^8 in all lanes, in last 25938c2ecf20Sopenharmony_ci # iteration we multiply least significant lane by r^8 and most 25948c2ecf20Sopenharmony_ci # significant one by r, that's why table gets shifted... 25958c2ecf20Sopenharmony_ci 25968c2ecf20Sopenharmony_ci vpsrlq \$32,$R0,$R0 # 0105020603070408 25978c2ecf20Sopenharmony_ci vpsrlq \$32,$R1,$R1 25988c2ecf20Sopenharmony_ci vpsrlq \$32,$R2,$R2 25998c2ecf20Sopenharmony_ci vpsrlq \$32,$S3,$S3 26008c2ecf20Sopenharmony_ci vpsrlq \$32,$S4,$S4 26018c2ecf20Sopenharmony_ci vpsrlq \$32,$R3,$R3 26028c2ecf20Sopenharmony_ci vpsrlq \$32,$R4,$R4 26038c2ecf20Sopenharmony_ci vpsrlq \$32,$S1,$S1 26048c2ecf20Sopenharmony_ci vpsrlq \$32,$S2,$S2 26058c2ecf20Sopenharmony_ci 26068c2ecf20Sopenharmony_ci ################################################################ 26078c2ecf20Sopenharmony_ci # load either next or last 64 byte of input 26088c2ecf20Sopenharmony_ci lea ($inp,$len),$inp 26098c2ecf20Sopenharmony_ci 26108c2ecf20Sopenharmony_ci #vpaddq $H2,$T2,$H2 # accumulate input 26118c2ecf20Sopenharmony_ci vpaddq $H0,$T0,$H0 26128c2ecf20Sopenharmony_ci 26138c2ecf20Sopenharmony_ci vpmuludq $H2,$R1,$D3 # d3 = h2*r1 26148c2ecf20Sopenharmony_ci vpmuludq $H2,$R2,$D4 # d4 = h2*r2 26158c2ecf20Sopenharmony_ci vpmuludq $H2,$S3,$D0 # d0 = h2*s3 26168c2ecf20Sopenharmony_ci vpandq $MASK,$T1,$T1 # 1 26178c2ecf20Sopenharmony_ci vpmuludq $H2,$S4,$D1 # d1 = h2*s4 26188c2ecf20Sopenharmony_ci vpandq $MASK,$T3,$T3 # 3 26198c2ecf20Sopenharmony_ci vpmuludq $H2,$R0,$D2 # d2 = h2*r0 26208c2ecf20Sopenharmony_ci vporq $PADBIT,$T4,$T4 # padbit, yes, always 26218c2ecf20Sopenharmony_ci vpaddq $H1,$T1,$H1 # accumulate input 26228c2ecf20Sopenharmony_ci vpaddq $H3,$T3,$H3 26238c2ecf20Sopenharmony_ci vpaddq $H4,$T4,$H4 26248c2ecf20Sopenharmony_ci 26258c2ecf20Sopenharmony_ci vmovdqu 16*0($inp),%x#$T0 26268c2ecf20Sopenharmony_ci vpmuludq $H0,$R3,$M3 26278c2ecf20Sopenharmony_ci vpmuludq $H0,$R4,$M4 26288c2ecf20Sopenharmony_ci vpmuludq $H0,$R0,$M0 26298c2ecf20Sopenharmony_ci vpmuludq $H0,$R1,$M1 26308c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h0*r3 26318c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h0*r4 26328c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h0*r0 26338c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h0*r1 26348c2ecf20Sopenharmony_ci 26358c2ecf20Sopenharmony_ci vmovdqu 16*1($inp),%x#$T1 26368c2ecf20Sopenharmony_ci vpmuludq $H1,$R2,$M3 26378c2ecf20Sopenharmony_ci vpmuludq $H1,$R3,$M4 26388c2ecf20Sopenharmony_ci vpmuludq $H1,$S4,$M0 26398c2ecf20Sopenharmony_ci vpmuludq $H0,$R2,$M2 26408c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h1*r2 26418c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h1*r3 26428c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h1*s4 26438c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h0*r2 26448c2ecf20Sopenharmony_ci 26458c2ecf20Sopenharmony_ci vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 26468c2ecf20Sopenharmony_ci vpmuludq $H3,$R0,$M3 26478c2ecf20Sopenharmony_ci vpmuludq $H3,$R1,$M4 26488c2ecf20Sopenharmony_ci vpmuludq $H1,$R0,$M1 26498c2ecf20Sopenharmony_ci vpmuludq $H1,$R1,$M2 26508c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$D3 # d3 += h3*r0 26518c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h3*r1 26528c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h1*r0 26538c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h1*r1 26548c2ecf20Sopenharmony_ci 26558c2ecf20Sopenharmony_ci vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 26568c2ecf20Sopenharmony_ci vpmuludq $H4,$S4,$M3 26578c2ecf20Sopenharmony_ci vpmuludq $H4,$R0,$M4 26588c2ecf20Sopenharmony_ci vpmuludq $H3,$S2,$M0 26598c2ecf20Sopenharmony_ci vpmuludq $H3,$S3,$M1 26608c2ecf20Sopenharmony_ci vpmuludq $H3,$S4,$M2 26618c2ecf20Sopenharmony_ci vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 26628c2ecf20Sopenharmony_ci vpaddq $M4,$D4,$D4 # d4 += h4*r0 26638c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$D0 # d0 += h3*s2 26648c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$D1 # d1 += h3*s3 26658c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$D2 # d2 += h3*s4 26668c2ecf20Sopenharmony_ci 26678c2ecf20Sopenharmony_ci vpmuludq $H4,$S1,$M0 26688c2ecf20Sopenharmony_ci vpmuludq $H4,$S2,$M1 26698c2ecf20Sopenharmony_ci vpmuludq $H4,$S3,$M2 26708c2ecf20Sopenharmony_ci vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 26718c2ecf20Sopenharmony_ci vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 26728c2ecf20Sopenharmony_ci vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 26738c2ecf20Sopenharmony_ci 26748c2ecf20Sopenharmony_ci ################################################################ 26758c2ecf20Sopenharmony_ci # horizontal addition 26768c2ecf20Sopenharmony_ci 26778c2ecf20Sopenharmony_ci mov \$1,%eax 26788c2ecf20Sopenharmony_ci vpermq \$0xb1,$H3,$D3 26798c2ecf20Sopenharmony_ci vpermq \$0xb1,$D4,$H4 26808c2ecf20Sopenharmony_ci vpermq \$0xb1,$H0,$D0 26818c2ecf20Sopenharmony_ci vpermq \$0xb1,$H1,$D1 26828c2ecf20Sopenharmony_ci vpermq \$0xb1,$H2,$D2 26838c2ecf20Sopenharmony_ci vpaddq $D3,$H3,$H3 26848c2ecf20Sopenharmony_ci vpaddq $D4,$H4,$H4 26858c2ecf20Sopenharmony_ci vpaddq $D0,$H0,$H0 26868c2ecf20Sopenharmony_ci vpaddq $D1,$H1,$H1 26878c2ecf20Sopenharmony_ci vpaddq $D2,$H2,$H2 26888c2ecf20Sopenharmony_ci 26898c2ecf20Sopenharmony_ci kmovw %eax,%k3 26908c2ecf20Sopenharmony_ci vpermq \$0x2,$H3,$D3 26918c2ecf20Sopenharmony_ci vpermq \$0x2,$H4,$D4 26928c2ecf20Sopenharmony_ci vpermq \$0x2,$H0,$D0 26938c2ecf20Sopenharmony_ci vpermq \$0x2,$H1,$D1 26948c2ecf20Sopenharmony_ci vpermq \$0x2,$H2,$D2 26958c2ecf20Sopenharmony_ci vpaddq $D3,$H3,$H3 26968c2ecf20Sopenharmony_ci vpaddq $D4,$H4,$H4 26978c2ecf20Sopenharmony_ci vpaddq $D0,$H0,$H0 26988c2ecf20Sopenharmony_ci vpaddq $D1,$H1,$H1 26998c2ecf20Sopenharmony_ci vpaddq $D2,$H2,$H2 27008c2ecf20Sopenharmony_ci 27018c2ecf20Sopenharmony_ci vextracti64x4 \$0x1,$H3,%y#$D3 27028c2ecf20Sopenharmony_ci vextracti64x4 \$0x1,$H4,%y#$D4 27038c2ecf20Sopenharmony_ci vextracti64x4 \$0x1,$H0,%y#$D0 27048c2ecf20Sopenharmony_ci vextracti64x4 \$0x1,$H1,%y#$D1 27058c2ecf20Sopenharmony_ci vextracti64x4 \$0x1,$H2,%y#$D2 27068c2ecf20Sopenharmony_ci vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case 27078c2ecf20Sopenharmony_ci vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 27088c2ecf20Sopenharmony_ci vpaddq $D0,$H0,${H0}{%k3}{z} 27098c2ecf20Sopenharmony_ci vpaddq $D1,$H1,${H1}{%k3}{z} 27108c2ecf20Sopenharmony_ci vpaddq $D2,$H2,${H2}{%k3}{z} 27118c2ecf20Sopenharmony_ci___ 27128c2ecf20Sopenharmony_cimap(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); 27138c2ecf20Sopenharmony_cimap(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); 27148c2ecf20Sopenharmony_ci$code.=<<___; 27158c2ecf20Sopenharmony_ci ################################################################ 27168c2ecf20Sopenharmony_ci # lazy reduction (interleaved with input splat) 27178c2ecf20Sopenharmony_ci 27188c2ecf20Sopenharmony_ci vpsrlq \$26,$H3,$D3 27198c2ecf20Sopenharmony_ci vpand $MASK,$H3,$H3 27208c2ecf20Sopenharmony_ci vpsrldq \$6,$T0,$T2 # splat input 27218c2ecf20Sopenharmony_ci vpsrldq \$6,$T1,$T3 27228c2ecf20Sopenharmony_ci vpunpckhqdq $T1,$T0,$T4 # 4 27238c2ecf20Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 27248c2ecf20Sopenharmony_ci 27258c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$D0 27268c2ecf20Sopenharmony_ci vpand $MASK,$H0,$H0 27278c2ecf20Sopenharmony_ci vpunpcklqdq $T3,$T2,$T2 # 2:3 27288c2ecf20Sopenharmony_ci vpunpcklqdq $T1,$T0,$T0 # 0:1 27298c2ecf20Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 27308c2ecf20Sopenharmony_ci 27318c2ecf20Sopenharmony_ci vpsrlq \$26,$H4,$D4 27328c2ecf20Sopenharmony_ci vpand $MASK,$H4,$H4 27338c2ecf20Sopenharmony_ci 27348c2ecf20Sopenharmony_ci vpsrlq \$26,$H1,$D1 27358c2ecf20Sopenharmony_ci vpand $MASK,$H1,$H1 27368c2ecf20Sopenharmony_ci vpsrlq \$30,$T2,$T3 27378c2ecf20Sopenharmony_ci vpsrlq \$4,$T2,$T2 27388c2ecf20Sopenharmony_ci vpaddq $D1,$H2,$H2 # h1 -> h2 27398c2ecf20Sopenharmony_ci 27408c2ecf20Sopenharmony_ci vpaddq $D4,$H0,$H0 27418c2ecf20Sopenharmony_ci vpsllq \$2,$D4,$D4 27428c2ecf20Sopenharmony_ci vpsrlq \$26,$T0,$T1 27438c2ecf20Sopenharmony_ci vpsrlq \$40,$T4,$T4 # 4 27448c2ecf20Sopenharmony_ci vpaddq $D4,$H0,$H0 # h4 -> h0 27458c2ecf20Sopenharmony_ci 27468c2ecf20Sopenharmony_ci vpsrlq \$26,$H2,$D2 27478c2ecf20Sopenharmony_ci vpand $MASK,$H2,$H2 27488c2ecf20Sopenharmony_ci vpand $MASK,$T2,$T2 # 2 27498c2ecf20Sopenharmony_ci vpand $MASK,$T0,$T0 # 0 27508c2ecf20Sopenharmony_ci vpaddq $D2,$H3,$H3 # h2 -> h3 27518c2ecf20Sopenharmony_ci 27528c2ecf20Sopenharmony_ci vpsrlq \$26,$H0,$D0 27538c2ecf20Sopenharmony_ci vpand $MASK,$H0,$H0 27548c2ecf20Sopenharmony_ci vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 27558c2ecf20Sopenharmony_ci vpand $MASK,$T1,$T1 # 1 27568c2ecf20Sopenharmony_ci vpaddq $D0,$H1,$H1 # h0 -> h1 27578c2ecf20Sopenharmony_ci 27588c2ecf20Sopenharmony_ci vpsrlq \$26,$H3,$D3 27598c2ecf20Sopenharmony_ci vpand $MASK,$H3,$H3 27608c2ecf20Sopenharmony_ci vpand $MASK,$T3,$T3 # 3 27618c2ecf20Sopenharmony_ci vpor 32(%rcx),$T4,$T4 # padbit, yes, always 27628c2ecf20Sopenharmony_ci vpaddq $D3,$H4,$H4 # h3 -> h4 27638c2ecf20Sopenharmony_ci 27648c2ecf20Sopenharmony_ci lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 27658c2ecf20Sopenharmony_ci add \$64,$len 27668c2ecf20Sopenharmony_ci jnz .Ltail_avx2$suffix 27678c2ecf20Sopenharmony_ci 27688c2ecf20Sopenharmony_ci vpsubq $T2,$H2,$H2 # undo input accumulation 27698c2ecf20Sopenharmony_ci vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 27708c2ecf20Sopenharmony_ci vmovd %x#$H1,`4*1-48-64`($ctx) 27718c2ecf20Sopenharmony_ci vmovd %x#$H2,`4*2-48-64`($ctx) 27728c2ecf20Sopenharmony_ci vmovd %x#$H3,`4*3-48-64`($ctx) 27738c2ecf20Sopenharmony_ci vmovd %x#$H4,`4*4-48-64`($ctx) 27748c2ecf20Sopenharmony_ci vzeroall 27758c2ecf20Sopenharmony_ci___ 27768c2ecf20Sopenharmony_ci$code.=<<___ if ($win64); 27778c2ecf20Sopenharmony_ci movdqa -0xb0(%r10),%xmm6 27788c2ecf20Sopenharmony_ci movdqa -0xa0(%r10),%xmm7 27798c2ecf20Sopenharmony_ci movdqa -0x90(%r10),%xmm8 27808c2ecf20Sopenharmony_ci movdqa -0x80(%r10),%xmm9 27818c2ecf20Sopenharmony_ci movdqa -0x70(%r10),%xmm10 27828c2ecf20Sopenharmony_ci movdqa -0x60(%r10),%xmm11 27838c2ecf20Sopenharmony_ci movdqa -0x50(%r10),%xmm12 27848c2ecf20Sopenharmony_ci movdqa -0x40(%r10),%xmm13 27858c2ecf20Sopenharmony_ci movdqa -0x30(%r10),%xmm14 27868c2ecf20Sopenharmony_ci movdqa -0x20(%r10),%xmm15 27878c2ecf20Sopenharmony_ci lea -8(%r10),%rsp 27888c2ecf20Sopenharmony_ci.Ldo_avx512_epilogue: 27898c2ecf20Sopenharmony_ci___ 27908c2ecf20Sopenharmony_ci$code.=<<___ if (!$win64); 27918c2ecf20Sopenharmony_ci lea -8(%r10),%rsp 27928c2ecf20Sopenharmony_ci.cfi_def_cfa_register %rsp 27938c2ecf20Sopenharmony_ci___ 27948c2ecf20Sopenharmony_ci$code.=<<___; 27958c2ecf20Sopenharmony_ci RET 27968c2ecf20Sopenharmony_ci.cfi_endproc 27978c2ecf20Sopenharmony_ci___ 27988c2ecf20Sopenharmony_ci 27998c2ecf20Sopenharmony_ci} 28008c2ecf20Sopenharmony_ci 28018c2ecf20Sopenharmony_ci} 28028c2ecf20Sopenharmony_ci 28038c2ecf20Sopenharmony_ci&declare_function("poly1305_blocks_avx2", 32, 4); 28048c2ecf20Sopenharmony_cipoly1305_blocks_avxN(0); 28058c2ecf20Sopenharmony_ci&end_function("poly1305_blocks_avx2"); 28068c2ecf20Sopenharmony_ci 28078c2ecf20Sopenharmony_ci####################################################################### 28088c2ecf20Sopenharmony_ciif ($avx>2) { 28098c2ecf20Sopenharmony_ci# On entry we have input length divisible by 64. But since inner loop 28108c2ecf20Sopenharmony_ci# processes 128 bytes per iteration, cases when length is not divisible 28118c2ecf20Sopenharmony_ci# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this 28128c2ecf20Sopenharmony_ci# reason stack layout is kept identical to poly1305_blocks_avx2. If not 28138c2ecf20Sopenharmony_ci# for this tail, we wouldn't have to even allocate stack frame... 28148c2ecf20Sopenharmony_ci 28158c2ecf20Sopenharmony_ciif($kernel) { 28168c2ecf20Sopenharmony_ci $code .= "#ifdef CONFIG_AS_AVX512\n"; 28178c2ecf20Sopenharmony_ci} 28188c2ecf20Sopenharmony_ci 28198c2ecf20Sopenharmony_ci&declare_function("poly1305_blocks_avx512", 32, 4); 28208c2ecf20Sopenharmony_cipoly1305_blocks_avxN(1); 28218c2ecf20Sopenharmony_ci&end_function("poly1305_blocks_avx512"); 28228c2ecf20Sopenharmony_ci 28238c2ecf20Sopenharmony_ciif ($kernel) { 28248c2ecf20Sopenharmony_ci $code .= "#endif\n"; 28258c2ecf20Sopenharmony_ci} 28268c2ecf20Sopenharmony_ci 28278c2ecf20Sopenharmony_ciif (!$kernel && $avx>3) { 28288c2ecf20Sopenharmony_ci######################################################################## 28298c2ecf20Sopenharmony_ci# VPMADD52 version using 2^44 radix. 28308c2ecf20Sopenharmony_ci# 28318c2ecf20Sopenharmony_ci# One can argue that base 2^52 would be more natural. Well, even though 28328c2ecf20Sopenharmony_ci# some operations would be more natural, one has to recognize couple of 28338c2ecf20Sopenharmony_ci# things. Base 2^52 doesn't provide advantage over base 2^44 if you look 28348c2ecf20Sopenharmony_ci# at amount of multiply-n-accumulate operations. Secondly, it makes it 28358c2ecf20Sopenharmony_ci# impossible to pre-compute multiples of 5 [referred to as s[]/sN in 28368c2ecf20Sopenharmony_ci# reference implementations], which means that more such operations 28378c2ecf20Sopenharmony_ci# would have to be performed in inner loop, which in turn makes critical 28388c2ecf20Sopenharmony_ci# path longer. In other words, even though base 2^44 reduction might 28398c2ecf20Sopenharmony_ci# look less elegant, overall critical path is actually shorter... 28408c2ecf20Sopenharmony_ci 28418c2ecf20Sopenharmony_ci######################################################################## 28428c2ecf20Sopenharmony_ci# Layout of opaque area is following. 28438c2ecf20Sopenharmony_ci# 28448c2ecf20Sopenharmony_ci# unsigned __int64 h[3]; # current hash value base 2^44 28458c2ecf20Sopenharmony_ci# unsigned __int64 s[2]; # key value*20 base 2^44 28468c2ecf20Sopenharmony_ci# unsigned __int64 r[3]; # key value base 2^44 28478c2ecf20Sopenharmony_ci# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; 28488c2ecf20Sopenharmony_ci# # r^n positions reflect 28498c2ecf20Sopenharmony_ci# # placement in register, not 28508c2ecf20Sopenharmony_ci# # memory, R[3] is R[1]*20 28518c2ecf20Sopenharmony_ci 28528c2ecf20Sopenharmony_ci$code.=<<___; 28538c2ecf20Sopenharmony_ci.type poly1305_init_base2_44,\@function,3 28548c2ecf20Sopenharmony_ci.align 32 28558c2ecf20Sopenharmony_cipoly1305_init_base2_44: 28568c2ecf20Sopenharmony_ci xor %eax,%eax 28578c2ecf20Sopenharmony_ci mov %rax,0($ctx) # initialize hash value 28588c2ecf20Sopenharmony_ci mov %rax,8($ctx) 28598c2ecf20Sopenharmony_ci mov %rax,16($ctx) 28608c2ecf20Sopenharmony_ci 28618c2ecf20Sopenharmony_ci.Linit_base2_44: 28628c2ecf20Sopenharmony_ci lea poly1305_blocks_vpmadd52(%rip),%r10 28638c2ecf20Sopenharmony_ci lea poly1305_emit_base2_44(%rip),%r11 28648c2ecf20Sopenharmony_ci 28658c2ecf20Sopenharmony_ci mov \$0x0ffffffc0fffffff,%rax 28668c2ecf20Sopenharmony_ci mov \$0x0ffffffc0ffffffc,%rcx 28678c2ecf20Sopenharmony_ci and 0($inp),%rax 28688c2ecf20Sopenharmony_ci mov \$0x00000fffffffffff,%r8 28698c2ecf20Sopenharmony_ci and 8($inp),%rcx 28708c2ecf20Sopenharmony_ci mov \$0x00000fffffffffff,%r9 28718c2ecf20Sopenharmony_ci and %rax,%r8 28728c2ecf20Sopenharmony_ci shrd \$44,%rcx,%rax 28738c2ecf20Sopenharmony_ci mov %r8,40($ctx) # r0 28748c2ecf20Sopenharmony_ci and %r9,%rax 28758c2ecf20Sopenharmony_ci shr \$24,%rcx 28768c2ecf20Sopenharmony_ci mov %rax,48($ctx) # r1 28778c2ecf20Sopenharmony_ci lea (%rax,%rax,4),%rax # *5 28788c2ecf20Sopenharmony_ci mov %rcx,56($ctx) # r2 28798c2ecf20Sopenharmony_ci shl \$2,%rax # magic <<2 28808c2ecf20Sopenharmony_ci lea (%rcx,%rcx,4),%rcx # *5 28818c2ecf20Sopenharmony_ci shl \$2,%rcx # magic <<2 28828c2ecf20Sopenharmony_ci mov %rax,24($ctx) # s1 28838c2ecf20Sopenharmony_ci mov %rcx,32($ctx) # s2 28848c2ecf20Sopenharmony_ci movq \$-1,64($ctx) # write impossible value 28858c2ecf20Sopenharmony_ci___ 28868c2ecf20Sopenharmony_ci$code.=<<___ if ($flavour !~ /elf32/); 28878c2ecf20Sopenharmony_ci mov %r10,0(%rdx) 28888c2ecf20Sopenharmony_ci mov %r11,8(%rdx) 28898c2ecf20Sopenharmony_ci___ 28908c2ecf20Sopenharmony_ci$code.=<<___ if ($flavour =~ /elf32/); 28918c2ecf20Sopenharmony_ci mov %r10d,0(%rdx) 28928c2ecf20Sopenharmony_ci mov %r11d,4(%rdx) 28938c2ecf20Sopenharmony_ci___ 28948c2ecf20Sopenharmony_ci$code.=<<___; 28958c2ecf20Sopenharmony_ci mov \$1,%eax 28968c2ecf20Sopenharmony_ci RET 28978c2ecf20Sopenharmony_ci.size poly1305_init_base2_44,.-poly1305_init_base2_44 28988c2ecf20Sopenharmony_ci___ 28998c2ecf20Sopenharmony_ci{ 29008c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); 29018c2ecf20Sopenharmony_cimy ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); 29028c2ecf20Sopenharmony_cimy ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); 29038c2ecf20Sopenharmony_ci 29048c2ecf20Sopenharmony_ci$code.=<<___; 29058c2ecf20Sopenharmony_ci.type poly1305_blocks_vpmadd52,\@function,4 29068c2ecf20Sopenharmony_ci.align 32 29078c2ecf20Sopenharmony_cipoly1305_blocks_vpmadd52: 29088c2ecf20Sopenharmony_ci shr \$4,$len 29098c2ecf20Sopenharmony_ci jz .Lno_data_vpmadd52 # too short 29108c2ecf20Sopenharmony_ci 29118c2ecf20Sopenharmony_ci shl \$40,$padbit 29128c2ecf20Sopenharmony_ci mov 64($ctx),%r8 # peek on power of the key 29138c2ecf20Sopenharmony_ci 29148c2ecf20Sopenharmony_ci # if powers of the key are not calculated yet, process up to 3 29158c2ecf20Sopenharmony_ci # blocks with this single-block subroutine, otherwise ensure that 29168c2ecf20Sopenharmony_ci # length is divisible by 2 blocks and pass the rest down to next 29178c2ecf20Sopenharmony_ci # subroutine... 29188c2ecf20Sopenharmony_ci 29198c2ecf20Sopenharmony_ci mov \$3,%rax 29208c2ecf20Sopenharmony_ci mov \$1,%r10 29218c2ecf20Sopenharmony_ci cmp \$4,$len # is input long 29228c2ecf20Sopenharmony_ci cmovae %r10,%rax 29238c2ecf20Sopenharmony_ci test %r8,%r8 # is power value impossible? 29248c2ecf20Sopenharmony_ci cmovns %r10,%rax 29258c2ecf20Sopenharmony_ci 29268c2ecf20Sopenharmony_ci and $len,%rax # is input of favourable length? 29278c2ecf20Sopenharmony_ci jz .Lblocks_vpmadd52_4x 29288c2ecf20Sopenharmony_ci 29298c2ecf20Sopenharmony_ci sub %rax,$len 29308c2ecf20Sopenharmony_ci mov \$7,%r10d 29318c2ecf20Sopenharmony_ci mov \$1,%r11d 29328c2ecf20Sopenharmony_ci kmovw %r10d,%k7 29338c2ecf20Sopenharmony_ci lea .L2_44_inp_permd(%rip),%r10 29348c2ecf20Sopenharmony_ci kmovw %r11d,%k1 29358c2ecf20Sopenharmony_ci 29368c2ecf20Sopenharmony_ci vmovq $padbit,%x#$PAD 29378c2ecf20Sopenharmony_ci vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd 29388c2ecf20Sopenharmony_ci vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift 29398c2ecf20Sopenharmony_ci vpermq \$0xcf,$PAD,$PAD 29408c2ecf20Sopenharmony_ci vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask 29418c2ecf20Sopenharmony_ci 29428c2ecf20Sopenharmony_ci vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value 29438c2ecf20Sopenharmony_ci vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys 29448c2ecf20Sopenharmony_ci vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} 29458c2ecf20Sopenharmony_ci vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} 29468c2ecf20Sopenharmony_ci 29478c2ecf20Sopenharmony_ci vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt 29488c2ecf20Sopenharmony_ci vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft 29498c2ecf20Sopenharmony_ci 29508c2ecf20Sopenharmony_ci jmp .Loop_vpmadd52 29518c2ecf20Sopenharmony_ci 29528c2ecf20Sopenharmony_ci.align 32 29538c2ecf20Sopenharmony_ci.Loop_vpmadd52: 29548c2ecf20Sopenharmony_ci vmovdqu32 0($inp),%x#$T0 # load input as ----3210 29558c2ecf20Sopenharmony_ci lea 16($inp),$inp 29568c2ecf20Sopenharmony_ci 29578c2ecf20Sopenharmony_ci vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 29588c2ecf20Sopenharmony_ci vpsrlvq $inp_shift,$T0,$T0 29598c2ecf20Sopenharmony_ci vpandq $reduc_mask,$T0,$T0 29608c2ecf20Sopenharmony_ci vporq $PAD,$T0,$T0 29618c2ecf20Sopenharmony_ci 29628c2ecf20Sopenharmony_ci vpaddq $T0,$Dlo,$Dlo # accumulate input 29638c2ecf20Sopenharmony_ci 29648c2ecf20Sopenharmony_ci vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value 29658c2ecf20Sopenharmony_ci vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} 29668c2ecf20Sopenharmony_ci vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} 29678c2ecf20Sopenharmony_ci 29688c2ecf20Sopenharmony_ci vpxord $Dlo,$Dlo,$Dlo 29698c2ecf20Sopenharmony_ci vpxord $Dhi,$Dhi,$Dhi 29708c2ecf20Sopenharmony_ci 29718c2ecf20Sopenharmony_ci vpmadd52luq $r2r1r0,$H0,$Dlo 29728c2ecf20Sopenharmony_ci vpmadd52huq $r2r1r0,$H0,$Dhi 29738c2ecf20Sopenharmony_ci 29748c2ecf20Sopenharmony_ci vpmadd52luq $r1r0s2,$H1,$Dlo 29758c2ecf20Sopenharmony_ci vpmadd52huq $r1r0s2,$H1,$Dhi 29768c2ecf20Sopenharmony_ci 29778c2ecf20Sopenharmony_ci vpmadd52luq $r0s2s1,$H2,$Dlo 29788c2ecf20Sopenharmony_ci vpmadd52huq $r0s2s1,$H2,$Dhi 29798c2ecf20Sopenharmony_ci 29808c2ecf20Sopenharmony_ci vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword 29818c2ecf20Sopenharmony_ci vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword 29828c2ecf20Sopenharmony_ci vpandq $reduc_mask,$Dlo,$Dlo 29838c2ecf20Sopenharmony_ci 29848c2ecf20Sopenharmony_ci vpaddq $T0,$Dhi,$Dhi 29858c2ecf20Sopenharmony_ci 29868c2ecf20Sopenharmony_ci vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword 29878c2ecf20Sopenharmony_ci 29888c2ecf20Sopenharmony_ci vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) 29898c2ecf20Sopenharmony_ci 29908c2ecf20Sopenharmony_ci vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word 29918c2ecf20Sopenharmony_ci vpandq $reduc_mask,$Dlo,$Dlo 29928c2ecf20Sopenharmony_ci 29938c2ecf20Sopenharmony_ci vpermq \$0b10010011,$T0,$T0 29948c2ecf20Sopenharmony_ci 29958c2ecf20Sopenharmony_ci vpaddq $T0,$Dlo,$Dlo 29968c2ecf20Sopenharmony_ci 29978c2ecf20Sopenharmony_ci vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} 29988c2ecf20Sopenharmony_ci 29998c2ecf20Sopenharmony_ci vpaddq $T0,$Dlo,$Dlo 30008c2ecf20Sopenharmony_ci vpsllq \$2,$T0,$T0 30018c2ecf20Sopenharmony_ci 30028c2ecf20Sopenharmony_ci vpaddq $T0,$Dlo,$Dlo 30038c2ecf20Sopenharmony_ci 30048c2ecf20Sopenharmony_ci dec %rax # len-=16 30058c2ecf20Sopenharmony_ci jnz .Loop_vpmadd52 30068c2ecf20Sopenharmony_ci 30078c2ecf20Sopenharmony_ci vmovdqu64 $Dlo,0($ctx){%k7} # store hash value 30088c2ecf20Sopenharmony_ci 30098c2ecf20Sopenharmony_ci test $len,$len 30108c2ecf20Sopenharmony_ci jnz .Lblocks_vpmadd52_4x 30118c2ecf20Sopenharmony_ci 30128c2ecf20Sopenharmony_ci.Lno_data_vpmadd52: 30138c2ecf20Sopenharmony_ci RET 30148c2ecf20Sopenharmony_ci.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 30158c2ecf20Sopenharmony_ci___ 30168c2ecf20Sopenharmony_ci} 30178c2ecf20Sopenharmony_ci{ 30188c2ecf20Sopenharmony_ci######################################################################## 30198c2ecf20Sopenharmony_ci# As implied by its name 4x subroutine processes 4 blocks in parallel 30208c2ecf20Sopenharmony_ci# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power 30218c2ecf20Sopenharmony_ci# and is handled in 256-bit %ymm registers. 30228c2ecf20Sopenharmony_ci 30238c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 30248c2ecf20Sopenharmony_cimy ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 30258c2ecf20Sopenharmony_cimy ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 30268c2ecf20Sopenharmony_ci 30278c2ecf20Sopenharmony_ci$code.=<<___; 30288c2ecf20Sopenharmony_ci.type poly1305_blocks_vpmadd52_4x,\@function,4 30298c2ecf20Sopenharmony_ci.align 32 30308c2ecf20Sopenharmony_cipoly1305_blocks_vpmadd52_4x: 30318c2ecf20Sopenharmony_ci shr \$4,$len 30328c2ecf20Sopenharmony_ci jz .Lno_data_vpmadd52_4x # too short 30338c2ecf20Sopenharmony_ci 30348c2ecf20Sopenharmony_ci shl \$40,$padbit 30358c2ecf20Sopenharmony_ci mov 64($ctx),%r8 # peek on power of the key 30368c2ecf20Sopenharmony_ci 30378c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_4x: 30388c2ecf20Sopenharmony_ci vpbroadcastq $padbit,$PAD 30398c2ecf20Sopenharmony_ci 30408c2ecf20Sopenharmony_ci vmovdqa64 .Lx_mask44(%rip),$mask44 30418c2ecf20Sopenharmony_ci mov \$5,%eax 30428c2ecf20Sopenharmony_ci vmovdqa64 .Lx_mask42(%rip),$mask42 30438c2ecf20Sopenharmony_ci kmovw %eax,%k1 # used in 2x path 30448c2ecf20Sopenharmony_ci 30458c2ecf20Sopenharmony_ci test %r8,%r8 # is power value impossible? 30468c2ecf20Sopenharmony_ci js .Linit_vpmadd52 # if it is, then init R[4] 30478c2ecf20Sopenharmony_ci 30488c2ecf20Sopenharmony_ci vmovq 0($ctx),%x#$H0 # load current hash value 30498c2ecf20Sopenharmony_ci vmovq 8($ctx),%x#$H1 30508c2ecf20Sopenharmony_ci vmovq 16($ctx),%x#$H2 30518c2ecf20Sopenharmony_ci 30528c2ecf20Sopenharmony_ci test \$3,$len # is length 4*n+2? 30538c2ecf20Sopenharmony_ci jnz .Lblocks_vpmadd52_2x_do 30548c2ecf20Sopenharmony_ci 30558c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_4x_do: 30568c2ecf20Sopenharmony_ci vpbroadcastq 64($ctx),$R0 # load 4th power of the key 30578c2ecf20Sopenharmony_ci vpbroadcastq 96($ctx),$R1 30588c2ecf20Sopenharmony_ci vpbroadcastq 128($ctx),$R2 30598c2ecf20Sopenharmony_ci vpbroadcastq 160($ctx),$S1 30608c2ecf20Sopenharmony_ci 30618c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_4x_key_loaded: 30628c2ecf20Sopenharmony_ci vpsllq \$2,$R2,$S2 # S2 = R2*5*4 30638c2ecf20Sopenharmony_ci vpaddq $R2,$S2,$S2 30648c2ecf20Sopenharmony_ci vpsllq \$2,$S2,$S2 30658c2ecf20Sopenharmony_ci 30668c2ecf20Sopenharmony_ci test \$7,$len # is len 8*n? 30678c2ecf20Sopenharmony_ci jz .Lblocks_vpmadd52_8x 30688c2ecf20Sopenharmony_ci 30698c2ecf20Sopenharmony_ci vmovdqu64 16*0($inp),$T2 # load data 30708c2ecf20Sopenharmony_ci vmovdqu64 16*2($inp),$T3 30718c2ecf20Sopenharmony_ci lea 16*4($inp),$inp 30728c2ecf20Sopenharmony_ci 30738c2ecf20Sopenharmony_ci vpunpcklqdq $T3,$T2,$T1 # transpose data 30748c2ecf20Sopenharmony_ci vpunpckhqdq $T3,$T2,$T3 30758c2ecf20Sopenharmony_ci 30768c2ecf20Sopenharmony_ci # at this point 64-bit lanes are ordered as 3-1-2-0 30778c2ecf20Sopenharmony_ci 30788c2ecf20Sopenharmony_ci vpsrlq \$24,$T3,$T2 # splat the data 30798c2ecf20Sopenharmony_ci vporq $PAD,$T2,$T2 30808c2ecf20Sopenharmony_ci vpaddq $T2,$H2,$H2 # accumulate input 30818c2ecf20Sopenharmony_ci vpandq $mask44,$T1,$T0 30828c2ecf20Sopenharmony_ci vpsrlq \$44,$T1,$T1 30838c2ecf20Sopenharmony_ci vpsllq \$20,$T3,$T3 30848c2ecf20Sopenharmony_ci vporq $T3,$T1,$T1 30858c2ecf20Sopenharmony_ci vpandq $mask44,$T1,$T1 30868c2ecf20Sopenharmony_ci 30878c2ecf20Sopenharmony_ci sub \$4,$len 30888c2ecf20Sopenharmony_ci jz .Ltail_vpmadd52_4x 30898c2ecf20Sopenharmony_ci jmp .Loop_vpmadd52_4x 30908c2ecf20Sopenharmony_ci ud2 30918c2ecf20Sopenharmony_ci 30928c2ecf20Sopenharmony_ci.align 32 30938c2ecf20Sopenharmony_ci.Linit_vpmadd52: 30948c2ecf20Sopenharmony_ci vmovq 24($ctx),%x#$S1 # load key 30958c2ecf20Sopenharmony_ci vmovq 56($ctx),%x#$H2 30968c2ecf20Sopenharmony_ci vmovq 32($ctx),%x#$S2 30978c2ecf20Sopenharmony_ci vmovq 40($ctx),%x#$R0 30988c2ecf20Sopenharmony_ci vmovq 48($ctx),%x#$R1 30998c2ecf20Sopenharmony_ci 31008c2ecf20Sopenharmony_ci vmovdqa $R0,$H0 31018c2ecf20Sopenharmony_ci vmovdqa $R1,$H1 31028c2ecf20Sopenharmony_ci vmovdqa $H2,$R2 31038c2ecf20Sopenharmony_ci 31048c2ecf20Sopenharmony_ci mov \$2,%eax 31058c2ecf20Sopenharmony_ci 31068c2ecf20Sopenharmony_ci.Lmul_init_vpmadd52: 31078c2ecf20Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 31088c2ecf20Sopenharmony_ci vpmadd52luq $H2,$S1,$D0lo 31098c2ecf20Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 31108c2ecf20Sopenharmony_ci vpmadd52huq $H2,$S1,$D0hi 31118c2ecf20Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 31128c2ecf20Sopenharmony_ci vpmadd52luq $H2,$S2,$D1lo 31138c2ecf20Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 31148c2ecf20Sopenharmony_ci vpmadd52huq $H2,$S2,$D1hi 31158c2ecf20Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 31168c2ecf20Sopenharmony_ci vpmadd52luq $H2,$R0,$D2lo 31178c2ecf20Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 31188c2ecf20Sopenharmony_ci vpmadd52huq $H2,$R0,$D2hi 31198c2ecf20Sopenharmony_ci 31208c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R0,$D0lo 31218c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R0,$D0hi 31228c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R1,$D1lo 31238c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R1,$D1hi 31248c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R2,$D2lo 31258c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R2,$D2hi 31268c2ecf20Sopenharmony_ci 31278c2ecf20Sopenharmony_ci vpmadd52luq $H1,$S2,$D0lo 31288c2ecf20Sopenharmony_ci vpmadd52huq $H1,$S2,$D0hi 31298c2ecf20Sopenharmony_ci vpmadd52luq $H1,$R0,$D1lo 31308c2ecf20Sopenharmony_ci vpmadd52huq $H1,$R0,$D1hi 31318c2ecf20Sopenharmony_ci vpmadd52luq $H1,$R1,$D2lo 31328c2ecf20Sopenharmony_ci vpmadd52huq $H1,$R1,$D2hi 31338c2ecf20Sopenharmony_ci 31348c2ecf20Sopenharmony_ci ################################################################ 31358c2ecf20Sopenharmony_ci # partial reduction 31368c2ecf20Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 31378c2ecf20Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 31388c2ecf20Sopenharmony_ci vpandq $mask44,$D0lo,$H0 31398c2ecf20Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 31408c2ecf20Sopenharmony_ci 31418c2ecf20Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 31428c2ecf20Sopenharmony_ci 31438c2ecf20Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 31448c2ecf20Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 31458c2ecf20Sopenharmony_ci vpandq $mask44,$D1lo,$H1 31468c2ecf20Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 31478c2ecf20Sopenharmony_ci 31488c2ecf20Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 31498c2ecf20Sopenharmony_ci 31508c2ecf20Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 31518c2ecf20Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 31528c2ecf20Sopenharmony_ci vpandq $mask42,$D2lo,$H2 31538c2ecf20Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 31548c2ecf20Sopenharmony_ci 31558c2ecf20Sopenharmony_ci vpaddq $D2hi,$H0,$H0 31568c2ecf20Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 31578c2ecf20Sopenharmony_ci 31588c2ecf20Sopenharmony_ci vpaddq $D2hi,$H0,$H0 31598c2ecf20Sopenharmony_ci 31608c2ecf20Sopenharmony_ci vpsrlq \$44,$H0,$tmp # additional step 31618c2ecf20Sopenharmony_ci vpandq $mask44,$H0,$H0 31628c2ecf20Sopenharmony_ci 31638c2ecf20Sopenharmony_ci vpaddq $tmp,$H1,$H1 31648c2ecf20Sopenharmony_ci 31658c2ecf20Sopenharmony_ci dec %eax 31668c2ecf20Sopenharmony_ci jz .Ldone_init_vpmadd52 31678c2ecf20Sopenharmony_ci 31688c2ecf20Sopenharmony_ci vpunpcklqdq $R1,$H1,$R1 # 1,2 31698c2ecf20Sopenharmony_ci vpbroadcastq %x#$H1,%x#$H1 # 2,2 31708c2ecf20Sopenharmony_ci vpunpcklqdq $R2,$H2,$R2 31718c2ecf20Sopenharmony_ci vpbroadcastq %x#$H2,%x#$H2 31728c2ecf20Sopenharmony_ci vpunpcklqdq $R0,$H0,$R0 31738c2ecf20Sopenharmony_ci vpbroadcastq %x#$H0,%x#$H0 31748c2ecf20Sopenharmony_ci 31758c2ecf20Sopenharmony_ci vpsllq \$2,$R1,$S1 # S1 = R1*5*4 31768c2ecf20Sopenharmony_ci vpsllq \$2,$R2,$S2 # S2 = R2*5*4 31778c2ecf20Sopenharmony_ci vpaddq $R1,$S1,$S1 31788c2ecf20Sopenharmony_ci vpaddq $R2,$S2,$S2 31798c2ecf20Sopenharmony_ci vpsllq \$2,$S1,$S1 31808c2ecf20Sopenharmony_ci vpsllq \$2,$S2,$S2 31818c2ecf20Sopenharmony_ci 31828c2ecf20Sopenharmony_ci jmp .Lmul_init_vpmadd52 31838c2ecf20Sopenharmony_ci ud2 31848c2ecf20Sopenharmony_ci 31858c2ecf20Sopenharmony_ci.align 32 31868c2ecf20Sopenharmony_ci.Ldone_init_vpmadd52: 31878c2ecf20Sopenharmony_ci vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 31888c2ecf20Sopenharmony_ci vinserti128 \$1,%x#$R2,$H2,$R2 31898c2ecf20Sopenharmony_ci vinserti128 \$1,%x#$R0,$H0,$R0 31908c2ecf20Sopenharmony_ci 31918c2ecf20Sopenharmony_ci vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 31928c2ecf20Sopenharmony_ci vpermq \$0b11011000,$R2,$R2 31938c2ecf20Sopenharmony_ci vpermq \$0b11011000,$R0,$R0 31948c2ecf20Sopenharmony_ci 31958c2ecf20Sopenharmony_ci vpsllq \$2,$R1,$S1 # S1 = R1*5*4 31968c2ecf20Sopenharmony_ci vpaddq $R1,$S1,$S1 31978c2ecf20Sopenharmony_ci vpsllq \$2,$S1,$S1 31988c2ecf20Sopenharmony_ci 31998c2ecf20Sopenharmony_ci vmovq 0($ctx),%x#$H0 # load current hash value 32008c2ecf20Sopenharmony_ci vmovq 8($ctx),%x#$H1 32018c2ecf20Sopenharmony_ci vmovq 16($ctx),%x#$H2 32028c2ecf20Sopenharmony_ci 32038c2ecf20Sopenharmony_ci test \$3,$len # is length 4*n+2? 32048c2ecf20Sopenharmony_ci jnz .Ldone_init_vpmadd52_2x 32058c2ecf20Sopenharmony_ci 32068c2ecf20Sopenharmony_ci vmovdqu64 $R0,64($ctx) # save key powers 32078c2ecf20Sopenharmony_ci vpbroadcastq %x#$R0,$R0 # broadcast 4th power 32088c2ecf20Sopenharmony_ci vmovdqu64 $R1,96($ctx) 32098c2ecf20Sopenharmony_ci vpbroadcastq %x#$R1,$R1 32108c2ecf20Sopenharmony_ci vmovdqu64 $R2,128($ctx) 32118c2ecf20Sopenharmony_ci vpbroadcastq %x#$R2,$R2 32128c2ecf20Sopenharmony_ci vmovdqu64 $S1,160($ctx) 32138c2ecf20Sopenharmony_ci vpbroadcastq %x#$S1,$S1 32148c2ecf20Sopenharmony_ci 32158c2ecf20Sopenharmony_ci jmp .Lblocks_vpmadd52_4x_key_loaded 32168c2ecf20Sopenharmony_ci ud2 32178c2ecf20Sopenharmony_ci 32188c2ecf20Sopenharmony_ci.align 32 32198c2ecf20Sopenharmony_ci.Ldone_init_vpmadd52_2x: 32208c2ecf20Sopenharmony_ci vmovdqu64 $R0,64($ctx) # save key powers 32218c2ecf20Sopenharmony_ci vpsrldq \$8,$R0,$R0 # 0-1-0-2 32228c2ecf20Sopenharmony_ci vmovdqu64 $R1,96($ctx) 32238c2ecf20Sopenharmony_ci vpsrldq \$8,$R1,$R1 32248c2ecf20Sopenharmony_ci vmovdqu64 $R2,128($ctx) 32258c2ecf20Sopenharmony_ci vpsrldq \$8,$R2,$R2 32268c2ecf20Sopenharmony_ci vmovdqu64 $S1,160($ctx) 32278c2ecf20Sopenharmony_ci vpsrldq \$8,$S1,$S1 32288c2ecf20Sopenharmony_ci jmp .Lblocks_vpmadd52_2x_key_loaded 32298c2ecf20Sopenharmony_ci ud2 32308c2ecf20Sopenharmony_ci 32318c2ecf20Sopenharmony_ci.align 32 32328c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_2x_do: 32338c2ecf20Sopenharmony_ci vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers 32348c2ecf20Sopenharmony_ci vmovdqu64 160+8($ctx),${S1}{%k1}{z} 32358c2ecf20Sopenharmony_ci vmovdqu64 64+8($ctx),${R0}{%k1}{z} 32368c2ecf20Sopenharmony_ci vmovdqu64 96+8($ctx),${R1}{%k1}{z} 32378c2ecf20Sopenharmony_ci 32388c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_2x_key_loaded: 32398c2ecf20Sopenharmony_ci vmovdqu64 16*0($inp),$T2 # load data 32408c2ecf20Sopenharmony_ci vpxorq $T3,$T3,$T3 32418c2ecf20Sopenharmony_ci lea 16*2($inp),$inp 32428c2ecf20Sopenharmony_ci 32438c2ecf20Sopenharmony_ci vpunpcklqdq $T3,$T2,$T1 # transpose data 32448c2ecf20Sopenharmony_ci vpunpckhqdq $T3,$T2,$T3 32458c2ecf20Sopenharmony_ci 32468c2ecf20Sopenharmony_ci # at this point 64-bit lanes are ordered as x-1-x-0 32478c2ecf20Sopenharmony_ci 32488c2ecf20Sopenharmony_ci vpsrlq \$24,$T3,$T2 # splat the data 32498c2ecf20Sopenharmony_ci vporq $PAD,$T2,$T2 32508c2ecf20Sopenharmony_ci vpaddq $T2,$H2,$H2 # accumulate input 32518c2ecf20Sopenharmony_ci vpandq $mask44,$T1,$T0 32528c2ecf20Sopenharmony_ci vpsrlq \$44,$T1,$T1 32538c2ecf20Sopenharmony_ci vpsllq \$20,$T3,$T3 32548c2ecf20Sopenharmony_ci vporq $T3,$T1,$T1 32558c2ecf20Sopenharmony_ci vpandq $mask44,$T1,$T1 32568c2ecf20Sopenharmony_ci 32578c2ecf20Sopenharmony_ci jmp .Ltail_vpmadd52_2x 32588c2ecf20Sopenharmony_ci ud2 32598c2ecf20Sopenharmony_ci 32608c2ecf20Sopenharmony_ci.align 32 32618c2ecf20Sopenharmony_ci.Loop_vpmadd52_4x: 32628c2ecf20Sopenharmony_ci #vpaddq $T2,$H2,$H2 # accumulate input 32638c2ecf20Sopenharmony_ci vpaddq $T0,$H0,$H0 32648c2ecf20Sopenharmony_ci vpaddq $T1,$H1,$H1 32658c2ecf20Sopenharmony_ci 32668c2ecf20Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 32678c2ecf20Sopenharmony_ci vpmadd52luq $H2,$S1,$D0lo 32688c2ecf20Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 32698c2ecf20Sopenharmony_ci vpmadd52huq $H2,$S1,$D0hi 32708c2ecf20Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 32718c2ecf20Sopenharmony_ci vpmadd52luq $H2,$S2,$D1lo 32728c2ecf20Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 32738c2ecf20Sopenharmony_ci vpmadd52huq $H2,$S2,$D1hi 32748c2ecf20Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 32758c2ecf20Sopenharmony_ci vpmadd52luq $H2,$R0,$D2lo 32768c2ecf20Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 32778c2ecf20Sopenharmony_ci vpmadd52huq $H2,$R0,$D2hi 32788c2ecf20Sopenharmony_ci 32798c2ecf20Sopenharmony_ci vmovdqu64 16*0($inp),$T2 # load data 32808c2ecf20Sopenharmony_ci vmovdqu64 16*2($inp),$T3 32818c2ecf20Sopenharmony_ci lea 16*4($inp),$inp 32828c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R0,$D0lo 32838c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R0,$D0hi 32848c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R1,$D1lo 32858c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R1,$D1hi 32868c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R2,$D2lo 32878c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R2,$D2hi 32888c2ecf20Sopenharmony_ci 32898c2ecf20Sopenharmony_ci vpunpcklqdq $T3,$T2,$T1 # transpose data 32908c2ecf20Sopenharmony_ci vpunpckhqdq $T3,$T2,$T3 32918c2ecf20Sopenharmony_ci vpmadd52luq $H1,$S2,$D0lo 32928c2ecf20Sopenharmony_ci vpmadd52huq $H1,$S2,$D0hi 32938c2ecf20Sopenharmony_ci vpmadd52luq $H1,$R0,$D1lo 32948c2ecf20Sopenharmony_ci vpmadd52huq $H1,$R0,$D1hi 32958c2ecf20Sopenharmony_ci vpmadd52luq $H1,$R1,$D2lo 32968c2ecf20Sopenharmony_ci vpmadd52huq $H1,$R1,$D2hi 32978c2ecf20Sopenharmony_ci 32988c2ecf20Sopenharmony_ci ################################################################ 32998c2ecf20Sopenharmony_ci # partial reduction (interleaved with data splat) 33008c2ecf20Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 33018c2ecf20Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 33028c2ecf20Sopenharmony_ci vpandq $mask44,$D0lo,$H0 33038c2ecf20Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 33048c2ecf20Sopenharmony_ci 33058c2ecf20Sopenharmony_ci vpsrlq \$24,$T3,$T2 33068c2ecf20Sopenharmony_ci vporq $PAD,$T2,$T2 33078c2ecf20Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 33088c2ecf20Sopenharmony_ci 33098c2ecf20Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 33108c2ecf20Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 33118c2ecf20Sopenharmony_ci vpandq $mask44,$D1lo,$H1 33128c2ecf20Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 33138c2ecf20Sopenharmony_ci 33148c2ecf20Sopenharmony_ci vpandq $mask44,$T1,$T0 33158c2ecf20Sopenharmony_ci vpsrlq \$44,$T1,$T1 33168c2ecf20Sopenharmony_ci vpsllq \$20,$T3,$T3 33178c2ecf20Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 33188c2ecf20Sopenharmony_ci 33198c2ecf20Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 33208c2ecf20Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 33218c2ecf20Sopenharmony_ci vpandq $mask42,$D2lo,$H2 33228c2ecf20Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 33238c2ecf20Sopenharmony_ci 33248c2ecf20Sopenharmony_ci vpaddq $T2,$H2,$H2 # accumulate input 33258c2ecf20Sopenharmony_ci vpaddq $D2hi,$H0,$H0 33268c2ecf20Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 33278c2ecf20Sopenharmony_ci 33288c2ecf20Sopenharmony_ci vpaddq $D2hi,$H0,$H0 33298c2ecf20Sopenharmony_ci vporq $T3,$T1,$T1 33308c2ecf20Sopenharmony_ci vpandq $mask44,$T1,$T1 33318c2ecf20Sopenharmony_ci 33328c2ecf20Sopenharmony_ci vpsrlq \$44,$H0,$tmp # additional step 33338c2ecf20Sopenharmony_ci vpandq $mask44,$H0,$H0 33348c2ecf20Sopenharmony_ci 33358c2ecf20Sopenharmony_ci vpaddq $tmp,$H1,$H1 33368c2ecf20Sopenharmony_ci 33378c2ecf20Sopenharmony_ci sub \$4,$len # len-=64 33388c2ecf20Sopenharmony_ci jnz .Loop_vpmadd52_4x 33398c2ecf20Sopenharmony_ci 33408c2ecf20Sopenharmony_ci.Ltail_vpmadd52_4x: 33418c2ecf20Sopenharmony_ci vmovdqu64 128($ctx),$R2 # load all key powers 33428c2ecf20Sopenharmony_ci vmovdqu64 160($ctx),$S1 33438c2ecf20Sopenharmony_ci vmovdqu64 64($ctx),$R0 33448c2ecf20Sopenharmony_ci vmovdqu64 96($ctx),$R1 33458c2ecf20Sopenharmony_ci 33468c2ecf20Sopenharmony_ci.Ltail_vpmadd52_2x: 33478c2ecf20Sopenharmony_ci vpsllq \$2,$R2,$S2 # S2 = R2*5*4 33488c2ecf20Sopenharmony_ci vpaddq $R2,$S2,$S2 33498c2ecf20Sopenharmony_ci vpsllq \$2,$S2,$S2 33508c2ecf20Sopenharmony_ci 33518c2ecf20Sopenharmony_ci #vpaddq $T2,$H2,$H2 # accumulate input 33528c2ecf20Sopenharmony_ci vpaddq $T0,$H0,$H0 33538c2ecf20Sopenharmony_ci vpaddq $T1,$H1,$H1 33548c2ecf20Sopenharmony_ci 33558c2ecf20Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 33568c2ecf20Sopenharmony_ci vpmadd52luq $H2,$S1,$D0lo 33578c2ecf20Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 33588c2ecf20Sopenharmony_ci vpmadd52huq $H2,$S1,$D0hi 33598c2ecf20Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 33608c2ecf20Sopenharmony_ci vpmadd52luq $H2,$S2,$D1lo 33618c2ecf20Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 33628c2ecf20Sopenharmony_ci vpmadd52huq $H2,$S2,$D1hi 33638c2ecf20Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 33648c2ecf20Sopenharmony_ci vpmadd52luq $H2,$R0,$D2lo 33658c2ecf20Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 33668c2ecf20Sopenharmony_ci vpmadd52huq $H2,$R0,$D2hi 33678c2ecf20Sopenharmony_ci 33688c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R0,$D0lo 33698c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R0,$D0hi 33708c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R1,$D1lo 33718c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R1,$D1hi 33728c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R2,$D2lo 33738c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R2,$D2hi 33748c2ecf20Sopenharmony_ci 33758c2ecf20Sopenharmony_ci vpmadd52luq $H1,$S2,$D0lo 33768c2ecf20Sopenharmony_ci vpmadd52huq $H1,$S2,$D0hi 33778c2ecf20Sopenharmony_ci vpmadd52luq $H1,$R0,$D1lo 33788c2ecf20Sopenharmony_ci vpmadd52huq $H1,$R0,$D1hi 33798c2ecf20Sopenharmony_ci vpmadd52luq $H1,$R1,$D2lo 33808c2ecf20Sopenharmony_ci vpmadd52huq $H1,$R1,$D2hi 33818c2ecf20Sopenharmony_ci 33828c2ecf20Sopenharmony_ci ################################################################ 33838c2ecf20Sopenharmony_ci # horizontal addition 33848c2ecf20Sopenharmony_ci 33858c2ecf20Sopenharmony_ci mov \$1,%eax 33868c2ecf20Sopenharmony_ci kmovw %eax,%k1 33878c2ecf20Sopenharmony_ci vpsrldq \$8,$D0lo,$T0 33888c2ecf20Sopenharmony_ci vpsrldq \$8,$D0hi,$H0 33898c2ecf20Sopenharmony_ci vpsrldq \$8,$D1lo,$T1 33908c2ecf20Sopenharmony_ci vpsrldq \$8,$D1hi,$H1 33918c2ecf20Sopenharmony_ci vpaddq $T0,$D0lo,$D0lo 33928c2ecf20Sopenharmony_ci vpaddq $H0,$D0hi,$D0hi 33938c2ecf20Sopenharmony_ci vpsrldq \$8,$D2lo,$T2 33948c2ecf20Sopenharmony_ci vpsrldq \$8,$D2hi,$H2 33958c2ecf20Sopenharmony_ci vpaddq $T1,$D1lo,$D1lo 33968c2ecf20Sopenharmony_ci vpaddq $H1,$D1hi,$D1hi 33978c2ecf20Sopenharmony_ci vpermq \$0x2,$D0lo,$T0 33988c2ecf20Sopenharmony_ci vpermq \$0x2,$D0hi,$H0 33998c2ecf20Sopenharmony_ci vpaddq $T2,$D2lo,$D2lo 34008c2ecf20Sopenharmony_ci vpaddq $H2,$D2hi,$D2hi 34018c2ecf20Sopenharmony_ci 34028c2ecf20Sopenharmony_ci vpermq \$0x2,$D1lo,$T1 34038c2ecf20Sopenharmony_ci vpermq \$0x2,$D1hi,$H1 34048c2ecf20Sopenharmony_ci vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 34058c2ecf20Sopenharmony_ci vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 34068c2ecf20Sopenharmony_ci vpermq \$0x2,$D2lo,$T2 34078c2ecf20Sopenharmony_ci vpermq \$0x2,$D2hi,$H2 34088c2ecf20Sopenharmony_ci vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 34098c2ecf20Sopenharmony_ci vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 34108c2ecf20Sopenharmony_ci vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 34118c2ecf20Sopenharmony_ci vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 34128c2ecf20Sopenharmony_ci 34138c2ecf20Sopenharmony_ci ################################################################ 34148c2ecf20Sopenharmony_ci # partial reduction 34158c2ecf20Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 34168c2ecf20Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 34178c2ecf20Sopenharmony_ci vpandq $mask44,$D0lo,$H0 34188c2ecf20Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 34198c2ecf20Sopenharmony_ci 34208c2ecf20Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 34218c2ecf20Sopenharmony_ci 34228c2ecf20Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 34238c2ecf20Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 34248c2ecf20Sopenharmony_ci vpandq $mask44,$D1lo,$H1 34258c2ecf20Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 34268c2ecf20Sopenharmony_ci 34278c2ecf20Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 34288c2ecf20Sopenharmony_ci 34298c2ecf20Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 34308c2ecf20Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 34318c2ecf20Sopenharmony_ci vpandq $mask42,$D2lo,$H2 34328c2ecf20Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 34338c2ecf20Sopenharmony_ci 34348c2ecf20Sopenharmony_ci vpaddq $D2hi,$H0,$H0 34358c2ecf20Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 34368c2ecf20Sopenharmony_ci 34378c2ecf20Sopenharmony_ci vpaddq $D2hi,$H0,$H0 34388c2ecf20Sopenharmony_ci 34398c2ecf20Sopenharmony_ci vpsrlq \$44,$H0,$tmp # additional step 34408c2ecf20Sopenharmony_ci vpandq $mask44,$H0,$H0 34418c2ecf20Sopenharmony_ci 34428c2ecf20Sopenharmony_ci vpaddq $tmp,$H1,$H1 34438c2ecf20Sopenharmony_ci # at this point $len is 34448c2ecf20Sopenharmony_ci # either 4*n+2 or 0... 34458c2ecf20Sopenharmony_ci sub \$2,$len # len-=32 34468c2ecf20Sopenharmony_ci ja .Lblocks_vpmadd52_4x_do 34478c2ecf20Sopenharmony_ci 34488c2ecf20Sopenharmony_ci vmovq %x#$H0,0($ctx) 34498c2ecf20Sopenharmony_ci vmovq %x#$H1,8($ctx) 34508c2ecf20Sopenharmony_ci vmovq %x#$H2,16($ctx) 34518c2ecf20Sopenharmony_ci vzeroall 34528c2ecf20Sopenharmony_ci 34538c2ecf20Sopenharmony_ci.Lno_data_vpmadd52_4x: 34548c2ecf20Sopenharmony_ci RET 34558c2ecf20Sopenharmony_ci.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 34568c2ecf20Sopenharmony_ci___ 34578c2ecf20Sopenharmony_ci} 34588c2ecf20Sopenharmony_ci{ 34598c2ecf20Sopenharmony_ci######################################################################## 34608c2ecf20Sopenharmony_ci# As implied by its name 8x subroutine processes 8 blocks in parallel... 34618c2ecf20Sopenharmony_ci# This is intermediate version, as it's used only in cases when input 34628c2ecf20Sopenharmony_ci# length is either 8*n, 8*n+1 or 8*n+2... 34638c2ecf20Sopenharmony_ci 34648c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 34658c2ecf20Sopenharmony_cimy ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 34668c2ecf20Sopenharmony_cimy ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 34678c2ecf20Sopenharmony_cimy ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); 34688c2ecf20Sopenharmony_ci 34698c2ecf20Sopenharmony_ci$code.=<<___; 34708c2ecf20Sopenharmony_ci.type poly1305_blocks_vpmadd52_8x,\@function,4 34718c2ecf20Sopenharmony_ci.align 32 34728c2ecf20Sopenharmony_cipoly1305_blocks_vpmadd52_8x: 34738c2ecf20Sopenharmony_ci shr \$4,$len 34748c2ecf20Sopenharmony_ci jz .Lno_data_vpmadd52_8x # too short 34758c2ecf20Sopenharmony_ci 34768c2ecf20Sopenharmony_ci shl \$40,$padbit 34778c2ecf20Sopenharmony_ci mov 64($ctx),%r8 # peek on power of the key 34788c2ecf20Sopenharmony_ci 34798c2ecf20Sopenharmony_ci vmovdqa64 .Lx_mask44(%rip),$mask44 34808c2ecf20Sopenharmony_ci vmovdqa64 .Lx_mask42(%rip),$mask42 34818c2ecf20Sopenharmony_ci 34828c2ecf20Sopenharmony_ci test %r8,%r8 # is power value impossible? 34838c2ecf20Sopenharmony_ci js .Linit_vpmadd52 # if it is, then init R[4] 34848c2ecf20Sopenharmony_ci 34858c2ecf20Sopenharmony_ci vmovq 0($ctx),%x#$H0 # load current hash value 34868c2ecf20Sopenharmony_ci vmovq 8($ctx),%x#$H1 34878c2ecf20Sopenharmony_ci vmovq 16($ctx),%x#$H2 34888c2ecf20Sopenharmony_ci 34898c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_8x: 34908c2ecf20Sopenharmony_ci ################################################################ 34918c2ecf20Sopenharmony_ci # fist we calculate more key powers 34928c2ecf20Sopenharmony_ci 34938c2ecf20Sopenharmony_ci vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers 34948c2ecf20Sopenharmony_ci vmovdqu64 160($ctx),$S1 34958c2ecf20Sopenharmony_ci vmovdqu64 64($ctx),$R0 34968c2ecf20Sopenharmony_ci vmovdqu64 96($ctx),$R1 34978c2ecf20Sopenharmony_ci 34988c2ecf20Sopenharmony_ci vpsllq \$2,$R2,$S2 # S2 = R2*5*4 34998c2ecf20Sopenharmony_ci vpaddq $R2,$S2,$S2 35008c2ecf20Sopenharmony_ci vpsllq \$2,$S2,$S2 35018c2ecf20Sopenharmony_ci 35028c2ecf20Sopenharmony_ci vpbroadcastq %x#$R2,$RR2 # broadcast 4th power 35038c2ecf20Sopenharmony_ci vpbroadcastq %x#$R0,$RR0 35048c2ecf20Sopenharmony_ci vpbroadcastq %x#$R1,$RR1 35058c2ecf20Sopenharmony_ci 35068c2ecf20Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 35078c2ecf20Sopenharmony_ci vpmadd52luq $RR2,$S1,$D0lo 35088c2ecf20Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 35098c2ecf20Sopenharmony_ci vpmadd52huq $RR2,$S1,$D0hi 35108c2ecf20Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 35118c2ecf20Sopenharmony_ci vpmadd52luq $RR2,$S2,$D1lo 35128c2ecf20Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 35138c2ecf20Sopenharmony_ci vpmadd52huq $RR2,$S2,$D1hi 35148c2ecf20Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 35158c2ecf20Sopenharmony_ci vpmadd52luq $RR2,$R0,$D2lo 35168c2ecf20Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 35178c2ecf20Sopenharmony_ci vpmadd52huq $RR2,$R0,$D2hi 35188c2ecf20Sopenharmony_ci 35198c2ecf20Sopenharmony_ci vpmadd52luq $RR0,$R0,$D0lo 35208c2ecf20Sopenharmony_ci vpmadd52huq $RR0,$R0,$D0hi 35218c2ecf20Sopenharmony_ci vpmadd52luq $RR0,$R1,$D1lo 35228c2ecf20Sopenharmony_ci vpmadd52huq $RR0,$R1,$D1hi 35238c2ecf20Sopenharmony_ci vpmadd52luq $RR0,$R2,$D2lo 35248c2ecf20Sopenharmony_ci vpmadd52huq $RR0,$R2,$D2hi 35258c2ecf20Sopenharmony_ci 35268c2ecf20Sopenharmony_ci vpmadd52luq $RR1,$S2,$D0lo 35278c2ecf20Sopenharmony_ci vpmadd52huq $RR1,$S2,$D0hi 35288c2ecf20Sopenharmony_ci vpmadd52luq $RR1,$R0,$D1lo 35298c2ecf20Sopenharmony_ci vpmadd52huq $RR1,$R0,$D1hi 35308c2ecf20Sopenharmony_ci vpmadd52luq $RR1,$R1,$D2lo 35318c2ecf20Sopenharmony_ci vpmadd52huq $RR1,$R1,$D2hi 35328c2ecf20Sopenharmony_ci 35338c2ecf20Sopenharmony_ci ################################################################ 35348c2ecf20Sopenharmony_ci # partial reduction 35358c2ecf20Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 35368c2ecf20Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 35378c2ecf20Sopenharmony_ci vpandq $mask44,$D0lo,$RR0 35388c2ecf20Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 35398c2ecf20Sopenharmony_ci 35408c2ecf20Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 35418c2ecf20Sopenharmony_ci 35428c2ecf20Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 35438c2ecf20Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 35448c2ecf20Sopenharmony_ci vpandq $mask44,$D1lo,$RR1 35458c2ecf20Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 35468c2ecf20Sopenharmony_ci 35478c2ecf20Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 35488c2ecf20Sopenharmony_ci 35498c2ecf20Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 35508c2ecf20Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 35518c2ecf20Sopenharmony_ci vpandq $mask42,$D2lo,$RR2 35528c2ecf20Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 35538c2ecf20Sopenharmony_ci 35548c2ecf20Sopenharmony_ci vpaddq $D2hi,$RR0,$RR0 35558c2ecf20Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 35568c2ecf20Sopenharmony_ci 35578c2ecf20Sopenharmony_ci vpaddq $D2hi,$RR0,$RR0 35588c2ecf20Sopenharmony_ci 35598c2ecf20Sopenharmony_ci vpsrlq \$44,$RR0,$tmp # additional step 35608c2ecf20Sopenharmony_ci vpandq $mask44,$RR0,$RR0 35618c2ecf20Sopenharmony_ci 35628c2ecf20Sopenharmony_ci vpaddq $tmp,$RR1,$RR1 35638c2ecf20Sopenharmony_ci 35648c2ecf20Sopenharmony_ci ################################################################ 35658c2ecf20Sopenharmony_ci # At this point Rx holds 1324 powers, RRx - 5768, and the goal 35668c2ecf20Sopenharmony_ci # is 15263748, which reflects how data is loaded... 35678c2ecf20Sopenharmony_ci 35688c2ecf20Sopenharmony_ci vpunpcklqdq $R2,$RR2,$T2 # 3748 35698c2ecf20Sopenharmony_ci vpunpckhqdq $R2,$RR2,$R2 # 1526 35708c2ecf20Sopenharmony_ci vpunpcklqdq $R0,$RR0,$T0 35718c2ecf20Sopenharmony_ci vpunpckhqdq $R0,$RR0,$R0 35728c2ecf20Sopenharmony_ci vpunpcklqdq $R1,$RR1,$T1 35738c2ecf20Sopenharmony_ci vpunpckhqdq $R1,$RR1,$R1 35748c2ecf20Sopenharmony_ci___ 35758c2ecf20Sopenharmony_ci######## switch to %zmm 35768c2ecf20Sopenharmony_cimap(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 35778c2ecf20Sopenharmony_cimap(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 35788c2ecf20Sopenharmony_cimap(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 35798c2ecf20Sopenharmony_cimap(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); 35808c2ecf20Sopenharmony_ci 35818c2ecf20Sopenharmony_ci$code.=<<___; 35828c2ecf20Sopenharmony_ci vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 35838c2ecf20Sopenharmony_ci vshufi64x2 \$0x44,$R0,$T0,$RR0 35848c2ecf20Sopenharmony_ci vshufi64x2 \$0x44,$R1,$T1,$RR1 35858c2ecf20Sopenharmony_ci 35868c2ecf20Sopenharmony_ci vmovdqu64 16*0($inp),$T2 # load data 35878c2ecf20Sopenharmony_ci vmovdqu64 16*4($inp),$T3 35888c2ecf20Sopenharmony_ci lea 16*8($inp),$inp 35898c2ecf20Sopenharmony_ci 35908c2ecf20Sopenharmony_ci vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 35918c2ecf20Sopenharmony_ci vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 35928c2ecf20Sopenharmony_ci vpaddq $RR2,$SS2,$SS2 35938c2ecf20Sopenharmony_ci vpaddq $RR1,$SS1,$SS1 35948c2ecf20Sopenharmony_ci vpsllq \$2,$SS2,$SS2 35958c2ecf20Sopenharmony_ci vpsllq \$2,$SS1,$SS1 35968c2ecf20Sopenharmony_ci 35978c2ecf20Sopenharmony_ci vpbroadcastq $padbit,$PAD 35988c2ecf20Sopenharmony_ci vpbroadcastq %x#$mask44,$mask44 35998c2ecf20Sopenharmony_ci vpbroadcastq %x#$mask42,$mask42 36008c2ecf20Sopenharmony_ci 36018c2ecf20Sopenharmony_ci vpbroadcastq %x#$SS1,$S1 # broadcast 8th power 36028c2ecf20Sopenharmony_ci vpbroadcastq %x#$SS2,$S2 36038c2ecf20Sopenharmony_ci vpbroadcastq %x#$RR0,$R0 36048c2ecf20Sopenharmony_ci vpbroadcastq %x#$RR1,$R1 36058c2ecf20Sopenharmony_ci vpbroadcastq %x#$RR2,$R2 36068c2ecf20Sopenharmony_ci 36078c2ecf20Sopenharmony_ci vpunpcklqdq $T3,$T2,$T1 # transpose data 36088c2ecf20Sopenharmony_ci vpunpckhqdq $T3,$T2,$T3 36098c2ecf20Sopenharmony_ci 36108c2ecf20Sopenharmony_ci # at this point 64-bit lanes are ordered as 73625140 36118c2ecf20Sopenharmony_ci 36128c2ecf20Sopenharmony_ci vpsrlq \$24,$T3,$T2 # splat the data 36138c2ecf20Sopenharmony_ci vporq $PAD,$T2,$T2 36148c2ecf20Sopenharmony_ci vpaddq $T2,$H2,$H2 # accumulate input 36158c2ecf20Sopenharmony_ci vpandq $mask44,$T1,$T0 36168c2ecf20Sopenharmony_ci vpsrlq \$44,$T1,$T1 36178c2ecf20Sopenharmony_ci vpsllq \$20,$T3,$T3 36188c2ecf20Sopenharmony_ci vporq $T3,$T1,$T1 36198c2ecf20Sopenharmony_ci vpandq $mask44,$T1,$T1 36208c2ecf20Sopenharmony_ci 36218c2ecf20Sopenharmony_ci sub \$8,$len 36228c2ecf20Sopenharmony_ci jz .Ltail_vpmadd52_8x 36238c2ecf20Sopenharmony_ci jmp .Loop_vpmadd52_8x 36248c2ecf20Sopenharmony_ci 36258c2ecf20Sopenharmony_ci.align 32 36268c2ecf20Sopenharmony_ci.Loop_vpmadd52_8x: 36278c2ecf20Sopenharmony_ci #vpaddq $T2,$H2,$H2 # accumulate input 36288c2ecf20Sopenharmony_ci vpaddq $T0,$H0,$H0 36298c2ecf20Sopenharmony_ci vpaddq $T1,$H1,$H1 36308c2ecf20Sopenharmony_ci 36318c2ecf20Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 36328c2ecf20Sopenharmony_ci vpmadd52luq $H2,$S1,$D0lo 36338c2ecf20Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 36348c2ecf20Sopenharmony_ci vpmadd52huq $H2,$S1,$D0hi 36358c2ecf20Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 36368c2ecf20Sopenharmony_ci vpmadd52luq $H2,$S2,$D1lo 36378c2ecf20Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 36388c2ecf20Sopenharmony_ci vpmadd52huq $H2,$S2,$D1hi 36398c2ecf20Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 36408c2ecf20Sopenharmony_ci vpmadd52luq $H2,$R0,$D2lo 36418c2ecf20Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 36428c2ecf20Sopenharmony_ci vpmadd52huq $H2,$R0,$D2hi 36438c2ecf20Sopenharmony_ci 36448c2ecf20Sopenharmony_ci vmovdqu64 16*0($inp),$T2 # load data 36458c2ecf20Sopenharmony_ci vmovdqu64 16*4($inp),$T3 36468c2ecf20Sopenharmony_ci lea 16*8($inp),$inp 36478c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R0,$D0lo 36488c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R0,$D0hi 36498c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R1,$D1lo 36508c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R1,$D1hi 36518c2ecf20Sopenharmony_ci vpmadd52luq $H0,$R2,$D2lo 36528c2ecf20Sopenharmony_ci vpmadd52huq $H0,$R2,$D2hi 36538c2ecf20Sopenharmony_ci 36548c2ecf20Sopenharmony_ci vpunpcklqdq $T3,$T2,$T1 # transpose data 36558c2ecf20Sopenharmony_ci vpunpckhqdq $T3,$T2,$T3 36568c2ecf20Sopenharmony_ci vpmadd52luq $H1,$S2,$D0lo 36578c2ecf20Sopenharmony_ci vpmadd52huq $H1,$S2,$D0hi 36588c2ecf20Sopenharmony_ci vpmadd52luq $H1,$R0,$D1lo 36598c2ecf20Sopenharmony_ci vpmadd52huq $H1,$R0,$D1hi 36608c2ecf20Sopenharmony_ci vpmadd52luq $H1,$R1,$D2lo 36618c2ecf20Sopenharmony_ci vpmadd52huq $H1,$R1,$D2hi 36628c2ecf20Sopenharmony_ci 36638c2ecf20Sopenharmony_ci ################################################################ 36648c2ecf20Sopenharmony_ci # partial reduction (interleaved with data splat) 36658c2ecf20Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 36668c2ecf20Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 36678c2ecf20Sopenharmony_ci vpandq $mask44,$D0lo,$H0 36688c2ecf20Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 36698c2ecf20Sopenharmony_ci 36708c2ecf20Sopenharmony_ci vpsrlq \$24,$T3,$T2 36718c2ecf20Sopenharmony_ci vporq $PAD,$T2,$T2 36728c2ecf20Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 36738c2ecf20Sopenharmony_ci 36748c2ecf20Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 36758c2ecf20Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 36768c2ecf20Sopenharmony_ci vpandq $mask44,$D1lo,$H1 36778c2ecf20Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 36788c2ecf20Sopenharmony_ci 36798c2ecf20Sopenharmony_ci vpandq $mask44,$T1,$T0 36808c2ecf20Sopenharmony_ci vpsrlq \$44,$T1,$T1 36818c2ecf20Sopenharmony_ci vpsllq \$20,$T3,$T3 36828c2ecf20Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 36838c2ecf20Sopenharmony_ci 36848c2ecf20Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 36858c2ecf20Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 36868c2ecf20Sopenharmony_ci vpandq $mask42,$D2lo,$H2 36878c2ecf20Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 36888c2ecf20Sopenharmony_ci 36898c2ecf20Sopenharmony_ci vpaddq $T2,$H2,$H2 # accumulate input 36908c2ecf20Sopenharmony_ci vpaddq $D2hi,$H0,$H0 36918c2ecf20Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 36928c2ecf20Sopenharmony_ci 36938c2ecf20Sopenharmony_ci vpaddq $D2hi,$H0,$H0 36948c2ecf20Sopenharmony_ci vporq $T3,$T1,$T1 36958c2ecf20Sopenharmony_ci vpandq $mask44,$T1,$T1 36968c2ecf20Sopenharmony_ci 36978c2ecf20Sopenharmony_ci vpsrlq \$44,$H0,$tmp # additional step 36988c2ecf20Sopenharmony_ci vpandq $mask44,$H0,$H0 36998c2ecf20Sopenharmony_ci 37008c2ecf20Sopenharmony_ci vpaddq $tmp,$H1,$H1 37018c2ecf20Sopenharmony_ci 37028c2ecf20Sopenharmony_ci sub \$8,$len # len-=128 37038c2ecf20Sopenharmony_ci jnz .Loop_vpmadd52_8x 37048c2ecf20Sopenharmony_ci 37058c2ecf20Sopenharmony_ci.Ltail_vpmadd52_8x: 37068c2ecf20Sopenharmony_ci #vpaddq $T2,$H2,$H2 # accumulate input 37078c2ecf20Sopenharmony_ci vpaddq $T0,$H0,$H0 37088c2ecf20Sopenharmony_ci vpaddq $T1,$H1,$H1 37098c2ecf20Sopenharmony_ci 37108c2ecf20Sopenharmony_ci vpxorq $D0lo,$D0lo,$D0lo 37118c2ecf20Sopenharmony_ci vpmadd52luq $H2,$SS1,$D0lo 37128c2ecf20Sopenharmony_ci vpxorq $D0hi,$D0hi,$D0hi 37138c2ecf20Sopenharmony_ci vpmadd52huq $H2,$SS1,$D0hi 37148c2ecf20Sopenharmony_ci vpxorq $D1lo,$D1lo,$D1lo 37158c2ecf20Sopenharmony_ci vpmadd52luq $H2,$SS2,$D1lo 37168c2ecf20Sopenharmony_ci vpxorq $D1hi,$D1hi,$D1hi 37178c2ecf20Sopenharmony_ci vpmadd52huq $H2,$SS2,$D1hi 37188c2ecf20Sopenharmony_ci vpxorq $D2lo,$D2lo,$D2lo 37198c2ecf20Sopenharmony_ci vpmadd52luq $H2,$RR0,$D2lo 37208c2ecf20Sopenharmony_ci vpxorq $D2hi,$D2hi,$D2hi 37218c2ecf20Sopenharmony_ci vpmadd52huq $H2,$RR0,$D2hi 37228c2ecf20Sopenharmony_ci 37238c2ecf20Sopenharmony_ci vpmadd52luq $H0,$RR0,$D0lo 37248c2ecf20Sopenharmony_ci vpmadd52huq $H0,$RR0,$D0hi 37258c2ecf20Sopenharmony_ci vpmadd52luq $H0,$RR1,$D1lo 37268c2ecf20Sopenharmony_ci vpmadd52huq $H0,$RR1,$D1hi 37278c2ecf20Sopenharmony_ci vpmadd52luq $H0,$RR2,$D2lo 37288c2ecf20Sopenharmony_ci vpmadd52huq $H0,$RR2,$D2hi 37298c2ecf20Sopenharmony_ci 37308c2ecf20Sopenharmony_ci vpmadd52luq $H1,$SS2,$D0lo 37318c2ecf20Sopenharmony_ci vpmadd52huq $H1,$SS2,$D0hi 37328c2ecf20Sopenharmony_ci vpmadd52luq $H1,$RR0,$D1lo 37338c2ecf20Sopenharmony_ci vpmadd52huq $H1,$RR0,$D1hi 37348c2ecf20Sopenharmony_ci vpmadd52luq $H1,$RR1,$D2lo 37358c2ecf20Sopenharmony_ci vpmadd52huq $H1,$RR1,$D2hi 37368c2ecf20Sopenharmony_ci 37378c2ecf20Sopenharmony_ci ################################################################ 37388c2ecf20Sopenharmony_ci # horizontal addition 37398c2ecf20Sopenharmony_ci 37408c2ecf20Sopenharmony_ci mov \$1,%eax 37418c2ecf20Sopenharmony_ci kmovw %eax,%k1 37428c2ecf20Sopenharmony_ci vpsrldq \$8,$D0lo,$T0 37438c2ecf20Sopenharmony_ci vpsrldq \$8,$D0hi,$H0 37448c2ecf20Sopenharmony_ci vpsrldq \$8,$D1lo,$T1 37458c2ecf20Sopenharmony_ci vpsrldq \$8,$D1hi,$H1 37468c2ecf20Sopenharmony_ci vpaddq $T0,$D0lo,$D0lo 37478c2ecf20Sopenharmony_ci vpaddq $H0,$D0hi,$D0hi 37488c2ecf20Sopenharmony_ci vpsrldq \$8,$D2lo,$T2 37498c2ecf20Sopenharmony_ci vpsrldq \$8,$D2hi,$H2 37508c2ecf20Sopenharmony_ci vpaddq $T1,$D1lo,$D1lo 37518c2ecf20Sopenharmony_ci vpaddq $H1,$D1hi,$D1hi 37528c2ecf20Sopenharmony_ci vpermq \$0x2,$D0lo,$T0 37538c2ecf20Sopenharmony_ci vpermq \$0x2,$D0hi,$H0 37548c2ecf20Sopenharmony_ci vpaddq $T2,$D2lo,$D2lo 37558c2ecf20Sopenharmony_ci vpaddq $H2,$D2hi,$D2hi 37568c2ecf20Sopenharmony_ci 37578c2ecf20Sopenharmony_ci vpermq \$0x2,$D1lo,$T1 37588c2ecf20Sopenharmony_ci vpermq \$0x2,$D1hi,$H1 37598c2ecf20Sopenharmony_ci vpaddq $T0,$D0lo,$D0lo 37608c2ecf20Sopenharmony_ci vpaddq $H0,$D0hi,$D0hi 37618c2ecf20Sopenharmony_ci vpermq \$0x2,$D2lo,$T2 37628c2ecf20Sopenharmony_ci vpermq \$0x2,$D2hi,$H2 37638c2ecf20Sopenharmony_ci vpaddq $T1,$D1lo,$D1lo 37648c2ecf20Sopenharmony_ci vpaddq $H1,$D1hi,$D1hi 37658c2ecf20Sopenharmony_ci vextracti64x4 \$1,$D0lo,%y#$T0 37668c2ecf20Sopenharmony_ci vextracti64x4 \$1,$D0hi,%y#$H0 37678c2ecf20Sopenharmony_ci vpaddq $T2,$D2lo,$D2lo 37688c2ecf20Sopenharmony_ci vpaddq $H2,$D2hi,$D2hi 37698c2ecf20Sopenharmony_ci 37708c2ecf20Sopenharmony_ci vextracti64x4 \$1,$D1lo,%y#$T1 37718c2ecf20Sopenharmony_ci vextracti64x4 \$1,$D1hi,%y#$H1 37728c2ecf20Sopenharmony_ci vextracti64x4 \$1,$D2lo,%y#$T2 37738c2ecf20Sopenharmony_ci vextracti64x4 \$1,$D2hi,%y#$H2 37748c2ecf20Sopenharmony_ci___ 37758c2ecf20Sopenharmony_ci######## switch back to %ymm 37768c2ecf20Sopenharmony_cimap(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 37778c2ecf20Sopenharmony_cimap(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 37788c2ecf20Sopenharmony_cimap(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 37798c2ecf20Sopenharmony_ci 37808c2ecf20Sopenharmony_ci$code.=<<___; 37818c2ecf20Sopenharmony_ci vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 37828c2ecf20Sopenharmony_ci vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 37838c2ecf20Sopenharmony_ci vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 37848c2ecf20Sopenharmony_ci vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 37858c2ecf20Sopenharmony_ci vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 37868c2ecf20Sopenharmony_ci vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 37878c2ecf20Sopenharmony_ci 37888c2ecf20Sopenharmony_ci ################################################################ 37898c2ecf20Sopenharmony_ci # partial reduction 37908c2ecf20Sopenharmony_ci vpsrlq \$44,$D0lo,$tmp 37918c2ecf20Sopenharmony_ci vpsllq \$8,$D0hi,$D0hi 37928c2ecf20Sopenharmony_ci vpandq $mask44,$D0lo,$H0 37938c2ecf20Sopenharmony_ci vpaddq $tmp,$D0hi,$D0hi 37948c2ecf20Sopenharmony_ci 37958c2ecf20Sopenharmony_ci vpaddq $D0hi,$D1lo,$D1lo 37968c2ecf20Sopenharmony_ci 37978c2ecf20Sopenharmony_ci vpsrlq \$44,$D1lo,$tmp 37988c2ecf20Sopenharmony_ci vpsllq \$8,$D1hi,$D1hi 37998c2ecf20Sopenharmony_ci vpandq $mask44,$D1lo,$H1 38008c2ecf20Sopenharmony_ci vpaddq $tmp,$D1hi,$D1hi 38018c2ecf20Sopenharmony_ci 38028c2ecf20Sopenharmony_ci vpaddq $D1hi,$D2lo,$D2lo 38038c2ecf20Sopenharmony_ci 38048c2ecf20Sopenharmony_ci vpsrlq \$42,$D2lo,$tmp 38058c2ecf20Sopenharmony_ci vpsllq \$10,$D2hi,$D2hi 38068c2ecf20Sopenharmony_ci vpandq $mask42,$D2lo,$H2 38078c2ecf20Sopenharmony_ci vpaddq $tmp,$D2hi,$D2hi 38088c2ecf20Sopenharmony_ci 38098c2ecf20Sopenharmony_ci vpaddq $D2hi,$H0,$H0 38108c2ecf20Sopenharmony_ci vpsllq \$2,$D2hi,$D2hi 38118c2ecf20Sopenharmony_ci 38128c2ecf20Sopenharmony_ci vpaddq $D2hi,$H0,$H0 38138c2ecf20Sopenharmony_ci 38148c2ecf20Sopenharmony_ci vpsrlq \$44,$H0,$tmp # additional step 38158c2ecf20Sopenharmony_ci vpandq $mask44,$H0,$H0 38168c2ecf20Sopenharmony_ci 38178c2ecf20Sopenharmony_ci vpaddq $tmp,$H1,$H1 38188c2ecf20Sopenharmony_ci 38198c2ecf20Sopenharmony_ci ################################################################ 38208c2ecf20Sopenharmony_ci 38218c2ecf20Sopenharmony_ci vmovq %x#$H0,0($ctx) 38228c2ecf20Sopenharmony_ci vmovq %x#$H1,8($ctx) 38238c2ecf20Sopenharmony_ci vmovq %x#$H2,16($ctx) 38248c2ecf20Sopenharmony_ci vzeroall 38258c2ecf20Sopenharmony_ci 38268c2ecf20Sopenharmony_ci.Lno_data_vpmadd52_8x: 38278c2ecf20Sopenharmony_ci RET 38288c2ecf20Sopenharmony_ci.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 38298c2ecf20Sopenharmony_ci___ 38308c2ecf20Sopenharmony_ci} 38318c2ecf20Sopenharmony_ci$code.=<<___; 38328c2ecf20Sopenharmony_ci.type poly1305_emit_base2_44,\@function,3 38338c2ecf20Sopenharmony_ci.align 32 38348c2ecf20Sopenharmony_cipoly1305_emit_base2_44: 38358c2ecf20Sopenharmony_ci mov 0($ctx),%r8 # load hash value 38368c2ecf20Sopenharmony_ci mov 8($ctx),%r9 38378c2ecf20Sopenharmony_ci mov 16($ctx),%r10 38388c2ecf20Sopenharmony_ci 38398c2ecf20Sopenharmony_ci mov %r9,%rax 38408c2ecf20Sopenharmony_ci shr \$20,%r9 38418c2ecf20Sopenharmony_ci shl \$44,%rax 38428c2ecf20Sopenharmony_ci mov %r10,%rcx 38438c2ecf20Sopenharmony_ci shr \$40,%r10 38448c2ecf20Sopenharmony_ci shl \$24,%rcx 38458c2ecf20Sopenharmony_ci 38468c2ecf20Sopenharmony_ci add %rax,%r8 38478c2ecf20Sopenharmony_ci adc %rcx,%r9 38488c2ecf20Sopenharmony_ci adc \$0,%r10 38498c2ecf20Sopenharmony_ci 38508c2ecf20Sopenharmony_ci mov %r8,%rax 38518c2ecf20Sopenharmony_ci add \$5,%r8 # compare to modulus 38528c2ecf20Sopenharmony_ci mov %r9,%rcx 38538c2ecf20Sopenharmony_ci adc \$0,%r9 38548c2ecf20Sopenharmony_ci adc \$0,%r10 38558c2ecf20Sopenharmony_ci shr \$2,%r10 # did 130-bit value overflow? 38568c2ecf20Sopenharmony_ci cmovnz %r8,%rax 38578c2ecf20Sopenharmony_ci cmovnz %r9,%rcx 38588c2ecf20Sopenharmony_ci 38598c2ecf20Sopenharmony_ci add 0($nonce),%rax # accumulate nonce 38608c2ecf20Sopenharmony_ci adc 8($nonce),%rcx 38618c2ecf20Sopenharmony_ci mov %rax,0($mac) # write result 38628c2ecf20Sopenharmony_ci mov %rcx,8($mac) 38638c2ecf20Sopenharmony_ci 38648c2ecf20Sopenharmony_ci RET 38658c2ecf20Sopenharmony_ci.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 38668c2ecf20Sopenharmony_ci___ 38678c2ecf20Sopenharmony_ci} } } 38688c2ecf20Sopenharmony_ci} 38698c2ecf20Sopenharmony_ci 38708c2ecf20Sopenharmony_ciif (!$kernel) 38718c2ecf20Sopenharmony_ci{ # chacha20-poly1305 helpers 38728c2ecf20Sopenharmony_cimy ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 38738c2ecf20Sopenharmony_ci ("%rdi","%rsi","%rdx","%rcx"); # Unix order 38748c2ecf20Sopenharmony_ci$code.=<<___; 38758c2ecf20Sopenharmony_ci.globl xor128_encrypt_n_pad 38768c2ecf20Sopenharmony_ci.type xor128_encrypt_n_pad,\@abi-omnipotent 38778c2ecf20Sopenharmony_ci.align 16 38788c2ecf20Sopenharmony_cixor128_encrypt_n_pad: 38798c2ecf20Sopenharmony_ci sub $otp,$inp 38808c2ecf20Sopenharmony_ci sub $otp,$out 38818c2ecf20Sopenharmony_ci mov $len,%r10 # put len aside 38828c2ecf20Sopenharmony_ci shr \$4,$len # len / 16 38838c2ecf20Sopenharmony_ci jz .Ltail_enc 38848c2ecf20Sopenharmony_ci nop 38858c2ecf20Sopenharmony_ci.Loop_enc_xmm: 38868c2ecf20Sopenharmony_ci movdqu ($inp,$otp),%xmm0 38878c2ecf20Sopenharmony_ci pxor ($otp),%xmm0 38888c2ecf20Sopenharmony_ci movdqu %xmm0,($out,$otp) 38898c2ecf20Sopenharmony_ci movdqa %xmm0,($otp) 38908c2ecf20Sopenharmony_ci lea 16($otp),$otp 38918c2ecf20Sopenharmony_ci dec $len 38928c2ecf20Sopenharmony_ci jnz .Loop_enc_xmm 38938c2ecf20Sopenharmony_ci 38948c2ecf20Sopenharmony_ci and \$15,%r10 # len % 16 38958c2ecf20Sopenharmony_ci jz .Ldone_enc 38968c2ecf20Sopenharmony_ci 38978c2ecf20Sopenharmony_ci.Ltail_enc: 38988c2ecf20Sopenharmony_ci mov \$16,$len 38998c2ecf20Sopenharmony_ci sub %r10,$len 39008c2ecf20Sopenharmony_ci xor %eax,%eax 39018c2ecf20Sopenharmony_ci.Loop_enc_byte: 39028c2ecf20Sopenharmony_ci mov ($inp,$otp),%al 39038c2ecf20Sopenharmony_ci xor ($otp),%al 39048c2ecf20Sopenharmony_ci mov %al,($out,$otp) 39058c2ecf20Sopenharmony_ci mov %al,($otp) 39068c2ecf20Sopenharmony_ci lea 1($otp),$otp 39078c2ecf20Sopenharmony_ci dec %r10 39088c2ecf20Sopenharmony_ci jnz .Loop_enc_byte 39098c2ecf20Sopenharmony_ci 39108c2ecf20Sopenharmony_ci xor %eax,%eax 39118c2ecf20Sopenharmony_ci.Loop_enc_pad: 39128c2ecf20Sopenharmony_ci mov %al,($otp) 39138c2ecf20Sopenharmony_ci lea 1($otp),$otp 39148c2ecf20Sopenharmony_ci dec $len 39158c2ecf20Sopenharmony_ci jnz .Loop_enc_pad 39168c2ecf20Sopenharmony_ci 39178c2ecf20Sopenharmony_ci.Ldone_enc: 39188c2ecf20Sopenharmony_ci mov $otp,%rax 39198c2ecf20Sopenharmony_ci RET 39208c2ecf20Sopenharmony_ci.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 39218c2ecf20Sopenharmony_ci 39228c2ecf20Sopenharmony_ci.globl xor128_decrypt_n_pad 39238c2ecf20Sopenharmony_ci.type xor128_decrypt_n_pad,\@abi-omnipotent 39248c2ecf20Sopenharmony_ci.align 16 39258c2ecf20Sopenharmony_cixor128_decrypt_n_pad: 39268c2ecf20Sopenharmony_ci sub $otp,$inp 39278c2ecf20Sopenharmony_ci sub $otp,$out 39288c2ecf20Sopenharmony_ci mov $len,%r10 # put len aside 39298c2ecf20Sopenharmony_ci shr \$4,$len # len / 16 39308c2ecf20Sopenharmony_ci jz .Ltail_dec 39318c2ecf20Sopenharmony_ci nop 39328c2ecf20Sopenharmony_ci.Loop_dec_xmm: 39338c2ecf20Sopenharmony_ci movdqu ($inp,$otp),%xmm0 39348c2ecf20Sopenharmony_ci movdqa ($otp),%xmm1 39358c2ecf20Sopenharmony_ci pxor %xmm0,%xmm1 39368c2ecf20Sopenharmony_ci movdqu %xmm1,($out,$otp) 39378c2ecf20Sopenharmony_ci movdqa %xmm0,($otp) 39388c2ecf20Sopenharmony_ci lea 16($otp),$otp 39398c2ecf20Sopenharmony_ci dec $len 39408c2ecf20Sopenharmony_ci jnz .Loop_dec_xmm 39418c2ecf20Sopenharmony_ci 39428c2ecf20Sopenharmony_ci pxor %xmm1,%xmm1 39438c2ecf20Sopenharmony_ci and \$15,%r10 # len % 16 39448c2ecf20Sopenharmony_ci jz .Ldone_dec 39458c2ecf20Sopenharmony_ci 39468c2ecf20Sopenharmony_ci.Ltail_dec: 39478c2ecf20Sopenharmony_ci mov \$16,$len 39488c2ecf20Sopenharmony_ci sub %r10,$len 39498c2ecf20Sopenharmony_ci xor %eax,%eax 39508c2ecf20Sopenharmony_ci xor %r11d,%r11d 39518c2ecf20Sopenharmony_ci.Loop_dec_byte: 39528c2ecf20Sopenharmony_ci mov ($inp,$otp),%r11b 39538c2ecf20Sopenharmony_ci mov ($otp),%al 39548c2ecf20Sopenharmony_ci xor %r11b,%al 39558c2ecf20Sopenharmony_ci mov %al,($out,$otp) 39568c2ecf20Sopenharmony_ci mov %r11b,($otp) 39578c2ecf20Sopenharmony_ci lea 1($otp),$otp 39588c2ecf20Sopenharmony_ci dec %r10 39598c2ecf20Sopenharmony_ci jnz .Loop_dec_byte 39608c2ecf20Sopenharmony_ci 39618c2ecf20Sopenharmony_ci xor %eax,%eax 39628c2ecf20Sopenharmony_ci.Loop_dec_pad: 39638c2ecf20Sopenharmony_ci mov %al,($otp) 39648c2ecf20Sopenharmony_ci lea 1($otp),$otp 39658c2ecf20Sopenharmony_ci dec $len 39668c2ecf20Sopenharmony_ci jnz .Loop_dec_pad 39678c2ecf20Sopenharmony_ci 39688c2ecf20Sopenharmony_ci.Ldone_dec: 39698c2ecf20Sopenharmony_ci mov $otp,%rax 39708c2ecf20Sopenharmony_ci RET 39718c2ecf20Sopenharmony_ci.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 39728c2ecf20Sopenharmony_ci___ 39738c2ecf20Sopenharmony_ci} 39748c2ecf20Sopenharmony_ci 39758c2ecf20Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 39768c2ecf20Sopenharmony_ci# CONTEXT *context,DISPATCHER_CONTEXT *disp) 39778c2ecf20Sopenharmony_ciif ($win64) { 39788c2ecf20Sopenharmony_ci$rec="%rcx"; 39798c2ecf20Sopenharmony_ci$frame="%rdx"; 39808c2ecf20Sopenharmony_ci$context="%r8"; 39818c2ecf20Sopenharmony_ci$disp="%r9"; 39828c2ecf20Sopenharmony_ci 39838c2ecf20Sopenharmony_ci$code.=<<___; 39848c2ecf20Sopenharmony_ci.extern __imp_RtlVirtualUnwind 39858c2ecf20Sopenharmony_ci.type se_handler,\@abi-omnipotent 39868c2ecf20Sopenharmony_ci.align 16 39878c2ecf20Sopenharmony_cise_handler: 39888c2ecf20Sopenharmony_ci push %rsi 39898c2ecf20Sopenharmony_ci push %rdi 39908c2ecf20Sopenharmony_ci push %rbx 39918c2ecf20Sopenharmony_ci push %rbp 39928c2ecf20Sopenharmony_ci push %r12 39938c2ecf20Sopenharmony_ci push %r13 39948c2ecf20Sopenharmony_ci push %r14 39958c2ecf20Sopenharmony_ci push %r15 39968c2ecf20Sopenharmony_ci pushfq 39978c2ecf20Sopenharmony_ci sub \$64,%rsp 39988c2ecf20Sopenharmony_ci 39998c2ecf20Sopenharmony_ci mov 120($context),%rax # pull context->Rax 40008c2ecf20Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 40018c2ecf20Sopenharmony_ci 40028c2ecf20Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 40038c2ecf20Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 40048c2ecf20Sopenharmony_ci 40058c2ecf20Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 40068c2ecf20Sopenharmony_ci lea (%rsi,%r10),%r10 # prologue label 40078c2ecf20Sopenharmony_ci cmp %r10,%rbx # context->Rip<.Lprologue 40088c2ecf20Sopenharmony_ci jb .Lcommon_seh_tail 40098c2ecf20Sopenharmony_ci 40108c2ecf20Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 40118c2ecf20Sopenharmony_ci 40128c2ecf20Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 40138c2ecf20Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 40148c2ecf20Sopenharmony_ci cmp %r10,%rbx # context->Rip>=.Lepilogue 40158c2ecf20Sopenharmony_ci jae .Lcommon_seh_tail 40168c2ecf20Sopenharmony_ci 40178c2ecf20Sopenharmony_ci lea 48(%rax),%rax 40188c2ecf20Sopenharmony_ci 40198c2ecf20Sopenharmony_ci mov -8(%rax),%rbx 40208c2ecf20Sopenharmony_ci mov -16(%rax),%rbp 40218c2ecf20Sopenharmony_ci mov -24(%rax),%r12 40228c2ecf20Sopenharmony_ci mov -32(%rax),%r13 40238c2ecf20Sopenharmony_ci mov -40(%rax),%r14 40248c2ecf20Sopenharmony_ci mov -48(%rax),%r15 40258c2ecf20Sopenharmony_ci mov %rbx,144($context) # restore context->Rbx 40268c2ecf20Sopenharmony_ci mov %rbp,160($context) # restore context->Rbp 40278c2ecf20Sopenharmony_ci mov %r12,216($context) # restore context->R12 40288c2ecf20Sopenharmony_ci mov %r13,224($context) # restore context->R13 40298c2ecf20Sopenharmony_ci mov %r14,232($context) # restore context->R14 40308c2ecf20Sopenharmony_ci mov %r15,240($context) # restore context->R14 40318c2ecf20Sopenharmony_ci 40328c2ecf20Sopenharmony_ci jmp .Lcommon_seh_tail 40338c2ecf20Sopenharmony_ci.size se_handler,.-se_handler 40348c2ecf20Sopenharmony_ci 40358c2ecf20Sopenharmony_ci.type avx_handler,\@abi-omnipotent 40368c2ecf20Sopenharmony_ci.align 16 40378c2ecf20Sopenharmony_ciavx_handler: 40388c2ecf20Sopenharmony_ci push %rsi 40398c2ecf20Sopenharmony_ci push %rdi 40408c2ecf20Sopenharmony_ci push %rbx 40418c2ecf20Sopenharmony_ci push %rbp 40428c2ecf20Sopenharmony_ci push %r12 40438c2ecf20Sopenharmony_ci push %r13 40448c2ecf20Sopenharmony_ci push %r14 40458c2ecf20Sopenharmony_ci push %r15 40468c2ecf20Sopenharmony_ci pushfq 40478c2ecf20Sopenharmony_ci sub \$64,%rsp 40488c2ecf20Sopenharmony_ci 40498c2ecf20Sopenharmony_ci mov 120($context),%rax # pull context->Rax 40508c2ecf20Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 40518c2ecf20Sopenharmony_ci 40528c2ecf20Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 40538c2ecf20Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 40548c2ecf20Sopenharmony_ci 40558c2ecf20Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 40568c2ecf20Sopenharmony_ci lea (%rsi,%r10),%r10 # prologue label 40578c2ecf20Sopenharmony_ci cmp %r10,%rbx # context->Rip<prologue label 40588c2ecf20Sopenharmony_ci jb .Lcommon_seh_tail 40598c2ecf20Sopenharmony_ci 40608c2ecf20Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 40618c2ecf20Sopenharmony_ci 40628c2ecf20Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 40638c2ecf20Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 40648c2ecf20Sopenharmony_ci cmp %r10,%rbx # context->Rip>=epilogue label 40658c2ecf20Sopenharmony_ci jae .Lcommon_seh_tail 40668c2ecf20Sopenharmony_ci 40678c2ecf20Sopenharmony_ci mov 208($context),%rax # pull context->R11 40688c2ecf20Sopenharmony_ci 40698c2ecf20Sopenharmony_ci lea 0x50(%rax),%rsi 40708c2ecf20Sopenharmony_ci lea 0xf8(%rax),%rax 40718c2ecf20Sopenharmony_ci lea 512($context),%rdi # &context.Xmm6 40728c2ecf20Sopenharmony_ci mov \$20,%ecx 40738c2ecf20Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 40748c2ecf20Sopenharmony_ci 40758c2ecf20Sopenharmony_ci.Lcommon_seh_tail: 40768c2ecf20Sopenharmony_ci mov 8(%rax),%rdi 40778c2ecf20Sopenharmony_ci mov 16(%rax),%rsi 40788c2ecf20Sopenharmony_ci mov %rax,152($context) # restore context->Rsp 40798c2ecf20Sopenharmony_ci mov %rsi,168($context) # restore context->Rsi 40808c2ecf20Sopenharmony_ci mov %rdi,176($context) # restore context->Rdi 40818c2ecf20Sopenharmony_ci 40828c2ecf20Sopenharmony_ci mov 40($disp),%rdi # disp->ContextRecord 40838c2ecf20Sopenharmony_ci mov $context,%rsi # context 40848c2ecf20Sopenharmony_ci mov \$154,%ecx # sizeof(CONTEXT) 40858c2ecf20Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 40868c2ecf20Sopenharmony_ci 40878c2ecf20Sopenharmony_ci mov $disp,%rsi 40888c2ecf20Sopenharmony_ci xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER 40898c2ecf20Sopenharmony_ci mov 8(%rsi),%rdx # arg2, disp->ImageBase 40908c2ecf20Sopenharmony_ci mov 0(%rsi),%r8 # arg3, disp->ControlPc 40918c2ecf20Sopenharmony_ci mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 40928c2ecf20Sopenharmony_ci mov 40(%rsi),%r10 # disp->ContextRecord 40938c2ecf20Sopenharmony_ci lea 56(%rsi),%r11 # &disp->HandlerData 40948c2ecf20Sopenharmony_ci lea 24(%rsi),%r12 # &disp->EstablisherFrame 40958c2ecf20Sopenharmony_ci mov %r10,32(%rsp) # arg5 40968c2ecf20Sopenharmony_ci mov %r11,40(%rsp) # arg6 40978c2ecf20Sopenharmony_ci mov %r12,48(%rsp) # arg7 40988c2ecf20Sopenharmony_ci mov %rcx,56(%rsp) # arg8, (NULL) 40998c2ecf20Sopenharmony_ci call *__imp_RtlVirtualUnwind(%rip) 41008c2ecf20Sopenharmony_ci 41018c2ecf20Sopenharmony_ci mov \$1,%eax # ExceptionContinueSearch 41028c2ecf20Sopenharmony_ci add \$64,%rsp 41038c2ecf20Sopenharmony_ci popfq 41048c2ecf20Sopenharmony_ci pop %r15 41058c2ecf20Sopenharmony_ci pop %r14 41068c2ecf20Sopenharmony_ci pop %r13 41078c2ecf20Sopenharmony_ci pop %r12 41088c2ecf20Sopenharmony_ci pop %rbp 41098c2ecf20Sopenharmony_ci pop %rbx 41108c2ecf20Sopenharmony_ci pop %rdi 41118c2ecf20Sopenharmony_ci pop %rsi 41128c2ecf20Sopenharmony_ci RET 41138c2ecf20Sopenharmony_ci.size avx_handler,.-avx_handler 41148c2ecf20Sopenharmony_ci 41158c2ecf20Sopenharmony_ci.section .pdata 41168c2ecf20Sopenharmony_ci.align 4 41178c2ecf20Sopenharmony_ci .rva .LSEH_begin_poly1305_init_x86_64 41188c2ecf20Sopenharmony_ci .rva .LSEH_end_poly1305_init_x86_64 41198c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_init_x86_64 41208c2ecf20Sopenharmony_ci 41218c2ecf20Sopenharmony_ci .rva .LSEH_begin_poly1305_blocks_x86_64 41228c2ecf20Sopenharmony_ci .rva .LSEH_end_poly1305_blocks_x86_64 41238c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_x86_64 41248c2ecf20Sopenharmony_ci 41258c2ecf20Sopenharmony_ci .rva .LSEH_begin_poly1305_emit_x86_64 41268c2ecf20Sopenharmony_ci .rva .LSEH_end_poly1305_emit_x86_64 41278c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_emit_x86_64 41288c2ecf20Sopenharmony_ci___ 41298c2ecf20Sopenharmony_ci$code.=<<___ if ($avx); 41308c2ecf20Sopenharmony_ci .rva .LSEH_begin_poly1305_blocks_avx 41318c2ecf20Sopenharmony_ci .rva .Lbase2_64_avx 41328c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx_1 41338c2ecf20Sopenharmony_ci 41348c2ecf20Sopenharmony_ci .rva .Lbase2_64_avx 41358c2ecf20Sopenharmony_ci .rva .Leven_avx 41368c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx_2 41378c2ecf20Sopenharmony_ci 41388c2ecf20Sopenharmony_ci .rva .Leven_avx 41398c2ecf20Sopenharmony_ci .rva .LSEH_end_poly1305_blocks_avx 41408c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx_3 41418c2ecf20Sopenharmony_ci 41428c2ecf20Sopenharmony_ci .rva .LSEH_begin_poly1305_emit_avx 41438c2ecf20Sopenharmony_ci .rva .LSEH_end_poly1305_emit_avx 41448c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_emit_avx 41458c2ecf20Sopenharmony_ci___ 41468c2ecf20Sopenharmony_ci$code.=<<___ if ($avx>1); 41478c2ecf20Sopenharmony_ci .rva .LSEH_begin_poly1305_blocks_avx2 41488c2ecf20Sopenharmony_ci .rva .Lbase2_64_avx2 41498c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx2_1 41508c2ecf20Sopenharmony_ci 41518c2ecf20Sopenharmony_ci .rva .Lbase2_64_avx2 41528c2ecf20Sopenharmony_ci .rva .Leven_avx2 41538c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx2_2 41548c2ecf20Sopenharmony_ci 41558c2ecf20Sopenharmony_ci .rva .Leven_avx2 41568c2ecf20Sopenharmony_ci .rva .LSEH_end_poly1305_blocks_avx2 41578c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx2_3 41588c2ecf20Sopenharmony_ci___ 41598c2ecf20Sopenharmony_ci$code.=<<___ if ($avx>2); 41608c2ecf20Sopenharmony_ci .rva .LSEH_begin_poly1305_blocks_avx512 41618c2ecf20Sopenharmony_ci .rva .LSEH_end_poly1305_blocks_avx512 41628c2ecf20Sopenharmony_ci .rva .LSEH_info_poly1305_blocks_avx512 41638c2ecf20Sopenharmony_ci___ 41648c2ecf20Sopenharmony_ci$code.=<<___; 41658c2ecf20Sopenharmony_ci.section .xdata 41668c2ecf20Sopenharmony_ci.align 8 41678c2ecf20Sopenharmony_ci.LSEH_info_poly1305_init_x86_64: 41688c2ecf20Sopenharmony_ci .byte 9,0,0,0 41698c2ecf20Sopenharmony_ci .rva se_handler 41708c2ecf20Sopenharmony_ci .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 41718c2ecf20Sopenharmony_ci 41728c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_x86_64: 41738c2ecf20Sopenharmony_ci .byte 9,0,0,0 41748c2ecf20Sopenharmony_ci .rva se_handler 41758c2ecf20Sopenharmony_ci .rva .Lblocks_body,.Lblocks_epilogue 41768c2ecf20Sopenharmony_ci 41778c2ecf20Sopenharmony_ci.LSEH_info_poly1305_emit_x86_64: 41788c2ecf20Sopenharmony_ci .byte 9,0,0,0 41798c2ecf20Sopenharmony_ci .rva se_handler 41808c2ecf20Sopenharmony_ci .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 41818c2ecf20Sopenharmony_ci___ 41828c2ecf20Sopenharmony_ci$code.=<<___ if ($avx); 41838c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_1: 41848c2ecf20Sopenharmony_ci .byte 9,0,0,0 41858c2ecf20Sopenharmony_ci .rva se_handler 41868c2ecf20Sopenharmony_ci .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] 41878c2ecf20Sopenharmony_ci 41888c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_2: 41898c2ecf20Sopenharmony_ci .byte 9,0,0,0 41908c2ecf20Sopenharmony_ci .rva se_handler 41918c2ecf20Sopenharmony_ci .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] 41928c2ecf20Sopenharmony_ci 41938c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_3: 41948c2ecf20Sopenharmony_ci .byte 9,0,0,0 41958c2ecf20Sopenharmony_ci .rva avx_handler 41968c2ecf20Sopenharmony_ci .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] 41978c2ecf20Sopenharmony_ci 41988c2ecf20Sopenharmony_ci.LSEH_info_poly1305_emit_avx: 41998c2ecf20Sopenharmony_ci .byte 9,0,0,0 42008c2ecf20Sopenharmony_ci .rva se_handler 42018c2ecf20Sopenharmony_ci .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx 42028c2ecf20Sopenharmony_ci___ 42038c2ecf20Sopenharmony_ci$code.=<<___ if ($avx>1); 42048c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_1: 42058c2ecf20Sopenharmony_ci .byte 9,0,0,0 42068c2ecf20Sopenharmony_ci .rva se_handler 42078c2ecf20Sopenharmony_ci .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] 42088c2ecf20Sopenharmony_ci 42098c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_2: 42108c2ecf20Sopenharmony_ci .byte 9,0,0,0 42118c2ecf20Sopenharmony_ci .rva se_handler 42128c2ecf20Sopenharmony_ci .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] 42138c2ecf20Sopenharmony_ci 42148c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_3: 42158c2ecf20Sopenharmony_ci .byte 9,0,0,0 42168c2ecf20Sopenharmony_ci .rva avx_handler 42178c2ecf20Sopenharmony_ci .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] 42188c2ecf20Sopenharmony_ci___ 42198c2ecf20Sopenharmony_ci$code.=<<___ if ($avx>2); 42208c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx512: 42218c2ecf20Sopenharmony_ci .byte 9,0,0,0 42228c2ecf20Sopenharmony_ci .rva avx_handler 42238c2ecf20Sopenharmony_ci .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] 42248c2ecf20Sopenharmony_ci___ 42258c2ecf20Sopenharmony_ci} 42268c2ecf20Sopenharmony_ci 42278c2ecf20Sopenharmony_ciopen SELF,$0; 42288c2ecf20Sopenharmony_ciwhile(<SELF>) { 42298c2ecf20Sopenharmony_ci next if (/^#!/); 42308c2ecf20Sopenharmony_ci last if (!s/^#/\/\// and !/^$/); 42318c2ecf20Sopenharmony_ci print; 42328c2ecf20Sopenharmony_ci} 42338c2ecf20Sopenharmony_ciclose SELF; 42348c2ecf20Sopenharmony_ci 42358c2ecf20Sopenharmony_ciforeach (split('\n',$code)) { 42368c2ecf20Sopenharmony_ci s/\`([^\`]*)\`/eval($1)/ge; 42378c2ecf20Sopenharmony_ci s/%r([a-z]+)#d/%e$1/g; 42388c2ecf20Sopenharmony_ci s/%r([0-9]+)#d/%r$1d/g; 42398c2ecf20Sopenharmony_ci s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; 42408c2ecf20Sopenharmony_ci 42418c2ecf20Sopenharmony_ci if ($kernel) { 42428c2ecf20Sopenharmony_ci s/(^\.type.*),[0-9]+$/\1/; 42438c2ecf20Sopenharmony_ci s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; 42448c2ecf20Sopenharmony_ci next if /^\.cfi.*/; 42458c2ecf20Sopenharmony_ci } 42468c2ecf20Sopenharmony_ci 42478c2ecf20Sopenharmony_ci print $_,"\n"; 42488c2ecf20Sopenharmony_ci} 42498c2ecf20Sopenharmony_ciclose STDOUT; 4250