18c2ecf20Sopenharmony_ci#!/usr/bin/env perl
28c2ecf20Sopenharmony_ci# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
38c2ecf20Sopenharmony_ci#
48c2ecf20Sopenharmony_ci# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
58c2ecf20Sopenharmony_ci# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
68c2ecf20Sopenharmony_ci# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
78c2ecf20Sopenharmony_ci#
88c2ecf20Sopenharmony_ci# This code is taken from the OpenSSL project but the author, Andy Polyakov,
98c2ecf20Sopenharmony_ci# has relicensed it under the licenses specified in the SPDX header above.
108c2ecf20Sopenharmony_ci# The original headers, including the original license headers, are
118c2ecf20Sopenharmony_ci# included below for completeness.
128c2ecf20Sopenharmony_ci#
138c2ecf20Sopenharmony_ci# ====================================================================
148c2ecf20Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
158c2ecf20Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
168c2ecf20Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
178c2ecf20Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
188c2ecf20Sopenharmony_ci# ====================================================================
198c2ecf20Sopenharmony_ci#
208c2ecf20Sopenharmony_ci# This module implements Poly1305 hash for x86_64.
218c2ecf20Sopenharmony_ci#
228c2ecf20Sopenharmony_ci# March 2015
238c2ecf20Sopenharmony_ci#
248c2ecf20Sopenharmony_ci# Initial release.
258c2ecf20Sopenharmony_ci#
268c2ecf20Sopenharmony_ci# December 2016
278c2ecf20Sopenharmony_ci#
288c2ecf20Sopenharmony_ci# Add AVX512F+VL+BW code path.
298c2ecf20Sopenharmony_ci#
308c2ecf20Sopenharmony_ci# November 2017
318c2ecf20Sopenharmony_ci#
328c2ecf20Sopenharmony_ci# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
338c2ecf20Sopenharmony_ci# executed even on Knights Landing. Trigger for modification was
348c2ecf20Sopenharmony_ci# observation that AVX512 code paths can negatively affect overall
358c2ecf20Sopenharmony_ci# Skylake-X system performance. Since we are likely to suppress
368c2ecf20Sopenharmony_ci# AVX512F capability flag [at least on Skylake-X], conversion serves
378c2ecf20Sopenharmony_ci# as kind of "investment protection". Note that next *lake processor,
388c2ecf20Sopenharmony_ci# Cannonlake, has AVX512IFMA code path to execute...
398c2ecf20Sopenharmony_ci#
408c2ecf20Sopenharmony_ci# Numbers are cycles per processed byte with poly1305_blocks alone,
418c2ecf20Sopenharmony_ci# measured with rdtsc at fixed clock frequency.
428c2ecf20Sopenharmony_ci#
438c2ecf20Sopenharmony_ci#		IALU/gcc-4.8(*)	AVX(**)		AVX2	AVX-512
448c2ecf20Sopenharmony_ci# P4		4.46/+120%	-
458c2ecf20Sopenharmony_ci# Core 2	2.41/+90%	-
468c2ecf20Sopenharmony_ci# Westmere	1.88/+120%	-
478c2ecf20Sopenharmony_ci# Sandy Bridge	1.39/+140%	1.10
488c2ecf20Sopenharmony_ci# Haswell	1.14/+175%	1.11		0.65
498c2ecf20Sopenharmony_ci# Skylake[-X]	1.13/+120%	0.96		0.51	[0.35]
508c2ecf20Sopenharmony_ci# Silvermont	2.83/+95%	-
518c2ecf20Sopenharmony_ci# Knights L	3.60/?		1.65		1.10	0.41(***)
528c2ecf20Sopenharmony_ci# Goldmont	1.70/+180%	-
538c2ecf20Sopenharmony_ci# VIA Nano	1.82/+150%	-
548c2ecf20Sopenharmony_ci# Sledgehammer	1.38/+160%	-
558c2ecf20Sopenharmony_ci# Bulldozer	2.30/+130%	0.97
568c2ecf20Sopenharmony_ci# Ryzen		1.15/+200%	1.08		1.18
578c2ecf20Sopenharmony_ci#
588c2ecf20Sopenharmony_ci# (*)	improvement coefficients relative to clang are more modest and
598c2ecf20Sopenharmony_ci#	are ~50% on most processors, in both cases we are comparing to
608c2ecf20Sopenharmony_ci#	__int128 code;
618c2ecf20Sopenharmony_ci# (**)	SSE2 implementation was attempted, but among non-AVX processors
628c2ecf20Sopenharmony_ci#	it was faster than integer-only code only on older Intel P4 and
638c2ecf20Sopenharmony_ci#	Core processors, 50-30%, less newer processor is, but slower on
648c2ecf20Sopenharmony_ci#	contemporary ones, for example almost 2x slower on Atom, and as
658c2ecf20Sopenharmony_ci#	former are naturally disappearing, SSE2 is deemed unnecessary;
668c2ecf20Sopenharmony_ci# (***)	strangely enough performance seems to vary from core to core,
678c2ecf20Sopenharmony_ci#	listed result is best case;
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci$flavour = shift;
708c2ecf20Sopenharmony_ci$output  = shift;
718c2ecf20Sopenharmony_ciif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
728c2ecf20Sopenharmony_ci
738c2ecf20Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
748c2ecf20Sopenharmony_ci$kernel=0; $kernel=1 if (!$flavour && !$output);
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ciif (!$kernel) {
778c2ecf20Sopenharmony_ci	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
788c2ecf20Sopenharmony_ci	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
798c2ecf20Sopenharmony_ci	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
808c2ecf20Sopenharmony_ci	die "can't locate x86_64-xlate.pl";
818c2ecf20Sopenharmony_ci
828c2ecf20Sopenharmony_ci	open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
838c2ecf20Sopenharmony_ci	*STDOUT=*OUT;
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_ci	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
868c2ecf20Sopenharmony_ci	    =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
878c2ecf20Sopenharmony_ci		$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
888c2ecf20Sopenharmony_ci	}
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci	if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
918c2ecf20Sopenharmony_ci	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
928c2ecf20Sopenharmony_ci		$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
938c2ecf20Sopenharmony_ci		$avx += 1 if ($1==2.11 && $2>=8);
948c2ecf20Sopenharmony_ci	}
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci	if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
978c2ecf20Sopenharmony_ci	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
988c2ecf20Sopenharmony_ci		$avx = ($1>=10) + ($1>=11);
998c2ecf20Sopenharmony_ci	}
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
1028c2ecf20Sopenharmony_ci		$avx = ($2>=3.0) + ($2>3.0);
1038c2ecf20Sopenharmony_ci	}
1048c2ecf20Sopenharmony_ci} else {
1058c2ecf20Sopenharmony_ci	$avx = 4; # The kernel uses ifdefs for this.
1068c2ecf20Sopenharmony_ci}
1078c2ecf20Sopenharmony_ci
1088c2ecf20Sopenharmony_cisub declare_function() {
1098c2ecf20Sopenharmony_ci	my ($name, $align, $nargs) = @_;
1108c2ecf20Sopenharmony_ci	if($kernel) {
1118c2ecf20Sopenharmony_ci		$code .= ".align $align\n";
1128c2ecf20Sopenharmony_ci		$code .= "SYM_FUNC_START($name)\n";
1138c2ecf20Sopenharmony_ci		$code .= ".L$name:\n";
1148c2ecf20Sopenharmony_ci	} else {
1158c2ecf20Sopenharmony_ci		$code .= ".globl	$name\n";
1168c2ecf20Sopenharmony_ci		$code .= ".type	$name,\@function,$nargs\n";
1178c2ecf20Sopenharmony_ci		$code .= ".align	$align\n";
1188c2ecf20Sopenharmony_ci		$code .= "$name:\n";
1198c2ecf20Sopenharmony_ci	}
1208c2ecf20Sopenharmony_ci}
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_cisub end_function() {
1238c2ecf20Sopenharmony_ci	my ($name) = @_;
1248c2ecf20Sopenharmony_ci	if($kernel) {
1258c2ecf20Sopenharmony_ci		$code .= "SYM_FUNC_END($name)\n";
1268c2ecf20Sopenharmony_ci	} else {
1278c2ecf20Sopenharmony_ci		$code .= ".size   $name,.-$name\n";
1288c2ecf20Sopenharmony_ci	}
1298c2ecf20Sopenharmony_ci}
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_ci$code.=<<___ if $kernel;
1328c2ecf20Sopenharmony_ci#include <linux/linkage.h>
1338c2ecf20Sopenharmony_ci___
1348c2ecf20Sopenharmony_ci
1358c2ecf20Sopenharmony_ciif ($avx) {
1368c2ecf20Sopenharmony_ci$code.=<<___ if $kernel;
1378c2ecf20Sopenharmony_ci.section .rodata
1388c2ecf20Sopenharmony_ci___
1398c2ecf20Sopenharmony_ci$code.=<<___;
1408c2ecf20Sopenharmony_ci.align	64
1418c2ecf20Sopenharmony_ci.Lconst:
1428c2ecf20Sopenharmony_ci.Lmask24:
1438c2ecf20Sopenharmony_ci.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1448c2ecf20Sopenharmony_ci.L129:
1458c2ecf20Sopenharmony_ci.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
1468c2ecf20Sopenharmony_ci.Lmask26:
1478c2ecf20Sopenharmony_ci.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1488c2ecf20Sopenharmony_ci.Lpermd_avx2:
1498c2ecf20Sopenharmony_ci.long	2,2,2,3,2,0,2,1
1508c2ecf20Sopenharmony_ci.Lpermd_avx512:
1518c2ecf20Sopenharmony_ci.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci.L2_44_inp_permd:
1548c2ecf20Sopenharmony_ci.long	0,1,1,2,2,3,7,7
1558c2ecf20Sopenharmony_ci.L2_44_inp_shift:
1568c2ecf20Sopenharmony_ci.quad	0,12,24,64
1578c2ecf20Sopenharmony_ci.L2_44_mask:
1588c2ecf20Sopenharmony_ci.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1598c2ecf20Sopenharmony_ci.L2_44_shift_rgt:
1608c2ecf20Sopenharmony_ci.quad	44,44,42,64
1618c2ecf20Sopenharmony_ci.L2_44_shift_lft:
1628c2ecf20Sopenharmony_ci.quad	8,8,10,64
1638c2ecf20Sopenharmony_ci
1648c2ecf20Sopenharmony_ci.align	64
1658c2ecf20Sopenharmony_ci.Lx_mask44:
1668c2ecf20Sopenharmony_ci.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1678c2ecf20Sopenharmony_ci.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1688c2ecf20Sopenharmony_ci.Lx_mask42:
1698c2ecf20Sopenharmony_ci.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1708c2ecf20Sopenharmony_ci.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1718c2ecf20Sopenharmony_ci___
1728c2ecf20Sopenharmony_ci}
1738c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel);
1748c2ecf20Sopenharmony_ci.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1758c2ecf20Sopenharmony_ci.align	16
1768c2ecf20Sopenharmony_ci___
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_cimy ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
1798c2ecf20Sopenharmony_cimy ($mac,$nonce)=($inp,$len);	# *_emit arguments
1808c2ecf20Sopenharmony_cimy ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
1818c2ecf20Sopenharmony_cimy ($h0,$h1,$h2)=("%r14","%rbx","%r10");
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_cisub poly1305_iteration {
1848c2ecf20Sopenharmony_ci# input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
1858c2ecf20Sopenharmony_ci# output:	$h0-$h2 *= $r0-$r1
1868c2ecf20Sopenharmony_ci$code.=<<___;
1878c2ecf20Sopenharmony_ci	mulq	$h0			# h0*r1
1888c2ecf20Sopenharmony_ci	mov	%rax,$d2
1898c2ecf20Sopenharmony_ci	 mov	$r0,%rax
1908c2ecf20Sopenharmony_ci	mov	%rdx,$d3
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ci	mulq	$h0			# h0*r0
1938c2ecf20Sopenharmony_ci	mov	%rax,$h0		# future $h0
1948c2ecf20Sopenharmony_ci	 mov	$r0,%rax
1958c2ecf20Sopenharmony_ci	mov	%rdx,$d1
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_ci	mulq	$h1			# h1*r0
1988c2ecf20Sopenharmony_ci	add	%rax,$d2
1998c2ecf20Sopenharmony_ci	 mov	$s1,%rax
2008c2ecf20Sopenharmony_ci	adc	%rdx,$d3
2018c2ecf20Sopenharmony_ci
2028c2ecf20Sopenharmony_ci	mulq	$h1			# h1*s1
2038c2ecf20Sopenharmony_ci	 mov	$h2,$h1			# borrow $h1
2048c2ecf20Sopenharmony_ci	add	%rax,$h0
2058c2ecf20Sopenharmony_ci	adc	%rdx,$d1
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_ci	imulq	$s1,$h1			# h2*s1
2088c2ecf20Sopenharmony_ci	add	$h1,$d2
2098c2ecf20Sopenharmony_ci	 mov	$d1,$h1
2108c2ecf20Sopenharmony_ci	adc	\$0,$d3
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci	imulq	$r0,$h2			# h2*r0
2138c2ecf20Sopenharmony_ci	add	$d2,$h1
2148c2ecf20Sopenharmony_ci	mov	\$-4,%rax		# mask value
2158c2ecf20Sopenharmony_ci	adc	$h2,$d3
2168c2ecf20Sopenharmony_ci
2178c2ecf20Sopenharmony_ci	and	$d3,%rax		# last reduction step
2188c2ecf20Sopenharmony_ci	mov	$d3,$h2
2198c2ecf20Sopenharmony_ci	shr	\$2,$d3
2208c2ecf20Sopenharmony_ci	and	\$3,$h2
2218c2ecf20Sopenharmony_ci	add	$d3,%rax
2228c2ecf20Sopenharmony_ci	add	%rax,$h0
2238c2ecf20Sopenharmony_ci	adc	\$0,$h1
2248c2ecf20Sopenharmony_ci	adc	\$0,$h2
2258c2ecf20Sopenharmony_ci___
2268c2ecf20Sopenharmony_ci}
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci########################################################################
2298c2ecf20Sopenharmony_ci# Layout of opaque area is following.
2308c2ecf20Sopenharmony_ci#
2318c2ecf20Sopenharmony_ci#	unsigned __int64 h[3];		# current hash value base 2^64
2328c2ecf20Sopenharmony_ci#	unsigned __int64 r[2];		# key value base 2^64
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci$code.=<<___;
2358c2ecf20Sopenharmony_ci.text
2368c2ecf20Sopenharmony_ci___
2378c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel);
2388c2ecf20Sopenharmony_ci.extern	OPENSSL_ia32cap_P
2398c2ecf20Sopenharmony_ci
2408c2ecf20Sopenharmony_ci.globl	poly1305_init_x86_64
2418c2ecf20Sopenharmony_ci.hidden	poly1305_init_x86_64
2428c2ecf20Sopenharmony_ci.globl	poly1305_blocks_x86_64
2438c2ecf20Sopenharmony_ci.hidden	poly1305_blocks_x86_64
2448c2ecf20Sopenharmony_ci.globl	poly1305_emit_x86_64
2458c2ecf20Sopenharmony_ci.hidden	poly1305_emit_x86_64
2468c2ecf20Sopenharmony_ci___
2478c2ecf20Sopenharmony_ci&declare_function("poly1305_init_x86_64", 32, 3);
2488c2ecf20Sopenharmony_ci$code.=<<___;
2498c2ecf20Sopenharmony_ci	xor	%eax,%eax
2508c2ecf20Sopenharmony_ci	mov	%rax,0($ctx)		# initialize hash value
2518c2ecf20Sopenharmony_ci	mov	%rax,8($ctx)
2528c2ecf20Sopenharmony_ci	mov	%rax,16($ctx)
2538c2ecf20Sopenharmony_ci
2548c2ecf20Sopenharmony_ci	cmp	\$0,$inp
2558c2ecf20Sopenharmony_ci	je	.Lno_key
2568c2ecf20Sopenharmony_ci___
2578c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel);
2588c2ecf20Sopenharmony_ci	lea	poly1305_blocks_x86_64(%rip),%r10
2598c2ecf20Sopenharmony_ci	lea	poly1305_emit_x86_64(%rip),%r11
2608c2ecf20Sopenharmony_ci___
2618c2ecf20Sopenharmony_ci$code.=<<___	if (!$kernel && $avx);
2628c2ecf20Sopenharmony_ci	mov	OPENSSL_ia32cap_P+4(%rip),%r9
2638c2ecf20Sopenharmony_ci	lea	poly1305_blocks_avx(%rip),%rax
2648c2ecf20Sopenharmony_ci	lea	poly1305_emit_avx(%rip),%rcx
2658c2ecf20Sopenharmony_ci	bt	\$`60-32`,%r9		# AVX?
2668c2ecf20Sopenharmony_ci	cmovc	%rax,%r10
2678c2ecf20Sopenharmony_ci	cmovc	%rcx,%r11
2688c2ecf20Sopenharmony_ci___
2698c2ecf20Sopenharmony_ci$code.=<<___	if (!$kernel && $avx>1);
2708c2ecf20Sopenharmony_ci	lea	poly1305_blocks_avx2(%rip),%rax
2718c2ecf20Sopenharmony_ci	bt	\$`5+32`,%r9		# AVX2?
2728c2ecf20Sopenharmony_ci	cmovc	%rax,%r10
2738c2ecf20Sopenharmony_ci___
2748c2ecf20Sopenharmony_ci$code.=<<___	if (!$kernel && $avx>3);
2758c2ecf20Sopenharmony_ci	mov	\$`(1<<31|1<<21|1<<16)`,%rax
2768c2ecf20Sopenharmony_ci	shr	\$32,%r9
2778c2ecf20Sopenharmony_ci	and	%rax,%r9
2788c2ecf20Sopenharmony_ci	cmp	%rax,%r9
2798c2ecf20Sopenharmony_ci	je	.Linit_base2_44
2808c2ecf20Sopenharmony_ci___
2818c2ecf20Sopenharmony_ci$code.=<<___;
2828c2ecf20Sopenharmony_ci	mov	\$0x0ffffffc0fffffff,%rax
2838c2ecf20Sopenharmony_ci	mov	\$0x0ffffffc0ffffffc,%rcx
2848c2ecf20Sopenharmony_ci	and	0($inp),%rax
2858c2ecf20Sopenharmony_ci	and	8($inp),%rcx
2868c2ecf20Sopenharmony_ci	mov	%rax,24($ctx)
2878c2ecf20Sopenharmony_ci	mov	%rcx,32($ctx)
2888c2ecf20Sopenharmony_ci___
2898c2ecf20Sopenharmony_ci$code.=<<___	if (!$kernel && $flavour !~ /elf32/);
2908c2ecf20Sopenharmony_ci	mov	%r10,0(%rdx)
2918c2ecf20Sopenharmony_ci	mov	%r11,8(%rdx)
2928c2ecf20Sopenharmony_ci___
2938c2ecf20Sopenharmony_ci$code.=<<___	if (!$kernel && $flavour =~ /elf32/);
2948c2ecf20Sopenharmony_ci	mov	%r10d,0(%rdx)
2958c2ecf20Sopenharmony_ci	mov	%r11d,4(%rdx)
2968c2ecf20Sopenharmony_ci___
2978c2ecf20Sopenharmony_ci$code.=<<___;
2988c2ecf20Sopenharmony_ci	mov	\$1,%eax
2998c2ecf20Sopenharmony_ci.Lno_key:
3008c2ecf20Sopenharmony_ci	RET
3018c2ecf20Sopenharmony_ci___
3028c2ecf20Sopenharmony_ci&end_function("poly1305_init_x86_64");
3038c2ecf20Sopenharmony_ci
3048c2ecf20Sopenharmony_ci&declare_function("poly1305_blocks_x86_64", 32, 4);
3058c2ecf20Sopenharmony_ci$code.=<<___;
3068c2ecf20Sopenharmony_ci.cfi_startproc
3078c2ecf20Sopenharmony_ci.Lblocks:
3088c2ecf20Sopenharmony_ci	shr	\$4,$len
3098c2ecf20Sopenharmony_ci	jz	.Lno_data		# too short
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_ci	push	%rbx
3128c2ecf20Sopenharmony_ci.cfi_push	%rbx
3138c2ecf20Sopenharmony_ci	push	%r12
3148c2ecf20Sopenharmony_ci.cfi_push	%r12
3158c2ecf20Sopenharmony_ci	push	%r13
3168c2ecf20Sopenharmony_ci.cfi_push	%r13
3178c2ecf20Sopenharmony_ci	push	%r14
3188c2ecf20Sopenharmony_ci.cfi_push	%r14
3198c2ecf20Sopenharmony_ci	push	%r15
3208c2ecf20Sopenharmony_ci.cfi_push	%r15
3218c2ecf20Sopenharmony_ci	push	$ctx
3228c2ecf20Sopenharmony_ci.cfi_push	$ctx
3238c2ecf20Sopenharmony_ci.Lblocks_body:
3248c2ecf20Sopenharmony_ci
3258c2ecf20Sopenharmony_ci	mov	$len,%r15		# reassign $len
3268c2ecf20Sopenharmony_ci
3278c2ecf20Sopenharmony_ci	mov	24($ctx),$r0		# load r
3288c2ecf20Sopenharmony_ci	mov	32($ctx),$s1
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci	mov	0($ctx),$h0		# load hash value
3318c2ecf20Sopenharmony_ci	mov	8($ctx),$h1
3328c2ecf20Sopenharmony_ci	mov	16($ctx),$h2
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ci	mov	$s1,$r1
3358c2ecf20Sopenharmony_ci	shr	\$2,$s1
3368c2ecf20Sopenharmony_ci	mov	$r1,%rax
3378c2ecf20Sopenharmony_ci	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
3388c2ecf20Sopenharmony_ci	jmp	.Loop
3398c2ecf20Sopenharmony_ci
3408c2ecf20Sopenharmony_ci.align	32
3418c2ecf20Sopenharmony_ci.Loop:
3428c2ecf20Sopenharmony_ci	add	0($inp),$h0		# accumulate input
3438c2ecf20Sopenharmony_ci	adc	8($inp),$h1
3448c2ecf20Sopenharmony_ci	lea	16($inp),$inp
3458c2ecf20Sopenharmony_ci	adc	$padbit,$h2
3468c2ecf20Sopenharmony_ci___
3478c2ecf20Sopenharmony_ci
3488c2ecf20Sopenharmony_ci	&poly1305_iteration();
3498c2ecf20Sopenharmony_ci
3508c2ecf20Sopenharmony_ci$code.=<<___;
3518c2ecf20Sopenharmony_ci	mov	$r1,%rax
3528c2ecf20Sopenharmony_ci	dec	%r15			# len-=16
3538c2ecf20Sopenharmony_ci	jnz	.Loop
3548c2ecf20Sopenharmony_ci
3558c2ecf20Sopenharmony_ci	mov	0(%rsp),$ctx
3568c2ecf20Sopenharmony_ci.cfi_restore	$ctx
3578c2ecf20Sopenharmony_ci
3588c2ecf20Sopenharmony_ci	mov	$h0,0($ctx)		# store hash value
3598c2ecf20Sopenharmony_ci	mov	$h1,8($ctx)
3608c2ecf20Sopenharmony_ci	mov	$h2,16($ctx)
3618c2ecf20Sopenharmony_ci
3628c2ecf20Sopenharmony_ci	mov	8(%rsp),%r15
3638c2ecf20Sopenharmony_ci.cfi_restore	%r15
3648c2ecf20Sopenharmony_ci	mov	16(%rsp),%r14
3658c2ecf20Sopenharmony_ci.cfi_restore	%r14
3668c2ecf20Sopenharmony_ci	mov	24(%rsp),%r13
3678c2ecf20Sopenharmony_ci.cfi_restore	%r13
3688c2ecf20Sopenharmony_ci	mov	32(%rsp),%r12
3698c2ecf20Sopenharmony_ci.cfi_restore	%r12
3708c2ecf20Sopenharmony_ci	mov	40(%rsp),%rbx
3718c2ecf20Sopenharmony_ci.cfi_restore	%rbx
3728c2ecf20Sopenharmony_ci	lea	48(%rsp),%rsp
3738c2ecf20Sopenharmony_ci.cfi_adjust_cfa_offset	-48
3748c2ecf20Sopenharmony_ci.Lno_data:
3758c2ecf20Sopenharmony_ci.Lblocks_epilogue:
3768c2ecf20Sopenharmony_ci	RET
3778c2ecf20Sopenharmony_ci.cfi_endproc
3788c2ecf20Sopenharmony_ci___
3798c2ecf20Sopenharmony_ci&end_function("poly1305_blocks_x86_64");
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci&declare_function("poly1305_emit_x86_64", 32, 3);
3828c2ecf20Sopenharmony_ci$code.=<<___;
3838c2ecf20Sopenharmony_ci.Lemit:
3848c2ecf20Sopenharmony_ci	mov	0($ctx),%r8	# load hash value
3858c2ecf20Sopenharmony_ci	mov	8($ctx),%r9
3868c2ecf20Sopenharmony_ci	mov	16($ctx),%r10
3878c2ecf20Sopenharmony_ci
3888c2ecf20Sopenharmony_ci	mov	%r8,%rax
3898c2ecf20Sopenharmony_ci	add	\$5,%r8		# compare to modulus
3908c2ecf20Sopenharmony_ci	mov	%r9,%rcx
3918c2ecf20Sopenharmony_ci	adc	\$0,%r9
3928c2ecf20Sopenharmony_ci	adc	\$0,%r10
3938c2ecf20Sopenharmony_ci	shr	\$2,%r10	# did 130-bit value overflow?
3948c2ecf20Sopenharmony_ci	cmovnz	%r8,%rax
3958c2ecf20Sopenharmony_ci	cmovnz	%r9,%rcx
3968c2ecf20Sopenharmony_ci
3978c2ecf20Sopenharmony_ci	add	0($nonce),%rax	# accumulate nonce
3988c2ecf20Sopenharmony_ci	adc	8($nonce),%rcx
3998c2ecf20Sopenharmony_ci	mov	%rax,0($mac)	# write result
4008c2ecf20Sopenharmony_ci	mov	%rcx,8($mac)
4018c2ecf20Sopenharmony_ci
4028c2ecf20Sopenharmony_ci	RET
4038c2ecf20Sopenharmony_ci___
4048c2ecf20Sopenharmony_ci&end_function("poly1305_emit_x86_64");
4058c2ecf20Sopenharmony_ciif ($avx) {
4068c2ecf20Sopenharmony_ci
4078c2ecf20Sopenharmony_ci########################################################################
4088c2ecf20Sopenharmony_ci# Layout of opaque area is following.
4098c2ecf20Sopenharmony_ci#
4108c2ecf20Sopenharmony_ci#	unsigned __int32 h[5];		# current hash value base 2^26
4118c2ecf20Sopenharmony_ci#	unsigned __int32 is_base2_26;
4128c2ecf20Sopenharmony_ci#	unsigned __int64 r[2];		# key value base 2^64
4138c2ecf20Sopenharmony_ci#	unsigned __int64 pad;
4148c2ecf20Sopenharmony_ci#	struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
4158c2ecf20Sopenharmony_ci#
4168c2ecf20Sopenharmony_ci# where r^n are base 2^26 digits of degrees of multiplier key. There are
4178c2ecf20Sopenharmony_ci# 5 digits, but last four are interleaved with multiples of 5, totalling
4188c2ecf20Sopenharmony_ci# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
4198c2ecf20Sopenharmony_ci
4208c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
4218c2ecf20Sopenharmony_ci    map("%xmm$_",(0..15));
4228c2ecf20Sopenharmony_ci
4238c2ecf20Sopenharmony_ci$code.=<<___;
4248c2ecf20Sopenharmony_ci.type	__poly1305_block,\@abi-omnipotent
4258c2ecf20Sopenharmony_ci.align	32
4268c2ecf20Sopenharmony_ci__poly1305_block:
4278c2ecf20Sopenharmony_ci	push $ctx
4288c2ecf20Sopenharmony_ci___
4298c2ecf20Sopenharmony_ci	&poly1305_iteration();
4308c2ecf20Sopenharmony_ci$code.=<<___;
4318c2ecf20Sopenharmony_ci	pop $ctx
4328c2ecf20Sopenharmony_ci	RET
4338c2ecf20Sopenharmony_ci.size	__poly1305_block,.-__poly1305_block
4348c2ecf20Sopenharmony_ci
4358c2ecf20Sopenharmony_ci.type	__poly1305_init_avx,\@abi-omnipotent
4368c2ecf20Sopenharmony_ci.align	32
4378c2ecf20Sopenharmony_ci__poly1305_init_avx:
4388c2ecf20Sopenharmony_ci	push %rbp
4398c2ecf20Sopenharmony_ci	mov %rsp,%rbp
4408c2ecf20Sopenharmony_ci	mov	$r0,$h0
4418c2ecf20Sopenharmony_ci	mov	$r1,$h1
4428c2ecf20Sopenharmony_ci	xor	$h2,$h2
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci	lea	48+64($ctx),$ctx	# size optimization
4458c2ecf20Sopenharmony_ci
4468c2ecf20Sopenharmony_ci	mov	$r1,%rax
4478c2ecf20Sopenharmony_ci	call	__poly1305_block	# r^2
4488c2ecf20Sopenharmony_ci
4498c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%eax	# save interleaved r^2 and r base 2^26
4508c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%edx
4518c2ecf20Sopenharmony_ci	mov	$h0,$d1
4528c2ecf20Sopenharmony_ci	and	$h0#d,%eax
4538c2ecf20Sopenharmony_ci	mov	$r0,$d2
4548c2ecf20Sopenharmony_ci	and	$r0#d,%edx
4558c2ecf20Sopenharmony_ci	mov	%eax,`16*0+0-64`($ctx)
4568c2ecf20Sopenharmony_ci	shr	\$26,$d1
4578c2ecf20Sopenharmony_ci	mov	%edx,`16*0+4-64`($ctx)
4588c2ecf20Sopenharmony_ci	shr	\$26,$d2
4598c2ecf20Sopenharmony_ci
4608c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%eax
4618c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%edx
4628c2ecf20Sopenharmony_ci	and	$d1#d,%eax
4638c2ecf20Sopenharmony_ci	and	$d2#d,%edx
4648c2ecf20Sopenharmony_ci	mov	%eax,`16*1+0-64`($ctx)
4658c2ecf20Sopenharmony_ci	lea	(%rax,%rax,4),%eax	# *5
4668c2ecf20Sopenharmony_ci	mov	%edx,`16*1+4-64`($ctx)
4678c2ecf20Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
4688c2ecf20Sopenharmony_ci	mov	%eax,`16*2+0-64`($ctx)
4698c2ecf20Sopenharmony_ci	shr	\$26,$d1
4708c2ecf20Sopenharmony_ci	mov	%edx,`16*2+4-64`($ctx)
4718c2ecf20Sopenharmony_ci	shr	\$26,$d2
4728c2ecf20Sopenharmony_ci
4738c2ecf20Sopenharmony_ci	mov	$h1,%rax
4748c2ecf20Sopenharmony_ci	mov	$r1,%rdx
4758c2ecf20Sopenharmony_ci	shl	\$12,%rax
4768c2ecf20Sopenharmony_ci	shl	\$12,%rdx
4778c2ecf20Sopenharmony_ci	or	$d1,%rax
4788c2ecf20Sopenharmony_ci	or	$d2,%rdx
4798c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%eax
4808c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%edx
4818c2ecf20Sopenharmony_ci	mov	%eax,`16*3+0-64`($ctx)
4828c2ecf20Sopenharmony_ci	lea	(%rax,%rax,4),%eax	# *5
4838c2ecf20Sopenharmony_ci	mov	%edx,`16*3+4-64`($ctx)
4848c2ecf20Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
4858c2ecf20Sopenharmony_ci	mov	%eax,`16*4+0-64`($ctx)
4868c2ecf20Sopenharmony_ci	mov	$h1,$d1
4878c2ecf20Sopenharmony_ci	mov	%edx,`16*4+4-64`($ctx)
4888c2ecf20Sopenharmony_ci	mov	$r1,$d2
4898c2ecf20Sopenharmony_ci
4908c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%eax
4918c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%edx
4928c2ecf20Sopenharmony_ci	shr	\$14,$d1
4938c2ecf20Sopenharmony_ci	shr	\$14,$d2
4948c2ecf20Sopenharmony_ci	and	$d1#d,%eax
4958c2ecf20Sopenharmony_ci	and	$d2#d,%edx
4968c2ecf20Sopenharmony_ci	mov	%eax,`16*5+0-64`($ctx)
4978c2ecf20Sopenharmony_ci	lea	(%rax,%rax,4),%eax	# *5
4988c2ecf20Sopenharmony_ci	mov	%edx,`16*5+4-64`($ctx)
4998c2ecf20Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
5008c2ecf20Sopenharmony_ci	mov	%eax,`16*6+0-64`($ctx)
5018c2ecf20Sopenharmony_ci	shr	\$26,$d1
5028c2ecf20Sopenharmony_ci	mov	%edx,`16*6+4-64`($ctx)
5038c2ecf20Sopenharmony_ci	shr	\$26,$d2
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_ci	mov	$h2,%rax
5068c2ecf20Sopenharmony_ci	shl	\$24,%rax
5078c2ecf20Sopenharmony_ci	or	%rax,$d1
5088c2ecf20Sopenharmony_ci	mov	$d1#d,`16*7+0-64`($ctx)
5098c2ecf20Sopenharmony_ci	lea	($d1,$d1,4),$d1		# *5
5108c2ecf20Sopenharmony_ci	mov	$d2#d,`16*7+4-64`($ctx)
5118c2ecf20Sopenharmony_ci	lea	($d2,$d2,4),$d2		# *5
5128c2ecf20Sopenharmony_ci	mov	$d1#d,`16*8+0-64`($ctx)
5138c2ecf20Sopenharmony_ci	mov	$d2#d,`16*8+4-64`($ctx)
5148c2ecf20Sopenharmony_ci
5158c2ecf20Sopenharmony_ci	mov	$r1,%rax
5168c2ecf20Sopenharmony_ci	call	__poly1305_block	# r^3
5178c2ecf20Sopenharmony_ci
5188c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%eax	# save r^3 base 2^26
5198c2ecf20Sopenharmony_ci	mov	$h0,$d1
5208c2ecf20Sopenharmony_ci	and	$h0#d,%eax
5218c2ecf20Sopenharmony_ci	shr	\$26,$d1
5228c2ecf20Sopenharmony_ci	mov	%eax,`16*0+12-64`($ctx)
5238c2ecf20Sopenharmony_ci
5248c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%edx
5258c2ecf20Sopenharmony_ci	and	$d1#d,%edx
5268c2ecf20Sopenharmony_ci	mov	%edx,`16*1+12-64`($ctx)
5278c2ecf20Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
5288c2ecf20Sopenharmony_ci	shr	\$26,$d1
5298c2ecf20Sopenharmony_ci	mov	%edx,`16*2+12-64`($ctx)
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci	mov	$h1,%rax
5328c2ecf20Sopenharmony_ci	shl	\$12,%rax
5338c2ecf20Sopenharmony_ci	or	$d1,%rax
5348c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%eax
5358c2ecf20Sopenharmony_ci	mov	%eax,`16*3+12-64`($ctx)
5368c2ecf20Sopenharmony_ci	lea	(%rax,%rax,4),%eax	# *5
5378c2ecf20Sopenharmony_ci	mov	$h1,$d1
5388c2ecf20Sopenharmony_ci	mov	%eax,`16*4+12-64`($ctx)
5398c2ecf20Sopenharmony_ci
5408c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%edx
5418c2ecf20Sopenharmony_ci	shr	\$14,$d1
5428c2ecf20Sopenharmony_ci	and	$d1#d,%edx
5438c2ecf20Sopenharmony_ci	mov	%edx,`16*5+12-64`($ctx)
5448c2ecf20Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
5458c2ecf20Sopenharmony_ci	shr	\$26,$d1
5468c2ecf20Sopenharmony_ci	mov	%edx,`16*6+12-64`($ctx)
5478c2ecf20Sopenharmony_ci
5488c2ecf20Sopenharmony_ci	mov	$h2,%rax
5498c2ecf20Sopenharmony_ci	shl	\$24,%rax
5508c2ecf20Sopenharmony_ci	or	%rax,$d1
5518c2ecf20Sopenharmony_ci	mov	$d1#d,`16*7+12-64`($ctx)
5528c2ecf20Sopenharmony_ci	lea	($d1,$d1,4),$d1		# *5
5538c2ecf20Sopenharmony_ci	mov	$d1#d,`16*8+12-64`($ctx)
5548c2ecf20Sopenharmony_ci
5558c2ecf20Sopenharmony_ci	mov	$r1,%rax
5568c2ecf20Sopenharmony_ci	call	__poly1305_block	# r^4
5578c2ecf20Sopenharmony_ci
5588c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%eax	# save r^4 base 2^26
5598c2ecf20Sopenharmony_ci	mov	$h0,$d1
5608c2ecf20Sopenharmony_ci	and	$h0#d,%eax
5618c2ecf20Sopenharmony_ci	shr	\$26,$d1
5628c2ecf20Sopenharmony_ci	mov	%eax,`16*0+8-64`($ctx)
5638c2ecf20Sopenharmony_ci
5648c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%edx
5658c2ecf20Sopenharmony_ci	and	$d1#d,%edx
5668c2ecf20Sopenharmony_ci	mov	%edx,`16*1+8-64`($ctx)
5678c2ecf20Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
5688c2ecf20Sopenharmony_ci	shr	\$26,$d1
5698c2ecf20Sopenharmony_ci	mov	%edx,`16*2+8-64`($ctx)
5708c2ecf20Sopenharmony_ci
5718c2ecf20Sopenharmony_ci	mov	$h1,%rax
5728c2ecf20Sopenharmony_ci	shl	\$12,%rax
5738c2ecf20Sopenharmony_ci	or	$d1,%rax
5748c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%eax
5758c2ecf20Sopenharmony_ci	mov	%eax,`16*3+8-64`($ctx)
5768c2ecf20Sopenharmony_ci	lea	(%rax,%rax,4),%eax	# *5
5778c2ecf20Sopenharmony_ci	mov	$h1,$d1
5788c2ecf20Sopenharmony_ci	mov	%eax,`16*4+8-64`($ctx)
5798c2ecf20Sopenharmony_ci
5808c2ecf20Sopenharmony_ci	mov	\$0x3ffffff,%edx
5818c2ecf20Sopenharmony_ci	shr	\$14,$d1
5828c2ecf20Sopenharmony_ci	and	$d1#d,%edx
5838c2ecf20Sopenharmony_ci	mov	%edx,`16*5+8-64`($ctx)
5848c2ecf20Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
5858c2ecf20Sopenharmony_ci	shr	\$26,$d1
5868c2ecf20Sopenharmony_ci	mov	%edx,`16*6+8-64`($ctx)
5878c2ecf20Sopenharmony_ci
5888c2ecf20Sopenharmony_ci	mov	$h2,%rax
5898c2ecf20Sopenharmony_ci	shl	\$24,%rax
5908c2ecf20Sopenharmony_ci	or	%rax,$d1
5918c2ecf20Sopenharmony_ci	mov	$d1#d,`16*7+8-64`($ctx)
5928c2ecf20Sopenharmony_ci	lea	($d1,$d1,4),$d1		# *5
5938c2ecf20Sopenharmony_ci	mov	$d1#d,`16*8+8-64`($ctx)
5948c2ecf20Sopenharmony_ci
5958c2ecf20Sopenharmony_ci	lea	-48-64($ctx),$ctx	# size [de-]optimization
5968c2ecf20Sopenharmony_ci	pop %rbp
5978c2ecf20Sopenharmony_ci	RET
5988c2ecf20Sopenharmony_ci.size	__poly1305_init_avx,.-__poly1305_init_avx
5998c2ecf20Sopenharmony_ci___
6008c2ecf20Sopenharmony_ci
6018c2ecf20Sopenharmony_ci&declare_function("poly1305_blocks_avx", 32, 4);
6028c2ecf20Sopenharmony_ci$code.=<<___;
6038c2ecf20Sopenharmony_ci.cfi_startproc
6048c2ecf20Sopenharmony_ci	mov	20($ctx),%r8d		# is_base2_26
6058c2ecf20Sopenharmony_ci	cmp	\$128,$len
6068c2ecf20Sopenharmony_ci	jae	.Lblocks_avx
6078c2ecf20Sopenharmony_ci	test	%r8d,%r8d
6088c2ecf20Sopenharmony_ci	jz	.Lblocks
6098c2ecf20Sopenharmony_ci
6108c2ecf20Sopenharmony_ci.Lblocks_avx:
6118c2ecf20Sopenharmony_ci	and	\$-16,$len
6128c2ecf20Sopenharmony_ci	jz	.Lno_data_avx
6138c2ecf20Sopenharmony_ci
6148c2ecf20Sopenharmony_ci	vzeroupper
6158c2ecf20Sopenharmony_ci
6168c2ecf20Sopenharmony_ci	test	%r8d,%r8d
6178c2ecf20Sopenharmony_ci	jz	.Lbase2_64_avx
6188c2ecf20Sopenharmony_ci
6198c2ecf20Sopenharmony_ci	test	\$31,$len
6208c2ecf20Sopenharmony_ci	jz	.Leven_avx
6218c2ecf20Sopenharmony_ci
6228c2ecf20Sopenharmony_ci	push	%rbp
6238c2ecf20Sopenharmony_ci.cfi_push	%rbp
6248c2ecf20Sopenharmony_ci	mov 	%rsp,%rbp
6258c2ecf20Sopenharmony_ci	push	%rbx
6268c2ecf20Sopenharmony_ci.cfi_push	%rbx
6278c2ecf20Sopenharmony_ci	push	%r12
6288c2ecf20Sopenharmony_ci.cfi_push	%r12
6298c2ecf20Sopenharmony_ci	push	%r13
6308c2ecf20Sopenharmony_ci.cfi_push	%r13
6318c2ecf20Sopenharmony_ci	push	%r14
6328c2ecf20Sopenharmony_ci.cfi_push	%r14
6338c2ecf20Sopenharmony_ci	push	%r15
6348c2ecf20Sopenharmony_ci.cfi_push	%r15
6358c2ecf20Sopenharmony_ci.Lblocks_avx_body:
6368c2ecf20Sopenharmony_ci
6378c2ecf20Sopenharmony_ci	mov	$len,%r15		# reassign $len
6388c2ecf20Sopenharmony_ci
6398c2ecf20Sopenharmony_ci	mov	0($ctx),$d1		# load hash value
6408c2ecf20Sopenharmony_ci	mov	8($ctx),$d2
6418c2ecf20Sopenharmony_ci	mov	16($ctx),$h2#d
6428c2ecf20Sopenharmony_ci
6438c2ecf20Sopenharmony_ci	mov	24($ctx),$r0		# load r
6448c2ecf20Sopenharmony_ci	mov	32($ctx),$s1
6458c2ecf20Sopenharmony_ci
6468c2ecf20Sopenharmony_ci	################################# base 2^26 -> base 2^64
6478c2ecf20Sopenharmony_ci	mov	$d1#d,$h0#d
6488c2ecf20Sopenharmony_ci	and	\$`-1*(1<<31)`,$d1
6498c2ecf20Sopenharmony_ci	mov	$d2,$r1			# borrow $r1
6508c2ecf20Sopenharmony_ci	mov	$d2#d,$h1#d
6518c2ecf20Sopenharmony_ci	and	\$`-1*(1<<31)`,$d2
6528c2ecf20Sopenharmony_ci
6538c2ecf20Sopenharmony_ci	shr	\$6,$d1
6548c2ecf20Sopenharmony_ci	shl	\$52,$r1
6558c2ecf20Sopenharmony_ci	add	$d1,$h0
6568c2ecf20Sopenharmony_ci	shr	\$12,$h1
6578c2ecf20Sopenharmony_ci	shr	\$18,$d2
6588c2ecf20Sopenharmony_ci	add	$r1,$h0
6598c2ecf20Sopenharmony_ci	adc	$d2,$h1
6608c2ecf20Sopenharmony_ci
6618c2ecf20Sopenharmony_ci	mov	$h2,$d1
6628c2ecf20Sopenharmony_ci	shl	\$40,$d1
6638c2ecf20Sopenharmony_ci	shr	\$24,$h2
6648c2ecf20Sopenharmony_ci	add	$d1,$h1
6658c2ecf20Sopenharmony_ci	adc	\$0,$h2			# can be partially reduced...
6668c2ecf20Sopenharmony_ci
6678c2ecf20Sopenharmony_ci	mov	\$-4,$d2		# ... so reduce
6688c2ecf20Sopenharmony_ci	mov	$h2,$d1
6698c2ecf20Sopenharmony_ci	and	$h2,$d2
6708c2ecf20Sopenharmony_ci	shr	\$2,$d1
6718c2ecf20Sopenharmony_ci	and	\$3,$h2
6728c2ecf20Sopenharmony_ci	add	$d2,$d1			# =*5
6738c2ecf20Sopenharmony_ci	add	$d1,$h0
6748c2ecf20Sopenharmony_ci	adc	\$0,$h1
6758c2ecf20Sopenharmony_ci	adc	\$0,$h2
6768c2ecf20Sopenharmony_ci
6778c2ecf20Sopenharmony_ci	mov	$s1,$r1
6788c2ecf20Sopenharmony_ci	mov	$s1,%rax
6798c2ecf20Sopenharmony_ci	shr	\$2,$s1
6808c2ecf20Sopenharmony_ci	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
6818c2ecf20Sopenharmony_ci
6828c2ecf20Sopenharmony_ci	add	0($inp),$h0		# accumulate input
6838c2ecf20Sopenharmony_ci	adc	8($inp),$h1
6848c2ecf20Sopenharmony_ci	lea	16($inp),$inp
6858c2ecf20Sopenharmony_ci	adc	$padbit,$h2
6868c2ecf20Sopenharmony_ci
6878c2ecf20Sopenharmony_ci	call	__poly1305_block
6888c2ecf20Sopenharmony_ci
6898c2ecf20Sopenharmony_ci	test	$padbit,$padbit		# if $padbit is zero,
6908c2ecf20Sopenharmony_ci	jz	.Lstore_base2_64_avx	# store hash in base 2^64 format
6918c2ecf20Sopenharmony_ci
6928c2ecf20Sopenharmony_ci	################################# base 2^64 -> base 2^26
6938c2ecf20Sopenharmony_ci	mov	$h0,%rax
6948c2ecf20Sopenharmony_ci	mov	$h0,%rdx
6958c2ecf20Sopenharmony_ci	shr	\$52,$h0
6968c2ecf20Sopenharmony_ci	mov	$h1,$r0
6978c2ecf20Sopenharmony_ci	mov	$h1,$r1
6988c2ecf20Sopenharmony_ci	shr	\$26,%rdx
6998c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%rax	# h[0]
7008c2ecf20Sopenharmony_ci	shl	\$12,$r0
7018c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%rdx	# h[1]
7028c2ecf20Sopenharmony_ci	shr	\$14,$h1
7038c2ecf20Sopenharmony_ci	or	$r0,$h0
7048c2ecf20Sopenharmony_ci	shl	\$24,$h2
7058c2ecf20Sopenharmony_ci	and	\$0x3ffffff,$h0		# h[2]
7068c2ecf20Sopenharmony_ci	shr	\$40,$r1
7078c2ecf20Sopenharmony_ci	and	\$0x3ffffff,$h1		# h[3]
7088c2ecf20Sopenharmony_ci	or	$r1,$h2			# h[4]
7098c2ecf20Sopenharmony_ci
7108c2ecf20Sopenharmony_ci	sub	\$16,%r15
7118c2ecf20Sopenharmony_ci	jz	.Lstore_base2_26_avx
7128c2ecf20Sopenharmony_ci
7138c2ecf20Sopenharmony_ci	vmovd	%rax#d,$H0
7148c2ecf20Sopenharmony_ci	vmovd	%rdx#d,$H1
7158c2ecf20Sopenharmony_ci	vmovd	$h0#d,$H2
7168c2ecf20Sopenharmony_ci	vmovd	$h1#d,$H3
7178c2ecf20Sopenharmony_ci	vmovd	$h2#d,$H4
7188c2ecf20Sopenharmony_ci	jmp	.Lproceed_avx
7198c2ecf20Sopenharmony_ci
7208c2ecf20Sopenharmony_ci.align	32
7218c2ecf20Sopenharmony_ci.Lstore_base2_64_avx:
7228c2ecf20Sopenharmony_ci	mov	$h0,0($ctx)
7238c2ecf20Sopenharmony_ci	mov	$h1,8($ctx)
7248c2ecf20Sopenharmony_ci	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
7258c2ecf20Sopenharmony_ci	jmp	.Ldone_avx
7268c2ecf20Sopenharmony_ci
7278c2ecf20Sopenharmony_ci.align	16
7288c2ecf20Sopenharmony_ci.Lstore_base2_26_avx:
7298c2ecf20Sopenharmony_ci	mov	%rax#d,0($ctx)		# store hash value base 2^26
7308c2ecf20Sopenharmony_ci	mov	%rdx#d,4($ctx)
7318c2ecf20Sopenharmony_ci	mov	$h0#d,8($ctx)
7328c2ecf20Sopenharmony_ci	mov	$h1#d,12($ctx)
7338c2ecf20Sopenharmony_ci	mov	$h2#d,16($ctx)
7348c2ecf20Sopenharmony_ci.align	16
7358c2ecf20Sopenharmony_ci.Ldone_avx:
7368c2ecf20Sopenharmony_ci	pop 		%r15
7378c2ecf20Sopenharmony_ci.cfi_restore	%r15
7388c2ecf20Sopenharmony_ci	pop 		%r14
7398c2ecf20Sopenharmony_ci.cfi_restore	%r14
7408c2ecf20Sopenharmony_ci	pop 		%r13
7418c2ecf20Sopenharmony_ci.cfi_restore	%r13
7428c2ecf20Sopenharmony_ci	pop 		%r12
7438c2ecf20Sopenharmony_ci.cfi_restore	%r12
7448c2ecf20Sopenharmony_ci	pop 		%rbx
7458c2ecf20Sopenharmony_ci.cfi_restore	%rbx
7468c2ecf20Sopenharmony_ci	pop 		%rbp
7478c2ecf20Sopenharmony_ci.cfi_restore	%rbp
7488c2ecf20Sopenharmony_ci.Lno_data_avx:
7498c2ecf20Sopenharmony_ci.Lblocks_avx_epilogue:
7508c2ecf20Sopenharmony_ci	RET
7518c2ecf20Sopenharmony_ci.cfi_endproc
7528c2ecf20Sopenharmony_ci
7538c2ecf20Sopenharmony_ci.align	32
7548c2ecf20Sopenharmony_ci.Lbase2_64_avx:
7558c2ecf20Sopenharmony_ci.cfi_startproc
7568c2ecf20Sopenharmony_ci	push	%rbp
7578c2ecf20Sopenharmony_ci.cfi_push	%rbp
7588c2ecf20Sopenharmony_ci	mov 	%rsp,%rbp
7598c2ecf20Sopenharmony_ci	push	%rbx
7608c2ecf20Sopenharmony_ci.cfi_push	%rbx
7618c2ecf20Sopenharmony_ci	push	%r12
7628c2ecf20Sopenharmony_ci.cfi_push	%r12
7638c2ecf20Sopenharmony_ci	push	%r13
7648c2ecf20Sopenharmony_ci.cfi_push	%r13
7658c2ecf20Sopenharmony_ci	push	%r14
7668c2ecf20Sopenharmony_ci.cfi_push	%r14
7678c2ecf20Sopenharmony_ci	push	%r15
7688c2ecf20Sopenharmony_ci.cfi_push	%r15
7698c2ecf20Sopenharmony_ci.Lbase2_64_avx_body:
7708c2ecf20Sopenharmony_ci
7718c2ecf20Sopenharmony_ci	mov	$len,%r15		# reassign $len
7728c2ecf20Sopenharmony_ci
7738c2ecf20Sopenharmony_ci	mov	24($ctx),$r0		# load r
7748c2ecf20Sopenharmony_ci	mov	32($ctx),$s1
7758c2ecf20Sopenharmony_ci
7768c2ecf20Sopenharmony_ci	mov	0($ctx),$h0		# load hash value
7778c2ecf20Sopenharmony_ci	mov	8($ctx),$h1
7788c2ecf20Sopenharmony_ci	mov	16($ctx),$h2#d
7798c2ecf20Sopenharmony_ci
7808c2ecf20Sopenharmony_ci	mov	$s1,$r1
7818c2ecf20Sopenharmony_ci	mov	$s1,%rax
7828c2ecf20Sopenharmony_ci	shr	\$2,$s1
7838c2ecf20Sopenharmony_ci	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
7848c2ecf20Sopenharmony_ci
7858c2ecf20Sopenharmony_ci	test	\$31,$len
7868c2ecf20Sopenharmony_ci	jz	.Linit_avx
7878c2ecf20Sopenharmony_ci
7888c2ecf20Sopenharmony_ci	add	0($inp),$h0		# accumulate input
7898c2ecf20Sopenharmony_ci	adc	8($inp),$h1
7908c2ecf20Sopenharmony_ci	lea	16($inp),$inp
7918c2ecf20Sopenharmony_ci	adc	$padbit,$h2
7928c2ecf20Sopenharmony_ci	sub	\$16,%r15
7938c2ecf20Sopenharmony_ci
7948c2ecf20Sopenharmony_ci	call	__poly1305_block
7958c2ecf20Sopenharmony_ci
7968c2ecf20Sopenharmony_ci.Linit_avx:
7978c2ecf20Sopenharmony_ci	################################# base 2^64 -> base 2^26
7988c2ecf20Sopenharmony_ci	mov	$h0,%rax
7998c2ecf20Sopenharmony_ci	mov	$h0,%rdx
8008c2ecf20Sopenharmony_ci	shr	\$52,$h0
8018c2ecf20Sopenharmony_ci	mov	$h1,$d1
8028c2ecf20Sopenharmony_ci	mov	$h1,$d2
8038c2ecf20Sopenharmony_ci	shr	\$26,%rdx
8048c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%rax	# h[0]
8058c2ecf20Sopenharmony_ci	shl	\$12,$d1
8068c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%rdx	# h[1]
8078c2ecf20Sopenharmony_ci	shr	\$14,$h1
8088c2ecf20Sopenharmony_ci	or	$d1,$h0
8098c2ecf20Sopenharmony_ci	shl	\$24,$h2
8108c2ecf20Sopenharmony_ci	and	\$0x3ffffff,$h0		# h[2]
8118c2ecf20Sopenharmony_ci	shr	\$40,$d2
8128c2ecf20Sopenharmony_ci	and	\$0x3ffffff,$h1		# h[3]
8138c2ecf20Sopenharmony_ci	or	$d2,$h2			# h[4]
8148c2ecf20Sopenharmony_ci
8158c2ecf20Sopenharmony_ci	vmovd	%rax#d,$H0
8168c2ecf20Sopenharmony_ci	vmovd	%rdx#d,$H1
8178c2ecf20Sopenharmony_ci	vmovd	$h0#d,$H2
8188c2ecf20Sopenharmony_ci	vmovd	$h1#d,$H3
8198c2ecf20Sopenharmony_ci	vmovd	$h2#d,$H4
8208c2ecf20Sopenharmony_ci	movl	\$1,20($ctx)		# set is_base2_26
8218c2ecf20Sopenharmony_ci
8228c2ecf20Sopenharmony_ci	call	__poly1305_init_avx
8238c2ecf20Sopenharmony_ci
8248c2ecf20Sopenharmony_ci.Lproceed_avx:
8258c2ecf20Sopenharmony_ci	mov	%r15,$len
8268c2ecf20Sopenharmony_ci	pop 		%r15
8278c2ecf20Sopenharmony_ci.cfi_restore	%r15
8288c2ecf20Sopenharmony_ci	pop 		%r14
8298c2ecf20Sopenharmony_ci.cfi_restore	%r14
8308c2ecf20Sopenharmony_ci	pop 		%r13
8318c2ecf20Sopenharmony_ci.cfi_restore	%r13
8328c2ecf20Sopenharmony_ci	pop 		%r12
8338c2ecf20Sopenharmony_ci.cfi_restore	%r12
8348c2ecf20Sopenharmony_ci	pop 		%rbx
8358c2ecf20Sopenharmony_ci.cfi_restore	%rbx
8368c2ecf20Sopenharmony_ci	pop 		%rbp
8378c2ecf20Sopenharmony_ci.cfi_restore	%rbp
8388c2ecf20Sopenharmony_ci.Lbase2_64_avx_epilogue:
8398c2ecf20Sopenharmony_ci	jmp	.Ldo_avx
8408c2ecf20Sopenharmony_ci.cfi_endproc
8418c2ecf20Sopenharmony_ci
8428c2ecf20Sopenharmony_ci.align	32
8438c2ecf20Sopenharmony_ci.Leven_avx:
8448c2ecf20Sopenharmony_ci.cfi_startproc
8458c2ecf20Sopenharmony_ci	vmovd		4*0($ctx),$H0		# load hash value
8468c2ecf20Sopenharmony_ci	vmovd		4*1($ctx),$H1
8478c2ecf20Sopenharmony_ci	vmovd		4*2($ctx),$H2
8488c2ecf20Sopenharmony_ci	vmovd		4*3($ctx),$H3
8498c2ecf20Sopenharmony_ci	vmovd		4*4($ctx),$H4
8508c2ecf20Sopenharmony_ci
8518c2ecf20Sopenharmony_ci.Ldo_avx:
8528c2ecf20Sopenharmony_ci___
8538c2ecf20Sopenharmony_ci$code.=<<___	if (!$win64);
8548c2ecf20Sopenharmony_ci	lea		8(%rsp),%r10
8558c2ecf20Sopenharmony_ci.cfi_def_cfa_register	%r10
8568c2ecf20Sopenharmony_ci	and		\$-32,%rsp
8578c2ecf20Sopenharmony_ci	sub		\$-8,%rsp
8588c2ecf20Sopenharmony_ci	lea		-0x58(%rsp),%r11
8598c2ecf20Sopenharmony_ci	sub		\$0x178,%rsp
8608c2ecf20Sopenharmony_ci___
8618c2ecf20Sopenharmony_ci$code.=<<___	if ($win64);
8628c2ecf20Sopenharmony_ci	lea		-0xf8(%rsp),%r11
8638c2ecf20Sopenharmony_ci	sub		\$0x218,%rsp
8648c2ecf20Sopenharmony_ci	vmovdqa		%xmm6,0x50(%r11)
8658c2ecf20Sopenharmony_ci	vmovdqa		%xmm7,0x60(%r11)
8668c2ecf20Sopenharmony_ci	vmovdqa		%xmm8,0x70(%r11)
8678c2ecf20Sopenharmony_ci	vmovdqa		%xmm9,0x80(%r11)
8688c2ecf20Sopenharmony_ci	vmovdqa		%xmm10,0x90(%r11)
8698c2ecf20Sopenharmony_ci	vmovdqa		%xmm11,0xa0(%r11)
8708c2ecf20Sopenharmony_ci	vmovdqa		%xmm12,0xb0(%r11)
8718c2ecf20Sopenharmony_ci	vmovdqa		%xmm13,0xc0(%r11)
8728c2ecf20Sopenharmony_ci	vmovdqa		%xmm14,0xd0(%r11)
8738c2ecf20Sopenharmony_ci	vmovdqa		%xmm15,0xe0(%r11)
8748c2ecf20Sopenharmony_ci.Ldo_avx_body:
8758c2ecf20Sopenharmony_ci___
8768c2ecf20Sopenharmony_ci$code.=<<___;
8778c2ecf20Sopenharmony_ci	sub		\$64,$len
8788c2ecf20Sopenharmony_ci	lea		-32($inp),%rax
8798c2ecf20Sopenharmony_ci	cmovc		%rax,$inp
8808c2ecf20Sopenharmony_ci
8818c2ecf20Sopenharmony_ci	vmovdqu		`16*3`($ctx),$D4	# preload r0^2
8828c2ecf20Sopenharmony_ci	lea		`16*3+64`($ctx),$ctx	# size optimization
8838c2ecf20Sopenharmony_ci	lea		.Lconst(%rip),%rcx
8848c2ecf20Sopenharmony_ci
8858c2ecf20Sopenharmony_ci	################################################################
8868c2ecf20Sopenharmony_ci	# load input
8878c2ecf20Sopenharmony_ci	vmovdqu		16*2($inp),$T0
8888c2ecf20Sopenharmony_ci	vmovdqu		16*3($inp),$T1
8898c2ecf20Sopenharmony_ci	vmovdqa		64(%rcx),$MASK		# .Lmask26
8908c2ecf20Sopenharmony_ci
8918c2ecf20Sopenharmony_ci	vpsrldq		\$6,$T0,$T2		# splat input
8928c2ecf20Sopenharmony_ci	vpsrldq		\$6,$T1,$T3
8938c2ecf20Sopenharmony_ci	vpunpckhqdq	$T1,$T0,$T4		# 4
8948c2ecf20Sopenharmony_ci	vpunpcklqdq	$T1,$T0,$T0		# 0:1
8958c2ecf20Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T3		# 2:3
8968c2ecf20Sopenharmony_ci
8978c2ecf20Sopenharmony_ci	vpsrlq		\$40,$T4,$T4		# 4
8988c2ecf20Sopenharmony_ci	vpsrlq		\$26,$T0,$T1
8998c2ecf20Sopenharmony_ci	vpand		$MASK,$T0,$T0		# 0
9008c2ecf20Sopenharmony_ci	vpsrlq		\$4,$T3,$T2
9018c2ecf20Sopenharmony_ci	vpand		$MASK,$T1,$T1		# 1
9028c2ecf20Sopenharmony_ci	vpsrlq		\$30,$T3,$T3
9038c2ecf20Sopenharmony_ci	vpand		$MASK,$T2,$T2		# 2
9048c2ecf20Sopenharmony_ci	vpand		$MASK,$T3,$T3		# 3
9058c2ecf20Sopenharmony_ci	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
9068c2ecf20Sopenharmony_ci
9078c2ecf20Sopenharmony_ci	jbe		.Lskip_loop_avx
9088c2ecf20Sopenharmony_ci
9098c2ecf20Sopenharmony_ci	# expand and copy pre-calculated table to stack
9108c2ecf20Sopenharmony_ci	vmovdqu		`16*1-64`($ctx),$D1
9118c2ecf20Sopenharmony_ci	vmovdqu		`16*2-64`($ctx),$D2
9128c2ecf20Sopenharmony_ci	vpshufd		\$0xEE,$D4,$D3		# 34xx -> 3434
9138c2ecf20Sopenharmony_ci	vpshufd		\$0x44,$D4,$D0		# xx12 -> 1212
9148c2ecf20Sopenharmony_ci	vmovdqa		$D3,-0x90(%r11)
9158c2ecf20Sopenharmony_ci	vmovdqa		$D0,0x00(%rsp)
9168c2ecf20Sopenharmony_ci	vpshufd		\$0xEE,$D1,$D4
9178c2ecf20Sopenharmony_ci	vmovdqu		`16*3-64`($ctx),$D0
9188c2ecf20Sopenharmony_ci	vpshufd		\$0x44,$D1,$D1
9198c2ecf20Sopenharmony_ci	vmovdqa		$D4,-0x80(%r11)
9208c2ecf20Sopenharmony_ci	vmovdqa		$D1,0x10(%rsp)
9218c2ecf20Sopenharmony_ci	vpshufd		\$0xEE,$D2,$D3
9228c2ecf20Sopenharmony_ci	vmovdqu		`16*4-64`($ctx),$D1
9238c2ecf20Sopenharmony_ci	vpshufd		\$0x44,$D2,$D2
9248c2ecf20Sopenharmony_ci	vmovdqa		$D3,-0x70(%r11)
9258c2ecf20Sopenharmony_ci	vmovdqa		$D2,0x20(%rsp)
9268c2ecf20Sopenharmony_ci	vpshufd		\$0xEE,$D0,$D4
9278c2ecf20Sopenharmony_ci	vmovdqu		`16*5-64`($ctx),$D2
9288c2ecf20Sopenharmony_ci	vpshufd		\$0x44,$D0,$D0
9298c2ecf20Sopenharmony_ci	vmovdqa		$D4,-0x60(%r11)
9308c2ecf20Sopenharmony_ci	vmovdqa		$D0,0x30(%rsp)
9318c2ecf20Sopenharmony_ci	vpshufd		\$0xEE,$D1,$D3
9328c2ecf20Sopenharmony_ci	vmovdqu		`16*6-64`($ctx),$D0
9338c2ecf20Sopenharmony_ci	vpshufd		\$0x44,$D1,$D1
9348c2ecf20Sopenharmony_ci	vmovdqa		$D3,-0x50(%r11)
9358c2ecf20Sopenharmony_ci	vmovdqa		$D1,0x40(%rsp)
9368c2ecf20Sopenharmony_ci	vpshufd		\$0xEE,$D2,$D4
9378c2ecf20Sopenharmony_ci	vmovdqu		`16*7-64`($ctx),$D1
9388c2ecf20Sopenharmony_ci	vpshufd		\$0x44,$D2,$D2
9398c2ecf20Sopenharmony_ci	vmovdqa		$D4,-0x40(%r11)
9408c2ecf20Sopenharmony_ci	vmovdqa		$D2,0x50(%rsp)
9418c2ecf20Sopenharmony_ci	vpshufd		\$0xEE,$D0,$D3
9428c2ecf20Sopenharmony_ci	vmovdqu		`16*8-64`($ctx),$D2
9438c2ecf20Sopenharmony_ci	vpshufd		\$0x44,$D0,$D0
9448c2ecf20Sopenharmony_ci	vmovdqa		$D3,-0x30(%r11)
9458c2ecf20Sopenharmony_ci	vmovdqa		$D0,0x60(%rsp)
9468c2ecf20Sopenharmony_ci	vpshufd		\$0xEE,$D1,$D4
9478c2ecf20Sopenharmony_ci	vpshufd		\$0x44,$D1,$D1
9488c2ecf20Sopenharmony_ci	vmovdqa		$D4,-0x20(%r11)
9498c2ecf20Sopenharmony_ci	vmovdqa		$D1,0x70(%rsp)
9508c2ecf20Sopenharmony_ci	vpshufd		\$0xEE,$D2,$D3
9518c2ecf20Sopenharmony_ci	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
9528c2ecf20Sopenharmony_ci	vpshufd		\$0x44,$D2,$D2
9538c2ecf20Sopenharmony_ci	vmovdqa		$D3,-0x10(%r11)
9548c2ecf20Sopenharmony_ci	vmovdqa		$D2,0x80(%rsp)
9558c2ecf20Sopenharmony_ci
9568c2ecf20Sopenharmony_ci	jmp		.Loop_avx
9578c2ecf20Sopenharmony_ci
9588c2ecf20Sopenharmony_ci.align	32
9598c2ecf20Sopenharmony_ci.Loop_avx:
9608c2ecf20Sopenharmony_ci	################################################################
9618c2ecf20Sopenharmony_ci	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
9628c2ecf20Sopenharmony_ci	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
9638c2ecf20Sopenharmony_ci	#   \___________________/
9648c2ecf20Sopenharmony_ci	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
9658c2ecf20Sopenharmony_ci	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
9668c2ecf20Sopenharmony_ci	#   \___________________/ \____________________/
9678c2ecf20Sopenharmony_ci	#
9688c2ecf20Sopenharmony_ci	# Note that we start with inp[2:3]*r^2. This is because it
9698c2ecf20Sopenharmony_ci	# doesn't depend on reduction in previous iteration.
9708c2ecf20Sopenharmony_ci	################################################################
9718c2ecf20Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
9728c2ecf20Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
9738c2ecf20Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
9748c2ecf20Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
9758c2ecf20Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
9768c2ecf20Sopenharmony_ci	#
9778c2ecf20Sopenharmony_ci	# though note that $Tx and $Hx are "reversed" in this section,
9788c2ecf20Sopenharmony_ci	# and $D4 is preloaded with r0^2...
9798c2ecf20Sopenharmony_ci
9808c2ecf20Sopenharmony_ci	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
9818c2ecf20Sopenharmony_ci	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
9828c2ecf20Sopenharmony_ci	  vmovdqa	$H2,0x20(%r11)				# offload hash
9838c2ecf20Sopenharmony_ci	vpmuludq	$T2,$D4,$D2		# d3 = h2*r0
9848c2ecf20Sopenharmony_ci	 vmovdqa	0x10(%rsp),$H2		# r1^2
9858c2ecf20Sopenharmony_ci	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
9868c2ecf20Sopenharmony_ci	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
9878c2ecf20Sopenharmony_ci
9888c2ecf20Sopenharmony_ci	  vmovdqa	$H0,0x00(%r11)				#
9898c2ecf20Sopenharmony_ci	vpmuludq	0x20(%rsp),$T4,$H0	# h4*s1
9908c2ecf20Sopenharmony_ci	  vmovdqa	$H1,0x10(%r11)				#
9918c2ecf20Sopenharmony_ci	vpmuludq	$T3,$H2,$H1		# h3*r1
9928c2ecf20Sopenharmony_ci	vpaddq		$H0,$D0,$D0		# d0 += h4*s1
9938c2ecf20Sopenharmony_ci	vpaddq		$H1,$D4,$D4		# d4 += h3*r1
9948c2ecf20Sopenharmony_ci	  vmovdqa	$H3,0x30(%r11)				#
9958c2ecf20Sopenharmony_ci	vpmuludq	$T2,$H2,$H0		# h2*r1
9968c2ecf20Sopenharmony_ci	vpmuludq	$T1,$H2,$H1		# h1*r1
9978c2ecf20Sopenharmony_ci	vpaddq		$H0,$D3,$D3		# d3 += h2*r1
9988c2ecf20Sopenharmony_ci	 vmovdqa	0x30(%rsp),$H3		# r2^2
9998c2ecf20Sopenharmony_ci	vpaddq		$H1,$D2,$D2		# d2 += h1*r1
10008c2ecf20Sopenharmony_ci	  vmovdqa	$H4,0x40(%r11)				#
10018c2ecf20Sopenharmony_ci	vpmuludq	$T0,$H2,$H2		# h0*r1
10028c2ecf20Sopenharmony_ci	 vpmuludq	$T2,$H3,$H0		# h2*r2
10038c2ecf20Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
10048c2ecf20Sopenharmony_ci
10058c2ecf20Sopenharmony_ci	 vmovdqa	0x40(%rsp),$H4		# s2^2
10068c2ecf20Sopenharmony_ci	vpaddq		$H0,$D4,$D4		# d4 += h2*r2
10078c2ecf20Sopenharmony_ci	vpmuludq	$T1,$H3,$H1		# h1*r2
10088c2ecf20Sopenharmony_ci	vpmuludq	$T0,$H3,$H3		# h0*r2
10098c2ecf20Sopenharmony_ci	vpaddq		$H1,$D3,$D3		# d3 += h1*r2
10108c2ecf20Sopenharmony_ci	 vmovdqa	0x50(%rsp),$H2		# r3^2
10118c2ecf20Sopenharmony_ci	vpaddq		$H3,$D2,$D2		# d2 += h0*r2
10128c2ecf20Sopenharmony_ci	vpmuludq	$T4,$H4,$H0		# h4*s2
10138c2ecf20Sopenharmony_ci	vpmuludq	$T3,$H4,$H4		# h3*s2
10148c2ecf20Sopenharmony_ci	vpaddq		$H0,$D1,$D1		# d1 += h4*s2
10158c2ecf20Sopenharmony_ci	 vmovdqa	0x60(%rsp),$H3		# s3^2
10168c2ecf20Sopenharmony_ci	vpaddq		$H4,$D0,$D0		# d0 += h3*s2
10178c2ecf20Sopenharmony_ci
10188c2ecf20Sopenharmony_ci	 vmovdqa	0x80(%rsp),$H4		# s4^2
10198c2ecf20Sopenharmony_ci	vpmuludq	$T1,$H2,$H1		# h1*r3
10208c2ecf20Sopenharmony_ci	vpmuludq	$T0,$H2,$H2		# h0*r3
10218c2ecf20Sopenharmony_ci	vpaddq		$H1,$D4,$D4		# d4 += h1*r3
10228c2ecf20Sopenharmony_ci	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
10238c2ecf20Sopenharmony_ci	vpmuludq	$T4,$H3,$H0		# h4*s3
10248c2ecf20Sopenharmony_ci	vpmuludq	$T3,$H3,$H1		# h3*s3
10258c2ecf20Sopenharmony_ci	vpaddq		$H0,$D2,$D2		# d2 += h4*s3
10268c2ecf20Sopenharmony_ci	 vmovdqu	16*0($inp),$H0				# load input
10278c2ecf20Sopenharmony_ci	vpaddq		$H1,$D1,$D1		# d1 += h3*s3
10288c2ecf20Sopenharmony_ci	vpmuludq	$T2,$H3,$H3		# h2*s3
10298c2ecf20Sopenharmony_ci	 vpmuludq	$T2,$H4,$T2		# h2*s4
10308c2ecf20Sopenharmony_ci	vpaddq		$H3,$D0,$D0		# d0 += h2*s3
10318c2ecf20Sopenharmony_ci
10328c2ecf20Sopenharmony_ci	 vmovdqu	16*1($inp),$H1				#
10338c2ecf20Sopenharmony_ci	vpaddq		$T2,$D1,$D1		# d1 += h2*s4
10348c2ecf20Sopenharmony_ci	vpmuludq	$T3,$H4,$T3		# h3*s4
10358c2ecf20Sopenharmony_ci	vpmuludq	$T4,$H4,$T4		# h4*s4
10368c2ecf20Sopenharmony_ci	 vpsrldq	\$6,$H0,$H2				# splat input
10378c2ecf20Sopenharmony_ci	vpaddq		$T3,$D2,$D2		# d2 += h3*s4
10388c2ecf20Sopenharmony_ci	vpaddq		$T4,$D3,$D3		# d3 += h4*s4
10398c2ecf20Sopenharmony_ci	 vpsrldq	\$6,$H1,$H3				#
10408c2ecf20Sopenharmony_ci	vpmuludq	0x70(%rsp),$T0,$T4	# h0*r4
10418c2ecf20Sopenharmony_ci	vpmuludq	$T1,$H4,$T0		# h1*s4
10428c2ecf20Sopenharmony_ci	 vpunpckhqdq	$H1,$H0,$H4		# 4
10438c2ecf20Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h0*r4
10448c2ecf20Sopenharmony_ci	 vmovdqa	-0x90(%r11),$T4		# r0^4
10458c2ecf20Sopenharmony_ci	vpaddq		$T0,$D0,$D0		# d0 += h1*s4
10468c2ecf20Sopenharmony_ci
10478c2ecf20Sopenharmony_ci	vpunpcklqdq	$H1,$H0,$H0		# 0:1
10488c2ecf20Sopenharmony_ci	vpunpcklqdq	$H3,$H2,$H3		# 2:3
10498c2ecf20Sopenharmony_ci
10508c2ecf20Sopenharmony_ci	#vpsrlq		\$40,$H4,$H4		# 4
10518c2ecf20Sopenharmony_ci	vpsrldq		\$`40/8`,$H4,$H4	# 4
10528c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$H1
10538c2ecf20Sopenharmony_ci	vpand		$MASK,$H0,$H0		# 0
10548c2ecf20Sopenharmony_ci	vpsrlq		\$4,$H3,$H2
10558c2ecf20Sopenharmony_ci	vpand		$MASK,$H1,$H1		# 1
10568c2ecf20Sopenharmony_ci	vpand		0(%rcx),$H4,$H4		# .Lmask24
10578c2ecf20Sopenharmony_ci	vpsrlq		\$30,$H3,$H3
10588c2ecf20Sopenharmony_ci	vpand		$MASK,$H2,$H2		# 2
10598c2ecf20Sopenharmony_ci	vpand		$MASK,$H3,$H3		# 3
10608c2ecf20Sopenharmony_ci	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
10618c2ecf20Sopenharmony_ci
10628c2ecf20Sopenharmony_ci	vpaddq		0x00(%r11),$H0,$H0	# add hash value
10638c2ecf20Sopenharmony_ci	vpaddq		0x10(%r11),$H1,$H1
10648c2ecf20Sopenharmony_ci	vpaddq		0x20(%r11),$H2,$H2
10658c2ecf20Sopenharmony_ci	vpaddq		0x30(%r11),$H3,$H3
10668c2ecf20Sopenharmony_ci	vpaddq		0x40(%r11),$H4,$H4
10678c2ecf20Sopenharmony_ci
10688c2ecf20Sopenharmony_ci	lea		16*2($inp),%rax
10698c2ecf20Sopenharmony_ci	lea		16*4($inp),$inp
10708c2ecf20Sopenharmony_ci	sub		\$64,$len
10718c2ecf20Sopenharmony_ci	cmovc		%rax,$inp
10728c2ecf20Sopenharmony_ci
10738c2ecf20Sopenharmony_ci	################################################################
10748c2ecf20Sopenharmony_ci	# Now we accumulate (inp[0:1]+hash)*r^4
10758c2ecf20Sopenharmony_ci	################################################################
10768c2ecf20Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
10778c2ecf20Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
10788c2ecf20Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
10798c2ecf20Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
10808c2ecf20Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
10818c2ecf20Sopenharmony_ci
10828c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T4,$T0		# h0*r0
10838c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T4,$T1		# h1*r0
10848c2ecf20Sopenharmony_ci	vpaddq		$T0,$D0,$D0
10858c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1,$D1
10868c2ecf20Sopenharmony_ci	 vmovdqa	-0x80(%r11),$T2		# r1^4
10878c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T4,$T0		# h2*r0
10888c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T4,$T1		# h3*r0
10898c2ecf20Sopenharmony_ci	vpaddq		$T0,$D2,$D2
10908c2ecf20Sopenharmony_ci	vpaddq		$T1,$D3,$D3
10918c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T4,$T4		# h4*r0
10928c2ecf20Sopenharmony_ci	 vpmuludq	-0x70(%r11),$H4,$T0	# h4*s1
10938c2ecf20Sopenharmony_ci	vpaddq		$T4,$D4,$D4
10948c2ecf20Sopenharmony_ci
10958c2ecf20Sopenharmony_ci	vpaddq		$T0,$D0,$D0		# d0 += h4*s1
10968c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T2,$T1		# h2*r1
10978c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T2,$T0		# h3*r1
10988c2ecf20Sopenharmony_ci	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
10998c2ecf20Sopenharmony_ci	 vmovdqa	-0x60(%r11),$T3		# r2^4
11008c2ecf20Sopenharmony_ci	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
11018c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T2,$T1		# h1*r1
11028c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r1
11038c2ecf20Sopenharmony_ci	vpaddq		$T1,$D2,$D2		# d2 += h1*r1
11048c2ecf20Sopenharmony_ci	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
11058c2ecf20Sopenharmony_ci
11068c2ecf20Sopenharmony_ci	 vmovdqa	-0x50(%r11),$T4		# s2^4
11078c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T3,$T0		# h2*r2
11088c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T3,$T1		# h1*r2
11098c2ecf20Sopenharmony_ci	vpaddq		$T0,$D4,$D4		# d4 += h2*r2
11108c2ecf20Sopenharmony_ci	vpaddq		$T1,$D3,$D3		# d3 += h1*r2
11118c2ecf20Sopenharmony_ci	 vmovdqa	-0x40(%r11),$T2		# r3^4
11128c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T3,$T3		# h0*r2
11138c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T4,$T0		# h4*s2
11148c2ecf20Sopenharmony_ci	vpaddq		$T3,$D2,$D2		# d2 += h0*r2
11158c2ecf20Sopenharmony_ci	vpaddq		$T0,$D1,$D1		# d1 += h4*s2
11168c2ecf20Sopenharmony_ci	 vmovdqa	-0x30(%r11),$T3		# s3^4
11178c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T4,$T4		# h3*s2
11188c2ecf20Sopenharmony_ci	 vpmuludq	$H1,$T2,$T1		# h1*r3
11198c2ecf20Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
11208c2ecf20Sopenharmony_ci
11218c2ecf20Sopenharmony_ci	 vmovdqa	-0x10(%r11),$T4		# s4^4
11228c2ecf20Sopenharmony_ci	vpaddq		$T1,$D4,$D4		# d4 += h1*r3
11238c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r3
11248c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T3,$T0		# h4*s3
11258c2ecf20Sopenharmony_ci	vpaddq		$T2,$D3,$D3		# d3 += h0*r3
11268c2ecf20Sopenharmony_ci	vpaddq		$T0,$D2,$D2		# d2 += h4*s3
11278c2ecf20Sopenharmony_ci	 vmovdqu	16*2($inp),$T0				# load input
11288c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T3,$T2		# h3*s3
11298c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T3,$T3		# h2*s3
11308c2ecf20Sopenharmony_ci	vpaddq		$T2,$D1,$D1		# d1 += h3*s3
11318c2ecf20Sopenharmony_ci	 vmovdqu	16*3($inp),$T1				#
11328c2ecf20Sopenharmony_ci	vpaddq		$T3,$D0,$D0		# d0 += h2*s3
11338c2ecf20Sopenharmony_ci
11348c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T4,$H2		# h2*s4
11358c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T4,$H3		# h3*s4
11368c2ecf20Sopenharmony_ci	 vpsrldq	\$6,$T0,$T2				# splat input
11378c2ecf20Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h2*s4
11388c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T4,$H4		# h4*s4
11398c2ecf20Sopenharmony_ci	 vpsrldq	\$6,$T1,$T3				#
11408c2ecf20Sopenharmony_ci	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*s4
11418c2ecf20Sopenharmony_ci	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*s4
11428c2ecf20Sopenharmony_ci	vpmuludq	-0x20(%r11),$H0,$H4	# h0*r4
11438c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T4,$H0
11448c2ecf20Sopenharmony_ci	 vpunpckhqdq	$T1,$T0,$T4		# 4
11458c2ecf20Sopenharmony_ci	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
11468c2ecf20Sopenharmony_ci	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
11478c2ecf20Sopenharmony_ci
11488c2ecf20Sopenharmony_ci	vpunpcklqdq	$T1,$T0,$T0		# 0:1
11498c2ecf20Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T3		# 2:3
11508c2ecf20Sopenharmony_ci
11518c2ecf20Sopenharmony_ci	#vpsrlq		\$40,$T4,$T4		# 4
11528c2ecf20Sopenharmony_ci	vpsrldq		\$`40/8`,$T4,$T4	# 4
11538c2ecf20Sopenharmony_ci	vpsrlq		\$26,$T0,$T1
11548c2ecf20Sopenharmony_ci	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
11558c2ecf20Sopenharmony_ci	vpand		$MASK,$T0,$T0		# 0
11568c2ecf20Sopenharmony_ci	vpsrlq		\$4,$T3,$T2
11578c2ecf20Sopenharmony_ci	vpand		$MASK,$T1,$T1		# 1
11588c2ecf20Sopenharmony_ci	vpand		0(%rcx),$T4,$T4		# .Lmask24
11598c2ecf20Sopenharmony_ci	vpsrlq		\$30,$T3,$T3
11608c2ecf20Sopenharmony_ci	vpand		$MASK,$T2,$T2		# 2
11618c2ecf20Sopenharmony_ci	vpand		$MASK,$T3,$T3		# 3
11628c2ecf20Sopenharmony_ci	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
11638c2ecf20Sopenharmony_ci
11648c2ecf20Sopenharmony_ci	################################################################
11658c2ecf20Sopenharmony_ci	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
11668c2ecf20Sopenharmony_ci	# and P. Schwabe
11678c2ecf20Sopenharmony_ci
11688c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
11698c2ecf20Sopenharmony_ci	vpand		$MASK,$H3,$H3
11708c2ecf20Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
11718c2ecf20Sopenharmony_ci
11728c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
11738c2ecf20Sopenharmony_ci	vpand		$MASK,$H0,$H0
11748c2ecf20Sopenharmony_ci	vpaddq		$D0,$D1,$H1		# h0 -> h1
11758c2ecf20Sopenharmony_ci
11768c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H4,$D0
11778c2ecf20Sopenharmony_ci	vpand		$MASK,$H4,$H4
11788c2ecf20Sopenharmony_ci
11798c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H1,$D1
11808c2ecf20Sopenharmony_ci	vpand		$MASK,$H1,$H1
11818c2ecf20Sopenharmony_ci	vpaddq		$D1,$H2,$H2		# h1 -> h2
11828c2ecf20Sopenharmony_ci
11838c2ecf20Sopenharmony_ci	vpaddq		$D0,$H0,$H0
11848c2ecf20Sopenharmony_ci	vpsllq		\$2,$D0,$D0
11858c2ecf20Sopenharmony_ci	vpaddq		$D0,$H0,$H0		# h4 -> h0
11868c2ecf20Sopenharmony_ci
11878c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H2,$D2
11888c2ecf20Sopenharmony_ci	vpand		$MASK,$H2,$H2
11898c2ecf20Sopenharmony_ci	vpaddq		$D2,$H3,$H3		# h2 -> h3
11908c2ecf20Sopenharmony_ci
11918c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
11928c2ecf20Sopenharmony_ci	vpand		$MASK,$H0,$H0
11938c2ecf20Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
11948c2ecf20Sopenharmony_ci
11958c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
11968c2ecf20Sopenharmony_ci	vpand		$MASK,$H3,$H3
11978c2ecf20Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
11988c2ecf20Sopenharmony_ci
11998c2ecf20Sopenharmony_ci	ja		.Loop_avx
12008c2ecf20Sopenharmony_ci
12018c2ecf20Sopenharmony_ci.Lskip_loop_avx:
12028c2ecf20Sopenharmony_ci	################################################################
12038c2ecf20Sopenharmony_ci	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
12048c2ecf20Sopenharmony_ci
12058c2ecf20Sopenharmony_ci	vpshufd		\$0x10,$D4,$D4		# r0^n, xx12 -> x1x2
12068c2ecf20Sopenharmony_ci	add		\$32,$len
12078c2ecf20Sopenharmony_ci	jnz		.Long_tail_avx
12088c2ecf20Sopenharmony_ci
12098c2ecf20Sopenharmony_ci	vpaddq		$H2,$T2,$T2
12108c2ecf20Sopenharmony_ci	vpaddq		$H0,$T0,$T0
12118c2ecf20Sopenharmony_ci	vpaddq		$H1,$T1,$T1
12128c2ecf20Sopenharmony_ci	vpaddq		$H3,$T3,$T3
12138c2ecf20Sopenharmony_ci	vpaddq		$H4,$T4,$T4
12148c2ecf20Sopenharmony_ci
12158c2ecf20Sopenharmony_ci.Long_tail_avx:
12168c2ecf20Sopenharmony_ci	vmovdqa		$H2,0x20(%r11)
12178c2ecf20Sopenharmony_ci	vmovdqa		$H0,0x00(%r11)
12188c2ecf20Sopenharmony_ci	vmovdqa		$H1,0x10(%r11)
12198c2ecf20Sopenharmony_ci	vmovdqa		$H3,0x30(%r11)
12208c2ecf20Sopenharmony_ci	vmovdqa		$H4,0x40(%r11)
12218c2ecf20Sopenharmony_ci
12228c2ecf20Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
12238c2ecf20Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
12248c2ecf20Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
12258c2ecf20Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
12268c2ecf20Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
12278c2ecf20Sopenharmony_ci
12288c2ecf20Sopenharmony_ci	vpmuludq	$T2,$D4,$D2		# d2 = h2*r0
12298c2ecf20Sopenharmony_ci	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
12308c2ecf20Sopenharmony_ci	 vpshufd	\$0x10,`16*1-64`($ctx),$H2		# r1^n
12318c2ecf20Sopenharmony_ci	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
12328c2ecf20Sopenharmony_ci	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
12338c2ecf20Sopenharmony_ci	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
12348c2ecf20Sopenharmony_ci
12358c2ecf20Sopenharmony_ci	vpmuludq	$T3,$H2,$H0		# h3*r1
12368c2ecf20Sopenharmony_ci	vpaddq		$H0,$D4,$D4		# d4 += h3*r1
12378c2ecf20Sopenharmony_ci	 vpshufd	\$0x10,`16*2-64`($ctx),$H3		# s1^n
12388c2ecf20Sopenharmony_ci	vpmuludq	$T2,$H2,$H1		# h2*r1
12398c2ecf20Sopenharmony_ci	vpaddq		$H1,$D3,$D3		# d3 += h2*r1
12408c2ecf20Sopenharmony_ci	 vpshufd	\$0x10,`16*3-64`($ctx),$H4		# r2^n
12418c2ecf20Sopenharmony_ci	vpmuludq	$T1,$H2,$H0		# h1*r1
12428c2ecf20Sopenharmony_ci	vpaddq		$H0,$D2,$D2		# d2 += h1*r1
12438c2ecf20Sopenharmony_ci	vpmuludq	$T0,$H2,$H2		# h0*r1
12448c2ecf20Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
12458c2ecf20Sopenharmony_ci	vpmuludq	$T4,$H3,$H3		# h4*s1
12468c2ecf20Sopenharmony_ci	vpaddq		$H3,$D0,$D0		# d0 += h4*s1
12478c2ecf20Sopenharmony_ci
12488c2ecf20Sopenharmony_ci	 vpshufd	\$0x10,`16*4-64`($ctx),$H2		# s2^n
12498c2ecf20Sopenharmony_ci	vpmuludq	$T2,$H4,$H1		# h2*r2
12508c2ecf20Sopenharmony_ci	vpaddq		$H1,$D4,$D4		# d4 += h2*r2
12518c2ecf20Sopenharmony_ci	vpmuludq	$T1,$H4,$H0		# h1*r2
12528c2ecf20Sopenharmony_ci	vpaddq		$H0,$D3,$D3		# d3 += h1*r2
12538c2ecf20Sopenharmony_ci	 vpshufd	\$0x10,`16*5-64`($ctx),$H3		# r3^n
12548c2ecf20Sopenharmony_ci	vpmuludq	$T0,$H4,$H4		# h0*r2
12558c2ecf20Sopenharmony_ci	vpaddq		$H4,$D2,$D2		# d2 += h0*r2
12568c2ecf20Sopenharmony_ci	vpmuludq	$T4,$H2,$H1		# h4*s2
12578c2ecf20Sopenharmony_ci	vpaddq		$H1,$D1,$D1		# d1 += h4*s2
12588c2ecf20Sopenharmony_ci	 vpshufd	\$0x10,`16*6-64`($ctx),$H4		# s3^n
12598c2ecf20Sopenharmony_ci	vpmuludq	$T3,$H2,$H2		# h3*s2
12608c2ecf20Sopenharmony_ci	vpaddq		$H2,$D0,$D0		# d0 += h3*s2
12618c2ecf20Sopenharmony_ci
12628c2ecf20Sopenharmony_ci	vpmuludq	$T1,$H3,$H0		# h1*r3
12638c2ecf20Sopenharmony_ci	vpaddq		$H0,$D4,$D4		# d4 += h1*r3
12648c2ecf20Sopenharmony_ci	vpmuludq	$T0,$H3,$H3		# h0*r3
12658c2ecf20Sopenharmony_ci	vpaddq		$H3,$D3,$D3		# d3 += h0*r3
12668c2ecf20Sopenharmony_ci	 vpshufd	\$0x10,`16*7-64`($ctx),$H2		# r4^n
12678c2ecf20Sopenharmony_ci	vpmuludq	$T4,$H4,$H1		# h4*s3
12688c2ecf20Sopenharmony_ci	vpaddq		$H1,$D2,$D2		# d2 += h4*s3
12698c2ecf20Sopenharmony_ci	 vpshufd	\$0x10,`16*8-64`($ctx),$H3		# s4^n
12708c2ecf20Sopenharmony_ci	vpmuludq	$T3,$H4,$H0		# h3*s3
12718c2ecf20Sopenharmony_ci	vpaddq		$H0,$D1,$D1		# d1 += h3*s3
12728c2ecf20Sopenharmony_ci	vpmuludq	$T2,$H4,$H4		# h2*s3
12738c2ecf20Sopenharmony_ci	vpaddq		$H4,$D0,$D0		# d0 += h2*s3
12748c2ecf20Sopenharmony_ci
12758c2ecf20Sopenharmony_ci	vpmuludq	$T0,$H2,$H2		# h0*r4
12768c2ecf20Sopenharmony_ci	vpaddq		$H2,$D4,$D4		# h4 = d4 + h0*r4
12778c2ecf20Sopenharmony_ci	vpmuludq	$T4,$H3,$H1		# h4*s4
12788c2ecf20Sopenharmony_ci	vpaddq		$H1,$D3,$D3		# h3 = d3 + h4*s4
12798c2ecf20Sopenharmony_ci	vpmuludq	$T3,$H3,$H0		# h3*s4
12808c2ecf20Sopenharmony_ci	vpaddq		$H0,$D2,$D2		# h2 = d2 + h3*s4
12818c2ecf20Sopenharmony_ci	vpmuludq	$T2,$H3,$H1		# h2*s4
12828c2ecf20Sopenharmony_ci	vpaddq		$H1,$D1,$D1		# h1 = d1 + h2*s4
12838c2ecf20Sopenharmony_ci	vpmuludq	$T1,$H3,$H3		# h1*s4
12848c2ecf20Sopenharmony_ci	vpaddq		$H3,$D0,$D0		# h0 = d0 + h1*s4
12858c2ecf20Sopenharmony_ci
12868c2ecf20Sopenharmony_ci	jz		.Lshort_tail_avx
12878c2ecf20Sopenharmony_ci
12888c2ecf20Sopenharmony_ci	vmovdqu		16*0($inp),$H0		# load input
12898c2ecf20Sopenharmony_ci	vmovdqu		16*1($inp),$H1
12908c2ecf20Sopenharmony_ci
12918c2ecf20Sopenharmony_ci	vpsrldq		\$6,$H0,$H2		# splat input
12928c2ecf20Sopenharmony_ci	vpsrldq		\$6,$H1,$H3
12938c2ecf20Sopenharmony_ci	vpunpckhqdq	$H1,$H0,$H4		# 4
12948c2ecf20Sopenharmony_ci	vpunpcklqdq	$H1,$H0,$H0		# 0:1
12958c2ecf20Sopenharmony_ci	vpunpcklqdq	$H3,$H2,$H3		# 2:3
12968c2ecf20Sopenharmony_ci
12978c2ecf20Sopenharmony_ci	vpsrlq		\$40,$H4,$H4		# 4
12988c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$H1
12998c2ecf20Sopenharmony_ci	vpand		$MASK,$H0,$H0		# 0
13008c2ecf20Sopenharmony_ci	vpsrlq		\$4,$H3,$H2
13018c2ecf20Sopenharmony_ci	vpand		$MASK,$H1,$H1		# 1
13028c2ecf20Sopenharmony_ci	vpsrlq		\$30,$H3,$H3
13038c2ecf20Sopenharmony_ci	vpand		$MASK,$H2,$H2		# 2
13048c2ecf20Sopenharmony_ci	vpand		$MASK,$H3,$H3		# 3
13058c2ecf20Sopenharmony_ci	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
13068c2ecf20Sopenharmony_ci
13078c2ecf20Sopenharmony_ci	vpshufd		\$0x32,`16*0-64`($ctx),$T4	# r0^n, 34xx -> x3x4
13088c2ecf20Sopenharmony_ci	vpaddq		0x00(%r11),$H0,$H0
13098c2ecf20Sopenharmony_ci	vpaddq		0x10(%r11),$H1,$H1
13108c2ecf20Sopenharmony_ci	vpaddq		0x20(%r11),$H2,$H2
13118c2ecf20Sopenharmony_ci	vpaddq		0x30(%r11),$H3,$H3
13128c2ecf20Sopenharmony_ci	vpaddq		0x40(%r11),$H4,$H4
13138c2ecf20Sopenharmony_ci
13148c2ecf20Sopenharmony_ci	################################################################
13158c2ecf20Sopenharmony_ci	# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
13168c2ecf20Sopenharmony_ci
13178c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T4,$T0		# h0*r0
13188c2ecf20Sopenharmony_ci	vpaddq		$T0,$D0,$D0		# d0 += h0*r0
13198c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T4,$T1		# h1*r0
13208c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1,$D1		# d1 += h1*r0
13218c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T4,$T0		# h2*r0
13228c2ecf20Sopenharmony_ci	vpaddq		$T0,$D2,$D2		# d2 += h2*r0
13238c2ecf20Sopenharmony_ci	 vpshufd	\$0x32,`16*1-64`($ctx),$T2		# r1^n
13248c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T4,$T1		# h3*r0
13258c2ecf20Sopenharmony_ci	vpaddq		$T1,$D3,$D3		# d3 += h3*r0
13268c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T4,$T4		# h4*r0
13278c2ecf20Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h4*r0
13288c2ecf20Sopenharmony_ci
13298c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T2,$T0		# h3*r1
13308c2ecf20Sopenharmony_ci	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
13318c2ecf20Sopenharmony_ci	 vpshufd	\$0x32,`16*2-64`($ctx),$T3		# s1
13328c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T2,$T1		# h2*r1
13338c2ecf20Sopenharmony_ci	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
13348c2ecf20Sopenharmony_ci	 vpshufd	\$0x32,`16*3-64`($ctx),$T4		# r2
13358c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T2,$T0		# h1*r1
13368c2ecf20Sopenharmony_ci	vpaddq		$T0,$D2,$D2		# d2 += h1*r1
13378c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r1
13388c2ecf20Sopenharmony_ci	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
13398c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T3,$T3		# h4*s1
13408c2ecf20Sopenharmony_ci	vpaddq		$T3,$D0,$D0		# d0 += h4*s1
13418c2ecf20Sopenharmony_ci
13428c2ecf20Sopenharmony_ci	 vpshufd	\$0x32,`16*4-64`($ctx),$T2		# s2
13438c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T4,$T1		# h2*r2
13448c2ecf20Sopenharmony_ci	vpaddq		$T1,$D4,$D4		# d4 += h2*r2
13458c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T4,$T0		# h1*r2
13468c2ecf20Sopenharmony_ci	vpaddq		$T0,$D3,$D3		# d3 += h1*r2
13478c2ecf20Sopenharmony_ci	 vpshufd	\$0x32,`16*5-64`($ctx),$T3		# r3
13488c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T4,$T4		# h0*r2
13498c2ecf20Sopenharmony_ci	vpaddq		$T4,$D2,$D2		# d2 += h0*r2
13508c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T2,$T1		# h4*s2
13518c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1,$D1		# d1 += h4*s2
13528c2ecf20Sopenharmony_ci	 vpshufd	\$0x32,`16*6-64`($ctx),$T4		# s3
13538c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T2,$T2		# h3*s2
13548c2ecf20Sopenharmony_ci	vpaddq		$T2,$D0,$D0		# d0 += h3*s2
13558c2ecf20Sopenharmony_ci
13568c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T3,$T0		# h1*r3
13578c2ecf20Sopenharmony_ci	vpaddq		$T0,$D4,$D4		# d4 += h1*r3
13588c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T3,$T3		# h0*r3
13598c2ecf20Sopenharmony_ci	vpaddq		$T3,$D3,$D3		# d3 += h0*r3
13608c2ecf20Sopenharmony_ci	 vpshufd	\$0x32,`16*7-64`($ctx),$T2		# r4
13618c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T4,$T1		# h4*s3
13628c2ecf20Sopenharmony_ci	vpaddq		$T1,$D2,$D2		# d2 += h4*s3
13638c2ecf20Sopenharmony_ci	 vpshufd	\$0x32,`16*8-64`($ctx),$T3		# s4
13648c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T4,$T0		# h3*s3
13658c2ecf20Sopenharmony_ci	vpaddq		$T0,$D1,$D1		# d1 += h3*s3
13668c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T4,$T4		# h2*s3
13678c2ecf20Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h2*s3
13688c2ecf20Sopenharmony_ci
13698c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r4
13708c2ecf20Sopenharmony_ci	vpaddq		$T2,$D4,$D4		# d4 += h0*r4
13718c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T3,$T1		# h4*s4
13728c2ecf20Sopenharmony_ci	vpaddq		$T1,$D3,$D3		# d3 += h4*s4
13738c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T3,$T0		# h3*s4
13748c2ecf20Sopenharmony_ci	vpaddq		$T0,$D2,$D2		# d2 += h3*s4
13758c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T3,$T1		# h2*s4
13768c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1,$D1		# d1 += h2*s4
13778c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T3,$T3		# h1*s4
13788c2ecf20Sopenharmony_ci	vpaddq		$T3,$D0,$D0		# d0 += h1*s4
13798c2ecf20Sopenharmony_ci
13808c2ecf20Sopenharmony_ci.Lshort_tail_avx:
13818c2ecf20Sopenharmony_ci	################################################################
13828c2ecf20Sopenharmony_ci	# horizontal addition
13838c2ecf20Sopenharmony_ci
13848c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D4,$T4
13858c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D3,$T3
13868c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D1,$T1
13878c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D0,$T0
13888c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D2,$T2
13898c2ecf20Sopenharmony_ci	vpaddq		$T3,$D3,$D3
13908c2ecf20Sopenharmony_ci	vpaddq		$T4,$D4,$D4
13918c2ecf20Sopenharmony_ci	vpaddq		$T0,$D0,$D0
13928c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1,$D1
13938c2ecf20Sopenharmony_ci	vpaddq		$T2,$D2,$D2
13948c2ecf20Sopenharmony_ci
13958c2ecf20Sopenharmony_ci	################################################################
13968c2ecf20Sopenharmony_ci	# lazy reduction
13978c2ecf20Sopenharmony_ci
13988c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D3,$H3
13998c2ecf20Sopenharmony_ci	vpand		$MASK,$D3,$D3
14008c2ecf20Sopenharmony_ci	vpaddq		$H3,$D4,$D4		# h3 -> h4
14018c2ecf20Sopenharmony_ci
14028c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D0,$H0
14038c2ecf20Sopenharmony_ci	vpand		$MASK,$D0,$D0
14048c2ecf20Sopenharmony_ci	vpaddq		$H0,$D1,$D1		# h0 -> h1
14058c2ecf20Sopenharmony_ci
14068c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D4,$H4
14078c2ecf20Sopenharmony_ci	vpand		$MASK,$D4,$D4
14088c2ecf20Sopenharmony_ci
14098c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D1,$H1
14108c2ecf20Sopenharmony_ci	vpand		$MASK,$D1,$D1
14118c2ecf20Sopenharmony_ci	vpaddq		$H1,$D2,$D2		# h1 -> h2
14128c2ecf20Sopenharmony_ci
14138c2ecf20Sopenharmony_ci	vpaddq		$H4,$D0,$D0
14148c2ecf20Sopenharmony_ci	vpsllq		\$2,$H4,$H4
14158c2ecf20Sopenharmony_ci	vpaddq		$H4,$D0,$D0		# h4 -> h0
14168c2ecf20Sopenharmony_ci
14178c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D2,$H2
14188c2ecf20Sopenharmony_ci	vpand		$MASK,$D2,$D2
14198c2ecf20Sopenharmony_ci	vpaddq		$H2,$D3,$D3		# h2 -> h3
14208c2ecf20Sopenharmony_ci
14218c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D0,$H0
14228c2ecf20Sopenharmony_ci	vpand		$MASK,$D0,$D0
14238c2ecf20Sopenharmony_ci	vpaddq		$H0,$D1,$D1		# h0 -> h1
14248c2ecf20Sopenharmony_ci
14258c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D3,$H3
14268c2ecf20Sopenharmony_ci	vpand		$MASK,$D3,$D3
14278c2ecf20Sopenharmony_ci	vpaddq		$H3,$D4,$D4		# h3 -> h4
14288c2ecf20Sopenharmony_ci
14298c2ecf20Sopenharmony_ci	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
14308c2ecf20Sopenharmony_ci	vmovd		$D1,`4*1-48-64`($ctx)
14318c2ecf20Sopenharmony_ci	vmovd		$D2,`4*2-48-64`($ctx)
14328c2ecf20Sopenharmony_ci	vmovd		$D3,`4*3-48-64`($ctx)
14338c2ecf20Sopenharmony_ci	vmovd		$D4,`4*4-48-64`($ctx)
14348c2ecf20Sopenharmony_ci___
14358c2ecf20Sopenharmony_ci$code.=<<___	if ($win64);
14368c2ecf20Sopenharmony_ci	vmovdqa		0x50(%r11),%xmm6
14378c2ecf20Sopenharmony_ci	vmovdqa		0x60(%r11),%xmm7
14388c2ecf20Sopenharmony_ci	vmovdqa		0x70(%r11),%xmm8
14398c2ecf20Sopenharmony_ci	vmovdqa		0x80(%r11),%xmm9
14408c2ecf20Sopenharmony_ci	vmovdqa		0x90(%r11),%xmm10
14418c2ecf20Sopenharmony_ci	vmovdqa		0xa0(%r11),%xmm11
14428c2ecf20Sopenharmony_ci	vmovdqa		0xb0(%r11),%xmm12
14438c2ecf20Sopenharmony_ci	vmovdqa		0xc0(%r11),%xmm13
14448c2ecf20Sopenharmony_ci	vmovdqa		0xd0(%r11),%xmm14
14458c2ecf20Sopenharmony_ci	vmovdqa		0xe0(%r11),%xmm15
14468c2ecf20Sopenharmony_ci	lea		0xf8(%r11),%rsp
14478c2ecf20Sopenharmony_ci.Ldo_avx_epilogue:
14488c2ecf20Sopenharmony_ci___
14498c2ecf20Sopenharmony_ci$code.=<<___	if (!$win64);
14508c2ecf20Sopenharmony_ci	lea		-8(%r10),%rsp
14518c2ecf20Sopenharmony_ci.cfi_def_cfa_register	%rsp
14528c2ecf20Sopenharmony_ci___
14538c2ecf20Sopenharmony_ci$code.=<<___;
14548c2ecf20Sopenharmony_ci	vzeroupper
14558c2ecf20Sopenharmony_ci	RET
14568c2ecf20Sopenharmony_ci.cfi_endproc
14578c2ecf20Sopenharmony_ci___
14588c2ecf20Sopenharmony_ci&end_function("poly1305_blocks_avx");
14598c2ecf20Sopenharmony_ci
14608c2ecf20Sopenharmony_ci&declare_function("poly1305_emit_avx", 32, 3);
14618c2ecf20Sopenharmony_ci$code.=<<___;
14628c2ecf20Sopenharmony_ci	cmpl	\$0,20($ctx)	# is_base2_26?
14638c2ecf20Sopenharmony_ci	je	.Lemit
14648c2ecf20Sopenharmony_ci
14658c2ecf20Sopenharmony_ci	mov	0($ctx),%eax	# load hash value base 2^26
14668c2ecf20Sopenharmony_ci	mov	4($ctx),%ecx
14678c2ecf20Sopenharmony_ci	mov	8($ctx),%r8d
14688c2ecf20Sopenharmony_ci	mov	12($ctx),%r11d
14698c2ecf20Sopenharmony_ci	mov	16($ctx),%r10d
14708c2ecf20Sopenharmony_ci
14718c2ecf20Sopenharmony_ci	shl	\$26,%rcx	# base 2^26 -> base 2^64
14728c2ecf20Sopenharmony_ci	mov	%r8,%r9
14738c2ecf20Sopenharmony_ci	shl	\$52,%r8
14748c2ecf20Sopenharmony_ci	add	%rcx,%rax
14758c2ecf20Sopenharmony_ci	shr	\$12,%r9
14768c2ecf20Sopenharmony_ci	add	%rax,%r8	# h0
14778c2ecf20Sopenharmony_ci	adc	\$0,%r9
14788c2ecf20Sopenharmony_ci
14798c2ecf20Sopenharmony_ci	shl	\$14,%r11
14808c2ecf20Sopenharmony_ci	mov	%r10,%rax
14818c2ecf20Sopenharmony_ci	shr	\$24,%r10
14828c2ecf20Sopenharmony_ci	add	%r11,%r9
14838c2ecf20Sopenharmony_ci	shl	\$40,%rax
14848c2ecf20Sopenharmony_ci	add	%rax,%r9	# h1
14858c2ecf20Sopenharmony_ci	adc	\$0,%r10	# h2
14868c2ecf20Sopenharmony_ci
14878c2ecf20Sopenharmony_ci	mov	%r10,%rax	# could be partially reduced, so reduce
14888c2ecf20Sopenharmony_ci	mov	%r10,%rcx
14898c2ecf20Sopenharmony_ci	and	\$3,%r10
14908c2ecf20Sopenharmony_ci	shr	\$2,%rax
14918c2ecf20Sopenharmony_ci	and	\$-4,%rcx
14928c2ecf20Sopenharmony_ci	add	%rcx,%rax
14938c2ecf20Sopenharmony_ci	add	%rax,%r8
14948c2ecf20Sopenharmony_ci	adc	\$0,%r9
14958c2ecf20Sopenharmony_ci	adc	\$0,%r10
14968c2ecf20Sopenharmony_ci
14978c2ecf20Sopenharmony_ci	mov	%r8,%rax
14988c2ecf20Sopenharmony_ci	add	\$5,%r8		# compare to modulus
14998c2ecf20Sopenharmony_ci	mov	%r9,%rcx
15008c2ecf20Sopenharmony_ci	adc	\$0,%r9
15018c2ecf20Sopenharmony_ci	adc	\$0,%r10
15028c2ecf20Sopenharmony_ci	shr	\$2,%r10	# did 130-bit value overflow?
15038c2ecf20Sopenharmony_ci	cmovnz	%r8,%rax
15048c2ecf20Sopenharmony_ci	cmovnz	%r9,%rcx
15058c2ecf20Sopenharmony_ci
15068c2ecf20Sopenharmony_ci	add	0($nonce),%rax	# accumulate nonce
15078c2ecf20Sopenharmony_ci	adc	8($nonce),%rcx
15088c2ecf20Sopenharmony_ci	mov	%rax,0($mac)	# write result
15098c2ecf20Sopenharmony_ci	mov	%rcx,8($mac)
15108c2ecf20Sopenharmony_ci
15118c2ecf20Sopenharmony_ci	RET
15128c2ecf20Sopenharmony_ci___
15138c2ecf20Sopenharmony_ci&end_function("poly1305_emit_avx");
15148c2ecf20Sopenharmony_ci
15158c2ecf20Sopenharmony_ciif ($avx>1) {
15168c2ecf20Sopenharmony_ci
15178c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
15188c2ecf20Sopenharmony_ci    map("%ymm$_",(0..15));
15198c2ecf20Sopenharmony_cimy $S4=$MASK;
15208c2ecf20Sopenharmony_ci
15218c2ecf20Sopenharmony_cisub poly1305_blocks_avxN {
15228c2ecf20Sopenharmony_ci	my ($avx512) = @_;
15238c2ecf20Sopenharmony_ci	my $suffix = $avx512 ? "_avx512" : "";
15248c2ecf20Sopenharmony_ci$code.=<<___;
15258c2ecf20Sopenharmony_ci.cfi_startproc
15268c2ecf20Sopenharmony_ci	mov	20($ctx),%r8d		# is_base2_26
15278c2ecf20Sopenharmony_ci	cmp	\$128,$len
15288c2ecf20Sopenharmony_ci	jae	.Lblocks_avx2$suffix
15298c2ecf20Sopenharmony_ci	test	%r8d,%r8d
15308c2ecf20Sopenharmony_ci	jz	.Lblocks
15318c2ecf20Sopenharmony_ci
15328c2ecf20Sopenharmony_ci.Lblocks_avx2$suffix:
15338c2ecf20Sopenharmony_ci	and	\$-16,$len
15348c2ecf20Sopenharmony_ci	jz	.Lno_data_avx2$suffix
15358c2ecf20Sopenharmony_ci
15368c2ecf20Sopenharmony_ci	vzeroupper
15378c2ecf20Sopenharmony_ci
15388c2ecf20Sopenharmony_ci	test	%r8d,%r8d
15398c2ecf20Sopenharmony_ci	jz	.Lbase2_64_avx2$suffix
15408c2ecf20Sopenharmony_ci
15418c2ecf20Sopenharmony_ci	test	\$63,$len
15428c2ecf20Sopenharmony_ci	jz	.Leven_avx2$suffix
15438c2ecf20Sopenharmony_ci
15448c2ecf20Sopenharmony_ci	push	%rbp
15458c2ecf20Sopenharmony_ci.cfi_push	%rbp
15468c2ecf20Sopenharmony_ci	mov 	%rsp,%rbp
15478c2ecf20Sopenharmony_ci	push	%rbx
15488c2ecf20Sopenharmony_ci.cfi_push	%rbx
15498c2ecf20Sopenharmony_ci	push	%r12
15508c2ecf20Sopenharmony_ci.cfi_push	%r12
15518c2ecf20Sopenharmony_ci	push	%r13
15528c2ecf20Sopenharmony_ci.cfi_push	%r13
15538c2ecf20Sopenharmony_ci	push	%r14
15548c2ecf20Sopenharmony_ci.cfi_push	%r14
15558c2ecf20Sopenharmony_ci	push	%r15
15568c2ecf20Sopenharmony_ci.cfi_push	%r15
15578c2ecf20Sopenharmony_ci.Lblocks_avx2_body$suffix:
15588c2ecf20Sopenharmony_ci
15598c2ecf20Sopenharmony_ci	mov	$len,%r15		# reassign $len
15608c2ecf20Sopenharmony_ci
15618c2ecf20Sopenharmony_ci	mov	0($ctx),$d1		# load hash value
15628c2ecf20Sopenharmony_ci	mov	8($ctx),$d2
15638c2ecf20Sopenharmony_ci	mov	16($ctx),$h2#d
15648c2ecf20Sopenharmony_ci
15658c2ecf20Sopenharmony_ci	mov	24($ctx),$r0		# load r
15668c2ecf20Sopenharmony_ci	mov	32($ctx),$s1
15678c2ecf20Sopenharmony_ci
15688c2ecf20Sopenharmony_ci	################################# base 2^26 -> base 2^64
15698c2ecf20Sopenharmony_ci	mov	$d1#d,$h0#d
15708c2ecf20Sopenharmony_ci	and	\$`-1*(1<<31)`,$d1
15718c2ecf20Sopenharmony_ci	mov	$d2,$r1			# borrow $r1
15728c2ecf20Sopenharmony_ci	mov	$d2#d,$h1#d
15738c2ecf20Sopenharmony_ci	and	\$`-1*(1<<31)`,$d2
15748c2ecf20Sopenharmony_ci
15758c2ecf20Sopenharmony_ci	shr	\$6,$d1
15768c2ecf20Sopenharmony_ci	shl	\$52,$r1
15778c2ecf20Sopenharmony_ci	add	$d1,$h0
15788c2ecf20Sopenharmony_ci	shr	\$12,$h1
15798c2ecf20Sopenharmony_ci	shr	\$18,$d2
15808c2ecf20Sopenharmony_ci	add	$r1,$h0
15818c2ecf20Sopenharmony_ci	adc	$d2,$h1
15828c2ecf20Sopenharmony_ci
15838c2ecf20Sopenharmony_ci	mov	$h2,$d1
15848c2ecf20Sopenharmony_ci	shl	\$40,$d1
15858c2ecf20Sopenharmony_ci	shr	\$24,$h2
15868c2ecf20Sopenharmony_ci	add	$d1,$h1
15878c2ecf20Sopenharmony_ci	adc	\$0,$h2			# can be partially reduced...
15888c2ecf20Sopenharmony_ci
15898c2ecf20Sopenharmony_ci	mov	\$-4,$d2		# ... so reduce
15908c2ecf20Sopenharmony_ci	mov	$h2,$d1
15918c2ecf20Sopenharmony_ci	and	$h2,$d2
15928c2ecf20Sopenharmony_ci	shr	\$2,$d1
15938c2ecf20Sopenharmony_ci	and	\$3,$h2
15948c2ecf20Sopenharmony_ci	add	$d2,$d1			# =*5
15958c2ecf20Sopenharmony_ci	add	$d1,$h0
15968c2ecf20Sopenharmony_ci	adc	\$0,$h1
15978c2ecf20Sopenharmony_ci	adc	\$0,$h2
15988c2ecf20Sopenharmony_ci
15998c2ecf20Sopenharmony_ci	mov	$s1,$r1
16008c2ecf20Sopenharmony_ci	mov	$s1,%rax
16018c2ecf20Sopenharmony_ci	shr	\$2,$s1
16028c2ecf20Sopenharmony_ci	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
16038c2ecf20Sopenharmony_ci
16048c2ecf20Sopenharmony_ci.Lbase2_26_pre_avx2$suffix:
16058c2ecf20Sopenharmony_ci	add	0($inp),$h0		# accumulate input
16068c2ecf20Sopenharmony_ci	adc	8($inp),$h1
16078c2ecf20Sopenharmony_ci	lea	16($inp),$inp
16088c2ecf20Sopenharmony_ci	adc	$padbit,$h2
16098c2ecf20Sopenharmony_ci	sub	\$16,%r15
16108c2ecf20Sopenharmony_ci
16118c2ecf20Sopenharmony_ci	call	__poly1305_block
16128c2ecf20Sopenharmony_ci	mov	$r1,%rax
16138c2ecf20Sopenharmony_ci
16148c2ecf20Sopenharmony_ci	test	\$63,%r15
16158c2ecf20Sopenharmony_ci	jnz	.Lbase2_26_pre_avx2$suffix
16168c2ecf20Sopenharmony_ci
16178c2ecf20Sopenharmony_ci	test	$padbit,$padbit		# if $padbit is zero,
16188c2ecf20Sopenharmony_ci	jz	.Lstore_base2_64_avx2$suffix	# store hash in base 2^64 format
16198c2ecf20Sopenharmony_ci
16208c2ecf20Sopenharmony_ci	################################# base 2^64 -> base 2^26
16218c2ecf20Sopenharmony_ci	mov	$h0,%rax
16228c2ecf20Sopenharmony_ci	mov	$h0,%rdx
16238c2ecf20Sopenharmony_ci	shr	\$52,$h0
16248c2ecf20Sopenharmony_ci	mov	$h1,$r0
16258c2ecf20Sopenharmony_ci	mov	$h1,$r1
16268c2ecf20Sopenharmony_ci	shr	\$26,%rdx
16278c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%rax	# h[0]
16288c2ecf20Sopenharmony_ci	shl	\$12,$r0
16298c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%rdx	# h[1]
16308c2ecf20Sopenharmony_ci	shr	\$14,$h1
16318c2ecf20Sopenharmony_ci	or	$r0,$h0
16328c2ecf20Sopenharmony_ci	shl	\$24,$h2
16338c2ecf20Sopenharmony_ci	and	\$0x3ffffff,$h0		# h[2]
16348c2ecf20Sopenharmony_ci	shr	\$40,$r1
16358c2ecf20Sopenharmony_ci	and	\$0x3ffffff,$h1		# h[3]
16368c2ecf20Sopenharmony_ci	or	$r1,$h2			# h[4]
16378c2ecf20Sopenharmony_ci
16388c2ecf20Sopenharmony_ci	test	%r15,%r15
16398c2ecf20Sopenharmony_ci	jz	.Lstore_base2_26_avx2$suffix
16408c2ecf20Sopenharmony_ci
16418c2ecf20Sopenharmony_ci	vmovd	%rax#d,%x#$H0
16428c2ecf20Sopenharmony_ci	vmovd	%rdx#d,%x#$H1
16438c2ecf20Sopenharmony_ci	vmovd	$h0#d,%x#$H2
16448c2ecf20Sopenharmony_ci	vmovd	$h1#d,%x#$H3
16458c2ecf20Sopenharmony_ci	vmovd	$h2#d,%x#$H4
16468c2ecf20Sopenharmony_ci	jmp	.Lproceed_avx2$suffix
16478c2ecf20Sopenharmony_ci
16488c2ecf20Sopenharmony_ci.align	32
16498c2ecf20Sopenharmony_ci.Lstore_base2_64_avx2$suffix:
16508c2ecf20Sopenharmony_ci	mov	$h0,0($ctx)
16518c2ecf20Sopenharmony_ci	mov	$h1,8($ctx)
16528c2ecf20Sopenharmony_ci	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
16538c2ecf20Sopenharmony_ci	jmp	.Ldone_avx2$suffix
16548c2ecf20Sopenharmony_ci
16558c2ecf20Sopenharmony_ci.align	16
16568c2ecf20Sopenharmony_ci.Lstore_base2_26_avx2$suffix:
16578c2ecf20Sopenharmony_ci	mov	%rax#d,0($ctx)		# store hash value base 2^26
16588c2ecf20Sopenharmony_ci	mov	%rdx#d,4($ctx)
16598c2ecf20Sopenharmony_ci	mov	$h0#d,8($ctx)
16608c2ecf20Sopenharmony_ci	mov	$h1#d,12($ctx)
16618c2ecf20Sopenharmony_ci	mov	$h2#d,16($ctx)
16628c2ecf20Sopenharmony_ci.align	16
16638c2ecf20Sopenharmony_ci.Ldone_avx2$suffix:
16648c2ecf20Sopenharmony_ci	pop 		%r15
16658c2ecf20Sopenharmony_ci.cfi_restore	%r15
16668c2ecf20Sopenharmony_ci	pop 		%r14
16678c2ecf20Sopenharmony_ci.cfi_restore	%r14
16688c2ecf20Sopenharmony_ci	pop 		%r13
16698c2ecf20Sopenharmony_ci.cfi_restore	%r13
16708c2ecf20Sopenharmony_ci	pop 		%r12
16718c2ecf20Sopenharmony_ci.cfi_restore	%r12
16728c2ecf20Sopenharmony_ci	pop 		%rbx
16738c2ecf20Sopenharmony_ci.cfi_restore	%rbx
16748c2ecf20Sopenharmony_ci	pop 		%rbp
16758c2ecf20Sopenharmony_ci.cfi_restore 	%rbp
16768c2ecf20Sopenharmony_ci.Lno_data_avx2$suffix:
16778c2ecf20Sopenharmony_ci.Lblocks_avx2_epilogue$suffix:
16788c2ecf20Sopenharmony_ci	RET
16798c2ecf20Sopenharmony_ci.cfi_endproc
16808c2ecf20Sopenharmony_ci
16818c2ecf20Sopenharmony_ci.align	32
16828c2ecf20Sopenharmony_ci.Lbase2_64_avx2$suffix:
16838c2ecf20Sopenharmony_ci.cfi_startproc
16848c2ecf20Sopenharmony_ci	push	%rbp
16858c2ecf20Sopenharmony_ci.cfi_push	%rbp
16868c2ecf20Sopenharmony_ci	mov 	%rsp,%rbp
16878c2ecf20Sopenharmony_ci	push	%rbx
16888c2ecf20Sopenharmony_ci.cfi_push	%rbx
16898c2ecf20Sopenharmony_ci	push	%r12
16908c2ecf20Sopenharmony_ci.cfi_push	%r12
16918c2ecf20Sopenharmony_ci	push	%r13
16928c2ecf20Sopenharmony_ci.cfi_push	%r13
16938c2ecf20Sopenharmony_ci	push	%r14
16948c2ecf20Sopenharmony_ci.cfi_push	%r14
16958c2ecf20Sopenharmony_ci	push	%r15
16968c2ecf20Sopenharmony_ci.cfi_push	%r15
16978c2ecf20Sopenharmony_ci.Lbase2_64_avx2_body$suffix:
16988c2ecf20Sopenharmony_ci
16998c2ecf20Sopenharmony_ci	mov	$len,%r15		# reassign $len
17008c2ecf20Sopenharmony_ci
17018c2ecf20Sopenharmony_ci	mov	24($ctx),$r0		# load r
17028c2ecf20Sopenharmony_ci	mov	32($ctx),$s1
17038c2ecf20Sopenharmony_ci
17048c2ecf20Sopenharmony_ci	mov	0($ctx),$h0		# load hash value
17058c2ecf20Sopenharmony_ci	mov	8($ctx),$h1
17068c2ecf20Sopenharmony_ci	mov	16($ctx),$h2#d
17078c2ecf20Sopenharmony_ci
17088c2ecf20Sopenharmony_ci	mov	$s1,$r1
17098c2ecf20Sopenharmony_ci	mov	$s1,%rax
17108c2ecf20Sopenharmony_ci	shr	\$2,$s1
17118c2ecf20Sopenharmony_ci	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
17128c2ecf20Sopenharmony_ci
17138c2ecf20Sopenharmony_ci	test	\$63,$len
17148c2ecf20Sopenharmony_ci	jz	.Linit_avx2$suffix
17158c2ecf20Sopenharmony_ci
17168c2ecf20Sopenharmony_ci.Lbase2_64_pre_avx2$suffix:
17178c2ecf20Sopenharmony_ci	add	0($inp),$h0		# accumulate input
17188c2ecf20Sopenharmony_ci	adc	8($inp),$h1
17198c2ecf20Sopenharmony_ci	lea	16($inp),$inp
17208c2ecf20Sopenharmony_ci	adc	$padbit,$h2
17218c2ecf20Sopenharmony_ci	sub	\$16,%r15
17228c2ecf20Sopenharmony_ci
17238c2ecf20Sopenharmony_ci	call	__poly1305_block
17248c2ecf20Sopenharmony_ci	mov	$r1,%rax
17258c2ecf20Sopenharmony_ci
17268c2ecf20Sopenharmony_ci	test	\$63,%r15
17278c2ecf20Sopenharmony_ci	jnz	.Lbase2_64_pre_avx2$suffix
17288c2ecf20Sopenharmony_ci
17298c2ecf20Sopenharmony_ci.Linit_avx2$suffix:
17308c2ecf20Sopenharmony_ci	################################# base 2^64 -> base 2^26
17318c2ecf20Sopenharmony_ci	mov	$h0,%rax
17328c2ecf20Sopenharmony_ci	mov	$h0,%rdx
17338c2ecf20Sopenharmony_ci	shr	\$52,$h0
17348c2ecf20Sopenharmony_ci	mov	$h1,$d1
17358c2ecf20Sopenharmony_ci	mov	$h1,$d2
17368c2ecf20Sopenharmony_ci	shr	\$26,%rdx
17378c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%rax	# h[0]
17388c2ecf20Sopenharmony_ci	shl	\$12,$d1
17398c2ecf20Sopenharmony_ci	and	\$0x3ffffff,%rdx	# h[1]
17408c2ecf20Sopenharmony_ci	shr	\$14,$h1
17418c2ecf20Sopenharmony_ci	or	$d1,$h0
17428c2ecf20Sopenharmony_ci	shl	\$24,$h2
17438c2ecf20Sopenharmony_ci	and	\$0x3ffffff,$h0		# h[2]
17448c2ecf20Sopenharmony_ci	shr	\$40,$d2
17458c2ecf20Sopenharmony_ci	and	\$0x3ffffff,$h1		# h[3]
17468c2ecf20Sopenharmony_ci	or	$d2,$h2			# h[4]
17478c2ecf20Sopenharmony_ci
17488c2ecf20Sopenharmony_ci	vmovd	%rax#d,%x#$H0
17498c2ecf20Sopenharmony_ci	vmovd	%rdx#d,%x#$H1
17508c2ecf20Sopenharmony_ci	vmovd	$h0#d,%x#$H2
17518c2ecf20Sopenharmony_ci	vmovd	$h1#d,%x#$H3
17528c2ecf20Sopenharmony_ci	vmovd	$h2#d,%x#$H4
17538c2ecf20Sopenharmony_ci	movl	\$1,20($ctx)		# set is_base2_26
17548c2ecf20Sopenharmony_ci
17558c2ecf20Sopenharmony_ci	call	__poly1305_init_avx
17568c2ecf20Sopenharmony_ci
17578c2ecf20Sopenharmony_ci.Lproceed_avx2$suffix:
17588c2ecf20Sopenharmony_ci	mov	%r15,$len			# restore $len
17598c2ecf20Sopenharmony_ci___
17608c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel);
17618c2ecf20Sopenharmony_ci	mov	OPENSSL_ia32cap_P+8(%rip),%r9d
17628c2ecf20Sopenharmony_ci	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
17638c2ecf20Sopenharmony_ci___
17648c2ecf20Sopenharmony_ci$code.=<<___;
17658c2ecf20Sopenharmony_ci	pop 		%r15
17668c2ecf20Sopenharmony_ci.cfi_restore	%r15
17678c2ecf20Sopenharmony_ci	pop 		%r14
17688c2ecf20Sopenharmony_ci.cfi_restore	%r14
17698c2ecf20Sopenharmony_ci	pop 		%r13
17708c2ecf20Sopenharmony_ci.cfi_restore	%r13
17718c2ecf20Sopenharmony_ci	pop 		%r12
17728c2ecf20Sopenharmony_ci.cfi_restore	%r12
17738c2ecf20Sopenharmony_ci	pop 		%rbx
17748c2ecf20Sopenharmony_ci.cfi_restore	%rbx
17758c2ecf20Sopenharmony_ci	pop 		%rbp
17768c2ecf20Sopenharmony_ci.cfi_restore 	%rbp
17778c2ecf20Sopenharmony_ci.Lbase2_64_avx2_epilogue$suffix:
17788c2ecf20Sopenharmony_ci	jmp	.Ldo_avx2$suffix
17798c2ecf20Sopenharmony_ci.cfi_endproc
17808c2ecf20Sopenharmony_ci
17818c2ecf20Sopenharmony_ci.align	32
17828c2ecf20Sopenharmony_ci.Leven_avx2$suffix:
17838c2ecf20Sopenharmony_ci.cfi_startproc
17848c2ecf20Sopenharmony_ci___
17858c2ecf20Sopenharmony_ci$code.=<<___ if (!$kernel);
17868c2ecf20Sopenharmony_ci	mov		OPENSSL_ia32cap_P+8(%rip),%r9d
17878c2ecf20Sopenharmony_ci___
17888c2ecf20Sopenharmony_ci$code.=<<___;
17898c2ecf20Sopenharmony_ci	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
17908c2ecf20Sopenharmony_ci	vmovd		4*1($ctx),%x#$H1
17918c2ecf20Sopenharmony_ci	vmovd		4*2($ctx),%x#$H2
17928c2ecf20Sopenharmony_ci	vmovd		4*3($ctx),%x#$H3
17938c2ecf20Sopenharmony_ci	vmovd		4*4($ctx),%x#$H4
17948c2ecf20Sopenharmony_ci
17958c2ecf20Sopenharmony_ci.Ldo_avx2$suffix:
17968c2ecf20Sopenharmony_ci___
17978c2ecf20Sopenharmony_ci$code.=<<___		if (!$kernel && $avx>2);
17988c2ecf20Sopenharmony_ci	cmp		\$512,$len
17998c2ecf20Sopenharmony_ci	jb		.Lskip_avx512
18008c2ecf20Sopenharmony_ci	and		%r11d,%r9d
18018c2ecf20Sopenharmony_ci	test		\$`1<<16`,%r9d		# check for AVX512F
18028c2ecf20Sopenharmony_ci	jnz		.Lblocks_avx512
18038c2ecf20Sopenharmony_ci.Lskip_avx512$suffix:
18048c2ecf20Sopenharmony_ci___
18058c2ecf20Sopenharmony_ci$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
18068c2ecf20Sopenharmony_ci	cmp		\$512,$len
18078c2ecf20Sopenharmony_ci	jae		.Lblocks_avx512
18088c2ecf20Sopenharmony_ci___
18098c2ecf20Sopenharmony_ci$code.=<<___	if (!$win64);
18108c2ecf20Sopenharmony_ci	lea		8(%rsp),%r10
18118c2ecf20Sopenharmony_ci.cfi_def_cfa_register	%r10
18128c2ecf20Sopenharmony_ci	sub		\$0x128,%rsp
18138c2ecf20Sopenharmony_ci___
18148c2ecf20Sopenharmony_ci$code.=<<___	if ($win64);
18158c2ecf20Sopenharmony_ci	lea		8(%rsp),%r10
18168c2ecf20Sopenharmony_ci	sub		\$0x1c8,%rsp
18178c2ecf20Sopenharmony_ci	vmovdqa		%xmm6,-0xb0(%r10)
18188c2ecf20Sopenharmony_ci	vmovdqa		%xmm7,-0xa0(%r10)
18198c2ecf20Sopenharmony_ci	vmovdqa		%xmm8,-0x90(%r10)
18208c2ecf20Sopenharmony_ci	vmovdqa		%xmm9,-0x80(%r10)
18218c2ecf20Sopenharmony_ci	vmovdqa		%xmm10,-0x70(%r10)
18228c2ecf20Sopenharmony_ci	vmovdqa		%xmm11,-0x60(%r10)
18238c2ecf20Sopenharmony_ci	vmovdqa		%xmm12,-0x50(%r10)
18248c2ecf20Sopenharmony_ci	vmovdqa		%xmm13,-0x40(%r10)
18258c2ecf20Sopenharmony_ci	vmovdqa		%xmm14,-0x30(%r10)
18268c2ecf20Sopenharmony_ci	vmovdqa		%xmm15,-0x20(%r10)
18278c2ecf20Sopenharmony_ci.Ldo_avx2_body$suffix:
18288c2ecf20Sopenharmony_ci___
18298c2ecf20Sopenharmony_ci$code.=<<___;
18308c2ecf20Sopenharmony_ci	lea		.Lconst(%rip),%rcx
18318c2ecf20Sopenharmony_ci	lea		48+64($ctx),$ctx	# size optimization
18328c2ecf20Sopenharmony_ci	vmovdqa		96(%rcx),$T0		# .Lpermd_avx2
18338c2ecf20Sopenharmony_ci
18348c2ecf20Sopenharmony_ci	# expand and copy pre-calculated table to stack
18358c2ecf20Sopenharmony_ci	vmovdqu		`16*0-64`($ctx),%x#$T2
18368c2ecf20Sopenharmony_ci	and		\$-512,%rsp
18378c2ecf20Sopenharmony_ci	vmovdqu		`16*1-64`($ctx),%x#$T3
18388c2ecf20Sopenharmony_ci	vmovdqu		`16*2-64`($ctx),%x#$T4
18398c2ecf20Sopenharmony_ci	vmovdqu		`16*3-64`($ctx),%x#$D0
18408c2ecf20Sopenharmony_ci	vmovdqu		`16*4-64`($ctx),%x#$D1
18418c2ecf20Sopenharmony_ci	vmovdqu		`16*5-64`($ctx),%x#$D2
18428c2ecf20Sopenharmony_ci	lea		0x90(%rsp),%rax		# size optimization
18438c2ecf20Sopenharmony_ci	vmovdqu		`16*6-64`($ctx),%x#$D3
18448c2ecf20Sopenharmony_ci	vpermd		$T2,$T0,$T2		# 00003412 -> 14243444
18458c2ecf20Sopenharmony_ci	vmovdqu		`16*7-64`($ctx),%x#$D4
18468c2ecf20Sopenharmony_ci	vpermd		$T3,$T0,$T3
18478c2ecf20Sopenharmony_ci	vmovdqu		`16*8-64`($ctx),%x#$MASK
18488c2ecf20Sopenharmony_ci	vpermd		$T4,$T0,$T4
18498c2ecf20Sopenharmony_ci	vmovdqa		$T2,0x00(%rsp)
18508c2ecf20Sopenharmony_ci	vpermd		$D0,$T0,$D0
18518c2ecf20Sopenharmony_ci	vmovdqa		$T3,0x20-0x90(%rax)
18528c2ecf20Sopenharmony_ci	vpermd		$D1,$T0,$D1
18538c2ecf20Sopenharmony_ci	vmovdqa		$T4,0x40-0x90(%rax)
18548c2ecf20Sopenharmony_ci	vpermd		$D2,$T0,$D2
18558c2ecf20Sopenharmony_ci	vmovdqa		$D0,0x60-0x90(%rax)
18568c2ecf20Sopenharmony_ci	vpermd		$D3,$T0,$D3
18578c2ecf20Sopenharmony_ci	vmovdqa		$D1,0x80-0x90(%rax)
18588c2ecf20Sopenharmony_ci	vpermd		$D4,$T0,$D4
18598c2ecf20Sopenharmony_ci	vmovdqa		$D2,0xa0-0x90(%rax)
18608c2ecf20Sopenharmony_ci	vpermd		$MASK,$T0,$MASK
18618c2ecf20Sopenharmony_ci	vmovdqa		$D3,0xc0-0x90(%rax)
18628c2ecf20Sopenharmony_ci	vmovdqa		$D4,0xe0-0x90(%rax)
18638c2ecf20Sopenharmony_ci	vmovdqa		$MASK,0x100-0x90(%rax)
18648c2ecf20Sopenharmony_ci	vmovdqa		64(%rcx),$MASK		# .Lmask26
18658c2ecf20Sopenharmony_ci
18668c2ecf20Sopenharmony_ci	################################################################
18678c2ecf20Sopenharmony_ci	# load input
18688c2ecf20Sopenharmony_ci	vmovdqu		16*0($inp),%x#$T0
18698c2ecf20Sopenharmony_ci	vmovdqu		16*1($inp),%x#$T1
18708c2ecf20Sopenharmony_ci	vinserti128	\$1,16*2($inp),$T0,$T0
18718c2ecf20Sopenharmony_ci	vinserti128	\$1,16*3($inp),$T1,$T1
18728c2ecf20Sopenharmony_ci	lea		16*4($inp),$inp
18738c2ecf20Sopenharmony_ci
18748c2ecf20Sopenharmony_ci	vpsrldq		\$6,$T0,$T2		# splat input
18758c2ecf20Sopenharmony_ci	vpsrldq		\$6,$T1,$T3
18768c2ecf20Sopenharmony_ci	vpunpckhqdq	$T1,$T0,$T4		# 4
18778c2ecf20Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T2		# 2:3
18788c2ecf20Sopenharmony_ci	vpunpcklqdq	$T1,$T0,$T0		# 0:1
18798c2ecf20Sopenharmony_ci
18808c2ecf20Sopenharmony_ci	vpsrlq		\$30,$T2,$T3
18818c2ecf20Sopenharmony_ci	vpsrlq		\$4,$T2,$T2
18828c2ecf20Sopenharmony_ci	vpsrlq		\$26,$T0,$T1
18838c2ecf20Sopenharmony_ci	vpsrlq		\$40,$T4,$T4		# 4
18848c2ecf20Sopenharmony_ci	vpand		$MASK,$T2,$T2		# 2
18858c2ecf20Sopenharmony_ci	vpand		$MASK,$T0,$T0		# 0
18868c2ecf20Sopenharmony_ci	vpand		$MASK,$T1,$T1		# 1
18878c2ecf20Sopenharmony_ci	vpand		$MASK,$T3,$T3		# 3
18888c2ecf20Sopenharmony_ci	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
18898c2ecf20Sopenharmony_ci
18908c2ecf20Sopenharmony_ci	vpaddq		$H2,$T2,$H2		# accumulate input
18918c2ecf20Sopenharmony_ci	sub		\$64,$len
18928c2ecf20Sopenharmony_ci	jz		.Ltail_avx2$suffix
18938c2ecf20Sopenharmony_ci	jmp		.Loop_avx2$suffix
18948c2ecf20Sopenharmony_ci
18958c2ecf20Sopenharmony_ci.align	32
18968c2ecf20Sopenharmony_ci.Loop_avx2$suffix:
18978c2ecf20Sopenharmony_ci	################################################################
18988c2ecf20Sopenharmony_ci	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
18998c2ecf20Sopenharmony_ci	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
19008c2ecf20Sopenharmony_ci	# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
19018c2ecf20Sopenharmony_ci	# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
19028c2ecf20Sopenharmony_ci	#   \________/\__________/
19038c2ecf20Sopenharmony_ci	################################################################
19048c2ecf20Sopenharmony_ci	#vpaddq		$H2,$T2,$H2		# accumulate input
19058c2ecf20Sopenharmony_ci	vpaddq		$H0,$T0,$H0
19068c2ecf20Sopenharmony_ci	vmovdqa		`32*0`(%rsp),$T0	# r0^4
19078c2ecf20Sopenharmony_ci	vpaddq		$H1,$T1,$H1
19088c2ecf20Sopenharmony_ci	vmovdqa		`32*1`(%rsp),$T1	# r1^4
19098c2ecf20Sopenharmony_ci	vpaddq		$H3,$T3,$H3
19108c2ecf20Sopenharmony_ci	vmovdqa		`32*3`(%rsp),$T2	# r2^4
19118c2ecf20Sopenharmony_ci	vpaddq		$H4,$T4,$H4
19128c2ecf20Sopenharmony_ci	vmovdqa		`32*6-0x90`(%rax),$T3	# s3^4
19138c2ecf20Sopenharmony_ci	vmovdqa		`32*8-0x90`(%rax),$S4	# s4^4
19148c2ecf20Sopenharmony_ci
19158c2ecf20Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
19168c2ecf20Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
19178c2ecf20Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
19188c2ecf20Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
19198c2ecf20Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
19208c2ecf20Sopenharmony_ci	#
19218c2ecf20Sopenharmony_ci	# however, as h2 is "chronologically" first one available pull
19228c2ecf20Sopenharmony_ci	# corresponding operations up, so it's
19238c2ecf20Sopenharmony_ci	#
19248c2ecf20Sopenharmony_ci	# d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
19258c2ecf20Sopenharmony_ci	# d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
19268c2ecf20Sopenharmony_ci	# d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
19278c2ecf20Sopenharmony_ci	# d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
19288c2ecf20Sopenharmony_ci	# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
19298c2ecf20Sopenharmony_ci
19308c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
19318c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
19328c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
19338c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
19348c2ecf20Sopenharmony_ci	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
19358c2ecf20Sopenharmony_ci
19368c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T1,$T4		# h0*r1
19378c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T1,$H2		# h1*r1, borrow $H2 as temp
19388c2ecf20Sopenharmony_ci	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
19398c2ecf20Sopenharmony_ci	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
19408c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T1,$T4		# h3*r1
19418c2ecf20Sopenharmony_ci	vpmuludq	`32*2`(%rsp),$H4,$H2	# h4*s1
19428c2ecf20Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
19438c2ecf20Sopenharmony_ci	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
19448c2ecf20Sopenharmony_ci	 vmovdqa	`32*4-0x90`(%rax),$T1	# s2
19458c2ecf20Sopenharmony_ci
19468c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T0,$T4		# h0*r0
19478c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T0,$H2		# h1*r0
19488c2ecf20Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
19498c2ecf20Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
19508c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T0,$T4		# h3*r0
19518c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T0,$H2		# h4*r0
19528c2ecf20Sopenharmony_ci	 vmovdqu	16*0($inp),%x#$T0	# load input
19538c2ecf20Sopenharmony_ci	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
19548c2ecf20Sopenharmony_ci	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
19558c2ecf20Sopenharmony_ci	 vinserti128	\$1,16*2($inp),$T0,$T0
19568c2ecf20Sopenharmony_ci
19578c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T1,$T4		# h3*s2
19588c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T1,$H2		# h4*s2
19598c2ecf20Sopenharmony_ci	 vmovdqu	16*1($inp),%x#$T1
19608c2ecf20Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
19618c2ecf20Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
19628c2ecf20Sopenharmony_ci	 vmovdqa	`32*5-0x90`(%rax),$H2	# r3
19638c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T2,$T4		# h1*r2
19648c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r2
19658c2ecf20Sopenharmony_ci	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
19668c2ecf20Sopenharmony_ci	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
19678c2ecf20Sopenharmony_ci	 vinserti128	\$1,16*3($inp),$T1,$T1
19688c2ecf20Sopenharmony_ci	 lea		16*4($inp),$inp
19698c2ecf20Sopenharmony_ci
19708c2ecf20Sopenharmony_ci	vpmuludq	$H1,$H2,$T4		# h1*r3
19718c2ecf20Sopenharmony_ci	vpmuludq	$H0,$H2,$H2		# h0*r3
19728c2ecf20Sopenharmony_ci	 vpsrldq	\$6,$T0,$T2		# splat input
19738c2ecf20Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
19748c2ecf20Sopenharmony_ci	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
19758c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T3,$T4		# h3*s3
19768c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T3,$H2		# h4*s3
19778c2ecf20Sopenharmony_ci	 vpsrldq	\$6,$T1,$T3
19788c2ecf20Sopenharmony_ci	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
19798c2ecf20Sopenharmony_ci	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
19808c2ecf20Sopenharmony_ci	 vpunpckhqdq	$T1,$T0,$T4		# 4
19818c2ecf20Sopenharmony_ci
19828c2ecf20Sopenharmony_ci	vpmuludq	$H3,$S4,$H3		# h3*s4
19838c2ecf20Sopenharmony_ci	vpmuludq	$H4,$S4,$H4		# h4*s4
19848c2ecf20Sopenharmony_ci	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
19858c2ecf20Sopenharmony_ci	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
19868c2ecf20Sopenharmony_ci	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
19878c2ecf20Sopenharmony_ci	 vpunpcklqdq	$T3,$T2,$T3		# 2:3
19888c2ecf20Sopenharmony_ci	vpmuludq	`32*7-0x90`(%rax),$H0,$H4	# h0*r4
19898c2ecf20Sopenharmony_ci	vpmuludq	$H1,$S4,$H0		# h1*s4
19908c2ecf20Sopenharmony_ci	vmovdqa		64(%rcx),$MASK		# .Lmask26
19918c2ecf20Sopenharmony_ci	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
19928c2ecf20Sopenharmony_ci	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
19938c2ecf20Sopenharmony_ci
19948c2ecf20Sopenharmony_ci	################################################################
19958c2ecf20Sopenharmony_ci	# lazy reduction (interleaved with tail of input splat)
19968c2ecf20Sopenharmony_ci
19978c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
19988c2ecf20Sopenharmony_ci	vpand		$MASK,$H3,$H3
19998c2ecf20Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
20008c2ecf20Sopenharmony_ci
20018c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
20028c2ecf20Sopenharmony_ci	vpand		$MASK,$H0,$H0
20038c2ecf20Sopenharmony_ci	vpaddq		$D0,$D1,$H1		# h0 -> h1
20048c2ecf20Sopenharmony_ci
20058c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H4,$D4
20068c2ecf20Sopenharmony_ci	vpand		$MASK,$H4,$H4
20078c2ecf20Sopenharmony_ci
20088c2ecf20Sopenharmony_ci	 vpsrlq		\$4,$T3,$T2
20098c2ecf20Sopenharmony_ci
20108c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H1,$D1
20118c2ecf20Sopenharmony_ci	vpand		$MASK,$H1,$H1
20128c2ecf20Sopenharmony_ci	vpaddq		$D1,$H2,$H2		# h1 -> h2
20138c2ecf20Sopenharmony_ci
20148c2ecf20Sopenharmony_ci	vpaddq		$D4,$H0,$H0
20158c2ecf20Sopenharmony_ci	vpsllq		\$2,$D4,$D4
20168c2ecf20Sopenharmony_ci	vpaddq		$D4,$H0,$H0		# h4 -> h0
20178c2ecf20Sopenharmony_ci
20188c2ecf20Sopenharmony_ci	 vpand		$MASK,$T2,$T2		# 2
20198c2ecf20Sopenharmony_ci	 vpsrlq		\$26,$T0,$T1
20208c2ecf20Sopenharmony_ci
20218c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H2,$D2
20228c2ecf20Sopenharmony_ci	vpand		$MASK,$H2,$H2
20238c2ecf20Sopenharmony_ci	vpaddq		$D2,$H3,$H3		# h2 -> h3
20248c2ecf20Sopenharmony_ci
20258c2ecf20Sopenharmony_ci	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
20268c2ecf20Sopenharmony_ci	 vpsrlq		\$30,$T3,$T3
20278c2ecf20Sopenharmony_ci
20288c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
20298c2ecf20Sopenharmony_ci	vpand		$MASK,$H0,$H0
20308c2ecf20Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
20318c2ecf20Sopenharmony_ci
20328c2ecf20Sopenharmony_ci	 vpsrlq		\$40,$T4,$T4		# 4
20338c2ecf20Sopenharmony_ci
20348c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
20358c2ecf20Sopenharmony_ci	vpand		$MASK,$H3,$H3
20368c2ecf20Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
20378c2ecf20Sopenharmony_ci
20388c2ecf20Sopenharmony_ci	 vpand		$MASK,$T0,$T0		# 0
20398c2ecf20Sopenharmony_ci	 vpand		$MASK,$T1,$T1		# 1
20408c2ecf20Sopenharmony_ci	 vpand		$MASK,$T3,$T3		# 3
20418c2ecf20Sopenharmony_ci	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
20428c2ecf20Sopenharmony_ci
20438c2ecf20Sopenharmony_ci	sub		\$64,$len
20448c2ecf20Sopenharmony_ci	jnz		.Loop_avx2$suffix
20458c2ecf20Sopenharmony_ci
20468c2ecf20Sopenharmony_ci	.byte		0x66,0x90
20478c2ecf20Sopenharmony_ci.Ltail_avx2$suffix:
20488c2ecf20Sopenharmony_ci	################################################################
20498c2ecf20Sopenharmony_ci	# while above multiplications were by r^4 in all lanes, in last
20508c2ecf20Sopenharmony_ci	# iteration we multiply least significant lane by r^4 and most
20518c2ecf20Sopenharmony_ci	# significant one by r, so copy of above except that references
20528c2ecf20Sopenharmony_ci	# to the precomputed table are displaced by 4...
20538c2ecf20Sopenharmony_ci
20548c2ecf20Sopenharmony_ci	#vpaddq		$H2,$T2,$H2		# accumulate input
20558c2ecf20Sopenharmony_ci	vpaddq		$H0,$T0,$H0
20568c2ecf20Sopenharmony_ci	vmovdqu		`32*0+4`(%rsp),$T0	# r0^4
20578c2ecf20Sopenharmony_ci	vpaddq		$H1,$T1,$H1
20588c2ecf20Sopenharmony_ci	vmovdqu		`32*1+4`(%rsp),$T1	# r1^4
20598c2ecf20Sopenharmony_ci	vpaddq		$H3,$T3,$H3
20608c2ecf20Sopenharmony_ci	vmovdqu		`32*3+4`(%rsp),$T2	# r2^4
20618c2ecf20Sopenharmony_ci	vpaddq		$H4,$T4,$H4
20628c2ecf20Sopenharmony_ci	vmovdqu		`32*6+4-0x90`(%rax),$T3	# s3^4
20638c2ecf20Sopenharmony_ci	vmovdqu		`32*8+4-0x90`(%rax),$S4	# s4^4
20648c2ecf20Sopenharmony_ci
20658c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
20668c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
20678c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
20688c2ecf20Sopenharmony_ci	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
20698c2ecf20Sopenharmony_ci	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
20708c2ecf20Sopenharmony_ci
20718c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T1,$T4		# h0*r1
20728c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T1,$H2		# h1*r1
20738c2ecf20Sopenharmony_ci	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
20748c2ecf20Sopenharmony_ci	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
20758c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T1,$T4		# h3*r1
20768c2ecf20Sopenharmony_ci	vpmuludq	`32*2+4`(%rsp),$H4,$H2	# h4*s1
20778c2ecf20Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
20788c2ecf20Sopenharmony_ci	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
20798c2ecf20Sopenharmony_ci
20808c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T0,$T4		# h0*r0
20818c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T0,$H2		# h1*r0
20828c2ecf20Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
20838c2ecf20Sopenharmony_ci	 vmovdqu	`32*4+4-0x90`(%rax),$T1	# s2
20848c2ecf20Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
20858c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T0,$T4		# h3*r0
20868c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T0,$H2		# h4*r0
20878c2ecf20Sopenharmony_ci	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
20888c2ecf20Sopenharmony_ci	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
20898c2ecf20Sopenharmony_ci
20908c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T1,$T4		# h3*s2
20918c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T1,$H2		# h4*s2
20928c2ecf20Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
20938c2ecf20Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
20948c2ecf20Sopenharmony_ci	 vmovdqu	`32*5+4-0x90`(%rax),$H2	# r3
20958c2ecf20Sopenharmony_ci	vpmuludq	$H1,$T2,$T4		# h1*r2
20968c2ecf20Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r2
20978c2ecf20Sopenharmony_ci	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
20988c2ecf20Sopenharmony_ci	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
20998c2ecf20Sopenharmony_ci
21008c2ecf20Sopenharmony_ci	vpmuludq	$H1,$H2,$T4		# h1*r3
21018c2ecf20Sopenharmony_ci	vpmuludq	$H0,$H2,$H2		# h0*r3
21028c2ecf20Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
21038c2ecf20Sopenharmony_ci	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
21048c2ecf20Sopenharmony_ci	vpmuludq	$H3,$T3,$T4		# h3*s3
21058c2ecf20Sopenharmony_ci	vpmuludq	$H4,$T3,$H2		# h4*s3
21068c2ecf20Sopenharmony_ci	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
21078c2ecf20Sopenharmony_ci	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
21088c2ecf20Sopenharmony_ci
21098c2ecf20Sopenharmony_ci	vpmuludq	$H3,$S4,$H3		# h3*s4
21108c2ecf20Sopenharmony_ci	vpmuludq	$H4,$S4,$H4		# h4*s4
21118c2ecf20Sopenharmony_ci	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
21128c2ecf20Sopenharmony_ci	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
21138c2ecf20Sopenharmony_ci	vpmuludq	`32*7+4-0x90`(%rax),$H0,$H4		# h0*r4
21148c2ecf20Sopenharmony_ci	vpmuludq	$H1,$S4,$H0		# h1*s4
21158c2ecf20Sopenharmony_ci	vmovdqa		64(%rcx),$MASK		# .Lmask26
21168c2ecf20Sopenharmony_ci	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
21178c2ecf20Sopenharmony_ci	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
21188c2ecf20Sopenharmony_ci
21198c2ecf20Sopenharmony_ci	################################################################
21208c2ecf20Sopenharmony_ci	# horizontal addition
21218c2ecf20Sopenharmony_ci
21228c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D1,$T1
21238c2ecf20Sopenharmony_ci	vpsrldq		\$8,$H2,$T2
21248c2ecf20Sopenharmony_ci	vpsrldq		\$8,$H3,$T3
21258c2ecf20Sopenharmony_ci	vpsrldq		\$8,$H4,$T4
21268c2ecf20Sopenharmony_ci	vpsrldq		\$8,$H0,$T0
21278c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1,$D1
21288c2ecf20Sopenharmony_ci	vpaddq		$T2,$H2,$H2
21298c2ecf20Sopenharmony_ci	vpaddq		$T3,$H3,$H3
21308c2ecf20Sopenharmony_ci	vpaddq		$T4,$H4,$H4
21318c2ecf20Sopenharmony_ci	vpaddq		$T0,$H0,$H0
21328c2ecf20Sopenharmony_ci
21338c2ecf20Sopenharmony_ci	vpermq		\$0x2,$H3,$T3
21348c2ecf20Sopenharmony_ci	vpermq		\$0x2,$H4,$T4
21358c2ecf20Sopenharmony_ci	vpermq		\$0x2,$H0,$T0
21368c2ecf20Sopenharmony_ci	vpermq		\$0x2,$D1,$T1
21378c2ecf20Sopenharmony_ci	vpermq		\$0x2,$H2,$T2
21388c2ecf20Sopenharmony_ci	vpaddq		$T3,$H3,$H3
21398c2ecf20Sopenharmony_ci	vpaddq		$T4,$H4,$H4
21408c2ecf20Sopenharmony_ci	vpaddq		$T0,$H0,$H0
21418c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1,$D1
21428c2ecf20Sopenharmony_ci	vpaddq		$T2,$H2,$H2
21438c2ecf20Sopenharmony_ci
21448c2ecf20Sopenharmony_ci	################################################################
21458c2ecf20Sopenharmony_ci	# lazy reduction
21468c2ecf20Sopenharmony_ci
21478c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
21488c2ecf20Sopenharmony_ci	vpand		$MASK,$H3,$H3
21498c2ecf20Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
21508c2ecf20Sopenharmony_ci
21518c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
21528c2ecf20Sopenharmony_ci	vpand		$MASK,$H0,$H0
21538c2ecf20Sopenharmony_ci	vpaddq		$D0,$D1,$H1		# h0 -> h1
21548c2ecf20Sopenharmony_ci
21558c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H4,$D4
21568c2ecf20Sopenharmony_ci	vpand		$MASK,$H4,$H4
21578c2ecf20Sopenharmony_ci
21588c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H1,$D1
21598c2ecf20Sopenharmony_ci	vpand		$MASK,$H1,$H1
21608c2ecf20Sopenharmony_ci	vpaddq		$D1,$H2,$H2		# h1 -> h2
21618c2ecf20Sopenharmony_ci
21628c2ecf20Sopenharmony_ci	vpaddq		$D4,$H0,$H0
21638c2ecf20Sopenharmony_ci	vpsllq		\$2,$D4,$D4
21648c2ecf20Sopenharmony_ci	vpaddq		$D4,$H0,$H0		# h4 -> h0
21658c2ecf20Sopenharmony_ci
21668c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H2,$D2
21678c2ecf20Sopenharmony_ci	vpand		$MASK,$H2,$H2
21688c2ecf20Sopenharmony_ci	vpaddq		$D2,$H3,$H3		# h2 -> h3
21698c2ecf20Sopenharmony_ci
21708c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
21718c2ecf20Sopenharmony_ci	vpand		$MASK,$H0,$H0
21728c2ecf20Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
21738c2ecf20Sopenharmony_ci
21748c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
21758c2ecf20Sopenharmony_ci	vpand		$MASK,$H3,$H3
21768c2ecf20Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
21778c2ecf20Sopenharmony_ci
21788c2ecf20Sopenharmony_ci	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
21798c2ecf20Sopenharmony_ci	vmovd		%x#$H1,`4*1-48-64`($ctx)
21808c2ecf20Sopenharmony_ci	vmovd		%x#$H2,`4*2-48-64`($ctx)
21818c2ecf20Sopenharmony_ci	vmovd		%x#$H3,`4*3-48-64`($ctx)
21828c2ecf20Sopenharmony_ci	vmovd		%x#$H4,`4*4-48-64`($ctx)
21838c2ecf20Sopenharmony_ci___
21848c2ecf20Sopenharmony_ci$code.=<<___	if ($win64);
21858c2ecf20Sopenharmony_ci	vmovdqa		-0xb0(%r10),%xmm6
21868c2ecf20Sopenharmony_ci	vmovdqa		-0xa0(%r10),%xmm7
21878c2ecf20Sopenharmony_ci	vmovdqa		-0x90(%r10),%xmm8
21888c2ecf20Sopenharmony_ci	vmovdqa		-0x80(%r10),%xmm9
21898c2ecf20Sopenharmony_ci	vmovdqa		-0x70(%r10),%xmm10
21908c2ecf20Sopenharmony_ci	vmovdqa		-0x60(%r10),%xmm11
21918c2ecf20Sopenharmony_ci	vmovdqa		-0x50(%r10),%xmm12
21928c2ecf20Sopenharmony_ci	vmovdqa		-0x40(%r10),%xmm13
21938c2ecf20Sopenharmony_ci	vmovdqa		-0x30(%r10),%xmm14
21948c2ecf20Sopenharmony_ci	vmovdqa		-0x20(%r10),%xmm15
21958c2ecf20Sopenharmony_ci	lea		-8(%r10),%rsp
21968c2ecf20Sopenharmony_ci.Ldo_avx2_epilogue$suffix:
21978c2ecf20Sopenharmony_ci___
21988c2ecf20Sopenharmony_ci$code.=<<___	if (!$win64);
21998c2ecf20Sopenharmony_ci	lea		-8(%r10),%rsp
22008c2ecf20Sopenharmony_ci.cfi_def_cfa_register	%rsp
22018c2ecf20Sopenharmony_ci___
22028c2ecf20Sopenharmony_ci$code.=<<___;
22038c2ecf20Sopenharmony_ci	vzeroupper
22048c2ecf20Sopenharmony_ci	RET
22058c2ecf20Sopenharmony_ci.cfi_endproc
22068c2ecf20Sopenharmony_ci___
22078c2ecf20Sopenharmony_ciif($avx > 2 && $avx512) {
22088c2ecf20Sopenharmony_cimy ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
22098c2ecf20Sopenharmony_cimy ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
22108c2ecf20Sopenharmony_cimy $PADBIT="%zmm30";
22118c2ecf20Sopenharmony_ci
22128c2ecf20Sopenharmony_cimap(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));		# switch to %zmm domain
22138c2ecf20Sopenharmony_cimap(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
22148c2ecf20Sopenharmony_cimap(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
22158c2ecf20Sopenharmony_cimap(s/%y/%z/,($MASK));
22168c2ecf20Sopenharmony_ci
22178c2ecf20Sopenharmony_ci$code.=<<___;
22188c2ecf20Sopenharmony_ci.cfi_startproc
22198c2ecf20Sopenharmony_ci.Lblocks_avx512:
22208c2ecf20Sopenharmony_ci	mov		\$15,%eax
22218c2ecf20Sopenharmony_ci	kmovw		%eax,%k2
22228c2ecf20Sopenharmony_ci___
22238c2ecf20Sopenharmony_ci$code.=<<___	if (!$win64);
22248c2ecf20Sopenharmony_ci	lea		8(%rsp),%r10
22258c2ecf20Sopenharmony_ci.cfi_def_cfa_register	%r10
22268c2ecf20Sopenharmony_ci	sub		\$0x128,%rsp
22278c2ecf20Sopenharmony_ci___
22288c2ecf20Sopenharmony_ci$code.=<<___	if ($win64);
22298c2ecf20Sopenharmony_ci	lea		8(%rsp),%r10
22308c2ecf20Sopenharmony_ci	sub		\$0x1c8,%rsp
22318c2ecf20Sopenharmony_ci	vmovdqa		%xmm6,-0xb0(%r10)
22328c2ecf20Sopenharmony_ci	vmovdqa		%xmm7,-0xa0(%r10)
22338c2ecf20Sopenharmony_ci	vmovdqa		%xmm8,-0x90(%r10)
22348c2ecf20Sopenharmony_ci	vmovdqa		%xmm9,-0x80(%r10)
22358c2ecf20Sopenharmony_ci	vmovdqa		%xmm10,-0x70(%r10)
22368c2ecf20Sopenharmony_ci	vmovdqa		%xmm11,-0x60(%r10)
22378c2ecf20Sopenharmony_ci	vmovdqa		%xmm12,-0x50(%r10)
22388c2ecf20Sopenharmony_ci	vmovdqa		%xmm13,-0x40(%r10)
22398c2ecf20Sopenharmony_ci	vmovdqa		%xmm14,-0x30(%r10)
22408c2ecf20Sopenharmony_ci	vmovdqa		%xmm15,-0x20(%r10)
22418c2ecf20Sopenharmony_ci.Ldo_avx512_body:
22428c2ecf20Sopenharmony_ci___
22438c2ecf20Sopenharmony_ci$code.=<<___;
22448c2ecf20Sopenharmony_ci	lea		.Lconst(%rip),%rcx
22458c2ecf20Sopenharmony_ci	lea		48+64($ctx),$ctx	# size optimization
22468c2ecf20Sopenharmony_ci	vmovdqa		96(%rcx),%y#$T2		# .Lpermd_avx2
22478c2ecf20Sopenharmony_ci
22488c2ecf20Sopenharmony_ci	# expand pre-calculated table
22498c2ecf20Sopenharmony_ci	vmovdqu		`16*0-64`($ctx),%x#$D0	# will become expanded ${R0}
22508c2ecf20Sopenharmony_ci	and		\$-512,%rsp
22518c2ecf20Sopenharmony_ci	vmovdqu		`16*1-64`($ctx),%x#$D1	# will become ... ${R1}
22528c2ecf20Sopenharmony_ci	mov		\$0x20,%rax
22538c2ecf20Sopenharmony_ci	vmovdqu		`16*2-64`($ctx),%x#$T0	# ... ${S1}
22548c2ecf20Sopenharmony_ci	vmovdqu		`16*3-64`($ctx),%x#$D2	# ... ${R2}
22558c2ecf20Sopenharmony_ci	vmovdqu		`16*4-64`($ctx),%x#$T1	# ... ${S2}
22568c2ecf20Sopenharmony_ci	vmovdqu		`16*5-64`($ctx),%x#$D3	# ... ${R3}
22578c2ecf20Sopenharmony_ci	vmovdqu		`16*6-64`($ctx),%x#$T3	# ... ${S3}
22588c2ecf20Sopenharmony_ci	vmovdqu		`16*7-64`($ctx),%x#$D4	# ... ${R4}
22598c2ecf20Sopenharmony_ci	vmovdqu		`16*8-64`($ctx),%x#$T4	# ... ${S4}
22608c2ecf20Sopenharmony_ci	vpermd		$D0,$T2,$R0		# 00003412 -> 14243444
22618c2ecf20Sopenharmony_ci	vpbroadcastq	64(%rcx),$MASK		# .Lmask26
22628c2ecf20Sopenharmony_ci	vpermd		$D1,$T2,$R1
22638c2ecf20Sopenharmony_ci	vpermd		$T0,$T2,$S1
22648c2ecf20Sopenharmony_ci	vpermd		$D2,$T2,$R2
22658c2ecf20Sopenharmony_ci	vmovdqa64	$R0,0x00(%rsp){%k2}	# save in case $len%128 != 0
22668c2ecf20Sopenharmony_ci	 vpsrlq		\$32,$R0,$T0		# 14243444 -> 01020304
22678c2ecf20Sopenharmony_ci	vpermd		$T1,$T2,$S2
22688c2ecf20Sopenharmony_ci	vmovdqu64	$R1,0x00(%rsp,%rax){%k2}
22698c2ecf20Sopenharmony_ci	 vpsrlq		\$32,$R1,$T1
22708c2ecf20Sopenharmony_ci	vpermd		$D3,$T2,$R3
22718c2ecf20Sopenharmony_ci	vmovdqa64	$S1,0x40(%rsp){%k2}
22728c2ecf20Sopenharmony_ci	vpermd		$T3,$T2,$S3
22738c2ecf20Sopenharmony_ci	vpermd		$D4,$T2,$R4
22748c2ecf20Sopenharmony_ci	vmovdqu64	$R2,0x40(%rsp,%rax){%k2}
22758c2ecf20Sopenharmony_ci	vpermd		$T4,$T2,$S4
22768c2ecf20Sopenharmony_ci	vmovdqa64	$S2,0x80(%rsp){%k2}
22778c2ecf20Sopenharmony_ci	vmovdqu64	$R3,0x80(%rsp,%rax){%k2}
22788c2ecf20Sopenharmony_ci	vmovdqa64	$S3,0xc0(%rsp){%k2}
22798c2ecf20Sopenharmony_ci	vmovdqu64	$R4,0xc0(%rsp,%rax){%k2}
22808c2ecf20Sopenharmony_ci	vmovdqa64	$S4,0x100(%rsp){%k2}
22818c2ecf20Sopenharmony_ci
22828c2ecf20Sopenharmony_ci	################################################################
22838c2ecf20Sopenharmony_ci	# calculate 5th through 8th powers of the key
22848c2ecf20Sopenharmony_ci	#
22858c2ecf20Sopenharmony_ci	# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
22868c2ecf20Sopenharmony_ci	# d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
22878c2ecf20Sopenharmony_ci	# d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
22888c2ecf20Sopenharmony_ci	# d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
22898c2ecf20Sopenharmony_ci	# d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
22908c2ecf20Sopenharmony_ci
22918c2ecf20Sopenharmony_ci	vpmuludq	$T0,$R0,$D0		# d0 = r0'*r0
22928c2ecf20Sopenharmony_ci	vpmuludq	$T0,$R1,$D1		# d1 = r0'*r1
22938c2ecf20Sopenharmony_ci	vpmuludq	$T0,$R2,$D2		# d2 = r0'*r2
22948c2ecf20Sopenharmony_ci	vpmuludq	$T0,$R3,$D3		# d3 = r0'*r3
22958c2ecf20Sopenharmony_ci	vpmuludq	$T0,$R4,$D4		# d4 = r0'*r4
22968c2ecf20Sopenharmony_ci	 vpsrlq		\$32,$R2,$T2
22978c2ecf20Sopenharmony_ci
22988c2ecf20Sopenharmony_ci	vpmuludq	$T1,$S4,$M0
22998c2ecf20Sopenharmony_ci	vpmuludq	$T1,$R0,$M1
23008c2ecf20Sopenharmony_ci	vpmuludq	$T1,$R1,$M2
23018c2ecf20Sopenharmony_ci	vpmuludq	$T1,$R2,$M3
23028c2ecf20Sopenharmony_ci	vpmuludq	$T1,$R3,$M4
23038c2ecf20Sopenharmony_ci	 vpsrlq		\$32,$R3,$T3
23048c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += r1'*5*r4
23058c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += r1'*r0
23068c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += r1'*r1
23078c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += r1'*r2
23088c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += r1'*r3
23098c2ecf20Sopenharmony_ci
23108c2ecf20Sopenharmony_ci	vpmuludq	$T2,$S3,$M0
23118c2ecf20Sopenharmony_ci	vpmuludq	$T2,$S4,$M1
23128c2ecf20Sopenharmony_ci	vpmuludq	$T2,$R1,$M3
23138c2ecf20Sopenharmony_ci	vpmuludq	$T2,$R2,$M4
23148c2ecf20Sopenharmony_ci	vpmuludq	$T2,$R0,$M2
23158c2ecf20Sopenharmony_ci	 vpsrlq		\$32,$R4,$T4
23168c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r3
23178c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r4
23188c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += r2'*r1
23198c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += r2'*r2
23208c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += r2'*r0
23218c2ecf20Sopenharmony_ci
23228c2ecf20Sopenharmony_ci	vpmuludq	$T3,$S2,$M0
23238c2ecf20Sopenharmony_ci	vpmuludq	$T3,$R0,$M3
23248c2ecf20Sopenharmony_ci	vpmuludq	$T3,$R1,$M4
23258c2ecf20Sopenharmony_ci	vpmuludq	$T3,$S3,$M1
23268c2ecf20Sopenharmony_ci	vpmuludq	$T3,$S4,$M2
23278c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += r3'*5*r2
23288c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += r3'*r0
23298c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += r3'*r1
23308c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += r3'*5*r3
23318c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += r3'*5*r4
23328c2ecf20Sopenharmony_ci
23338c2ecf20Sopenharmony_ci	vpmuludq	$T4,$S4,$M3
23348c2ecf20Sopenharmony_ci	vpmuludq	$T4,$R0,$M4
23358c2ecf20Sopenharmony_ci	vpmuludq	$T4,$S1,$M0
23368c2ecf20Sopenharmony_ci	vpmuludq	$T4,$S2,$M1
23378c2ecf20Sopenharmony_ci	vpmuludq	$T4,$S3,$M2
23388c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += r2'*5*r4
23398c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += r2'*r0
23408c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r1
23418c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r2
23428c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += r2'*5*r3
23438c2ecf20Sopenharmony_ci
23448c2ecf20Sopenharmony_ci	################################################################
23458c2ecf20Sopenharmony_ci	# load input
23468c2ecf20Sopenharmony_ci	vmovdqu64	16*0($inp),%z#$T3
23478c2ecf20Sopenharmony_ci	vmovdqu64	16*4($inp),%z#$T4
23488c2ecf20Sopenharmony_ci	lea		16*8($inp),$inp
23498c2ecf20Sopenharmony_ci
23508c2ecf20Sopenharmony_ci	################################################################
23518c2ecf20Sopenharmony_ci	# lazy reduction
23528c2ecf20Sopenharmony_ci
23538c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D3,$M3
23548c2ecf20Sopenharmony_ci	vpandq		$MASK,$D3,$D3
23558c2ecf20Sopenharmony_ci	vpaddq		$M3,$D4,$D4		# d3 -> d4
23568c2ecf20Sopenharmony_ci
23578c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D0,$M0
23588c2ecf20Sopenharmony_ci	vpandq		$MASK,$D0,$D0
23598c2ecf20Sopenharmony_ci	vpaddq		$M0,$D1,$D1		# d0 -> d1
23608c2ecf20Sopenharmony_ci
23618c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D4,$M4
23628c2ecf20Sopenharmony_ci	vpandq		$MASK,$D4,$D4
23638c2ecf20Sopenharmony_ci
23648c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D1,$M1
23658c2ecf20Sopenharmony_ci	vpandq		$MASK,$D1,$D1
23668c2ecf20Sopenharmony_ci	vpaddq		$M1,$D2,$D2		# d1 -> d2
23678c2ecf20Sopenharmony_ci
23688c2ecf20Sopenharmony_ci	vpaddq		$M4,$D0,$D0
23698c2ecf20Sopenharmony_ci	vpsllq		\$2,$M4,$M4
23708c2ecf20Sopenharmony_ci	vpaddq		$M4,$D0,$D0		# d4 -> d0
23718c2ecf20Sopenharmony_ci
23728c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D2,$M2
23738c2ecf20Sopenharmony_ci	vpandq		$MASK,$D2,$D2
23748c2ecf20Sopenharmony_ci	vpaddq		$M2,$D3,$D3		# d2 -> d3
23758c2ecf20Sopenharmony_ci
23768c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D0,$M0
23778c2ecf20Sopenharmony_ci	vpandq		$MASK,$D0,$D0
23788c2ecf20Sopenharmony_ci	vpaddq		$M0,$D1,$D1		# d0 -> d1
23798c2ecf20Sopenharmony_ci
23808c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D3,$M3
23818c2ecf20Sopenharmony_ci	vpandq		$MASK,$D3,$D3
23828c2ecf20Sopenharmony_ci	vpaddq		$M3,$D4,$D4		# d3 -> d4
23838c2ecf20Sopenharmony_ci
23848c2ecf20Sopenharmony_ci	################################################################
23858c2ecf20Sopenharmony_ci	# at this point we have 14243444 in $R0-$S4 and 05060708 in
23868c2ecf20Sopenharmony_ci	# $D0-$D4, ...
23878c2ecf20Sopenharmony_ci
23888c2ecf20Sopenharmony_ci	vpunpcklqdq	$T4,$T3,$T0	# transpose input
23898c2ecf20Sopenharmony_ci	vpunpckhqdq	$T4,$T3,$T4
23908c2ecf20Sopenharmony_ci
23918c2ecf20Sopenharmony_ci	# ... since input 64-bit lanes are ordered as 73625140, we could
23928c2ecf20Sopenharmony_ci	# "vperm" it to 76543210 (here and in each loop iteration), *or*
23938c2ecf20Sopenharmony_ci	# we could just flow along, hence the goal for $R0-$S4 is
23948c2ecf20Sopenharmony_ci	# 1858286838784888 ...
23958c2ecf20Sopenharmony_ci
23968c2ecf20Sopenharmony_ci	vmovdqa32	128(%rcx),$M0		# .Lpermd_avx512:
23978c2ecf20Sopenharmony_ci	mov		\$0x7777,%eax
23988c2ecf20Sopenharmony_ci	kmovw		%eax,%k1
23998c2ecf20Sopenharmony_ci
24008c2ecf20Sopenharmony_ci	vpermd		$R0,$M0,$R0		# 14243444 -> 1---2---3---4---
24018c2ecf20Sopenharmony_ci	vpermd		$R1,$M0,$R1
24028c2ecf20Sopenharmony_ci	vpermd		$R2,$M0,$R2
24038c2ecf20Sopenharmony_ci	vpermd		$R3,$M0,$R3
24048c2ecf20Sopenharmony_ci	vpermd		$R4,$M0,$R4
24058c2ecf20Sopenharmony_ci
24068c2ecf20Sopenharmony_ci	vpermd		$D0,$M0,${R0}{%k1}	# 05060708 -> 1858286838784888
24078c2ecf20Sopenharmony_ci	vpermd		$D1,$M0,${R1}{%k1}
24088c2ecf20Sopenharmony_ci	vpermd		$D2,$M0,${R2}{%k1}
24098c2ecf20Sopenharmony_ci	vpermd		$D3,$M0,${R3}{%k1}
24108c2ecf20Sopenharmony_ci	vpermd		$D4,$M0,${R4}{%k1}
24118c2ecf20Sopenharmony_ci
24128c2ecf20Sopenharmony_ci	vpslld		\$2,$R1,$S1		# *5
24138c2ecf20Sopenharmony_ci	vpslld		\$2,$R2,$S2
24148c2ecf20Sopenharmony_ci	vpslld		\$2,$R3,$S3
24158c2ecf20Sopenharmony_ci	vpslld		\$2,$R4,$S4
24168c2ecf20Sopenharmony_ci	vpaddd		$R1,$S1,$S1
24178c2ecf20Sopenharmony_ci	vpaddd		$R2,$S2,$S2
24188c2ecf20Sopenharmony_ci	vpaddd		$R3,$S3,$S3
24198c2ecf20Sopenharmony_ci	vpaddd		$R4,$S4,$S4
24208c2ecf20Sopenharmony_ci
24218c2ecf20Sopenharmony_ci	vpbroadcastq	32(%rcx),$PADBIT	# .L129
24228c2ecf20Sopenharmony_ci
24238c2ecf20Sopenharmony_ci	vpsrlq		\$52,$T0,$T2		# splat input
24248c2ecf20Sopenharmony_ci	vpsllq		\$12,$T4,$T3
24258c2ecf20Sopenharmony_ci	vporq		$T3,$T2,$T2
24268c2ecf20Sopenharmony_ci	vpsrlq		\$26,$T0,$T1
24278c2ecf20Sopenharmony_ci	vpsrlq		\$14,$T4,$T3
24288c2ecf20Sopenharmony_ci	vpsrlq		\$40,$T4,$T4		# 4
24298c2ecf20Sopenharmony_ci	vpandq		$MASK,$T2,$T2		# 2
24308c2ecf20Sopenharmony_ci	vpandq		$MASK,$T0,$T0		# 0
24318c2ecf20Sopenharmony_ci	#vpandq		$MASK,$T1,$T1		# 1
24328c2ecf20Sopenharmony_ci	#vpandq		$MASK,$T3,$T3		# 3
24338c2ecf20Sopenharmony_ci	#vporq		$PADBIT,$T4,$T4		# padbit, yes, always
24348c2ecf20Sopenharmony_ci
24358c2ecf20Sopenharmony_ci	vpaddq		$H2,$T2,$H2		# accumulate input
24368c2ecf20Sopenharmony_ci	sub		\$192,$len
24378c2ecf20Sopenharmony_ci	jbe		.Ltail_avx512
24388c2ecf20Sopenharmony_ci	jmp		.Loop_avx512
24398c2ecf20Sopenharmony_ci
24408c2ecf20Sopenharmony_ci.align	32
24418c2ecf20Sopenharmony_ci.Loop_avx512:
24428c2ecf20Sopenharmony_ci	################################################################
24438c2ecf20Sopenharmony_ci	# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
24448c2ecf20Sopenharmony_ci	# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
24458c2ecf20Sopenharmony_ci	# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
24468c2ecf20Sopenharmony_ci	# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
24478c2ecf20Sopenharmony_ci	# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
24488c2ecf20Sopenharmony_ci	# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
24498c2ecf20Sopenharmony_ci	# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
24508c2ecf20Sopenharmony_ci	# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
24518c2ecf20Sopenharmony_ci	#   \________/\___________/
24528c2ecf20Sopenharmony_ci	################################################################
24538c2ecf20Sopenharmony_ci	#vpaddq		$H2,$T2,$H2		# accumulate input
24548c2ecf20Sopenharmony_ci
24558c2ecf20Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
24568c2ecf20Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
24578c2ecf20Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
24588c2ecf20Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
24598c2ecf20Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
24608c2ecf20Sopenharmony_ci	#
24618c2ecf20Sopenharmony_ci	# however, as h2 is "chronologically" first one available pull
24628c2ecf20Sopenharmony_ci	# corresponding operations up, so it's
24638c2ecf20Sopenharmony_ci	#
24648c2ecf20Sopenharmony_ci	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
24658c2ecf20Sopenharmony_ci	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
24668c2ecf20Sopenharmony_ci	# d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
24678c2ecf20Sopenharmony_ci	# d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
24688c2ecf20Sopenharmony_ci	# d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
24698c2ecf20Sopenharmony_ci
24708c2ecf20Sopenharmony_ci	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
24718c2ecf20Sopenharmony_ci	 vpaddq		$H0,$T0,$H0
24728c2ecf20Sopenharmony_ci	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
24738c2ecf20Sopenharmony_ci	 vpandq		$MASK,$T1,$T1		# 1
24748c2ecf20Sopenharmony_ci	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
24758c2ecf20Sopenharmony_ci	 vpandq		$MASK,$T3,$T3		# 3
24768c2ecf20Sopenharmony_ci	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
24778c2ecf20Sopenharmony_ci	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
24788c2ecf20Sopenharmony_ci	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
24798c2ecf20Sopenharmony_ci	 vpaddq		$H1,$T1,$H1		# accumulate input
24808c2ecf20Sopenharmony_ci	 vpaddq		$H3,$T3,$H3
24818c2ecf20Sopenharmony_ci	 vpaddq		$H4,$T4,$H4
24828c2ecf20Sopenharmony_ci
24838c2ecf20Sopenharmony_ci	  vmovdqu64	16*0($inp),$T3		# load input
24848c2ecf20Sopenharmony_ci	  vmovdqu64	16*4($inp),$T4
24858c2ecf20Sopenharmony_ci	  lea		16*8($inp),$inp
24868c2ecf20Sopenharmony_ci	vpmuludq	$H0,$R3,$M3
24878c2ecf20Sopenharmony_ci	vpmuludq	$H0,$R4,$M4
24888c2ecf20Sopenharmony_ci	vpmuludq	$H0,$R0,$M0
24898c2ecf20Sopenharmony_ci	vpmuludq	$H0,$R1,$M1
24908c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
24918c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
24928c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
24938c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
24948c2ecf20Sopenharmony_ci
24958c2ecf20Sopenharmony_ci	vpmuludq	$H1,$R2,$M3
24968c2ecf20Sopenharmony_ci	vpmuludq	$H1,$R3,$M4
24978c2ecf20Sopenharmony_ci	vpmuludq	$H1,$S4,$M0
24988c2ecf20Sopenharmony_ci	vpmuludq	$H0,$R2,$M2
24998c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
25008c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
25018c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
25028c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
25038c2ecf20Sopenharmony_ci
25048c2ecf20Sopenharmony_ci	  vpunpcklqdq	$T4,$T3,$T0		# transpose input
25058c2ecf20Sopenharmony_ci	  vpunpckhqdq	$T4,$T3,$T4
25068c2ecf20Sopenharmony_ci
25078c2ecf20Sopenharmony_ci	vpmuludq	$H3,$R0,$M3
25088c2ecf20Sopenharmony_ci	vpmuludq	$H3,$R1,$M4
25098c2ecf20Sopenharmony_ci	vpmuludq	$H1,$R0,$M1
25108c2ecf20Sopenharmony_ci	vpmuludq	$H1,$R1,$M2
25118c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
25128c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
25138c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
25148c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
25158c2ecf20Sopenharmony_ci
25168c2ecf20Sopenharmony_ci	vpmuludq	$H4,$S4,$M3
25178c2ecf20Sopenharmony_ci	vpmuludq	$H4,$R0,$M4
25188c2ecf20Sopenharmony_ci	vpmuludq	$H3,$S2,$M0
25198c2ecf20Sopenharmony_ci	vpmuludq	$H3,$S3,$M1
25208c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h4*s4
25218c2ecf20Sopenharmony_ci	vpmuludq	$H3,$S4,$M2
25228c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
25238c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
25248c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
25258c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
25268c2ecf20Sopenharmony_ci
25278c2ecf20Sopenharmony_ci	vpmuludq	$H4,$S1,$M0
25288c2ecf20Sopenharmony_ci	vpmuludq	$H4,$S2,$M1
25298c2ecf20Sopenharmony_ci	vpmuludq	$H4,$S3,$M2
25308c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
25318c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
25328c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
25338c2ecf20Sopenharmony_ci
25348c2ecf20Sopenharmony_ci	################################################################
25358c2ecf20Sopenharmony_ci	# lazy reduction (interleaved with input splat)
25368c2ecf20Sopenharmony_ci
25378c2ecf20Sopenharmony_ci	 vpsrlq		\$52,$T0,$T2		# splat input
25388c2ecf20Sopenharmony_ci	 vpsllq		\$12,$T4,$T3
25398c2ecf20Sopenharmony_ci
25408c2ecf20Sopenharmony_ci	vpsrlq		\$26,$D3,$H3
25418c2ecf20Sopenharmony_ci	vpandq		$MASK,$D3,$D3
25428c2ecf20Sopenharmony_ci	vpaddq		$H3,$D4,$H4		# h3 -> h4
25438c2ecf20Sopenharmony_ci
25448c2ecf20Sopenharmony_ci	 vporq		$T3,$T2,$T2
25458c2ecf20Sopenharmony_ci
25468c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
25478c2ecf20Sopenharmony_ci	vpandq		$MASK,$H0,$H0
25488c2ecf20Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
25498c2ecf20Sopenharmony_ci
25508c2ecf20Sopenharmony_ci	 vpandq		$MASK,$T2,$T2		# 2
25518c2ecf20Sopenharmony_ci
25528c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H4,$D4
25538c2ecf20Sopenharmony_ci	vpandq		$MASK,$H4,$H4
25548c2ecf20Sopenharmony_ci
25558c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H1,$D1
25568c2ecf20Sopenharmony_ci	vpandq		$MASK,$H1,$H1
25578c2ecf20Sopenharmony_ci	vpaddq		$D1,$H2,$H2		# h1 -> h2
25588c2ecf20Sopenharmony_ci
25598c2ecf20Sopenharmony_ci	vpaddq		$D4,$H0,$H0
25608c2ecf20Sopenharmony_ci	vpsllq		\$2,$D4,$D4
25618c2ecf20Sopenharmony_ci	vpaddq		$D4,$H0,$H0		# h4 -> h0
25628c2ecf20Sopenharmony_ci
25638c2ecf20Sopenharmony_ci	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
25648c2ecf20Sopenharmony_ci	 vpsrlq		\$26,$T0,$T1
25658c2ecf20Sopenharmony_ci
25668c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H2,$D2
25678c2ecf20Sopenharmony_ci	vpandq		$MASK,$H2,$H2
25688c2ecf20Sopenharmony_ci	vpaddq		$D2,$D3,$H3		# h2 -> h3
25698c2ecf20Sopenharmony_ci
25708c2ecf20Sopenharmony_ci	 vpsrlq		\$14,$T4,$T3
25718c2ecf20Sopenharmony_ci
25728c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
25738c2ecf20Sopenharmony_ci	vpandq		$MASK,$H0,$H0
25748c2ecf20Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
25758c2ecf20Sopenharmony_ci
25768c2ecf20Sopenharmony_ci	 vpsrlq		\$40,$T4,$T4		# 4
25778c2ecf20Sopenharmony_ci
25788c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
25798c2ecf20Sopenharmony_ci	vpandq		$MASK,$H3,$H3
25808c2ecf20Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
25818c2ecf20Sopenharmony_ci
25828c2ecf20Sopenharmony_ci	 vpandq		$MASK,$T0,$T0		# 0
25838c2ecf20Sopenharmony_ci	 #vpandq	$MASK,$T1,$T1		# 1
25848c2ecf20Sopenharmony_ci	 #vpandq	$MASK,$T3,$T3		# 3
25858c2ecf20Sopenharmony_ci	 #vporq		$PADBIT,$T4,$T4		# padbit, yes, always
25868c2ecf20Sopenharmony_ci
25878c2ecf20Sopenharmony_ci	sub		\$128,$len
25888c2ecf20Sopenharmony_ci	ja		.Loop_avx512
25898c2ecf20Sopenharmony_ci
25908c2ecf20Sopenharmony_ci.Ltail_avx512:
25918c2ecf20Sopenharmony_ci	################################################################
25928c2ecf20Sopenharmony_ci	# while above multiplications were by r^8 in all lanes, in last
25938c2ecf20Sopenharmony_ci	# iteration we multiply least significant lane by r^8 and most
25948c2ecf20Sopenharmony_ci	# significant one by r, that's why table gets shifted...
25958c2ecf20Sopenharmony_ci
25968c2ecf20Sopenharmony_ci	vpsrlq		\$32,$R0,$R0		# 0105020603070408
25978c2ecf20Sopenharmony_ci	vpsrlq		\$32,$R1,$R1
25988c2ecf20Sopenharmony_ci	vpsrlq		\$32,$R2,$R2
25998c2ecf20Sopenharmony_ci	vpsrlq		\$32,$S3,$S3
26008c2ecf20Sopenharmony_ci	vpsrlq		\$32,$S4,$S4
26018c2ecf20Sopenharmony_ci	vpsrlq		\$32,$R3,$R3
26028c2ecf20Sopenharmony_ci	vpsrlq		\$32,$R4,$R4
26038c2ecf20Sopenharmony_ci	vpsrlq		\$32,$S1,$S1
26048c2ecf20Sopenharmony_ci	vpsrlq		\$32,$S2,$S2
26058c2ecf20Sopenharmony_ci
26068c2ecf20Sopenharmony_ci	################################################################
26078c2ecf20Sopenharmony_ci	# load either next or last 64 byte of input
26088c2ecf20Sopenharmony_ci	lea		($inp,$len),$inp
26098c2ecf20Sopenharmony_ci
26108c2ecf20Sopenharmony_ci	#vpaddq		$H2,$T2,$H2		# accumulate input
26118c2ecf20Sopenharmony_ci	vpaddq		$H0,$T0,$H0
26128c2ecf20Sopenharmony_ci
26138c2ecf20Sopenharmony_ci	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
26148c2ecf20Sopenharmony_ci	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
26158c2ecf20Sopenharmony_ci	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
26168c2ecf20Sopenharmony_ci	 vpandq		$MASK,$T1,$T1		# 1
26178c2ecf20Sopenharmony_ci	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
26188c2ecf20Sopenharmony_ci	 vpandq		$MASK,$T3,$T3		# 3
26198c2ecf20Sopenharmony_ci	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
26208c2ecf20Sopenharmony_ci	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
26218c2ecf20Sopenharmony_ci	 vpaddq		$H1,$T1,$H1		# accumulate input
26228c2ecf20Sopenharmony_ci	 vpaddq		$H3,$T3,$H3
26238c2ecf20Sopenharmony_ci	 vpaddq		$H4,$T4,$H4
26248c2ecf20Sopenharmony_ci
26258c2ecf20Sopenharmony_ci	  vmovdqu	16*0($inp),%x#$T0
26268c2ecf20Sopenharmony_ci	vpmuludq	$H0,$R3,$M3
26278c2ecf20Sopenharmony_ci	vpmuludq	$H0,$R4,$M4
26288c2ecf20Sopenharmony_ci	vpmuludq	$H0,$R0,$M0
26298c2ecf20Sopenharmony_ci	vpmuludq	$H0,$R1,$M1
26308c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
26318c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
26328c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
26338c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
26348c2ecf20Sopenharmony_ci
26358c2ecf20Sopenharmony_ci	  vmovdqu	16*1($inp),%x#$T1
26368c2ecf20Sopenharmony_ci	vpmuludq	$H1,$R2,$M3
26378c2ecf20Sopenharmony_ci	vpmuludq	$H1,$R3,$M4
26388c2ecf20Sopenharmony_ci	vpmuludq	$H1,$S4,$M0
26398c2ecf20Sopenharmony_ci	vpmuludq	$H0,$R2,$M2
26408c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
26418c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
26428c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
26438c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
26448c2ecf20Sopenharmony_ci
26458c2ecf20Sopenharmony_ci	  vinserti128	\$1,16*2($inp),%y#$T0,%y#$T0
26468c2ecf20Sopenharmony_ci	vpmuludq	$H3,$R0,$M3
26478c2ecf20Sopenharmony_ci	vpmuludq	$H3,$R1,$M4
26488c2ecf20Sopenharmony_ci	vpmuludq	$H1,$R0,$M1
26498c2ecf20Sopenharmony_ci	vpmuludq	$H1,$R1,$M2
26508c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
26518c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
26528c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
26538c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
26548c2ecf20Sopenharmony_ci
26558c2ecf20Sopenharmony_ci	  vinserti128	\$1,16*3($inp),%y#$T1,%y#$T1
26568c2ecf20Sopenharmony_ci	vpmuludq	$H4,$S4,$M3
26578c2ecf20Sopenharmony_ci	vpmuludq	$H4,$R0,$M4
26588c2ecf20Sopenharmony_ci	vpmuludq	$H3,$S2,$M0
26598c2ecf20Sopenharmony_ci	vpmuludq	$H3,$S3,$M1
26608c2ecf20Sopenharmony_ci	vpmuludq	$H3,$S4,$M2
26618c2ecf20Sopenharmony_ci	vpaddq		$M3,$D3,$H3		# h3 = d3 + h4*s4
26628c2ecf20Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
26638c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
26648c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
26658c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
26668c2ecf20Sopenharmony_ci
26678c2ecf20Sopenharmony_ci	vpmuludq	$H4,$S1,$M0
26688c2ecf20Sopenharmony_ci	vpmuludq	$H4,$S2,$M1
26698c2ecf20Sopenharmony_ci	vpmuludq	$H4,$S3,$M2
26708c2ecf20Sopenharmony_ci	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
26718c2ecf20Sopenharmony_ci	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
26728c2ecf20Sopenharmony_ci	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
26738c2ecf20Sopenharmony_ci
26748c2ecf20Sopenharmony_ci	################################################################
26758c2ecf20Sopenharmony_ci	# horizontal addition
26768c2ecf20Sopenharmony_ci
26778c2ecf20Sopenharmony_ci	mov		\$1,%eax
26788c2ecf20Sopenharmony_ci	vpermq		\$0xb1,$H3,$D3
26798c2ecf20Sopenharmony_ci	vpermq		\$0xb1,$D4,$H4
26808c2ecf20Sopenharmony_ci	vpermq		\$0xb1,$H0,$D0
26818c2ecf20Sopenharmony_ci	vpermq		\$0xb1,$H1,$D1
26828c2ecf20Sopenharmony_ci	vpermq		\$0xb1,$H2,$D2
26838c2ecf20Sopenharmony_ci	vpaddq		$D3,$H3,$H3
26848c2ecf20Sopenharmony_ci	vpaddq		$D4,$H4,$H4
26858c2ecf20Sopenharmony_ci	vpaddq		$D0,$H0,$H0
26868c2ecf20Sopenharmony_ci	vpaddq		$D1,$H1,$H1
26878c2ecf20Sopenharmony_ci	vpaddq		$D2,$H2,$H2
26888c2ecf20Sopenharmony_ci
26898c2ecf20Sopenharmony_ci	kmovw		%eax,%k3
26908c2ecf20Sopenharmony_ci	vpermq		\$0x2,$H3,$D3
26918c2ecf20Sopenharmony_ci	vpermq		\$0x2,$H4,$D4
26928c2ecf20Sopenharmony_ci	vpermq		\$0x2,$H0,$D0
26938c2ecf20Sopenharmony_ci	vpermq		\$0x2,$H1,$D1
26948c2ecf20Sopenharmony_ci	vpermq		\$0x2,$H2,$D2
26958c2ecf20Sopenharmony_ci	vpaddq		$D3,$H3,$H3
26968c2ecf20Sopenharmony_ci	vpaddq		$D4,$H4,$H4
26978c2ecf20Sopenharmony_ci	vpaddq		$D0,$H0,$H0
26988c2ecf20Sopenharmony_ci	vpaddq		$D1,$H1,$H1
26998c2ecf20Sopenharmony_ci	vpaddq		$D2,$H2,$H2
27008c2ecf20Sopenharmony_ci
27018c2ecf20Sopenharmony_ci	vextracti64x4	\$0x1,$H3,%y#$D3
27028c2ecf20Sopenharmony_ci	vextracti64x4	\$0x1,$H4,%y#$D4
27038c2ecf20Sopenharmony_ci	vextracti64x4	\$0x1,$H0,%y#$D0
27048c2ecf20Sopenharmony_ci	vextracti64x4	\$0x1,$H1,%y#$D1
27058c2ecf20Sopenharmony_ci	vextracti64x4	\$0x1,$H2,%y#$D2
27068c2ecf20Sopenharmony_ci	vpaddq		$D3,$H3,${H3}{%k3}{z}	# keep single qword in case
27078c2ecf20Sopenharmony_ci	vpaddq		$D4,$H4,${H4}{%k3}{z}	# it's passed to .Ltail_avx2
27088c2ecf20Sopenharmony_ci	vpaddq		$D0,$H0,${H0}{%k3}{z}
27098c2ecf20Sopenharmony_ci	vpaddq		$D1,$H1,${H1}{%k3}{z}
27108c2ecf20Sopenharmony_ci	vpaddq		$D2,$H2,${H2}{%k3}{z}
27118c2ecf20Sopenharmony_ci___
27128c2ecf20Sopenharmony_cimap(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
27138c2ecf20Sopenharmony_cimap(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
27148c2ecf20Sopenharmony_ci$code.=<<___;
27158c2ecf20Sopenharmony_ci	################################################################
27168c2ecf20Sopenharmony_ci	# lazy reduction (interleaved with input splat)
27178c2ecf20Sopenharmony_ci
27188c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
27198c2ecf20Sopenharmony_ci	vpand		$MASK,$H3,$H3
27208c2ecf20Sopenharmony_ci	 vpsrldq	\$6,$T0,$T2		# splat input
27218c2ecf20Sopenharmony_ci	 vpsrldq	\$6,$T1,$T3
27228c2ecf20Sopenharmony_ci	 vpunpckhqdq	$T1,$T0,$T4		# 4
27238c2ecf20Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
27248c2ecf20Sopenharmony_ci
27258c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
27268c2ecf20Sopenharmony_ci	vpand		$MASK,$H0,$H0
27278c2ecf20Sopenharmony_ci	 vpunpcklqdq	$T3,$T2,$T2		# 2:3
27288c2ecf20Sopenharmony_ci	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
27298c2ecf20Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
27308c2ecf20Sopenharmony_ci
27318c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H4,$D4
27328c2ecf20Sopenharmony_ci	vpand		$MASK,$H4,$H4
27338c2ecf20Sopenharmony_ci
27348c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H1,$D1
27358c2ecf20Sopenharmony_ci	vpand		$MASK,$H1,$H1
27368c2ecf20Sopenharmony_ci	 vpsrlq		\$30,$T2,$T3
27378c2ecf20Sopenharmony_ci	 vpsrlq		\$4,$T2,$T2
27388c2ecf20Sopenharmony_ci	vpaddq		$D1,$H2,$H2		# h1 -> h2
27398c2ecf20Sopenharmony_ci
27408c2ecf20Sopenharmony_ci	vpaddq		$D4,$H0,$H0
27418c2ecf20Sopenharmony_ci	vpsllq		\$2,$D4,$D4
27428c2ecf20Sopenharmony_ci	 vpsrlq		\$26,$T0,$T1
27438c2ecf20Sopenharmony_ci	 vpsrlq		\$40,$T4,$T4		# 4
27448c2ecf20Sopenharmony_ci	vpaddq		$D4,$H0,$H0		# h4 -> h0
27458c2ecf20Sopenharmony_ci
27468c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H2,$D2
27478c2ecf20Sopenharmony_ci	vpand		$MASK,$H2,$H2
27488c2ecf20Sopenharmony_ci	 vpand		$MASK,$T2,$T2		# 2
27498c2ecf20Sopenharmony_ci	 vpand		$MASK,$T0,$T0		# 0
27508c2ecf20Sopenharmony_ci	vpaddq		$D2,$H3,$H3		# h2 -> h3
27518c2ecf20Sopenharmony_ci
27528c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
27538c2ecf20Sopenharmony_ci	vpand		$MASK,$H0,$H0
27548c2ecf20Sopenharmony_ci	 vpaddq		$H2,$T2,$H2		# accumulate input for .Ltail_avx2
27558c2ecf20Sopenharmony_ci	 vpand		$MASK,$T1,$T1		# 1
27568c2ecf20Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
27578c2ecf20Sopenharmony_ci
27588c2ecf20Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
27598c2ecf20Sopenharmony_ci	vpand		$MASK,$H3,$H3
27608c2ecf20Sopenharmony_ci	 vpand		$MASK,$T3,$T3		# 3
27618c2ecf20Sopenharmony_ci	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
27628c2ecf20Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
27638c2ecf20Sopenharmony_ci
27648c2ecf20Sopenharmony_ci	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
27658c2ecf20Sopenharmony_ci	add		\$64,$len
27668c2ecf20Sopenharmony_ci	jnz		.Ltail_avx2$suffix
27678c2ecf20Sopenharmony_ci
27688c2ecf20Sopenharmony_ci	vpsubq		$T2,$H2,$H2		# undo input accumulation
27698c2ecf20Sopenharmony_ci	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
27708c2ecf20Sopenharmony_ci	vmovd		%x#$H1,`4*1-48-64`($ctx)
27718c2ecf20Sopenharmony_ci	vmovd		%x#$H2,`4*2-48-64`($ctx)
27728c2ecf20Sopenharmony_ci	vmovd		%x#$H3,`4*3-48-64`($ctx)
27738c2ecf20Sopenharmony_ci	vmovd		%x#$H4,`4*4-48-64`($ctx)
27748c2ecf20Sopenharmony_ci	vzeroall
27758c2ecf20Sopenharmony_ci___
27768c2ecf20Sopenharmony_ci$code.=<<___	if ($win64);
27778c2ecf20Sopenharmony_ci	movdqa		-0xb0(%r10),%xmm6
27788c2ecf20Sopenharmony_ci	movdqa		-0xa0(%r10),%xmm7
27798c2ecf20Sopenharmony_ci	movdqa		-0x90(%r10),%xmm8
27808c2ecf20Sopenharmony_ci	movdqa		-0x80(%r10),%xmm9
27818c2ecf20Sopenharmony_ci	movdqa		-0x70(%r10),%xmm10
27828c2ecf20Sopenharmony_ci	movdqa		-0x60(%r10),%xmm11
27838c2ecf20Sopenharmony_ci	movdqa		-0x50(%r10),%xmm12
27848c2ecf20Sopenharmony_ci	movdqa		-0x40(%r10),%xmm13
27858c2ecf20Sopenharmony_ci	movdqa		-0x30(%r10),%xmm14
27868c2ecf20Sopenharmony_ci	movdqa		-0x20(%r10),%xmm15
27878c2ecf20Sopenharmony_ci	lea		-8(%r10),%rsp
27888c2ecf20Sopenharmony_ci.Ldo_avx512_epilogue:
27898c2ecf20Sopenharmony_ci___
27908c2ecf20Sopenharmony_ci$code.=<<___	if (!$win64);
27918c2ecf20Sopenharmony_ci	lea		-8(%r10),%rsp
27928c2ecf20Sopenharmony_ci.cfi_def_cfa_register	%rsp
27938c2ecf20Sopenharmony_ci___
27948c2ecf20Sopenharmony_ci$code.=<<___;
27958c2ecf20Sopenharmony_ci	RET
27968c2ecf20Sopenharmony_ci.cfi_endproc
27978c2ecf20Sopenharmony_ci___
27988c2ecf20Sopenharmony_ci
27998c2ecf20Sopenharmony_ci}
28008c2ecf20Sopenharmony_ci
28018c2ecf20Sopenharmony_ci}
28028c2ecf20Sopenharmony_ci
28038c2ecf20Sopenharmony_ci&declare_function("poly1305_blocks_avx2", 32, 4);
28048c2ecf20Sopenharmony_cipoly1305_blocks_avxN(0);
28058c2ecf20Sopenharmony_ci&end_function("poly1305_blocks_avx2");
28068c2ecf20Sopenharmony_ci
28078c2ecf20Sopenharmony_ci#######################################################################
28088c2ecf20Sopenharmony_ciif ($avx>2) {
28098c2ecf20Sopenharmony_ci# On entry we have input length divisible by 64. But since inner loop
28108c2ecf20Sopenharmony_ci# processes 128 bytes per iteration, cases when length is not divisible
28118c2ecf20Sopenharmony_ci# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
28128c2ecf20Sopenharmony_ci# reason stack layout is kept identical to poly1305_blocks_avx2. If not
28138c2ecf20Sopenharmony_ci# for this tail, we wouldn't have to even allocate stack frame...
28148c2ecf20Sopenharmony_ci
28158c2ecf20Sopenharmony_ciif($kernel) {
28168c2ecf20Sopenharmony_ci	$code .= "#ifdef CONFIG_AS_AVX512\n";
28178c2ecf20Sopenharmony_ci}
28188c2ecf20Sopenharmony_ci
28198c2ecf20Sopenharmony_ci&declare_function("poly1305_blocks_avx512", 32, 4);
28208c2ecf20Sopenharmony_cipoly1305_blocks_avxN(1);
28218c2ecf20Sopenharmony_ci&end_function("poly1305_blocks_avx512");
28228c2ecf20Sopenharmony_ci
28238c2ecf20Sopenharmony_ciif ($kernel) {
28248c2ecf20Sopenharmony_ci	$code .= "#endif\n";
28258c2ecf20Sopenharmony_ci}
28268c2ecf20Sopenharmony_ci
28278c2ecf20Sopenharmony_ciif (!$kernel && $avx>3) {
28288c2ecf20Sopenharmony_ci########################################################################
28298c2ecf20Sopenharmony_ci# VPMADD52 version using 2^44 radix.
28308c2ecf20Sopenharmony_ci#
28318c2ecf20Sopenharmony_ci# One can argue that base 2^52 would be more natural. Well, even though
28328c2ecf20Sopenharmony_ci# some operations would be more natural, one has to recognize couple of
28338c2ecf20Sopenharmony_ci# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
28348c2ecf20Sopenharmony_ci# at amount of multiply-n-accumulate operations. Secondly, it makes it
28358c2ecf20Sopenharmony_ci# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
28368c2ecf20Sopenharmony_ci# reference implementations], which means that more such operations
28378c2ecf20Sopenharmony_ci# would have to be performed in inner loop, which in turn makes critical
28388c2ecf20Sopenharmony_ci# path longer. In other words, even though base 2^44 reduction might
28398c2ecf20Sopenharmony_ci# look less elegant, overall critical path is actually shorter...
28408c2ecf20Sopenharmony_ci
28418c2ecf20Sopenharmony_ci########################################################################
28428c2ecf20Sopenharmony_ci# Layout of opaque area is following.
28438c2ecf20Sopenharmony_ci#
28448c2ecf20Sopenharmony_ci#	unsigned __int64 h[3];		# current hash value base 2^44
28458c2ecf20Sopenharmony_ci#	unsigned __int64 s[2];		# key value*20 base 2^44
28468c2ecf20Sopenharmony_ci#	unsigned __int64 r[3];		# key value base 2^44
28478c2ecf20Sopenharmony_ci#	struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
28488c2ecf20Sopenharmony_ci#					# r^n positions reflect
28498c2ecf20Sopenharmony_ci#					# placement in register, not
28508c2ecf20Sopenharmony_ci#					# memory, R[3] is R[1]*20
28518c2ecf20Sopenharmony_ci
28528c2ecf20Sopenharmony_ci$code.=<<___;
28538c2ecf20Sopenharmony_ci.type	poly1305_init_base2_44,\@function,3
28548c2ecf20Sopenharmony_ci.align	32
28558c2ecf20Sopenharmony_cipoly1305_init_base2_44:
28568c2ecf20Sopenharmony_ci	xor	%eax,%eax
28578c2ecf20Sopenharmony_ci	mov	%rax,0($ctx)		# initialize hash value
28588c2ecf20Sopenharmony_ci	mov	%rax,8($ctx)
28598c2ecf20Sopenharmony_ci	mov	%rax,16($ctx)
28608c2ecf20Sopenharmony_ci
28618c2ecf20Sopenharmony_ci.Linit_base2_44:
28628c2ecf20Sopenharmony_ci	lea	poly1305_blocks_vpmadd52(%rip),%r10
28638c2ecf20Sopenharmony_ci	lea	poly1305_emit_base2_44(%rip),%r11
28648c2ecf20Sopenharmony_ci
28658c2ecf20Sopenharmony_ci	mov	\$0x0ffffffc0fffffff,%rax
28668c2ecf20Sopenharmony_ci	mov	\$0x0ffffffc0ffffffc,%rcx
28678c2ecf20Sopenharmony_ci	and	0($inp),%rax
28688c2ecf20Sopenharmony_ci	mov	\$0x00000fffffffffff,%r8
28698c2ecf20Sopenharmony_ci	and	8($inp),%rcx
28708c2ecf20Sopenharmony_ci	mov	\$0x00000fffffffffff,%r9
28718c2ecf20Sopenharmony_ci	and	%rax,%r8
28728c2ecf20Sopenharmony_ci	shrd	\$44,%rcx,%rax
28738c2ecf20Sopenharmony_ci	mov	%r8,40($ctx)		# r0
28748c2ecf20Sopenharmony_ci	and	%r9,%rax
28758c2ecf20Sopenharmony_ci	shr	\$24,%rcx
28768c2ecf20Sopenharmony_ci	mov	%rax,48($ctx)		# r1
28778c2ecf20Sopenharmony_ci	lea	(%rax,%rax,4),%rax	# *5
28788c2ecf20Sopenharmony_ci	mov	%rcx,56($ctx)		# r2
28798c2ecf20Sopenharmony_ci	shl	\$2,%rax		# magic <<2
28808c2ecf20Sopenharmony_ci	lea	(%rcx,%rcx,4),%rcx	# *5
28818c2ecf20Sopenharmony_ci	shl	\$2,%rcx		# magic <<2
28828c2ecf20Sopenharmony_ci	mov	%rax,24($ctx)		# s1
28838c2ecf20Sopenharmony_ci	mov	%rcx,32($ctx)		# s2
28848c2ecf20Sopenharmony_ci	movq	\$-1,64($ctx)		# write impossible value
28858c2ecf20Sopenharmony_ci___
28868c2ecf20Sopenharmony_ci$code.=<<___	if ($flavour !~ /elf32/);
28878c2ecf20Sopenharmony_ci	mov	%r10,0(%rdx)
28888c2ecf20Sopenharmony_ci	mov	%r11,8(%rdx)
28898c2ecf20Sopenharmony_ci___
28908c2ecf20Sopenharmony_ci$code.=<<___	if ($flavour =~ /elf32/);
28918c2ecf20Sopenharmony_ci	mov	%r10d,0(%rdx)
28928c2ecf20Sopenharmony_ci	mov	%r11d,4(%rdx)
28938c2ecf20Sopenharmony_ci___
28948c2ecf20Sopenharmony_ci$code.=<<___;
28958c2ecf20Sopenharmony_ci	mov	\$1,%eax
28968c2ecf20Sopenharmony_ci	RET
28978c2ecf20Sopenharmony_ci.size	poly1305_init_base2_44,.-poly1305_init_base2_44
28988c2ecf20Sopenharmony_ci___
28998c2ecf20Sopenharmony_ci{
29008c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
29018c2ecf20Sopenharmony_cimy ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
29028c2ecf20Sopenharmony_cimy ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
29038c2ecf20Sopenharmony_ci
29048c2ecf20Sopenharmony_ci$code.=<<___;
29058c2ecf20Sopenharmony_ci.type	poly1305_blocks_vpmadd52,\@function,4
29068c2ecf20Sopenharmony_ci.align	32
29078c2ecf20Sopenharmony_cipoly1305_blocks_vpmadd52:
29088c2ecf20Sopenharmony_ci	shr	\$4,$len
29098c2ecf20Sopenharmony_ci	jz	.Lno_data_vpmadd52		# too short
29108c2ecf20Sopenharmony_ci
29118c2ecf20Sopenharmony_ci	shl	\$40,$padbit
29128c2ecf20Sopenharmony_ci	mov	64($ctx),%r8			# peek on power of the key
29138c2ecf20Sopenharmony_ci
29148c2ecf20Sopenharmony_ci	# if powers of the key are not calculated yet, process up to 3
29158c2ecf20Sopenharmony_ci	# blocks with this single-block subroutine, otherwise ensure that
29168c2ecf20Sopenharmony_ci	# length is divisible by 2 blocks and pass the rest down to next
29178c2ecf20Sopenharmony_ci	# subroutine...
29188c2ecf20Sopenharmony_ci
29198c2ecf20Sopenharmony_ci	mov	\$3,%rax
29208c2ecf20Sopenharmony_ci	mov	\$1,%r10
29218c2ecf20Sopenharmony_ci	cmp	\$4,$len			# is input long
29228c2ecf20Sopenharmony_ci	cmovae	%r10,%rax
29238c2ecf20Sopenharmony_ci	test	%r8,%r8				# is power value impossible?
29248c2ecf20Sopenharmony_ci	cmovns	%r10,%rax
29258c2ecf20Sopenharmony_ci
29268c2ecf20Sopenharmony_ci	and	$len,%rax			# is input of favourable length?
29278c2ecf20Sopenharmony_ci	jz	.Lblocks_vpmadd52_4x
29288c2ecf20Sopenharmony_ci
29298c2ecf20Sopenharmony_ci	sub		%rax,$len
29308c2ecf20Sopenharmony_ci	mov		\$7,%r10d
29318c2ecf20Sopenharmony_ci	mov		\$1,%r11d
29328c2ecf20Sopenharmony_ci	kmovw		%r10d,%k7
29338c2ecf20Sopenharmony_ci	lea		.L2_44_inp_permd(%rip),%r10
29348c2ecf20Sopenharmony_ci	kmovw		%r11d,%k1
29358c2ecf20Sopenharmony_ci
29368c2ecf20Sopenharmony_ci	vmovq		$padbit,%x#$PAD
29378c2ecf20Sopenharmony_ci	vmovdqa64	0(%r10),$inp_permd	# .L2_44_inp_permd
29388c2ecf20Sopenharmony_ci	vmovdqa64	32(%r10),$inp_shift	# .L2_44_inp_shift
29398c2ecf20Sopenharmony_ci	vpermq		\$0xcf,$PAD,$PAD
29408c2ecf20Sopenharmony_ci	vmovdqa64	64(%r10),$reduc_mask	# .L2_44_mask
29418c2ecf20Sopenharmony_ci
29428c2ecf20Sopenharmony_ci	vmovdqu64	0($ctx),${Dlo}{%k7}{z}		# load hash value
29438c2ecf20Sopenharmony_ci	vmovdqu64	40($ctx),${r2r1r0}{%k7}{z}	# load keys
29448c2ecf20Sopenharmony_ci	vmovdqu64	32($ctx),${r1r0s2}{%k7}{z}
29458c2ecf20Sopenharmony_ci	vmovdqu64	24($ctx),${r0s2s1}{%k7}{z}
29468c2ecf20Sopenharmony_ci
29478c2ecf20Sopenharmony_ci	vmovdqa64	96(%r10),$reduc_rght	# .L2_44_shift_rgt
29488c2ecf20Sopenharmony_ci	vmovdqa64	128(%r10),$reduc_left	# .L2_44_shift_lft
29498c2ecf20Sopenharmony_ci
29508c2ecf20Sopenharmony_ci	jmp		.Loop_vpmadd52
29518c2ecf20Sopenharmony_ci
29528c2ecf20Sopenharmony_ci.align	32
29538c2ecf20Sopenharmony_ci.Loop_vpmadd52:
29548c2ecf20Sopenharmony_ci	vmovdqu32	0($inp),%x#$T0		# load input as ----3210
29558c2ecf20Sopenharmony_ci	lea		16($inp),$inp
29568c2ecf20Sopenharmony_ci
29578c2ecf20Sopenharmony_ci	vpermd		$T0,$inp_permd,$T0	# ----3210 -> --322110
29588c2ecf20Sopenharmony_ci	vpsrlvq		$inp_shift,$T0,$T0
29598c2ecf20Sopenharmony_ci	vpandq		$reduc_mask,$T0,$T0
29608c2ecf20Sopenharmony_ci	vporq		$PAD,$T0,$T0
29618c2ecf20Sopenharmony_ci
29628c2ecf20Sopenharmony_ci	vpaddq		$T0,$Dlo,$Dlo		# accumulate input
29638c2ecf20Sopenharmony_ci
29648c2ecf20Sopenharmony_ci	vpermq		\$0,$Dlo,${H0}{%k7}{z}	# smash hash value
29658c2ecf20Sopenharmony_ci	vpermq		\$0b01010101,$Dlo,${H1}{%k7}{z}
29668c2ecf20Sopenharmony_ci	vpermq		\$0b10101010,$Dlo,${H2}{%k7}{z}
29678c2ecf20Sopenharmony_ci
29688c2ecf20Sopenharmony_ci	vpxord		$Dlo,$Dlo,$Dlo
29698c2ecf20Sopenharmony_ci	vpxord		$Dhi,$Dhi,$Dhi
29708c2ecf20Sopenharmony_ci
29718c2ecf20Sopenharmony_ci	vpmadd52luq	$r2r1r0,$H0,$Dlo
29728c2ecf20Sopenharmony_ci	vpmadd52huq	$r2r1r0,$H0,$Dhi
29738c2ecf20Sopenharmony_ci
29748c2ecf20Sopenharmony_ci	vpmadd52luq	$r1r0s2,$H1,$Dlo
29758c2ecf20Sopenharmony_ci	vpmadd52huq	$r1r0s2,$H1,$Dhi
29768c2ecf20Sopenharmony_ci
29778c2ecf20Sopenharmony_ci	vpmadd52luq	$r0s2s1,$H2,$Dlo
29788c2ecf20Sopenharmony_ci	vpmadd52huq	$r0s2s1,$H2,$Dhi
29798c2ecf20Sopenharmony_ci
29808c2ecf20Sopenharmony_ci	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost qword
29818c2ecf20Sopenharmony_ci	vpsllvq		$reduc_left,$Dhi,$Dhi	# 0 in topmost qword
29828c2ecf20Sopenharmony_ci	vpandq		$reduc_mask,$Dlo,$Dlo
29838c2ecf20Sopenharmony_ci
29848c2ecf20Sopenharmony_ci	vpaddq		$T0,$Dhi,$Dhi
29858c2ecf20Sopenharmony_ci
29868c2ecf20Sopenharmony_ci	vpermq		\$0b10010011,$Dhi,$Dhi	# 0 in lowest qword
29878c2ecf20Sopenharmony_ci
29888c2ecf20Sopenharmony_ci	vpaddq		$Dhi,$Dlo,$Dlo		# note topmost qword :-)
29898c2ecf20Sopenharmony_ci
29908c2ecf20Sopenharmony_ci	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost word
29918c2ecf20Sopenharmony_ci	vpandq		$reduc_mask,$Dlo,$Dlo
29928c2ecf20Sopenharmony_ci
29938c2ecf20Sopenharmony_ci	vpermq		\$0b10010011,$T0,$T0
29948c2ecf20Sopenharmony_ci
29958c2ecf20Sopenharmony_ci	vpaddq		$T0,$Dlo,$Dlo
29968c2ecf20Sopenharmony_ci
29978c2ecf20Sopenharmony_ci	vpermq		\$0b10010011,$Dlo,${T0}{%k1}{z}
29988c2ecf20Sopenharmony_ci
29998c2ecf20Sopenharmony_ci	vpaddq		$T0,$Dlo,$Dlo
30008c2ecf20Sopenharmony_ci	vpsllq		\$2,$T0,$T0
30018c2ecf20Sopenharmony_ci
30028c2ecf20Sopenharmony_ci	vpaddq		$T0,$Dlo,$Dlo
30038c2ecf20Sopenharmony_ci
30048c2ecf20Sopenharmony_ci	dec		%rax			# len-=16
30058c2ecf20Sopenharmony_ci	jnz		.Loop_vpmadd52
30068c2ecf20Sopenharmony_ci
30078c2ecf20Sopenharmony_ci	vmovdqu64	$Dlo,0($ctx){%k7}	# store hash value
30088c2ecf20Sopenharmony_ci
30098c2ecf20Sopenharmony_ci	test		$len,$len
30108c2ecf20Sopenharmony_ci	jnz		.Lblocks_vpmadd52_4x
30118c2ecf20Sopenharmony_ci
30128c2ecf20Sopenharmony_ci.Lno_data_vpmadd52:
30138c2ecf20Sopenharmony_ci	RET
30148c2ecf20Sopenharmony_ci.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
30158c2ecf20Sopenharmony_ci___
30168c2ecf20Sopenharmony_ci}
30178c2ecf20Sopenharmony_ci{
30188c2ecf20Sopenharmony_ci########################################################################
30198c2ecf20Sopenharmony_ci# As implied by its name 4x subroutine processes 4 blocks in parallel
30208c2ecf20Sopenharmony_ci# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
30218c2ecf20Sopenharmony_ci# and is handled in 256-bit %ymm registers.
30228c2ecf20Sopenharmony_ci
30238c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
30248c2ecf20Sopenharmony_cimy ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
30258c2ecf20Sopenharmony_cimy ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
30268c2ecf20Sopenharmony_ci
30278c2ecf20Sopenharmony_ci$code.=<<___;
30288c2ecf20Sopenharmony_ci.type	poly1305_blocks_vpmadd52_4x,\@function,4
30298c2ecf20Sopenharmony_ci.align	32
30308c2ecf20Sopenharmony_cipoly1305_blocks_vpmadd52_4x:
30318c2ecf20Sopenharmony_ci	shr	\$4,$len
30328c2ecf20Sopenharmony_ci	jz	.Lno_data_vpmadd52_4x		# too short
30338c2ecf20Sopenharmony_ci
30348c2ecf20Sopenharmony_ci	shl	\$40,$padbit
30358c2ecf20Sopenharmony_ci	mov	64($ctx),%r8			# peek on power of the key
30368c2ecf20Sopenharmony_ci
30378c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_4x:
30388c2ecf20Sopenharmony_ci	vpbroadcastq	$padbit,$PAD
30398c2ecf20Sopenharmony_ci
30408c2ecf20Sopenharmony_ci	vmovdqa64	.Lx_mask44(%rip),$mask44
30418c2ecf20Sopenharmony_ci	mov		\$5,%eax
30428c2ecf20Sopenharmony_ci	vmovdqa64	.Lx_mask42(%rip),$mask42
30438c2ecf20Sopenharmony_ci	kmovw		%eax,%k1		# used in 2x path
30448c2ecf20Sopenharmony_ci
30458c2ecf20Sopenharmony_ci	test		%r8,%r8			# is power value impossible?
30468c2ecf20Sopenharmony_ci	js		.Linit_vpmadd52		# if it is, then init R[4]
30478c2ecf20Sopenharmony_ci
30488c2ecf20Sopenharmony_ci	vmovq		0($ctx),%x#$H0		# load current hash value
30498c2ecf20Sopenharmony_ci	vmovq		8($ctx),%x#$H1
30508c2ecf20Sopenharmony_ci	vmovq		16($ctx),%x#$H2
30518c2ecf20Sopenharmony_ci
30528c2ecf20Sopenharmony_ci	test		\$3,$len		# is length 4*n+2?
30538c2ecf20Sopenharmony_ci	jnz		.Lblocks_vpmadd52_2x_do
30548c2ecf20Sopenharmony_ci
30558c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_4x_do:
30568c2ecf20Sopenharmony_ci	vpbroadcastq	64($ctx),$R0		# load 4th power of the key
30578c2ecf20Sopenharmony_ci	vpbroadcastq	96($ctx),$R1
30588c2ecf20Sopenharmony_ci	vpbroadcastq	128($ctx),$R2
30598c2ecf20Sopenharmony_ci	vpbroadcastq	160($ctx),$S1
30608c2ecf20Sopenharmony_ci
30618c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_4x_key_loaded:
30628c2ecf20Sopenharmony_ci	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
30638c2ecf20Sopenharmony_ci	vpaddq		$R2,$S2,$S2
30648c2ecf20Sopenharmony_ci	vpsllq		\$2,$S2,$S2
30658c2ecf20Sopenharmony_ci
30668c2ecf20Sopenharmony_ci	test		\$7,$len		# is len 8*n?
30678c2ecf20Sopenharmony_ci	jz		.Lblocks_vpmadd52_8x
30688c2ecf20Sopenharmony_ci
30698c2ecf20Sopenharmony_ci	vmovdqu64	16*0($inp),$T2		# load data
30708c2ecf20Sopenharmony_ci	vmovdqu64	16*2($inp),$T3
30718c2ecf20Sopenharmony_ci	lea		16*4($inp),$inp
30728c2ecf20Sopenharmony_ci
30738c2ecf20Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T1		# transpose data
30748c2ecf20Sopenharmony_ci	vpunpckhqdq	$T3,$T2,$T3
30758c2ecf20Sopenharmony_ci
30768c2ecf20Sopenharmony_ci	# at this point 64-bit lanes are ordered as 3-1-2-0
30778c2ecf20Sopenharmony_ci
30788c2ecf20Sopenharmony_ci	vpsrlq		\$24,$T3,$T2		# splat the data
30798c2ecf20Sopenharmony_ci	vporq		$PAD,$T2,$T2
30808c2ecf20Sopenharmony_ci	 vpaddq		$T2,$H2,$H2		# accumulate input
30818c2ecf20Sopenharmony_ci	vpandq		$mask44,$T1,$T0
30828c2ecf20Sopenharmony_ci	vpsrlq		\$44,$T1,$T1
30838c2ecf20Sopenharmony_ci	vpsllq		\$20,$T3,$T3
30848c2ecf20Sopenharmony_ci	vporq		$T3,$T1,$T1
30858c2ecf20Sopenharmony_ci	vpandq		$mask44,$T1,$T1
30868c2ecf20Sopenharmony_ci
30878c2ecf20Sopenharmony_ci	sub		\$4,$len
30888c2ecf20Sopenharmony_ci	jz		.Ltail_vpmadd52_4x
30898c2ecf20Sopenharmony_ci	jmp		.Loop_vpmadd52_4x
30908c2ecf20Sopenharmony_ci	ud2
30918c2ecf20Sopenharmony_ci
30928c2ecf20Sopenharmony_ci.align	32
30938c2ecf20Sopenharmony_ci.Linit_vpmadd52:
30948c2ecf20Sopenharmony_ci	vmovq		24($ctx),%x#$S1		# load key
30958c2ecf20Sopenharmony_ci	vmovq		56($ctx),%x#$H2
30968c2ecf20Sopenharmony_ci	vmovq		32($ctx),%x#$S2
30978c2ecf20Sopenharmony_ci	vmovq		40($ctx),%x#$R0
30988c2ecf20Sopenharmony_ci	vmovq		48($ctx),%x#$R1
30998c2ecf20Sopenharmony_ci
31008c2ecf20Sopenharmony_ci	vmovdqa		$R0,$H0
31018c2ecf20Sopenharmony_ci	vmovdqa		$R1,$H1
31028c2ecf20Sopenharmony_ci	vmovdqa		$H2,$R2
31038c2ecf20Sopenharmony_ci
31048c2ecf20Sopenharmony_ci	mov		\$2,%eax
31058c2ecf20Sopenharmony_ci
31068c2ecf20Sopenharmony_ci.Lmul_init_vpmadd52:
31078c2ecf20Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
31088c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$S1,$D0lo
31098c2ecf20Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
31108c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$S1,$D0hi
31118c2ecf20Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
31128c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$S2,$D1lo
31138c2ecf20Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
31148c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$S2,$D1hi
31158c2ecf20Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
31168c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$R0,$D2lo
31178c2ecf20Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
31188c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$R0,$D2hi
31198c2ecf20Sopenharmony_ci
31208c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R0,$D0lo
31218c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R0,$D0hi
31228c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R1,$D1lo
31238c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R1,$D1hi
31248c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R2,$D2lo
31258c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R2,$D2hi
31268c2ecf20Sopenharmony_ci
31278c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$S2,$D0lo
31288c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$S2,$D0hi
31298c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$R0,$D1lo
31308c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$R0,$D1hi
31318c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$R1,$D2lo
31328c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$R1,$D2hi
31338c2ecf20Sopenharmony_ci
31348c2ecf20Sopenharmony_ci	################################################################
31358c2ecf20Sopenharmony_ci	# partial reduction
31368c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
31378c2ecf20Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
31388c2ecf20Sopenharmony_ci	vpandq		$mask44,$D0lo,$H0
31398c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
31408c2ecf20Sopenharmony_ci
31418c2ecf20Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
31428c2ecf20Sopenharmony_ci
31438c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
31448c2ecf20Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
31458c2ecf20Sopenharmony_ci	vpandq		$mask44,$D1lo,$H1
31468c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
31478c2ecf20Sopenharmony_ci
31488c2ecf20Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
31498c2ecf20Sopenharmony_ci
31508c2ecf20Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
31518c2ecf20Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
31528c2ecf20Sopenharmony_ci	vpandq		$mask42,$D2lo,$H2
31538c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
31548c2ecf20Sopenharmony_ci
31558c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
31568c2ecf20Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
31578c2ecf20Sopenharmony_ci
31588c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
31598c2ecf20Sopenharmony_ci
31608c2ecf20Sopenharmony_ci	vpsrlq		\$44,$H0,$tmp		# additional step
31618c2ecf20Sopenharmony_ci	vpandq		$mask44,$H0,$H0
31628c2ecf20Sopenharmony_ci
31638c2ecf20Sopenharmony_ci	vpaddq		$tmp,$H1,$H1
31648c2ecf20Sopenharmony_ci
31658c2ecf20Sopenharmony_ci	dec		%eax
31668c2ecf20Sopenharmony_ci	jz		.Ldone_init_vpmadd52
31678c2ecf20Sopenharmony_ci
31688c2ecf20Sopenharmony_ci	vpunpcklqdq	$R1,$H1,$R1		# 1,2
31698c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$H1,%x#$H1		# 2,2
31708c2ecf20Sopenharmony_ci	vpunpcklqdq	$R2,$H2,$R2
31718c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$H2,%x#$H2
31728c2ecf20Sopenharmony_ci	vpunpcklqdq	$R0,$H0,$R0
31738c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$H0,%x#$H0
31748c2ecf20Sopenharmony_ci
31758c2ecf20Sopenharmony_ci	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
31768c2ecf20Sopenharmony_ci	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
31778c2ecf20Sopenharmony_ci	vpaddq		$R1,$S1,$S1
31788c2ecf20Sopenharmony_ci	vpaddq		$R2,$S2,$S2
31798c2ecf20Sopenharmony_ci	vpsllq		\$2,$S1,$S1
31808c2ecf20Sopenharmony_ci	vpsllq		\$2,$S2,$S2
31818c2ecf20Sopenharmony_ci
31828c2ecf20Sopenharmony_ci	jmp		.Lmul_init_vpmadd52
31838c2ecf20Sopenharmony_ci	ud2
31848c2ecf20Sopenharmony_ci
31858c2ecf20Sopenharmony_ci.align	32
31868c2ecf20Sopenharmony_ci.Ldone_init_vpmadd52:
31878c2ecf20Sopenharmony_ci	vinserti128	\$1,%x#$R1,$H1,$R1	# 1,2,3,4
31888c2ecf20Sopenharmony_ci	vinserti128	\$1,%x#$R2,$H2,$R2
31898c2ecf20Sopenharmony_ci	vinserti128	\$1,%x#$R0,$H0,$R0
31908c2ecf20Sopenharmony_ci
31918c2ecf20Sopenharmony_ci	vpermq		\$0b11011000,$R1,$R1	# 1,3,2,4
31928c2ecf20Sopenharmony_ci	vpermq		\$0b11011000,$R2,$R2
31938c2ecf20Sopenharmony_ci	vpermq		\$0b11011000,$R0,$R0
31948c2ecf20Sopenharmony_ci
31958c2ecf20Sopenharmony_ci	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
31968c2ecf20Sopenharmony_ci	vpaddq		$R1,$S1,$S1
31978c2ecf20Sopenharmony_ci	vpsllq		\$2,$S1,$S1
31988c2ecf20Sopenharmony_ci
31998c2ecf20Sopenharmony_ci	vmovq		0($ctx),%x#$H0		# load current hash value
32008c2ecf20Sopenharmony_ci	vmovq		8($ctx),%x#$H1
32018c2ecf20Sopenharmony_ci	vmovq		16($ctx),%x#$H2
32028c2ecf20Sopenharmony_ci
32038c2ecf20Sopenharmony_ci	test		\$3,$len		# is length 4*n+2?
32048c2ecf20Sopenharmony_ci	jnz		.Ldone_init_vpmadd52_2x
32058c2ecf20Sopenharmony_ci
32068c2ecf20Sopenharmony_ci	vmovdqu64	$R0,64($ctx)		# save key powers
32078c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$R0,$R0		# broadcast 4th power
32088c2ecf20Sopenharmony_ci	vmovdqu64	$R1,96($ctx)
32098c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$R1,$R1
32108c2ecf20Sopenharmony_ci	vmovdqu64	$R2,128($ctx)
32118c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$R2,$R2
32128c2ecf20Sopenharmony_ci	vmovdqu64	$S1,160($ctx)
32138c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$S1,$S1
32148c2ecf20Sopenharmony_ci
32158c2ecf20Sopenharmony_ci	jmp		.Lblocks_vpmadd52_4x_key_loaded
32168c2ecf20Sopenharmony_ci	ud2
32178c2ecf20Sopenharmony_ci
32188c2ecf20Sopenharmony_ci.align	32
32198c2ecf20Sopenharmony_ci.Ldone_init_vpmadd52_2x:
32208c2ecf20Sopenharmony_ci	vmovdqu64	$R0,64($ctx)		# save key powers
32218c2ecf20Sopenharmony_ci	vpsrldq		\$8,$R0,$R0		# 0-1-0-2
32228c2ecf20Sopenharmony_ci	vmovdqu64	$R1,96($ctx)
32238c2ecf20Sopenharmony_ci	vpsrldq		\$8,$R1,$R1
32248c2ecf20Sopenharmony_ci	vmovdqu64	$R2,128($ctx)
32258c2ecf20Sopenharmony_ci	vpsrldq		\$8,$R2,$R2
32268c2ecf20Sopenharmony_ci	vmovdqu64	$S1,160($ctx)
32278c2ecf20Sopenharmony_ci	vpsrldq		\$8,$S1,$S1
32288c2ecf20Sopenharmony_ci	jmp		.Lblocks_vpmadd52_2x_key_loaded
32298c2ecf20Sopenharmony_ci	ud2
32308c2ecf20Sopenharmony_ci
32318c2ecf20Sopenharmony_ci.align	32
32328c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_2x_do:
32338c2ecf20Sopenharmony_ci	vmovdqu64	128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
32348c2ecf20Sopenharmony_ci	vmovdqu64	160+8($ctx),${S1}{%k1}{z}
32358c2ecf20Sopenharmony_ci	vmovdqu64	64+8($ctx),${R0}{%k1}{z}
32368c2ecf20Sopenharmony_ci	vmovdqu64	96+8($ctx),${R1}{%k1}{z}
32378c2ecf20Sopenharmony_ci
32388c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_2x_key_loaded:
32398c2ecf20Sopenharmony_ci	vmovdqu64	16*0($inp),$T2		# load data
32408c2ecf20Sopenharmony_ci	vpxorq		$T3,$T3,$T3
32418c2ecf20Sopenharmony_ci	lea		16*2($inp),$inp
32428c2ecf20Sopenharmony_ci
32438c2ecf20Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T1		# transpose data
32448c2ecf20Sopenharmony_ci	vpunpckhqdq	$T3,$T2,$T3
32458c2ecf20Sopenharmony_ci
32468c2ecf20Sopenharmony_ci	# at this point 64-bit lanes are ordered as x-1-x-0
32478c2ecf20Sopenharmony_ci
32488c2ecf20Sopenharmony_ci	vpsrlq		\$24,$T3,$T2		# splat the data
32498c2ecf20Sopenharmony_ci	vporq		$PAD,$T2,$T2
32508c2ecf20Sopenharmony_ci	 vpaddq		$T2,$H2,$H2		# accumulate input
32518c2ecf20Sopenharmony_ci	vpandq		$mask44,$T1,$T0
32528c2ecf20Sopenharmony_ci	vpsrlq		\$44,$T1,$T1
32538c2ecf20Sopenharmony_ci	vpsllq		\$20,$T3,$T3
32548c2ecf20Sopenharmony_ci	vporq		$T3,$T1,$T1
32558c2ecf20Sopenharmony_ci	vpandq		$mask44,$T1,$T1
32568c2ecf20Sopenharmony_ci
32578c2ecf20Sopenharmony_ci	jmp		.Ltail_vpmadd52_2x
32588c2ecf20Sopenharmony_ci	ud2
32598c2ecf20Sopenharmony_ci
32608c2ecf20Sopenharmony_ci.align	32
32618c2ecf20Sopenharmony_ci.Loop_vpmadd52_4x:
32628c2ecf20Sopenharmony_ci	#vpaddq		$T2,$H2,$H2		# accumulate input
32638c2ecf20Sopenharmony_ci	vpaddq		$T0,$H0,$H0
32648c2ecf20Sopenharmony_ci	vpaddq		$T1,$H1,$H1
32658c2ecf20Sopenharmony_ci
32668c2ecf20Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
32678c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$S1,$D0lo
32688c2ecf20Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
32698c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$S1,$D0hi
32708c2ecf20Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
32718c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$S2,$D1lo
32728c2ecf20Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
32738c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$S2,$D1hi
32748c2ecf20Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
32758c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$R0,$D2lo
32768c2ecf20Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
32778c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$R0,$D2hi
32788c2ecf20Sopenharmony_ci
32798c2ecf20Sopenharmony_ci	 vmovdqu64	16*0($inp),$T2		# load data
32808c2ecf20Sopenharmony_ci	 vmovdqu64	16*2($inp),$T3
32818c2ecf20Sopenharmony_ci	 lea		16*4($inp),$inp
32828c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R0,$D0lo
32838c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R0,$D0hi
32848c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R1,$D1lo
32858c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R1,$D1hi
32868c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R2,$D2lo
32878c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R2,$D2hi
32888c2ecf20Sopenharmony_ci
32898c2ecf20Sopenharmony_ci	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
32908c2ecf20Sopenharmony_ci	 vpunpckhqdq	$T3,$T2,$T3
32918c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$S2,$D0lo
32928c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$S2,$D0hi
32938c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$R0,$D1lo
32948c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$R0,$D1hi
32958c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$R1,$D2lo
32968c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$R1,$D2hi
32978c2ecf20Sopenharmony_ci
32988c2ecf20Sopenharmony_ci	################################################################
32998c2ecf20Sopenharmony_ci	# partial reduction (interleaved with data splat)
33008c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
33018c2ecf20Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
33028c2ecf20Sopenharmony_ci	vpandq		$mask44,$D0lo,$H0
33038c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
33048c2ecf20Sopenharmony_ci
33058c2ecf20Sopenharmony_ci	 vpsrlq		\$24,$T3,$T2
33068c2ecf20Sopenharmony_ci	 vporq		$PAD,$T2,$T2
33078c2ecf20Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
33088c2ecf20Sopenharmony_ci
33098c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
33108c2ecf20Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
33118c2ecf20Sopenharmony_ci	vpandq		$mask44,$D1lo,$H1
33128c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
33138c2ecf20Sopenharmony_ci
33148c2ecf20Sopenharmony_ci	 vpandq		$mask44,$T1,$T0
33158c2ecf20Sopenharmony_ci	 vpsrlq		\$44,$T1,$T1
33168c2ecf20Sopenharmony_ci	 vpsllq		\$20,$T3,$T3
33178c2ecf20Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
33188c2ecf20Sopenharmony_ci
33198c2ecf20Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
33208c2ecf20Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
33218c2ecf20Sopenharmony_ci	vpandq		$mask42,$D2lo,$H2
33228c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
33238c2ecf20Sopenharmony_ci
33248c2ecf20Sopenharmony_ci	  vpaddq	$T2,$H2,$H2		# accumulate input
33258c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
33268c2ecf20Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
33278c2ecf20Sopenharmony_ci
33288c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
33298c2ecf20Sopenharmony_ci	 vporq		$T3,$T1,$T1
33308c2ecf20Sopenharmony_ci	 vpandq		$mask44,$T1,$T1
33318c2ecf20Sopenharmony_ci
33328c2ecf20Sopenharmony_ci	vpsrlq		\$44,$H0,$tmp		# additional step
33338c2ecf20Sopenharmony_ci	vpandq		$mask44,$H0,$H0
33348c2ecf20Sopenharmony_ci
33358c2ecf20Sopenharmony_ci	vpaddq		$tmp,$H1,$H1
33368c2ecf20Sopenharmony_ci
33378c2ecf20Sopenharmony_ci	sub		\$4,$len		# len-=64
33388c2ecf20Sopenharmony_ci	jnz		.Loop_vpmadd52_4x
33398c2ecf20Sopenharmony_ci
33408c2ecf20Sopenharmony_ci.Ltail_vpmadd52_4x:
33418c2ecf20Sopenharmony_ci	vmovdqu64	128($ctx),$R2		# load all key powers
33428c2ecf20Sopenharmony_ci	vmovdqu64	160($ctx),$S1
33438c2ecf20Sopenharmony_ci	vmovdqu64	64($ctx),$R0
33448c2ecf20Sopenharmony_ci	vmovdqu64	96($ctx),$R1
33458c2ecf20Sopenharmony_ci
33468c2ecf20Sopenharmony_ci.Ltail_vpmadd52_2x:
33478c2ecf20Sopenharmony_ci	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
33488c2ecf20Sopenharmony_ci	vpaddq		$R2,$S2,$S2
33498c2ecf20Sopenharmony_ci	vpsllq		\$2,$S2,$S2
33508c2ecf20Sopenharmony_ci
33518c2ecf20Sopenharmony_ci	#vpaddq		$T2,$H2,$H2		# accumulate input
33528c2ecf20Sopenharmony_ci	vpaddq		$T0,$H0,$H0
33538c2ecf20Sopenharmony_ci	vpaddq		$T1,$H1,$H1
33548c2ecf20Sopenharmony_ci
33558c2ecf20Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
33568c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$S1,$D0lo
33578c2ecf20Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
33588c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$S1,$D0hi
33598c2ecf20Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
33608c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$S2,$D1lo
33618c2ecf20Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
33628c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$S2,$D1hi
33638c2ecf20Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
33648c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$R0,$D2lo
33658c2ecf20Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
33668c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$R0,$D2hi
33678c2ecf20Sopenharmony_ci
33688c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R0,$D0lo
33698c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R0,$D0hi
33708c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R1,$D1lo
33718c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R1,$D1hi
33728c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R2,$D2lo
33738c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R2,$D2hi
33748c2ecf20Sopenharmony_ci
33758c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$S2,$D0lo
33768c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$S2,$D0hi
33778c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$R0,$D1lo
33788c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$R0,$D1hi
33798c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$R1,$D2lo
33808c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$R1,$D2hi
33818c2ecf20Sopenharmony_ci
33828c2ecf20Sopenharmony_ci	################################################################
33838c2ecf20Sopenharmony_ci	# horizontal addition
33848c2ecf20Sopenharmony_ci
33858c2ecf20Sopenharmony_ci	mov		\$1,%eax
33868c2ecf20Sopenharmony_ci	kmovw		%eax,%k1
33878c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D0lo,$T0
33888c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D0hi,$H0
33898c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D1lo,$T1
33908c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D1hi,$H1
33918c2ecf20Sopenharmony_ci	vpaddq		$T0,$D0lo,$D0lo
33928c2ecf20Sopenharmony_ci	vpaddq		$H0,$D0hi,$D0hi
33938c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D2lo,$T2
33948c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D2hi,$H2
33958c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1lo,$D1lo
33968c2ecf20Sopenharmony_ci	vpaddq		$H1,$D1hi,$D1hi
33978c2ecf20Sopenharmony_ci	 vpermq		\$0x2,$D0lo,$T0
33988c2ecf20Sopenharmony_ci	 vpermq		\$0x2,$D0hi,$H0
33998c2ecf20Sopenharmony_ci	vpaddq		$T2,$D2lo,$D2lo
34008c2ecf20Sopenharmony_ci	vpaddq		$H2,$D2hi,$D2hi
34018c2ecf20Sopenharmony_ci
34028c2ecf20Sopenharmony_ci	vpermq		\$0x2,$D1lo,$T1
34038c2ecf20Sopenharmony_ci	vpermq		\$0x2,$D1hi,$H1
34048c2ecf20Sopenharmony_ci	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
34058c2ecf20Sopenharmony_ci	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
34068c2ecf20Sopenharmony_ci	vpermq		\$0x2,$D2lo,$T2
34078c2ecf20Sopenharmony_ci	vpermq		\$0x2,$D2hi,$H2
34088c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
34098c2ecf20Sopenharmony_ci	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
34108c2ecf20Sopenharmony_ci	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
34118c2ecf20Sopenharmony_ci	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
34128c2ecf20Sopenharmony_ci
34138c2ecf20Sopenharmony_ci	################################################################
34148c2ecf20Sopenharmony_ci	# partial reduction
34158c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
34168c2ecf20Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
34178c2ecf20Sopenharmony_ci	vpandq		$mask44,$D0lo,$H0
34188c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
34198c2ecf20Sopenharmony_ci
34208c2ecf20Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
34218c2ecf20Sopenharmony_ci
34228c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
34238c2ecf20Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
34248c2ecf20Sopenharmony_ci	vpandq		$mask44,$D1lo,$H1
34258c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
34268c2ecf20Sopenharmony_ci
34278c2ecf20Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
34288c2ecf20Sopenharmony_ci
34298c2ecf20Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
34308c2ecf20Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
34318c2ecf20Sopenharmony_ci	vpandq		$mask42,$D2lo,$H2
34328c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
34338c2ecf20Sopenharmony_ci
34348c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
34358c2ecf20Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
34368c2ecf20Sopenharmony_ci
34378c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
34388c2ecf20Sopenharmony_ci
34398c2ecf20Sopenharmony_ci	vpsrlq		\$44,$H0,$tmp		# additional step
34408c2ecf20Sopenharmony_ci	vpandq		$mask44,$H0,$H0
34418c2ecf20Sopenharmony_ci
34428c2ecf20Sopenharmony_ci	vpaddq		$tmp,$H1,$H1
34438c2ecf20Sopenharmony_ci						# at this point $len is
34448c2ecf20Sopenharmony_ci						# either 4*n+2 or 0...
34458c2ecf20Sopenharmony_ci	sub		\$2,$len		# len-=32
34468c2ecf20Sopenharmony_ci	ja		.Lblocks_vpmadd52_4x_do
34478c2ecf20Sopenharmony_ci
34488c2ecf20Sopenharmony_ci	vmovq		%x#$H0,0($ctx)
34498c2ecf20Sopenharmony_ci	vmovq		%x#$H1,8($ctx)
34508c2ecf20Sopenharmony_ci	vmovq		%x#$H2,16($ctx)
34518c2ecf20Sopenharmony_ci	vzeroall
34528c2ecf20Sopenharmony_ci
34538c2ecf20Sopenharmony_ci.Lno_data_vpmadd52_4x:
34548c2ecf20Sopenharmony_ci	RET
34558c2ecf20Sopenharmony_ci.size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
34568c2ecf20Sopenharmony_ci___
34578c2ecf20Sopenharmony_ci}
34588c2ecf20Sopenharmony_ci{
34598c2ecf20Sopenharmony_ci########################################################################
34608c2ecf20Sopenharmony_ci# As implied by its name 8x subroutine processes 8 blocks in parallel...
34618c2ecf20Sopenharmony_ci# This is intermediate version, as it's used only in cases when input
34628c2ecf20Sopenharmony_ci# length is either 8*n, 8*n+1 or 8*n+2...
34638c2ecf20Sopenharmony_ci
34648c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
34658c2ecf20Sopenharmony_cimy ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
34668c2ecf20Sopenharmony_cimy ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
34678c2ecf20Sopenharmony_cimy ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
34688c2ecf20Sopenharmony_ci
34698c2ecf20Sopenharmony_ci$code.=<<___;
34708c2ecf20Sopenharmony_ci.type	poly1305_blocks_vpmadd52_8x,\@function,4
34718c2ecf20Sopenharmony_ci.align	32
34728c2ecf20Sopenharmony_cipoly1305_blocks_vpmadd52_8x:
34738c2ecf20Sopenharmony_ci	shr	\$4,$len
34748c2ecf20Sopenharmony_ci	jz	.Lno_data_vpmadd52_8x		# too short
34758c2ecf20Sopenharmony_ci
34768c2ecf20Sopenharmony_ci	shl	\$40,$padbit
34778c2ecf20Sopenharmony_ci	mov	64($ctx),%r8			# peek on power of the key
34788c2ecf20Sopenharmony_ci
34798c2ecf20Sopenharmony_ci	vmovdqa64	.Lx_mask44(%rip),$mask44
34808c2ecf20Sopenharmony_ci	vmovdqa64	.Lx_mask42(%rip),$mask42
34818c2ecf20Sopenharmony_ci
34828c2ecf20Sopenharmony_ci	test	%r8,%r8				# is power value impossible?
34838c2ecf20Sopenharmony_ci	js	.Linit_vpmadd52			# if it is, then init R[4]
34848c2ecf20Sopenharmony_ci
34858c2ecf20Sopenharmony_ci	vmovq	0($ctx),%x#$H0			# load current hash value
34868c2ecf20Sopenharmony_ci	vmovq	8($ctx),%x#$H1
34878c2ecf20Sopenharmony_ci	vmovq	16($ctx),%x#$H2
34888c2ecf20Sopenharmony_ci
34898c2ecf20Sopenharmony_ci.Lblocks_vpmadd52_8x:
34908c2ecf20Sopenharmony_ci	################################################################
34918c2ecf20Sopenharmony_ci	# fist we calculate more key powers
34928c2ecf20Sopenharmony_ci
34938c2ecf20Sopenharmony_ci	vmovdqu64	128($ctx),$R2		# load 1-3-2-4 powers
34948c2ecf20Sopenharmony_ci	vmovdqu64	160($ctx),$S1
34958c2ecf20Sopenharmony_ci	vmovdqu64	64($ctx),$R0
34968c2ecf20Sopenharmony_ci	vmovdqu64	96($ctx),$R1
34978c2ecf20Sopenharmony_ci
34988c2ecf20Sopenharmony_ci	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
34998c2ecf20Sopenharmony_ci	vpaddq		$R2,$S2,$S2
35008c2ecf20Sopenharmony_ci	vpsllq		\$2,$S2,$S2
35018c2ecf20Sopenharmony_ci
35028c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$R2,$RR2		# broadcast 4th power
35038c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$R0,$RR0
35048c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$R1,$RR1
35058c2ecf20Sopenharmony_ci
35068c2ecf20Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
35078c2ecf20Sopenharmony_ci	vpmadd52luq	$RR2,$S1,$D0lo
35088c2ecf20Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
35098c2ecf20Sopenharmony_ci	vpmadd52huq	$RR2,$S1,$D0hi
35108c2ecf20Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
35118c2ecf20Sopenharmony_ci	vpmadd52luq	$RR2,$S2,$D1lo
35128c2ecf20Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
35138c2ecf20Sopenharmony_ci	vpmadd52huq	$RR2,$S2,$D1hi
35148c2ecf20Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
35158c2ecf20Sopenharmony_ci	vpmadd52luq	$RR2,$R0,$D2lo
35168c2ecf20Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
35178c2ecf20Sopenharmony_ci	vpmadd52huq	$RR2,$R0,$D2hi
35188c2ecf20Sopenharmony_ci
35198c2ecf20Sopenharmony_ci	vpmadd52luq	$RR0,$R0,$D0lo
35208c2ecf20Sopenharmony_ci	vpmadd52huq	$RR0,$R0,$D0hi
35218c2ecf20Sopenharmony_ci	vpmadd52luq	$RR0,$R1,$D1lo
35228c2ecf20Sopenharmony_ci	vpmadd52huq	$RR0,$R1,$D1hi
35238c2ecf20Sopenharmony_ci	vpmadd52luq	$RR0,$R2,$D2lo
35248c2ecf20Sopenharmony_ci	vpmadd52huq	$RR0,$R2,$D2hi
35258c2ecf20Sopenharmony_ci
35268c2ecf20Sopenharmony_ci	vpmadd52luq	$RR1,$S2,$D0lo
35278c2ecf20Sopenharmony_ci	vpmadd52huq	$RR1,$S2,$D0hi
35288c2ecf20Sopenharmony_ci	vpmadd52luq	$RR1,$R0,$D1lo
35298c2ecf20Sopenharmony_ci	vpmadd52huq	$RR1,$R0,$D1hi
35308c2ecf20Sopenharmony_ci	vpmadd52luq	$RR1,$R1,$D2lo
35318c2ecf20Sopenharmony_ci	vpmadd52huq	$RR1,$R1,$D2hi
35328c2ecf20Sopenharmony_ci
35338c2ecf20Sopenharmony_ci	################################################################
35348c2ecf20Sopenharmony_ci	# partial reduction
35358c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
35368c2ecf20Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
35378c2ecf20Sopenharmony_ci	vpandq		$mask44,$D0lo,$RR0
35388c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
35398c2ecf20Sopenharmony_ci
35408c2ecf20Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
35418c2ecf20Sopenharmony_ci
35428c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
35438c2ecf20Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
35448c2ecf20Sopenharmony_ci	vpandq		$mask44,$D1lo,$RR1
35458c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
35468c2ecf20Sopenharmony_ci
35478c2ecf20Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
35488c2ecf20Sopenharmony_ci
35498c2ecf20Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
35508c2ecf20Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
35518c2ecf20Sopenharmony_ci	vpandq		$mask42,$D2lo,$RR2
35528c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
35538c2ecf20Sopenharmony_ci
35548c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$RR0,$RR0
35558c2ecf20Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
35568c2ecf20Sopenharmony_ci
35578c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$RR0,$RR0
35588c2ecf20Sopenharmony_ci
35598c2ecf20Sopenharmony_ci	vpsrlq		\$44,$RR0,$tmp		# additional step
35608c2ecf20Sopenharmony_ci	vpandq		$mask44,$RR0,$RR0
35618c2ecf20Sopenharmony_ci
35628c2ecf20Sopenharmony_ci	vpaddq		$tmp,$RR1,$RR1
35638c2ecf20Sopenharmony_ci
35648c2ecf20Sopenharmony_ci	################################################################
35658c2ecf20Sopenharmony_ci	# At this point Rx holds 1324 powers, RRx - 5768, and the goal
35668c2ecf20Sopenharmony_ci	# is 15263748, which reflects how data is loaded...
35678c2ecf20Sopenharmony_ci
35688c2ecf20Sopenharmony_ci	vpunpcklqdq	$R2,$RR2,$T2		# 3748
35698c2ecf20Sopenharmony_ci	vpunpckhqdq	$R2,$RR2,$R2		# 1526
35708c2ecf20Sopenharmony_ci	vpunpcklqdq	$R0,$RR0,$T0
35718c2ecf20Sopenharmony_ci	vpunpckhqdq	$R0,$RR0,$R0
35728c2ecf20Sopenharmony_ci	vpunpcklqdq	$R1,$RR1,$T1
35738c2ecf20Sopenharmony_ci	vpunpckhqdq	$R1,$RR1,$R1
35748c2ecf20Sopenharmony_ci___
35758c2ecf20Sopenharmony_ci######## switch to %zmm
35768c2ecf20Sopenharmony_cimap(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
35778c2ecf20Sopenharmony_cimap(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
35788c2ecf20Sopenharmony_cimap(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
35798c2ecf20Sopenharmony_cimap(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
35808c2ecf20Sopenharmony_ci
35818c2ecf20Sopenharmony_ci$code.=<<___;
35828c2ecf20Sopenharmony_ci	vshufi64x2	\$0x44,$R2,$T2,$RR2	# 15263748
35838c2ecf20Sopenharmony_ci	vshufi64x2	\$0x44,$R0,$T0,$RR0
35848c2ecf20Sopenharmony_ci	vshufi64x2	\$0x44,$R1,$T1,$RR1
35858c2ecf20Sopenharmony_ci
35868c2ecf20Sopenharmony_ci	vmovdqu64	16*0($inp),$T2		# load data
35878c2ecf20Sopenharmony_ci	vmovdqu64	16*4($inp),$T3
35888c2ecf20Sopenharmony_ci	lea		16*8($inp),$inp
35898c2ecf20Sopenharmony_ci
35908c2ecf20Sopenharmony_ci	vpsllq		\$2,$RR2,$SS2		# S2 = R2*5*4
35918c2ecf20Sopenharmony_ci	vpsllq		\$2,$RR1,$SS1		# S1 = R1*5*4
35928c2ecf20Sopenharmony_ci	vpaddq		$RR2,$SS2,$SS2
35938c2ecf20Sopenharmony_ci	vpaddq		$RR1,$SS1,$SS1
35948c2ecf20Sopenharmony_ci	vpsllq		\$2,$SS2,$SS2
35958c2ecf20Sopenharmony_ci	vpsllq		\$2,$SS1,$SS1
35968c2ecf20Sopenharmony_ci
35978c2ecf20Sopenharmony_ci	vpbroadcastq	$padbit,$PAD
35988c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$mask44,$mask44
35998c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$mask42,$mask42
36008c2ecf20Sopenharmony_ci
36018c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$SS1,$S1		# broadcast 8th power
36028c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$SS2,$S2
36038c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$RR0,$R0
36048c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$RR1,$R1
36058c2ecf20Sopenharmony_ci	vpbroadcastq	%x#$RR2,$R2
36068c2ecf20Sopenharmony_ci
36078c2ecf20Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T1		# transpose data
36088c2ecf20Sopenharmony_ci	vpunpckhqdq	$T3,$T2,$T3
36098c2ecf20Sopenharmony_ci
36108c2ecf20Sopenharmony_ci	# at this point 64-bit lanes are ordered as 73625140
36118c2ecf20Sopenharmony_ci
36128c2ecf20Sopenharmony_ci	vpsrlq		\$24,$T3,$T2		# splat the data
36138c2ecf20Sopenharmony_ci	vporq		$PAD,$T2,$T2
36148c2ecf20Sopenharmony_ci	 vpaddq		$T2,$H2,$H2		# accumulate input
36158c2ecf20Sopenharmony_ci	vpandq		$mask44,$T1,$T0
36168c2ecf20Sopenharmony_ci	vpsrlq		\$44,$T1,$T1
36178c2ecf20Sopenharmony_ci	vpsllq		\$20,$T3,$T3
36188c2ecf20Sopenharmony_ci	vporq		$T3,$T1,$T1
36198c2ecf20Sopenharmony_ci	vpandq		$mask44,$T1,$T1
36208c2ecf20Sopenharmony_ci
36218c2ecf20Sopenharmony_ci	sub		\$8,$len
36228c2ecf20Sopenharmony_ci	jz		.Ltail_vpmadd52_8x
36238c2ecf20Sopenharmony_ci	jmp		.Loop_vpmadd52_8x
36248c2ecf20Sopenharmony_ci
36258c2ecf20Sopenharmony_ci.align	32
36268c2ecf20Sopenharmony_ci.Loop_vpmadd52_8x:
36278c2ecf20Sopenharmony_ci	#vpaddq		$T2,$H2,$H2		# accumulate input
36288c2ecf20Sopenharmony_ci	vpaddq		$T0,$H0,$H0
36298c2ecf20Sopenharmony_ci	vpaddq		$T1,$H1,$H1
36308c2ecf20Sopenharmony_ci
36318c2ecf20Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
36328c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$S1,$D0lo
36338c2ecf20Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
36348c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$S1,$D0hi
36358c2ecf20Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
36368c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$S2,$D1lo
36378c2ecf20Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
36388c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$S2,$D1hi
36398c2ecf20Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
36408c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$R0,$D2lo
36418c2ecf20Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
36428c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$R0,$D2hi
36438c2ecf20Sopenharmony_ci
36448c2ecf20Sopenharmony_ci	 vmovdqu64	16*0($inp),$T2		# load data
36458c2ecf20Sopenharmony_ci	 vmovdqu64	16*4($inp),$T3
36468c2ecf20Sopenharmony_ci	 lea		16*8($inp),$inp
36478c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R0,$D0lo
36488c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R0,$D0hi
36498c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R1,$D1lo
36508c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R1,$D1hi
36518c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$R2,$D2lo
36528c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$R2,$D2hi
36538c2ecf20Sopenharmony_ci
36548c2ecf20Sopenharmony_ci	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
36558c2ecf20Sopenharmony_ci	 vpunpckhqdq	$T3,$T2,$T3
36568c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$S2,$D0lo
36578c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$S2,$D0hi
36588c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$R0,$D1lo
36598c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$R0,$D1hi
36608c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$R1,$D2lo
36618c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$R1,$D2hi
36628c2ecf20Sopenharmony_ci
36638c2ecf20Sopenharmony_ci	################################################################
36648c2ecf20Sopenharmony_ci	# partial reduction (interleaved with data splat)
36658c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
36668c2ecf20Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
36678c2ecf20Sopenharmony_ci	vpandq		$mask44,$D0lo,$H0
36688c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
36698c2ecf20Sopenharmony_ci
36708c2ecf20Sopenharmony_ci	 vpsrlq		\$24,$T3,$T2
36718c2ecf20Sopenharmony_ci	 vporq		$PAD,$T2,$T2
36728c2ecf20Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
36738c2ecf20Sopenharmony_ci
36748c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
36758c2ecf20Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
36768c2ecf20Sopenharmony_ci	vpandq		$mask44,$D1lo,$H1
36778c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
36788c2ecf20Sopenharmony_ci
36798c2ecf20Sopenharmony_ci	 vpandq		$mask44,$T1,$T0
36808c2ecf20Sopenharmony_ci	 vpsrlq		\$44,$T1,$T1
36818c2ecf20Sopenharmony_ci	 vpsllq		\$20,$T3,$T3
36828c2ecf20Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
36838c2ecf20Sopenharmony_ci
36848c2ecf20Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
36858c2ecf20Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
36868c2ecf20Sopenharmony_ci	vpandq		$mask42,$D2lo,$H2
36878c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
36888c2ecf20Sopenharmony_ci
36898c2ecf20Sopenharmony_ci	  vpaddq	$T2,$H2,$H2		# accumulate input
36908c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
36918c2ecf20Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
36928c2ecf20Sopenharmony_ci
36938c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
36948c2ecf20Sopenharmony_ci	 vporq		$T3,$T1,$T1
36958c2ecf20Sopenharmony_ci	 vpandq		$mask44,$T1,$T1
36968c2ecf20Sopenharmony_ci
36978c2ecf20Sopenharmony_ci	vpsrlq		\$44,$H0,$tmp		# additional step
36988c2ecf20Sopenharmony_ci	vpandq		$mask44,$H0,$H0
36998c2ecf20Sopenharmony_ci
37008c2ecf20Sopenharmony_ci	vpaddq		$tmp,$H1,$H1
37018c2ecf20Sopenharmony_ci
37028c2ecf20Sopenharmony_ci	sub		\$8,$len		# len-=128
37038c2ecf20Sopenharmony_ci	jnz		.Loop_vpmadd52_8x
37048c2ecf20Sopenharmony_ci
37058c2ecf20Sopenharmony_ci.Ltail_vpmadd52_8x:
37068c2ecf20Sopenharmony_ci	#vpaddq		$T2,$H2,$H2		# accumulate input
37078c2ecf20Sopenharmony_ci	vpaddq		$T0,$H0,$H0
37088c2ecf20Sopenharmony_ci	vpaddq		$T1,$H1,$H1
37098c2ecf20Sopenharmony_ci
37108c2ecf20Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
37118c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$SS1,$D0lo
37128c2ecf20Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
37138c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$SS1,$D0hi
37148c2ecf20Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
37158c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$SS2,$D1lo
37168c2ecf20Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
37178c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$SS2,$D1hi
37188c2ecf20Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
37198c2ecf20Sopenharmony_ci	vpmadd52luq	$H2,$RR0,$D2lo
37208c2ecf20Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
37218c2ecf20Sopenharmony_ci	vpmadd52huq	$H2,$RR0,$D2hi
37228c2ecf20Sopenharmony_ci
37238c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$RR0,$D0lo
37248c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$RR0,$D0hi
37258c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$RR1,$D1lo
37268c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$RR1,$D1hi
37278c2ecf20Sopenharmony_ci	vpmadd52luq	$H0,$RR2,$D2lo
37288c2ecf20Sopenharmony_ci	vpmadd52huq	$H0,$RR2,$D2hi
37298c2ecf20Sopenharmony_ci
37308c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$SS2,$D0lo
37318c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$SS2,$D0hi
37328c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$RR0,$D1lo
37338c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$RR0,$D1hi
37348c2ecf20Sopenharmony_ci	vpmadd52luq	$H1,$RR1,$D2lo
37358c2ecf20Sopenharmony_ci	vpmadd52huq	$H1,$RR1,$D2hi
37368c2ecf20Sopenharmony_ci
37378c2ecf20Sopenharmony_ci	################################################################
37388c2ecf20Sopenharmony_ci	# horizontal addition
37398c2ecf20Sopenharmony_ci
37408c2ecf20Sopenharmony_ci	mov		\$1,%eax
37418c2ecf20Sopenharmony_ci	kmovw		%eax,%k1
37428c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D0lo,$T0
37438c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D0hi,$H0
37448c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D1lo,$T1
37458c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D1hi,$H1
37468c2ecf20Sopenharmony_ci	vpaddq		$T0,$D0lo,$D0lo
37478c2ecf20Sopenharmony_ci	vpaddq		$H0,$D0hi,$D0hi
37488c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D2lo,$T2
37498c2ecf20Sopenharmony_ci	vpsrldq		\$8,$D2hi,$H2
37508c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1lo,$D1lo
37518c2ecf20Sopenharmony_ci	vpaddq		$H1,$D1hi,$D1hi
37528c2ecf20Sopenharmony_ci	 vpermq		\$0x2,$D0lo,$T0
37538c2ecf20Sopenharmony_ci	 vpermq		\$0x2,$D0hi,$H0
37548c2ecf20Sopenharmony_ci	vpaddq		$T2,$D2lo,$D2lo
37558c2ecf20Sopenharmony_ci	vpaddq		$H2,$D2hi,$D2hi
37568c2ecf20Sopenharmony_ci
37578c2ecf20Sopenharmony_ci	vpermq		\$0x2,$D1lo,$T1
37588c2ecf20Sopenharmony_ci	vpermq		\$0x2,$D1hi,$H1
37598c2ecf20Sopenharmony_ci	vpaddq		$T0,$D0lo,$D0lo
37608c2ecf20Sopenharmony_ci	vpaddq		$H0,$D0hi,$D0hi
37618c2ecf20Sopenharmony_ci	vpermq		\$0x2,$D2lo,$T2
37628c2ecf20Sopenharmony_ci	vpermq		\$0x2,$D2hi,$H2
37638c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1lo,$D1lo
37648c2ecf20Sopenharmony_ci	vpaddq		$H1,$D1hi,$D1hi
37658c2ecf20Sopenharmony_ci	 vextracti64x4	\$1,$D0lo,%y#$T0
37668c2ecf20Sopenharmony_ci	 vextracti64x4	\$1,$D0hi,%y#$H0
37678c2ecf20Sopenharmony_ci	vpaddq		$T2,$D2lo,$D2lo
37688c2ecf20Sopenharmony_ci	vpaddq		$H2,$D2hi,$D2hi
37698c2ecf20Sopenharmony_ci
37708c2ecf20Sopenharmony_ci	vextracti64x4	\$1,$D1lo,%y#$T1
37718c2ecf20Sopenharmony_ci	vextracti64x4	\$1,$D1hi,%y#$H1
37728c2ecf20Sopenharmony_ci	vextracti64x4	\$1,$D2lo,%y#$T2
37738c2ecf20Sopenharmony_ci	vextracti64x4	\$1,$D2hi,%y#$H2
37748c2ecf20Sopenharmony_ci___
37758c2ecf20Sopenharmony_ci######## switch back to %ymm
37768c2ecf20Sopenharmony_cimap(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
37778c2ecf20Sopenharmony_cimap(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
37788c2ecf20Sopenharmony_cimap(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
37798c2ecf20Sopenharmony_ci
37808c2ecf20Sopenharmony_ci$code.=<<___;
37818c2ecf20Sopenharmony_ci	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
37828c2ecf20Sopenharmony_ci	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
37838c2ecf20Sopenharmony_ci	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
37848c2ecf20Sopenharmony_ci	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
37858c2ecf20Sopenharmony_ci	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
37868c2ecf20Sopenharmony_ci	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
37878c2ecf20Sopenharmony_ci
37888c2ecf20Sopenharmony_ci	################################################################
37898c2ecf20Sopenharmony_ci	# partial reduction
37908c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
37918c2ecf20Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
37928c2ecf20Sopenharmony_ci	vpandq		$mask44,$D0lo,$H0
37938c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
37948c2ecf20Sopenharmony_ci
37958c2ecf20Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
37968c2ecf20Sopenharmony_ci
37978c2ecf20Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
37988c2ecf20Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
37998c2ecf20Sopenharmony_ci	vpandq		$mask44,$D1lo,$H1
38008c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
38018c2ecf20Sopenharmony_ci
38028c2ecf20Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
38038c2ecf20Sopenharmony_ci
38048c2ecf20Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
38058c2ecf20Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
38068c2ecf20Sopenharmony_ci	vpandq		$mask42,$D2lo,$H2
38078c2ecf20Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
38088c2ecf20Sopenharmony_ci
38098c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
38108c2ecf20Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
38118c2ecf20Sopenharmony_ci
38128c2ecf20Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
38138c2ecf20Sopenharmony_ci
38148c2ecf20Sopenharmony_ci	vpsrlq		\$44,$H0,$tmp		# additional step
38158c2ecf20Sopenharmony_ci	vpandq		$mask44,$H0,$H0
38168c2ecf20Sopenharmony_ci
38178c2ecf20Sopenharmony_ci	vpaddq		$tmp,$H1,$H1
38188c2ecf20Sopenharmony_ci
38198c2ecf20Sopenharmony_ci	################################################################
38208c2ecf20Sopenharmony_ci
38218c2ecf20Sopenharmony_ci	vmovq		%x#$H0,0($ctx)
38228c2ecf20Sopenharmony_ci	vmovq		%x#$H1,8($ctx)
38238c2ecf20Sopenharmony_ci	vmovq		%x#$H2,16($ctx)
38248c2ecf20Sopenharmony_ci	vzeroall
38258c2ecf20Sopenharmony_ci
38268c2ecf20Sopenharmony_ci.Lno_data_vpmadd52_8x:
38278c2ecf20Sopenharmony_ci	RET
38288c2ecf20Sopenharmony_ci.size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
38298c2ecf20Sopenharmony_ci___
38308c2ecf20Sopenharmony_ci}
38318c2ecf20Sopenharmony_ci$code.=<<___;
38328c2ecf20Sopenharmony_ci.type	poly1305_emit_base2_44,\@function,3
38338c2ecf20Sopenharmony_ci.align	32
38348c2ecf20Sopenharmony_cipoly1305_emit_base2_44:
38358c2ecf20Sopenharmony_ci	mov	0($ctx),%r8	# load hash value
38368c2ecf20Sopenharmony_ci	mov	8($ctx),%r9
38378c2ecf20Sopenharmony_ci	mov	16($ctx),%r10
38388c2ecf20Sopenharmony_ci
38398c2ecf20Sopenharmony_ci	mov	%r9,%rax
38408c2ecf20Sopenharmony_ci	shr	\$20,%r9
38418c2ecf20Sopenharmony_ci	shl	\$44,%rax
38428c2ecf20Sopenharmony_ci	mov	%r10,%rcx
38438c2ecf20Sopenharmony_ci	shr	\$40,%r10
38448c2ecf20Sopenharmony_ci	shl	\$24,%rcx
38458c2ecf20Sopenharmony_ci
38468c2ecf20Sopenharmony_ci	add	%rax,%r8
38478c2ecf20Sopenharmony_ci	adc	%rcx,%r9
38488c2ecf20Sopenharmony_ci	adc	\$0,%r10
38498c2ecf20Sopenharmony_ci
38508c2ecf20Sopenharmony_ci	mov	%r8,%rax
38518c2ecf20Sopenharmony_ci	add	\$5,%r8		# compare to modulus
38528c2ecf20Sopenharmony_ci	mov	%r9,%rcx
38538c2ecf20Sopenharmony_ci	adc	\$0,%r9
38548c2ecf20Sopenharmony_ci	adc	\$0,%r10
38558c2ecf20Sopenharmony_ci	shr	\$2,%r10	# did 130-bit value overflow?
38568c2ecf20Sopenharmony_ci	cmovnz	%r8,%rax
38578c2ecf20Sopenharmony_ci	cmovnz	%r9,%rcx
38588c2ecf20Sopenharmony_ci
38598c2ecf20Sopenharmony_ci	add	0($nonce),%rax	# accumulate nonce
38608c2ecf20Sopenharmony_ci	adc	8($nonce),%rcx
38618c2ecf20Sopenharmony_ci	mov	%rax,0($mac)	# write result
38628c2ecf20Sopenharmony_ci	mov	%rcx,8($mac)
38638c2ecf20Sopenharmony_ci
38648c2ecf20Sopenharmony_ci	RET
38658c2ecf20Sopenharmony_ci.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
38668c2ecf20Sopenharmony_ci___
38678c2ecf20Sopenharmony_ci}	}	}
38688c2ecf20Sopenharmony_ci}
38698c2ecf20Sopenharmony_ci
38708c2ecf20Sopenharmony_ciif (!$kernel)
38718c2ecf20Sopenharmony_ci{	# chacha20-poly1305 helpers
38728c2ecf20Sopenharmony_cimy ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
38738c2ecf20Sopenharmony_ci                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
38748c2ecf20Sopenharmony_ci$code.=<<___;
38758c2ecf20Sopenharmony_ci.globl	xor128_encrypt_n_pad
38768c2ecf20Sopenharmony_ci.type	xor128_encrypt_n_pad,\@abi-omnipotent
38778c2ecf20Sopenharmony_ci.align	16
38788c2ecf20Sopenharmony_cixor128_encrypt_n_pad:
38798c2ecf20Sopenharmony_ci	sub	$otp,$inp
38808c2ecf20Sopenharmony_ci	sub	$otp,$out
38818c2ecf20Sopenharmony_ci	mov	$len,%r10		# put len aside
38828c2ecf20Sopenharmony_ci	shr	\$4,$len		# len / 16
38838c2ecf20Sopenharmony_ci	jz	.Ltail_enc
38848c2ecf20Sopenharmony_ci	nop
38858c2ecf20Sopenharmony_ci.Loop_enc_xmm:
38868c2ecf20Sopenharmony_ci	movdqu	($inp,$otp),%xmm0
38878c2ecf20Sopenharmony_ci	pxor	($otp),%xmm0
38888c2ecf20Sopenharmony_ci	movdqu	%xmm0,($out,$otp)
38898c2ecf20Sopenharmony_ci	movdqa	%xmm0,($otp)
38908c2ecf20Sopenharmony_ci	lea	16($otp),$otp
38918c2ecf20Sopenharmony_ci	dec	$len
38928c2ecf20Sopenharmony_ci	jnz	.Loop_enc_xmm
38938c2ecf20Sopenharmony_ci
38948c2ecf20Sopenharmony_ci	and	\$15,%r10		# len % 16
38958c2ecf20Sopenharmony_ci	jz	.Ldone_enc
38968c2ecf20Sopenharmony_ci
38978c2ecf20Sopenharmony_ci.Ltail_enc:
38988c2ecf20Sopenharmony_ci	mov	\$16,$len
38998c2ecf20Sopenharmony_ci	sub	%r10,$len
39008c2ecf20Sopenharmony_ci	xor	%eax,%eax
39018c2ecf20Sopenharmony_ci.Loop_enc_byte:
39028c2ecf20Sopenharmony_ci	mov	($inp,$otp),%al
39038c2ecf20Sopenharmony_ci	xor	($otp),%al
39048c2ecf20Sopenharmony_ci	mov	%al,($out,$otp)
39058c2ecf20Sopenharmony_ci	mov	%al,($otp)
39068c2ecf20Sopenharmony_ci	lea	1($otp),$otp
39078c2ecf20Sopenharmony_ci	dec	%r10
39088c2ecf20Sopenharmony_ci	jnz	.Loop_enc_byte
39098c2ecf20Sopenharmony_ci
39108c2ecf20Sopenharmony_ci	xor	%eax,%eax
39118c2ecf20Sopenharmony_ci.Loop_enc_pad:
39128c2ecf20Sopenharmony_ci	mov	%al,($otp)
39138c2ecf20Sopenharmony_ci	lea	1($otp),$otp
39148c2ecf20Sopenharmony_ci	dec	$len
39158c2ecf20Sopenharmony_ci	jnz	.Loop_enc_pad
39168c2ecf20Sopenharmony_ci
39178c2ecf20Sopenharmony_ci.Ldone_enc:
39188c2ecf20Sopenharmony_ci	mov	$otp,%rax
39198c2ecf20Sopenharmony_ci	RET
39208c2ecf20Sopenharmony_ci.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
39218c2ecf20Sopenharmony_ci
39228c2ecf20Sopenharmony_ci.globl	xor128_decrypt_n_pad
39238c2ecf20Sopenharmony_ci.type	xor128_decrypt_n_pad,\@abi-omnipotent
39248c2ecf20Sopenharmony_ci.align	16
39258c2ecf20Sopenharmony_cixor128_decrypt_n_pad:
39268c2ecf20Sopenharmony_ci	sub	$otp,$inp
39278c2ecf20Sopenharmony_ci	sub	$otp,$out
39288c2ecf20Sopenharmony_ci	mov	$len,%r10		# put len aside
39298c2ecf20Sopenharmony_ci	shr	\$4,$len		# len / 16
39308c2ecf20Sopenharmony_ci	jz	.Ltail_dec
39318c2ecf20Sopenharmony_ci	nop
39328c2ecf20Sopenharmony_ci.Loop_dec_xmm:
39338c2ecf20Sopenharmony_ci	movdqu	($inp,$otp),%xmm0
39348c2ecf20Sopenharmony_ci	movdqa	($otp),%xmm1
39358c2ecf20Sopenharmony_ci	pxor	%xmm0,%xmm1
39368c2ecf20Sopenharmony_ci	movdqu	%xmm1,($out,$otp)
39378c2ecf20Sopenharmony_ci	movdqa	%xmm0,($otp)
39388c2ecf20Sopenharmony_ci	lea	16($otp),$otp
39398c2ecf20Sopenharmony_ci	dec	$len
39408c2ecf20Sopenharmony_ci	jnz	.Loop_dec_xmm
39418c2ecf20Sopenharmony_ci
39428c2ecf20Sopenharmony_ci	pxor	%xmm1,%xmm1
39438c2ecf20Sopenharmony_ci	and	\$15,%r10		# len % 16
39448c2ecf20Sopenharmony_ci	jz	.Ldone_dec
39458c2ecf20Sopenharmony_ci
39468c2ecf20Sopenharmony_ci.Ltail_dec:
39478c2ecf20Sopenharmony_ci	mov	\$16,$len
39488c2ecf20Sopenharmony_ci	sub	%r10,$len
39498c2ecf20Sopenharmony_ci	xor	%eax,%eax
39508c2ecf20Sopenharmony_ci	xor	%r11d,%r11d
39518c2ecf20Sopenharmony_ci.Loop_dec_byte:
39528c2ecf20Sopenharmony_ci	mov	($inp,$otp),%r11b
39538c2ecf20Sopenharmony_ci	mov	($otp),%al
39548c2ecf20Sopenharmony_ci	xor	%r11b,%al
39558c2ecf20Sopenharmony_ci	mov	%al,($out,$otp)
39568c2ecf20Sopenharmony_ci	mov	%r11b,($otp)
39578c2ecf20Sopenharmony_ci	lea	1($otp),$otp
39588c2ecf20Sopenharmony_ci	dec	%r10
39598c2ecf20Sopenharmony_ci	jnz	.Loop_dec_byte
39608c2ecf20Sopenharmony_ci
39618c2ecf20Sopenharmony_ci	xor	%eax,%eax
39628c2ecf20Sopenharmony_ci.Loop_dec_pad:
39638c2ecf20Sopenharmony_ci	mov	%al,($otp)
39648c2ecf20Sopenharmony_ci	lea	1($otp),$otp
39658c2ecf20Sopenharmony_ci	dec	$len
39668c2ecf20Sopenharmony_ci	jnz	.Loop_dec_pad
39678c2ecf20Sopenharmony_ci
39688c2ecf20Sopenharmony_ci.Ldone_dec:
39698c2ecf20Sopenharmony_ci	mov	$otp,%rax
39708c2ecf20Sopenharmony_ci	RET
39718c2ecf20Sopenharmony_ci.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
39728c2ecf20Sopenharmony_ci___
39738c2ecf20Sopenharmony_ci}
39748c2ecf20Sopenharmony_ci
39758c2ecf20Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
39768c2ecf20Sopenharmony_ci#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
39778c2ecf20Sopenharmony_ciif ($win64) {
39788c2ecf20Sopenharmony_ci$rec="%rcx";
39798c2ecf20Sopenharmony_ci$frame="%rdx";
39808c2ecf20Sopenharmony_ci$context="%r8";
39818c2ecf20Sopenharmony_ci$disp="%r9";
39828c2ecf20Sopenharmony_ci
39838c2ecf20Sopenharmony_ci$code.=<<___;
39848c2ecf20Sopenharmony_ci.extern	__imp_RtlVirtualUnwind
39858c2ecf20Sopenharmony_ci.type	se_handler,\@abi-omnipotent
39868c2ecf20Sopenharmony_ci.align	16
39878c2ecf20Sopenharmony_cise_handler:
39888c2ecf20Sopenharmony_ci	push	%rsi
39898c2ecf20Sopenharmony_ci	push	%rdi
39908c2ecf20Sopenharmony_ci	push	%rbx
39918c2ecf20Sopenharmony_ci	push	%rbp
39928c2ecf20Sopenharmony_ci	push	%r12
39938c2ecf20Sopenharmony_ci	push	%r13
39948c2ecf20Sopenharmony_ci	push	%r14
39958c2ecf20Sopenharmony_ci	push	%r15
39968c2ecf20Sopenharmony_ci	pushfq
39978c2ecf20Sopenharmony_ci	sub	\$64,%rsp
39988c2ecf20Sopenharmony_ci
39998c2ecf20Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
40008c2ecf20Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
40018c2ecf20Sopenharmony_ci
40028c2ecf20Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
40038c2ecf20Sopenharmony_ci	mov	56($disp),%r11		# disp->HandlerData
40048c2ecf20Sopenharmony_ci
40058c2ecf20Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
40068c2ecf20Sopenharmony_ci	lea	(%rsi,%r10),%r10	# prologue label
40078c2ecf20Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<.Lprologue
40088c2ecf20Sopenharmony_ci	jb	.Lcommon_seh_tail
40098c2ecf20Sopenharmony_ci
40108c2ecf20Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
40118c2ecf20Sopenharmony_ci
40128c2ecf20Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
40138c2ecf20Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
40148c2ecf20Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
40158c2ecf20Sopenharmony_ci	jae	.Lcommon_seh_tail
40168c2ecf20Sopenharmony_ci
40178c2ecf20Sopenharmony_ci	lea	48(%rax),%rax
40188c2ecf20Sopenharmony_ci
40198c2ecf20Sopenharmony_ci	mov	-8(%rax),%rbx
40208c2ecf20Sopenharmony_ci	mov	-16(%rax),%rbp
40218c2ecf20Sopenharmony_ci	mov	-24(%rax),%r12
40228c2ecf20Sopenharmony_ci	mov	-32(%rax),%r13
40238c2ecf20Sopenharmony_ci	mov	-40(%rax),%r14
40248c2ecf20Sopenharmony_ci	mov	-48(%rax),%r15
40258c2ecf20Sopenharmony_ci	mov	%rbx,144($context)	# restore context->Rbx
40268c2ecf20Sopenharmony_ci	mov	%rbp,160($context)	# restore context->Rbp
40278c2ecf20Sopenharmony_ci	mov	%r12,216($context)	# restore context->R12
40288c2ecf20Sopenharmony_ci	mov	%r13,224($context)	# restore context->R13
40298c2ecf20Sopenharmony_ci	mov	%r14,232($context)	# restore context->R14
40308c2ecf20Sopenharmony_ci	mov	%r15,240($context)	# restore context->R14
40318c2ecf20Sopenharmony_ci
40328c2ecf20Sopenharmony_ci	jmp	.Lcommon_seh_tail
40338c2ecf20Sopenharmony_ci.size	se_handler,.-se_handler
40348c2ecf20Sopenharmony_ci
40358c2ecf20Sopenharmony_ci.type	avx_handler,\@abi-omnipotent
40368c2ecf20Sopenharmony_ci.align	16
40378c2ecf20Sopenharmony_ciavx_handler:
40388c2ecf20Sopenharmony_ci	push	%rsi
40398c2ecf20Sopenharmony_ci	push	%rdi
40408c2ecf20Sopenharmony_ci	push	%rbx
40418c2ecf20Sopenharmony_ci	push	%rbp
40428c2ecf20Sopenharmony_ci	push	%r12
40438c2ecf20Sopenharmony_ci	push	%r13
40448c2ecf20Sopenharmony_ci	push	%r14
40458c2ecf20Sopenharmony_ci	push	%r15
40468c2ecf20Sopenharmony_ci	pushfq
40478c2ecf20Sopenharmony_ci	sub	\$64,%rsp
40488c2ecf20Sopenharmony_ci
40498c2ecf20Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
40508c2ecf20Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
40518c2ecf20Sopenharmony_ci
40528c2ecf20Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
40538c2ecf20Sopenharmony_ci	mov	56($disp),%r11		# disp->HandlerData
40548c2ecf20Sopenharmony_ci
40558c2ecf20Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
40568c2ecf20Sopenharmony_ci	lea	(%rsi,%r10),%r10	# prologue label
40578c2ecf20Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<prologue label
40588c2ecf20Sopenharmony_ci	jb	.Lcommon_seh_tail
40598c2ecf20Sopenharmony_ci
40608c2ecf20Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
40618c2ecf20Sopenharmony_ci
40628c2ecf20Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
40638c2ecf20Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
40648c2ecf20Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=epilogue label
40658c2ecf20Sopenharmony_ci	jae	.Lcommon_seh_tail
40668c2ecf20Sopenharmony_ci
40678c2ecf20Sopenharmony_ci	mov	208($context),%rax	# pull context->R11
40688c2ecf20Sopenharmony_ci
40698c2ecf20Sopenharmony_ci	lea	0x50(%rax),%rsi
40708c2ecf20Sopenharmony_ci	lea	0xf8(%rax),%rax
40718c2ecf20Sopenharmony_ci	lea	512($context),%rdi	# &context.Xmm6
40728c2ecf20Sopenharmony_ci	mov	\$20,%ecx
40738c2ecf20Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
40748c2ecf20Sopenharmony_ci
40758c2ecf20Sopenharmony_ci.Lcommon_seh_tail:
40768c2ecf20Sopenharmony_ci	mov	8(%rax),%rdi
40778c2ecf20Sopenharmony_ci	mov	16(%rax),%rsi
40788c2ecf20Sopenharmony_ci	mov	%rax,152($context)	# restore context->Rsp
40798c2ecf20Sopenharmony_ci	mov	%rsi,168($context)	# restore context->Rsi
40808c2ecf20Sopenharmony_ci	mov	%rdi,176($context)	# restore context->Rdi
40818c2ecf20Sopenharmony_ci
40828c2ecf20Sopenharmony_ci	mov	40($disp),%rdi		# disp->ContextRecord
40838c2ecf20Sopenharmony_ci	mov	$context,%rsi		# context
40848c2ecf20Sopenharmony_ci	mov	\$154,%ecx		# sizeof(CONTEXT)
40858c2ecf20Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
40868c2ecf20Sopenharmony_ci
40878c2ecf20Sopenharmony_ci	mov	$disp,%rsi
40888c2ecf20Sopenharmony_ci	xor	%ecx,%ecx		# arg1, UNW_FLAG_NHANDLER
40898c2ecf20Sopenharmony_ci	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
40908c2ecf20Sopenharmony_ci	mov	0(%rsi),%r8		# arg3, disp->ControlPc
40918c2ecf20Sopenharmony_ci	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
40928c2ecf20Sopenharmony_ci	mov	40(%rsi),%r10		# disp->ContextRecord
40938c2ecf20Sopenharmony_ci	lea	56(%rsi),%r11		# &disp->HandlerData
40948c2ecf20Sopenharmony_ci	lea	24(%rsi),%r12		# &disp->EstablisherFrame
40958c2ecf20Sopenharmony_ci	mov	%r10,32(%rsp)		# arg5
40968c2ecf20Sopenharmony_ci	mov	%r11,40(%rsp)		# arg6
40978c2ecf20Sopenharmony_ci	mov	%r12,48(%rsp)		# arg7
40988c2ecf20Sopenharmony_ci	mov	%rcx,56(%rsp)		# arg8, (NULL)
40998c2ecf20Sopenharmony_ci	call	*__imp_RtlVirtualUnwind(%rip)
41008c2ecf20Sopenharmony_ci
41018c2ecf20Sopenharmony_ci	mov	\$1,%eax		# ExceptionContinueSearch
41028c2ecf20Sopenharmony_ci	add	\$64,%rsp
41038c2ecf20Sopenharmony_ci	popfq
41048c2ecf20Sopenharmony_ci	pop	%r15
41058c2ecf20Sopenharmony_ci	pop	%r14
41068c2ecf20Sopenharmony_ci	pop	%r13
41078c2ecf20Sopenharmony_ci	pop	%r12
41088c2ecf20Sopenharmony_ci	pop	%rbp
41098c2ecf20Sopenharmony_ci	pop	%rbx
41108c2ecf20Sopenharmony_ci	pop	%rdi
41118c2ecf20Sopenharmony_ci	pop	%rsi
41128c2ecf20Sopenharmony_ci	RET
41138c2ecf20Sopenharmony_ci.size	avx_handler,.-avx_handler
41148c2ecf20Sopenharmony_ci
41158c2ecf20Sopenharmony_ci.section	.pdata
41168c2ecf20Sopenharmony_ci.align	4
41178c2ecf20Sopenharmony_ci	.rva	.LSEH_begin_poly1305_init_x86_64
41188c2ecf20Sopenharmony_ci	.rva	.LSEH_end_poly1305_init_x86_64
41198c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_init_x86_64
41208c2ecf20Sopenharmony_ci
41218c2ecf20Sopenharmony_ci	.rva	.LSEH_begin_poly1305_blocks_x86_64
41228c2ecf20Sopenharmony_ci	.rva	.LSEH_end_poly1305_blocks_x86_64
41238c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_x86_64
41248c2ecf20Sopenharmony_ci
41258c2ecf20Sopenharmony_ci	.rva	.LSEH_begin_poly1305_emit_x86_64
41268c2ecf20Sopenharmony_ci	.rva	.LSEH_end_poly1305_emit_x86_64
41278c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_emit_x86_64
41288c2ecf20Sopenharmony_ci___
41298c2ecf20Sopenharmony_ci$code.=<<___ if ($avx);
41308c2ecf20Sopenharmony_ci	.rva	.LSEH_begin_poly1305_blocks_avx
41318c2ecf20Sopenharmony_ci	.rva	.Lbase2_64_avx
41328c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx_1
41338c2ecf20Sopenharmony_ci
41348c2ecf20Sopenharmony_ci	.rva	.Lbase2_64_avx
41358c2ecf20Sopenharmony_ci	.rva	.Leven_avx
41368c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx_2
41378c2ecf20Sopenharmony_ci
41388c2ecf20Sopenharmony_ci	.rva	.Leven_avx
41398c2ecf20Sopenharmony_ci	.rva	.LSEH_end_poly1305_blocks_avx
41408c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx_3
41418c2ecf20Sopenharmony_ci
41428c2ecf20Sopenharmony_ci	.rva	.LSEH_begin_poly1305_emit_avx
41438c2ecf20Sopenharmony_ci	.rva	.LSEH_end_poly1305_emit_avx
41448c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_emit_avx
41458c2ecf20Sopenharmony_ci___
41468c2ecf20Sopenharmony_ci$code.=<<___ if ($avx>1);
41478c2ecf20Sopenharmony_ci	.rva	.LSEH_begin_poly1305_blocks_avx2
41488c2ecf20Sopenharmony_ci	.rva	.Lbase2_64_avx2
41498c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx2_1
41508c2ecf20Sopenharmony_ci
41518c2ecf20Sopenharmony_ci	.rva	.Lbase2_64_avx2
41528c2ecf20Sopenharmony_ci	.rva	.Leven_avx2
41538c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx2_2
41548c2ecf20Sopenharmony_ci
41558c2ecf20Sopenharmony_ci	.rva	.Leven_avx2
41568c2ecf20Sopenharmony_ci	.rva	.LSEH_end_poly1305_blocks_avx2
41578c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx2_3
41588c2ecf20Sopenharmony_ci___
41598c2ecf20Sopenharmony_ci$code.=<<___ if ($avx>2);
41608c2ecf20Sopenharmony_ci	.rva	.LSEH_begin_poly1305_blocks_avx512
41618c2ecf20Sopenharmony_ci	.rva	.LSEH_end_poly1305_blocks_avx512
41628c2ecf20Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx512
41638c2ecf20Sopenharmony_ci___
41648c2ecf20Sopenharmony_ci$code.=<<___;
41658c2ecf20Sopenharmony_ci.section	.xdata
41668c2ecf20Sopenharmony_ci.align	8
41678c2ecf20Sopenharmony_ci.LSEH_info_poly1305_init_x86_64:
41688c2ecf20Sopenharmony_ci	.byte	9,0,0,0
41698c2ecf20Sopenharmony_ci	.rva	se_handler
41708c2ecf20Sopenharmony_ci	.rva	.LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
41718c2ecf20Sopenharmony_ci
41728c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_x86_64:
41738c2ecf20Sopenharmony_ci	.byte	9,0,0,0
41748c2ecf20Sopenharmony_ci	.rva	se_handler
41758c2ecf20Sopenharmony_ci	.rva	.Lblocks_body,.Lblocks_epilogue
41768c2ecf20Sopenharmony_ci
41778c2ecf20Sopenharmony_ci.LSEH_info_poly1305_emit_x86_64:
41788c2ecf20Sopenharmony_ci	.byte	9,0,0,0
41798c2ecf20Sopenharmony_ci	.rva	se_handler
41808c2ecf20Sopenharmony_ci	.rva	.LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
41818c2ecf20Sopenharmony_ci___
41828c2ecf20Sopenharmony_ci$code.=<<___ if ($avx);
41838c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_1:
41848c2ecf20Sopenharmony_ci	.byte	9,0,0,0
41858c2ecf20Sopenharmony_ci	.rva	se_handler
41868c2ecf20Sopenharmony_ci	.rva	.Lblocks_avx_body,.Lblocks_avx_epilogue		# HandlerData[]
41878c2ecf20Sopenharmony_ci
41888c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_2:
41898c2ecf20Sopenharmony_ci	.byte	9,0,0,0
41908c2ecf20Sopenharmony_ci	.rva	se_handler
41918c2ecf20Sopenharmony_ci	.rva	.Lbase2_64_avx_body,.Lbase2_64_avx_epilogue	# HandlerData[]
41928c2ecf20Sopenharmony_ci
41938c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_3:
41948c2ecf20Sopenharmony_ci	.byte	9,0,0,0
41958c2ecf20Sopenharmony_ci	.rva	avx_handler
41968c2ecf20Sopenharmony_ci	.rva	.Ldo_avx_body,.Ldo_avx_epilogue			# HandlerData[]
41978c2ecf20Sopenharmony_ci
41988c2ecf20Sopenharmony_ci.LSEH_info_poly1305_emit_avx:
41998c2ecf20Sopenharmony_ci	.byte	9,0,0,0
42008c2ecf20Sopenharmony_ci	.rva	se_handler
42018c2ecf20Sopenharmony_ci	.rva	.LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
42028c2ecf20Sopenharmony_ci___
42038c2ecf20Sopenharmony_ci$code.=<<___ if ($avx>1);
42048c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_1:
42058c2ecf20Sopenharmony_ci	.byte	9,0,0,0
42068c2ecf20Sopenharmony_ci	.rva	se_handler
42078c2ecf20Sopenharmony_ci	.rva	.Lblocks_avx2_body,.Lblocks_avx2_epilogue	# HandlerData[]
42088c2ecf20Sopenharmony_ci
42098c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_2:
42108c2ecf20Sopenharmony_ci	.byte	9,0,0,0
42118c2ecf20Sopenharmony_ci	.rva	se_handler
42128c2ecf20Sopenharmony_ci	.rva	.Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue	# HandlerData[]
42138c2ecf20Sopenharmony_ci
42148c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_3:
42158c2ecf20Sopenharmony_ci	.byte	9,0,0,0
42168c2ecf20Sopenharmony_ci	.rva	avx_handler
42178c2ecf20Sopenharmony_ci	.rva	.Ldo_avx2_body,.Ldo_avx2_epilogue		# HandlerData[]
42188c2ecf20Sopenharmony_ci___
42198c2ecf20Sopenharmony_ci$code.=<<___ if ($avx>2);
42208c2ecf20Sopenharmony_ci.LSEH_info_poly1305_blocks_avx512:
42218c2ecf20Sopenharmony_ci	.byte	9,0,0,0
42228c2ecf20Sopenharmony_ci	.rva	avx_handler
42238c2ecf20Sopenharmony_ci	.rva	.Ldo_avx512_body,.Ldo_avx512_epilogue		# HandlerData[]
42248c2ecf20Sopenharmony_ci___
42258c2ecf20Sopenharmony_ci}
42268c2ecf20Sopenharmony_ci
42278c2ecf20Sopenharmony_ciopen SELF,$0;
42288c2ecf20Sopenharmony_ciwhile(<SELF>) {
42298c2ecf20Sopenharmony_ci	next if (/^#!/);
42308c2ecf20Sopenharmony_ci	last if (!s/^#/\/\// and !/^$/);
42318c2ecf20Sopenharmony_ci	print;
42328c2ecf20Sopenharmony_ci}
42338c2ecf20Sopenharmony_ciclose SELF;
42348c2ecf20Sopenharmony_ci
42358c2ecf20Sopenharmony_ciforeach (split('\n',$code)) {
42368c2ecf20Sopenharmony_ci	s/\`([^\`]*)\`/eval($1)/ge;
42378c2ecf20Sopenharmony_ci	s/%r([a-z]+)#d/%e$1/g;
42388c2ecf20Sopenharmony_ci	s/%r([0-9]+)#d/%r$1d/g;
42398c2ecf20Sopenharmony_ci	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
42408c2ecf20Sopenharmony_ci
42418c2ecf20Sopenharmony_ci	if ($kernel) {
42428c2ecf20Sopenharmony_ci		s/(^\.type.*),[0-9]+$/\1/;
42438c2ecf20Sopenharmony_ci		s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
42448c2ecf20Sopenharmony_ci		next if /^\.cfi.*/;
42458c2ecf20Sopenharmony_ci	}
42468c2ecf20Sopenharmony_ci
42478c2ecf20Sopenharmony_ci	print $_,"\n";
42488c2ecf20Sopenharmony_ci}
42498c2ecf20Sopenharmony_ciclose STDOUT;
4250