162306a36Sopenharmony_ci#!/usr/bin/env perl
262306a36Sopenharmony_ci# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
362306a36Sopenharmony_ci#
462306a36Sopenharmony_ci# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
562306a36Sopenharmony_ci# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
662306a36Sopenharmony_ci# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
762306a36Sopenharmony_ci#
862306a36Sopenharmony_ci# This code is taken from the OpenSSL project but the author, Andy Polyakov,
962306a36Sopenharmony_ci# has relicensed it under the licenses specified in the SPDX header above.
1062306a36Sopenharmony_ci# The original headers, including the original license headers, are
1162306a36Sopenharmony_ci# included below for completeness.
1262306a36Sopenharmony_ci#
1362306a36Sopenharmony_ci# ====================================================================
1462306a36Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1562306a36Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
1662306a36Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
1762306a36Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
1862306a36Sopenharmony_ci# ====================================================================
1962306a36Sopenharmony_ci#
2062306a36Sopenharmony_ci# This module implements Poly1305 hash for x86_64.
2162306a36Sopenharmony_ci#
2262306a36Sopenharmony_ci# March 2015
2362306a36Sopenharmony_ci#
2462306a36Sopenharmony_ci# Initial release.
2562306a36Sopenharmony_ci#
2662306a36Sopenharmony_ci# December 2016
2762306a36Sopenharmony_ci#
2862306a36Sopenharmony_ci# Add AVX512F+VL+BW code path.
2962306a36Sopenharmony_ci#
3062306a36Sopenharmony_ci# November 2017
3162306a36Sopenharmony_ci#
3262306a36Sopenharmony_ci# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
3362306a36Sopenharmony_ci# executed even on Knights Landing. Trigger for modification was
3462306a36Sopenharmony_ci# observation that AVX512 code paths can negatively affect overall
3562306a36Sopenharmony_ci# Skylake-X system performance. Since we are likely to suppress
3662306a36Sopenharmony_ci# AVX512F capability flag [at least on Skylake-X], conversion serves
3762306a36Sopenharmony_ci# as kind of "investment protection". Note that next *lake processor,
3862306a36Sopenharmony_ci# Cannonlake, has AVX512IFMA code path to execute...
3962306a36Sopenharmony_ci#
4062306a36Sopenharmony_ci# Numbers are cycles per processed byte with poly1305_blocks alone,
4162306a36Sopenharmony_ci# measured with rdtsc at fixed clock frequency.
4262306a36Sopenharmony_ci#
4362306a36Sopenharmony_ci#		IALU/gcc-4.8(*)	AVX(**)		AVX2	AVX-512
4462306a36Sopenharmony_ci# P4		4.46/+120%	-
4562306a36Sopenharmony_ci# Core 2	2.41/+90%	-
4662306a36Sopenharmony_ci# Westmere	1.88/+120%	-
4762306a36Sopenharmony_ci# Sandy Bridge	1.39/+140%	1.10
4862306a36Sopenharmony_ci# Haswell	1.14/+175%	1.11		0.65
4962306a36Sopenharmony_ci# Skylake[-X]	1.13/+120%	0.96		0.51	[0.35]
5062306a36Sopenharmony_ci# Silvermont	2.83/+95%	-
5162306a36Sopenharmony_ci# Knights L	3.60/?		1.65		1.10	0.41(***)
5262306a36Sopenharmony_ci# Goldmont	1.70/+180%	-
5362306a36Sopenharmony_ci# VIA Nano	1.82/+150%	-
5462306a36Sopenharmony_ci# Sledgehammer	1.38/+160%	-
5562306a36Sopenharmony_ci# Bulldozer	2.30/+130%	0.97
5662306a36Sopenharmony_ci# Ryzen		1.15/+200%	1.08		1.18
5762306a36Sopenharmony_ci#
5862306a36Sopenharmony_ci# (*)	improvement coefficients relative to clang are more modest and
5962306a36Sopenharmony_ci#	are ~50% on most processors, in both cases we are comparing to
6062306a36Sopenharmony_ci#	__int128 code;
6162306a36Sopenharmony_ci# (**)	SSE2 implementation was attempted, but among non-AVX processors
6262306a36Sopenharmony_ci#	it was faster than integer-only code only on older Intel P4 and
6362306a36Sopenharmony_ci#	Core processors, 50-30%, less newer processor is, but slower on
6462306a36Sopenharmony_ci#	contemporary ones, for example almost 2x slower on Atom, and as
6562306a36Sopenharmony_ci#	former are naturally disappearing, SSE2 is deemed unnecessary;
6662306a36Sopenharmony_ci# (***)	strangely enough performance seems to vary from core to core,
6762306a36Sopenharmony_ci#	listed result is best case;
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci$flavour = shift;
7062306a36Sopenharmony_ci$output  = shift;
7162306a36Sopenharmony_ciif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
7462306a36Sopenharmony_ci$kernel=0; $kernel=1 if (!$flavour && !$output);
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ciif (!$kernel) {
7762306a36Sopenharmony_ci	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
7862306a36Sopenharmony_ci	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
7962306a36Sopenharmony_ci	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
8062306a36Sopenharmony_ci	die "can't locate x86_64-xlate.pl";
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci	open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
8362306a36Sopenharmony_ci	*STDOUT=*OUT;
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_ci	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
8662306a36Sopenharmony_ci	    =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
8762306a36Sopenharmony_ci		$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
8862306a36Sopenharmony_ci	}
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci	if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
9162306a36Sopenharmony_ci	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
9262306a36Sopenharmony_ci		$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
9362306a36Sopenharmony_ci		$avx += 1 if ($1==2.11 && $2>=8);
9462306a36Sopenharmony_ci	}
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
9762306a36Sopenharmony_ci	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
9862306a36Sopenharmony_ci		$avx = ($1>=10) + ($1>=11);
9962306a36Sopenharmony_ci	}
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
10262306a36Sopenharmony_ci		$avx = ($2>=3.0) + ($2>3.0);
10362306a36Sopenharmony_ci	}
10462306a36Sopenharmony_ci} else {
10562306a36Sopenharmony_ci	$avx = 4; # The kernel uses ifdefs for this.
10662306a36Sopenharmony_ci}
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_cisub declare_function() {
10962306a36Sopenharmony_ci	my ($name, $align, $nargs) = @_;
11062306a36Sopenharmony_ci	if($kernel) {
11162306a36Sopenharmony_ci		$code .= "SYM_FUNC_START($name)\n";
11262306a36Sopenharmony_ci		$code .= ".L$name:\n";
11362306a36Sopenharmony_ci	} else {
11462306a36Sopenharmony_ci		$code .= ".globl	$name\n";
11562306a36Sopenharmony_ci		$code .= ".type	$name,\@function,$nargs\n";
11662306a36Sopenharmony_ci		$code .= ".align	$align\n";
11762306a36Sopenharmony_ci		$code .= "$name:\n";
11862306a36Sopenharmony_ci	}
11962306a36Sopenharmony_ci}
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_cisub end_function() {
12262306a36Sopenharmony_ci	my ($name) = @_;
12362306a36Sopenharmony_ci	if($kernel) {
12462306a36Sopenharmony_ci		$code .= "SYM_FUNC_END($name)\n";
12562306a36Sopenharmony_ci	} else {
12662306a36Sopenharmony_ci		$code .= ".size   $name,.-$name\n";
12762306a36Sopenharmony_ci	}
12862306a36Sopenharmony_ci}
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci$code.=<<___ if $kernel;
13162306a36Sopenharmony_ci#include <linux/linkage.h>
13262306a36Sopenharmony_ci___
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ciif ($avx) {
13562306a36Sopenharmony_ci$code.=<<___ if $kernel;
13662306a36Sopenharmony_ci.section .rodata
13762306a36Sopenharmony_ci___
13862306a36Sopenharmony_ci$code.=<<___;
13962306a36Sopenharmony_ci.align	64
14062306a36Sopenharmony_ci.Lconst:
14162306a36Sopenharmony_ci.Lmask24:
14262306a36Sopenharmony_ci.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
14362306a36Sopenharmony_ci.L129:
14462306a36Sopenharmony_ci.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
14562306a36Sopenharmony_ci.Lmask26:
14662306a36Sopenharmony_ci.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
14762306a36Sopenharmony_ci.Lpermd_avx2:
14862306a36Sopenharmony_ci.long	2,2,2,3,2,0,2,1
14962306a36Sopenharmony_ci.Lpermd_avx512:
15062306a36Sopenharmony_ci.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci.L2_44_inp_permd:
15362306a36Sopenharmony_ci.long	0,1,1,2,2,3,7,7
15462306a36Sopenharmony_ci.L2_44_inp_shift:
15562306a36Sopenharmony_ci.quad	0,12,24,64
15662306a36Sopenharmony_ci.L2_44_mask:
15762306a36Sopenharmony_ci.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
15862306a36Sopenharmony_ci.L2_44_shift_rgt:
15962306a36Sopenharmony_ci.quad	44,44,42,64
16062306a36Sopenharmony_ci.L2_44_shift_lft:
16162306a36Sopenharmony_ci.quad	8,8,10,64
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci.align	64
16462306a36Sopenharmony_ci.Lx_mask44:
16562306a36Sopenharmony_ci.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
16662306a36Sopenharmony_ci.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
16762306a36Sopenharmony_ci.Lx_mask42:
16862306a36Sopenharmony_ci.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
16962306a36Sopenharmony_ci.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
17062306a36Sopenharmony_ci___
17162306a36Sopenharmony_ci}
17262306a36Sopenharmony_ci$code.=<<___ if (!$kernel);
17362306a36Sopenharmony_ci.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
17462306a36Sopenharmony_ci.align	16
17562306a36Sopenharmony_ci___
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_cimy ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
17862306a36Sopenharmony_cimy ($mac,$nonce)=($inp,$len);	# *_emit arguments
17962306a36Sopenharmony_cimy ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
18062306a36Sopenharmony_cimy ($h0,$h1,$h2)=("%r14","%rbx","%r10");
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_cisub poly1305_iteration {
18362306a36Sopenharmony_ci# input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
18462306a36Sopenharmony_ci# output:	$h0-$h2 *= $r0-$r1
18562306a36Sopenharmony_ci$code.=<<___;
18662306a36Sopenharmony_ci	mulq	$h0			# h0*r1
18762306a36Sopenharmony_ci	mov	%rax,$d2
18862306a36Sopenharmony_ci	 mov	$r0,%rax
18962306a36Sopenharmony_ci	mov	%rdx,$d3
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci	mulq	$h0			# h0*r0
19262306a36Sopenharmony_ci	mov	%rax,$h0		# future $h0
19362306a36Sopenharmony_ci	 mov	$r0,%rax
19462306a36Sopenharmony_ci	mov	%rdx,$d1
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci	mulq	$h1			# h1*r0
19762306a36Sopenharmony_ci	add	%rax,$d2
19862306a36Sopenharmony_ci	 mov	$s1,%rax
19962306a36Sopenharmony_ci	adc	%rdx,$d3
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	mulq	$h1			# h1*s1
20262306a36Sopenharmony_ci	 mov	$h2,$h1			# borrow $h1
20362306a36Sopenharmony_ci	add	%rax,$h0
20462306a36Sopenharmony_ci	adc	%rdx,$d1
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci	imulq	$s1,$h1			# h2*s1
20762306a36Sopenharmony_ci	add	$h1,$d2
20862306a36Sopenharmony_ci	 mov	$d1,$h1
20962306a36Sopenharmony_ci	adc	\$0,$d3
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci	imulq	$r0,$h2			# h2*r0
21262306a36Sopenharmony_ci	add	$d2,$h1
21362306a36Sopenharmony_ci	mov	\$-4,%rax		# mask value
21462306a36Sopenharmony_ci	adc	$h2,$d3
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci	and	$d3,%rax		# last reduction step
21762306a36Sopenharmony_ci	mov	$d3,$h2
21862306a36Sopenharmony_ci	shr	\$2,$d3
21962306a36Sopenharmony_ci	and	\$3,$h2
22062306a36Sopenharmony_ci	add	$d3,%rax
22162306a36Sopenharmony_ci	add	%rax,$h0
22262306a36Sopenharmony_ci	adc	\$0,$h1
22362306a36Sopenharmony_ci	adc	\$0,$h2
22462306a36Sopenharmony_ci___
22562306a36Sopenharmony_ci}
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci########################################################################
22862306a36Sopenharmony_ci# Layout of opaque area is following.
22962306a36Sopenharmony_ci#
23062306a36Sopenharmony_ci#	unsigned __int64 h[3];		# current hash value base 2^64
23162306a36Sopenharmony_ci#	unsigned __int64 r[2];		# key value base 2^64
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci$code.=<<___;
23462306a36Sopenharmony_ci.text
23562306a36Sopenharmony_ci___
23662306a36Sopenharmony_ci$code.=<<___ if (!$kernel);
23762306a36Sopenharmony_ci.extern	OPENSSL_ia32cap_P
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci.globl	poly1305_init_x86_64
24062306a36Sopenharmony_ci.hidden	poly1305_init_x86_64
24162306a36Sopenharmony_ci.globl	poly1305_blocks_x86_64
24262306a36Sopenharmony_ci.hidden	poly1305_blocks_x86_64
24362306a36Sopenharmony_ci.globl	poly1305_emit_x86_64
24462306a36Sopenharmony_ci.hidden	poly1305_emit_x86_64
24562306a36Sopenharmony_ci___
24662306a36Sopenharmony_ci&declare_function("poly1305_init_x86_64", 32, 3);
24762306a36Sopenharmony_ci$code.=<<___;
24862306a36Sopenharmony_ci	xor	%eax,%eax
24962306a36Sopenharmony_ci	mov	%rax,0($ctx)		# initialize hash value
25062306a36Sopenharmony_ci	mov	%rax,8($ctx)
25162306a36Sopenharmony_ci	mov	%rax,16($ctx)
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci	test	$inp,$inp
25462306a36Sopenharmony_ci	je	.Lno_key
25562306a36Sopenharmony_ci___
25662306a36Sopenharmony_ci$code.=<<___ if (!$kernel);
25762306a36Sopenharmony_ci	lea	poly1305_blocks_x86_64(%rip),%r10
25862306a36Sopenharmony_ci	lea	poly1305_emit_x86_64(%rip),%r11
25962306a36Sopenharmony_ci___
26062306a36Sopenharmony_ci$code.=<<___	if (!$kernel && $avx);
26162306a36Sopenharmony_ci	mov	OPENSSL_ia32cap_P+4(%rip),%r9
26262306a36Sopenharmony_ci	lea	poly1305_blocks_avx(%rip),%rax
26362306a36Sopenharmony_ci	lea	poly1305_emit_avx(%rip),%rcx
26462306a36Sopenharmony_ci	bt	\$`60-32`,%r9		# AVX?
26562306a36Sopenharmony_ci	cmovc	%rax,%r10
26662306a36Sopenharmony_ci	cmovc	%rcx,%r11
26762306a36Sopenharmony_ci___
26862306a36Sopenharmony_ci$code.=<<___	if (!$kernel && $avx>1);
26962306a36Sopenharmony_ci	lea	poly1305_blocks_avx2(%rip),%rax
27062306a36Sopenharmony_ci	bt	\$`5+32`,%r9		# AVX2?
27162306a36Sopenharmony_ci	cmovc	%rax,%r10
27262306a36Sopenharmony_ci___
27362306a36Sopenharmony_ci$code.=<<___	if (!$kernel && $avx>3);
27462306a36Sopenharmony_ci	mov	\$`(1<<31|1<<21|1<<16)`,%rax
27562306a36Sopenharmony_ci	shr	\$32,%r9
27662306a36Sopenharmony_ci	and	%rax,%r9
27762306a36Sopenharmony_ci	cmp	%rax,%r9
27862306a36Sopenharmony_ci	je	.Linit_base2_44
27962306a36Sopenharmony_ci___
28062306a36Sopenharmony_ci$code.=<<___;
28162306a36Sopenharmony_ci	mov	\$0x0ffffffc0fffffff,%rax
28262306a36Sopenharmony_ci	mov	\$0x0ffffffc0ffffffc,%rcx
28362306a36Sopenharmony_ci	and	0($inp),%rax
28462306a36Sopenharmony_ci	and	8($inp),%rcx
28562306a36Sopenharmony_ci	mov	%rax,24($ctx)
28662306a36Sopenharmony_ci	mov	%rcx,32($ctx)
28762306a36Sopenharmony_ci___
28862306a36Sopenharmony_ci$code.=<<___	if (!$kernel && $flavour !~ /elf32/);
28962306a36Sopenharmony_ci	mov	%r10,0(%rdx)
29062306a36Sopenharmony_ci	mov	%r11,8(%rdx)
29162306a36Sopenharmony_ci___
29262306a36Sopenharmony_ci$code.=<<___	if (!$kernel && $flavour =~ /elf32/);
29362306a36Sopenharmony_ci	mov	%r10d,0(%rdx)
29462306a36Sopenharmony_ci	mov	%r11d,4(%rdx)
29562306a36Sopenharmony_ci___
29662306a36Sopenharmony_ci$code.=<<___;
29762306a36Sopenharmony_ci	mov	\$1,%eax
29862306a36Sopenharmony_ci.Lno_key:
29962306a36Sopenharmony_ci	RET
30062306a36Sopenharmony_ci___
30162306a36Sopenharmony_ci&end_function("poly1305_init_x86_64");
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci&declare_function("poly1305_blocks_x86_64", 32, 4);
30462306a36Sopenharmony_ci$code.=<<___;
30562306a36Sopenharmony_ci.cfi_startproc
30662306a36Sopenharmony_ci.Lblocks:
30762306a36Sopenharmony_ci	shr	\$4,$len
30862306a36Sopenharmony_ci	jz	.Lno_data		# too short
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	push	%rbx
31162306a36Sopenharmony_ci.cfi_push	%rbx
31262306a36Sopenharmony_ci	push	%r12
31362306a36Sopenharmony_ci.cfi_push	%r12
31462306a36Sopenharmony_ci	push	%r13
31562306a36Sopenharmony_ci.cfi_push	%r13
31662306a36Sopenharmony_ci	push	%r14
31762306a36Sopenharmony_ci.cfi_push	%r14
31862306a36Sopenharmony_ci	push	%r15
31962306a36Sopenharmony_ci.cfi_push	%r15
32062306a36Sopenharmony_ci	push	$ctx
32162306a36Sopenharmony_ci.cfi_push	$ctx
32262306a36Sopenharmony_ci.Lblocks_body:
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci	mov	$len,%r15		# reassign $len
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci	mov	24($ctx),$r0		# load r
32762306a36Sopenharmony_ci	mov	32($ctx),$s1
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci	mov	0($ctx),$h0		# load hash value
33062306a36Sopenharmony_ci	mov	8($ctx),$h1
33162306a36Sopenharmony_ci	mov	16($ctx),$h2
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci	mov	$s1,$r1
33462306a36Sopenharmony_ci	shr	\$2,$s1
33562306a36Sopenharmony_ci	mov	$r1,%rax
33662306a36Sopenharmony_ci	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
33762306a36Sopenharmony_ci	jmp	.Loop
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci.align	32
34062306a36Sopenharmony_ci.Loop:
34162306a36Sopenharmony_ci	add	0($inp),$h0		# accumulate input
34262306a36Sopenharmony_ci	adc	8($inp),$h1
34362306a36Sopenharmony_ci	lea	16($inp),$inp
34462306a36Sopenharmony_ci	adc	$padbit,$h2
34562306a36Sopenharmony_ci___
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci	&poly1305_iteration();
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci$code.=<<___;
35062306a36Sopenharmony_ci	mov	$r1,%rax
35162306a36Sopenharmony_ci	dec	%r15			# len-=16
35262306a36Sopenharmony_ci	jnz	.Loop
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	mov	0(%rsp),$ctx
35562306a36Sopenharmony_ci.cfi_restore	$ctx
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci	mov	$h0,0($ctx)		# store hash value
35862306a36Sopenharmony_ci	mov	$h1,8($ctx)
35962306a36Sopenharmony_ci	mov	$h2,16($ctx)
36062306a36Sopenharmony_ci
36162306a36Sopenharmony_ci	mov	8(%rsp),%r15
36262306a36Sopenharmony_ci.cfi_restore	%r15
36362306a36Sopenharmony_ci	mov	16(%rsp),%r14
36462306a36Sopenharmony_ci.cfi_restore	%r14
36562306a36Sopenharmony_ci	mov	24(%rsp),%r13
36662306a36Sopenharmony_ci.cfi_restore	%r13
36762306a36Sopenharmony_ci	mov	32(%rsp),%r12
36862306a36Sopenharmony_ci.cfi_restore	%r12
36962306a36Sopenharmony_ci	mov	40(%rsp),%rbx
37062306a36Sopenharmony_ci.cfi_restore	%rbx
37162306a36Sopenharmony_ci	lea	48(%rsp),%rsp
37262306a36Sopenharmony_ci.cfi_adjust_cfa_offset	-48
37362306a36Sopenharmony_ci.Lno_data:
37462306a36Sopenharmony_ci.Lblocks_epilogue:
37562306a36Sopenharmony_ci	RET
37662306a36Sopenharmony_ci.cfi_endproc
37762306a36Sopenharmony_ci___
37862306a36Sopenharmony_ci&end_function("poly1305_blocks_x86_64");
37962306a36Sopenharmony_ci
38062306a36Sopenharmony_ci&declare_function("poly1305_emit_x86_64", 32, 3);
38162306a36Sopenharmony_ci$code.=<<___;
38262306a36Sopenharmony_ci.Lemit:
38362306a36Sopenharmony_ci	mov	0($ctx),%r8	# load hash value
38462306a36Sopenharmony_ci	mov	8($ctx),%r9
38562306a36Sopenharmony_ci	mov	16($ctx),%r10
38662306a36Sopenharmony_ci
38762306a36Sopenharmony_ci	mov	%r8,%rax
38862306a36Sopenharmony_ci	add	\$5,%r8		# compare to modulus
38962306a36Sopenharmony_ci	mov	%r9,%rcx
39062306a36Sopenharmony_ci	adc	\$0,%r9
39162306a36Sopenharmony_ci	adc	\$0,%r10
39262306a36Sopenharmony_ci	shr	\$2,%r10	# did 130-bit value overflow?
39362306a36Sopenharmony_ci	cmovnz	%r8,%rax
39462306a36Sopenharmony_ci	cmovnz	%r9,%rcx
39562306a36Sopenharmony_ci
39662306a36Sopenharmony_ci	add	0($nonce),%rax	# accumulate nonce
39762306a36Sopenharmony_ci	adc	8($nonce),%rcx
39862306a36Sopenharmony_ci	mov	%rax,0($mac)	# write result
39962306a36Sopenharmony_ci	mov	%rcx,8($mac)
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci	RET
40262306a36Sopenharmony_ci___
40362306a36Sopenharmony_ci&end_function("poly1305_emit_x86_64");
40462306a36Sopenharmony_ciif ($avx) {
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci########################################################################
40762306a36Sopenharmony_ci# Layout of opaque area is following.
40862306a36Sopenharmony_ci#
40962306a36Sopenharmony_ci#	unsigned __int32 h[5];		# current hash value base 2^26
41062306a36Sopenharmony_ci#	unsigned __int32 is_base2_26;
41162306a36Sopenharmony_ci#	unsigned __int64 r[2];		# key value base 2^64
41262306a36Sopenharmony_ci#	unsigned __int64 pad;
41362306a36Sopenharmony_ci#	struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
41462306a36Sopenharmony_ci#
41562306a36Sopenharmony_ci# where r^n are base 2^26 digits of degrees of multiplier key. There are
41662306a36Sopenharmony_ci# 5 digits, but last four are interleaved with multiples of 5, totalling
41762306a36Sopenharmony_ci# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_cimy ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
42062306a36Sopenharmony_ci    map("%xmm$_",(0..15));
42162306a36Sopenharmony_ci
42262306a36Sopenharmony_ci$code.=<<___;
42362306a36Sopenharmony_ci.type	__poly1305_block,\@abi-omnipotent
42462306a36Sopenharmony_ci.align	32
42562306a36Sopenharmony_ci__poly1305_block:
42662306a36Sopenharmony_ci	push $ctx
42762306a36Sopenharmony_ci___
42862306a36Sopenharmony_ci	&poly1305_iteration();
42962306a36Sopenharmony_ci$code.=<<___;
43062306a36Sopenharmony_ci	pop $ctx
43162306a36Sopenharmony_ci	RET
43262306a36Sopenharmony_ci.size	__poly1305_block,.-__poly1305_block
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ci.type	__poly1305_init_avx,\@abi-omnipotent
43562306a36Sopenharmony_ci.align	32
43662306a36Sopenharmony_ci__poly1305_init_avx:
43762306a36Sopenharmony_ci	push %rbp
43862306a36Sopenharmony_ci	mov %rsp,%rbp
43962306a36Sopenharmony_ci	mov	$r0,$h0
44062306a36Sopenharmony_ci	mov	$r1,$h1
44162306a36Sopenharmony_ci	xor	$h2,$h2
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	lea	48+64($ctx),$ctx	# size optimization
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_ci	mov	$r1,%rax
44662306a36Sopenharmony_ci	call	__poly1305_block	# r^2
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci	mov	\$0x3ffffff,%eax	# save interleaved r^2 and r base 2^26
44962306a36Sopenharmony_ci	mov	\$0x3ffffff,%edx
45062306a36Sopenharmony_ci	mov	$h0,$d1
45162306a36Sopenharmony_ci	and	$h0#d,%eax
45262306a36Sopenharmony_ci	mov	$r0,$d2
45362306a36Sopenharmony_ci	and	$r0#d,%edx
45462306a36Sopenharmony_ci	mov	%eax,`16*0+0-64`($ctx)
45562306a36Sopenharmony_ci	shr	\$26,$d1
45662306a36Sopenharmony_ci	mov	%edx,`16*0+4-64`($ctx)
45762306a36Sopenharmony_ci	shr	\$26,$d2
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci	mov	\$0x3ffffff,%eax
46062306a36Sopenharmony_ci	mov	\$0x3ffffff,%edx
46162306a36Sopenharmony_ci	and	$d1#d,%eax
46262306a36Sopenharmony_ci	and	$d2#d,%edx
46362306a36Sopenharmony_ci	mov	%eax,`16*1+0-64`($ctx)
46462306a36Sopenharmony_ci	lea	(%rax,%rax,4),%eax	# *5
46562306a36Sopenharmony_ci	mov	%edx,`16*1+4-64`($ctx)
46662306a36Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
46762306a36Sopenharmony_ci	mov	%eax,`16*2+0-64`($ctx)
46862306a36Sopenharmony_ci	shr	\$26,$d1
46962306a36Sopenharmony_ci	mov	%edx,`16*2+4-64`($ctx)
47062306a36Sopenharmony_ci	shr	\$26,$d2
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_ci	mov	$h1,%rax
47362306a36Sopenharmony_ci	mov	$r1,%rdx
47462306a36Sopenharmony_ci	shl	\$12,%rax
47562306a36Sopenharmony_ci	shl	\$12,%rdx
47662306a36Sopenharmony_ci	or	$d1,%rax
47762306a36Sopenharmony_ci	or	$d2,%rdx
47862306a36Sopenharmony_ci	and	\$0x3ffffff,%eax
47962306a36Sopenharmony_ci	and	\$0x3ffffff,%edx
48062306a36Sopenharmony_ci	mov	%eax,`16*3+0-64`($ctx)
48162306a36Sopenharmony_ci	lea	(%rax,%rax,4),%eax	# *5
48262306a36Sopenharmony_ci	mov	%edx,`16*3+4-64`($ctx)
48362306a36Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
48462306a36Sopenharmony_ci	mov	%eax,`16*4+0-64`($ctx)
48562306a36Sopenharmony_ci	mov	$h1,$d1
48662306a36Sopenharmony_ci	mov	%edx,`16*4+4-64`($ctx)
48762306a36Sopenharmony_ci	mov	$r1,$d2
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci	mov	\$0x3ffffff,%eax
49062306a36Sopenharmony_ci	mov	\$0x3ffffff,%edx
49162306a36Sopenharmony_ci	shr	\$14,$d1
49262306a36Sopenharmony_ci	shr	\$14,$d2
49362306a36Sopenharmony_ci	and	$d1#d,%eax
49462306a36Sopenharmony_ci	and	$d2#d,%edx
49562306a36Sopenharmony_ci	mov	%eax,`16*5+0-64`($ctx)
49662306a36Sopenharmony_ci	lea	(%rax,%rax,4),%eax	# *5
49762306a36Sopenharmony_ci	mov	%edx,`16*5+4-64`($ctx)
49862306a36Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
49962306a36Sopenharmony_ci	mov	%eax,`16*6+0-64`($ctx)
50062306a36Sopenharmony_ci	shr	\$26,$d1
50162306a36Sopenharmony_ci	mov	%edx,`16*6+4-64`($ctx)
50262306a36Sopenharmony_ci	shr	\$26,$d2
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_ci	mov	$h2,%rax
50562306a36Sopenharmony_ci	shl	\$24,%rax
50662306a36Sopenharmony_ci	or	%rax,$d1
50762306a36Sopenharmony_ci	mov	$d1#d,`16*7+0-64`($ctx)
50862306a36Sopenharmony_ci	lea	($d1,$d1,4),$d1		# *5
50962306a36Sopenharmony_ci	mov	$d2#d,`16*7+4-64`($ctx)
51062306a36Sopenharmony_ci	lea	($d2,$d2,4),$d2		# *5
51162306a36Sopenharmony_ci	mov	$d1#d,`16*8+0-64`($ctx)
51262306a36Sopenharmony_ci	mov	$d2#d,`16*8+4-64`($ctx)
51362306a36Sopenharmony_ci
51462306a36Sopenharmony_ci	mov	$r1,%rax
51562306a36Sopenharmony_ci	call	__poly1305_block	# r^3
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci	mov	\$0x3ffffff,%eax	# save r^3 base 2^26
51862306a36Sopenharmony_ci	mov	$h0,$d1
51962306a36Sopenharmony_ci	and	$h0#d,%eax
52062306a36Sopenharmony_ci	shr	\$26,$d1
52162306a36Sopenharmony_ci	mov	%eax,`16*0+12-64`($ctx)
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci	mov	\$0x3ffffff,%edx
52462306a36Sopenharmony_ci	and	$d1#d,%edx
52562306a36Sopenharmony_ci	mov	%edx,`16*1+12-64`($ctx)
52662306a36Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
52762306a36Sopenharmony_ci	shr	\$26,$d1
52862306a36Sopenharmony_ci	mov	%edx,`16*2+12-64`($ctx)
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci	mov	$h1,%rax
53162306a36Sopenharmony_ci	shl	\$12,%rax
53262306a36Sopenharmony_ci	or	$d1,%rax
53362306a36Sopenharmony_ci	and	\$0x3ffffff,%eax
53462306a36Sopenharmony_ci	mov	%eax,`16*3+12-64`($ctx)
53562306a36Sopenharmony_ci	lea	(%rax,%rax,4),%eax	# *5
53662306a36Sopenharmony_ci	mov	$h1,$d1
53762306a36Sopenharmony_ci	mov	%eax,`16*4+12-64`($ctx)
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_ci	mov	\$0x3ffffff,%edx
54062306a36Sopenharmony_ci	shr	\$14,$d1
54162306a36Sopenharmony_ci	and	$d1#d,%edx
54262306a36Sopenharmony_ci	mov	%edx,`16*5+12-64`($ctx)
54362306a36Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
54462306a36Sopenharmony_ci	shr	\$26,$d1
54562306a36Sopenharmony_ci	mov	%edx,`16*6+12-64`($ctx)
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_ci	mov	$h2,%rax
54862306a36Sopenharmony_ci	shl	\$24,%rax
54962306a36Sopenharmony_ci	or	%rax,$d1
55062306a36Sopenharmony_ci	mov	$d1#d,`16*7+12-64`($ctx)
55162306a36Sopenharmony_ci	lea	($d1,$d1,4),$d1		# *5
55262306a36Sopenharmony_ci	mov	$d1#d,`16*8+12-64`($ctx)
55362306a36Sopenharmony_ci
55462306a36Sopenharmony_ci	mov	$r1,%rax
55562306a36Sopenharmony_ci	call	__poly1305_block	# r^4
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_ci	mov	\$0x3ffffff,%eax	# save r^4 base 2^26
55862306a36Sopenharmony_ci	mov	$h0,$d1
55962306a36Sopenharmony_ci	and	$h0#d,%eax
56062306a36Sopenharmony_ci	shr	\$26,$d1
56162306a36Sopenharmony_ci	mov	%eax,`16*0+8-64`($ctx)
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci	mov	\$0x3ffffff,%edx
56462306a36Sopenharmony_ci	and	$d1#d,%edx
56562306a36Sopenharmony_ci	mov	%edx,`16*1+8-64`($ctx)
56662306a36Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
56762306a36Sopenharmony_ci	shr	\$26,$d1
56862306a36Sopenharmony_ci	mov	%edx,`16*2+8-64`($ctx)
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci	mov	$h1,%rax
57162306a36Sopenharmony_ci	shl	\$12,%rax
57262306a36Sopenharmony_ci	or	$d1,%rax
57362306a36Sopenharmony_ci	and	\$0x3ffffff,%eax
57462306a36Sopenharmony_ci	mov	%eax,`16*3+8-64`($ctx)
57562306a36Sopenharmony_ci	lea	(%rax,%rax,4),%eax	# *5
57662306a36Sopenharmony_ci	mov	$h1,$d1
57762306a36Sopenharmony_ci	mov	%eax,`16*4+8-64`($ctx)
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci	mov	\$0x3ffffff,%edx
58062306a36Sopenharmony_ci	shr	\$14,$d1
58162306a36Sopenharmony_ci	and	$d1#d,%edx
58262306a36Sopenharmony_ci	mov	%edx,`16*5+8-64`($ctx)
58362306a36Sopenharmony_ci	lea	(%rdx,%rdx,4),%edx	# *5
58462306a36Sopenharmony_ci	shr	\$26,$d1
58562306a36Sopenharmony_ci	mov	%edx,`16*6+8-64`($ctx)
58662306a36Sopenharmony_ci
58762306a36Sopenharmony_ci	mov	$h2,%rax
58862306a36Sopenharmony_ci	shl	\$24,%rax
58962306a36Sopenharmony_ci	or	%rax,$d1
59062306a36Sopenharmony_ci	mov	$d1#d,`16*7+8-64`($ctx)
59162306a36Sopenharmony_ci	lea	($d1,$d1,4),$d1		# *5
59262306a36Sopenharmony_ci	mov	$d1#d,`16*8+8-64`($ctx)
59362306a36Sopenharmony_ci
59462306a36Sopenharmony_ci	lea	-48-64($ctx),$ctx	# size [de-]optimization
59562306a36Sopenharmony_ci	pop %rbp
59662306a36Sopenharmony_ci	RET
59762306a36Sopenharmony_ci.size	__poly1305_init_avx,.-__poly1305_init_avx
59862306a36Sopenharmony_ci___
59962306a36Sopenharmony_ci
60062306a36Sopenharmony_ci&declare_function("poly1305_blocks_avx", 32, 4);
60162306a36Sopenharmony_ci$code.=<<___;
60262306a36Sopenharmony_ci.cfi_startproc
60362306a36Sopenharmony_ci	mov	20($ctx),%r8d		# is_base2_26
60462306a36Sopenharmony_ci	cmp	\$128,$len
60562306a36Sopenharmony_ci	jae	.Lblocks_avx
60662306a36Sopenharmony_ci	test	%r8d,%r8d
60762306a36Sopenharmony_ci	jz	.Lblocks
60862306a36Sopenharmony_ci
60962306a36Sopenharmony_ci.Lblocks_avx:
61062306a36Sopenharmony_ci	and	\$-16,$len
61162306a36Sopenharmony_ci	jz	.Lno_data_avx
61262306a36Sopenharmony_ci
61362306a36Sopenharmony_ci	vzeroupper
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci	test	%r8d,%r8d
61662306a36Sopenharmony_ci	jz	.Lbase2_64_avx
61762306a36Sopenharmony_ci
61862306a36Sopenharmony_ci	test	\$31,$len
61962306a36Sopenharmony_ci	jz	.Leven_avx
62062306a36Sopenharmony_ci
62162306a36Sopenharmony_ci	push	%rbp
62262306a36Sopenharmony_ci.cfi_push	%rbp
62362306a36Sopenharmony_ci	mov 	%rsp,%rbp
62462306a36Sopenharmony_ci	push	%rbx
62562306a36Sopenharmony_ci.cfi_push	%rbx
62662306a36Sopenharmony_ci	push	%r12
62762306a36Sopenharmony_ci.cfi_push	%r12
62862306a36Sopenharmony_ci	push	%r13
62962306a36Sopenharmony_ci.cfi_push	%r13
63062306a36Sopenharmony_ci	push	%r14
63162306a36Sopenharmony_ci.cfi_push	%r14
63262306a36Sopenharmony_ci	push	%r15
63362306a36Sopenharmony_ci.cfi_push	%r15
63462306a36Sopenharmony_ci.Lblocks_avx_body:
63562306a36Sopenharmony_ci
63662306a36Sopenharmony_ci	mov	$len,%r15		# reassign $len
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	mov	0($ctx),$d1		# load hash value
63962306a36Sopenharmony_ci	mov	8($ctx),$d2
64062306a36Sopenharmony_ci	mov	16($ctx),$h2#d
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_ci	mov	24($ctx),$r0		# load r
64362306a36Sopenharmony_ci	mov	32($ctx),$s1
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ci	################################# base 2^26 -> base 2^64
64662306a36Sopenharmony_ci	mov	$d1#d,$h0#d
64762306a36Sopenharmony_ci	and	\$`-1*(1<<31)`,$d1
64862306a36Sopenharmony_ci	mov	$d2,$r1			# borrow $r1
64962306a36Sopenharmony_ci	mov	$d2#d,$h1#d
65062306a36Sopenharmony_ci	and	\$`-1*(1<<31)`,$d2
65162306a36Sopenharmony_ci
65262306a36Sopenharmony_ci	shr	\$6,$d1
65362306a36Sopenharmony_ci	shl	\$52,$r1
65462306a36Sopenharmony_ci	add	$d1,$h0
65562306a36Sopenharmony_ci	shr	\$12,$h1
65662306a36Sopenharmony_ci	shr	\$18,$d2
65762306a36Sopenharmony_ci	add	$r1,$h0
65862306a36Sopenharmony_ci	adc	$d2,$h1
65962306a36Sopenharmony_ci
66062306a36Sopenharmony_ci	mov	$h2,$d1
66162306a36Sopenharmony_ci	shl	\$40,$d1
66262306a36Sopenharmony_ci	shr	\$24,$h2
66362306a36Sopenharmony_ci	add	$d1,$h1
66462306a36Sopenharmony_ci	adc	\$0,$h2			# can be partially reduced...
66562306a36Sopenharmony_ci
66662306a36Sopenharmony_ci	mov	\$-4,$d2		# ... so reduce
66762306a36Sopenharmony_ci	mov	$h2,$d1
66862306a36Sopenharmony_ci	and	$h2,$d2
66962306a36Sopenharmony_ci	shr	\$2,$d1
67062306a36Sopenharmony_ci	and	\$3,$h2
67162306a36Sopenharmony_ci	add	$d2,$d1			# =*5
67262306a36Sopenharmony_ci	add	$d1,$h0
67362306a36Sopenharmony_ci	adc	\$0,$h1
67462306a36Sopenharmony_ci	adc	\$0,$h2
67562306a36Sopenharmony_ci
67662306a36Sopenharmony_ci	mov	$s1,$r1
67762306a36Sopenharmony_ci	mov	$s1,%rax
67862306a36Sopenharmony_ci	shr	\$2,$s1
67962306a36Sopenharmony_ci	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
68062306a36Sopenharmony_ci
68162306a36Sopenharmony_ci	add	0($inp),$h0		# accumulate input
68262306a36Sopenharmony_ci	adc	8($inp),$h1
68362306a36Sopenharmony_ci	lea	16($inp),$inp
68462306a36Sopenharmony_ci	adc	$padbit,$h2
68562306a36Sopenharmony_ci
68662306a36Sopenharmony_ci	call	__poly1305_block
68762306a36Sopenharmony_ci
68862306a36Sopenharmony_ci	test	$padbit,$padbit		# if $padbit is zero,
68962306a36Sopenharmony_ci	jz	.Lstore_base2_64_avx	# store hash in base 2^64 format
69062306a36Sopenharmony_ci
69162306a36Sopenharmony_ci	################################# base 2^64 -> base 2^26
69262306a36Sopenharmony_ci	mov	$h0,%rax
69362306a36Sopenharmony_ci	mov	$h0,%rdx
69462306a36Sopenharmony_ci	shr	\$52,$h0
69562306a36Sopenharmony_ci	mov	$h1,$r0
69662306a36Sopenharmony_ci	mov	$h1,$r1
69762306a36Sopenharmony_ci	shr	\$26,%rdx
69862306a36Sopenharmony_ci	and	\$0x3ffffff,%rax	# h[0]
69962306a36Sopenharmony_ci	shl	\$12,$r0
70062306a36Sopenharmony_ci	and	\$0x3ffffff,%rdx	# h[1]
70162306a36Sopenharmony_ci	shr	\$14,$h1
70262306a36Sopenharmony_ci	or	$r0,$h0
70362306a36Sopenharmony_ci	shl	\$24,$h2
70462306a36Sopenharmony_ci	and	\$0x3ffffff,$h0		# h[2]
70562306a36Sopenharmony_ci	shr	\$40,$r1
70662306a36Sopenharmony_ci	and	\$0x3ffffff,$h1		# h[3]
70762306a36Sopenharmony_ci	or	$r1,$h2			# h[4]
70862306a36Sopenharmony_ci
70962306a36Sopenharmony_ci	sub	\$16,%r15
71062306a36Sopenharmony_ci	jz	.Lstore_base2_26_avx
71162306a36Sopenharmony_ci
71262306a36Sopenharmony_ci	vmovd	%rax#d,$H0
71362306a36Sopenharmony_ci	vmovd	%rdx#d,$H1
71462306a36Sopenharmony_ci	vmovd	$h0#d,$H2
71562306a36Sopenharmony_ci	vmovd	$h1#d,$H3
71662306a36Sopenharmony_ci	vmovd	$h2#d,$H4
71762306a36Sopenharmony_ci	jmp	.Lproceed_avx
71862306a36Sopenharmony_ci
71962306a36Sopenharmony_ci.align	32
72062306a36Sopenharmony_ci.Lstore_base2_64_avx:
72162306a36Sopenharmony_ci	mov	$h0,0($ctx)
72262306a36Sopenharmony_ci	mov	$h1,8($ctx)
72362306a36Sopenharmony_ci	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
72462306a36Sopenharmony_ci	jmp	.Ldone_avx
72562306a36Sopenharmony_ci
72662306a36Sopenharmony_ci.align	16
72762306a36Sopenharmony_ci.Lstore_base2_26_avx:
72862306a36Sopenharmony_ci	mov	%rax#d,0($ctx)		# store hash value base 2^26
72962306a36Sopenharmony_ci	mov	%rdx#d,4($ctx)
73062306a36Sopenharmony_ci	mov	$h0#d,8($ctx)
73162306a36Sopenharmony_ci	mov	$h1#d,12($ctx)
73262306a36Sopenharmony_ci	mov	$h2#d,16($ctx)
73362306a36Sopenharmony_ci.align	16
73462306a36Sopenharmony_ci.Ldone_avx:
73562306a36Sopenharmony_ci	pop 		%r15
73662306a36Sopenharmony_ci.cfi_restore	%r15
73762306a36Sopenharmony_ci	pop 		%r14
73862306a36Sopenharmony_ci.cfi_restore	%r14
73962306a36Sopenharmony_ci	pop 		%r13
74062306a36Sopenharmony_ci.cfi_restore	%r13
74162306a36Sopenharmony_ci	pop 		%r12
74262306a36Sopenharmony_ci.cfi_restore	%r12
74362306a36Sopenharmony_ci	pop 		%rbx
74462306a36Sopenharmony_ci.cfi_restore	%rbx
74562306a36Sopenharmony_ci	pop 		%rbp
74662306a36Sopenharmony_ci.cfi_restore	%rbp
74762306a36Sopenharmony_ci.Lno_data_avx:
74862306a36Sopenharmony_ci.Lblocks_avx_epilogue:
74962306a36Sopenharmony_ci	RET
75062306a36Sopenharmony_ci.cfi_endproc
75162306a36Sopenharmony_ci
75262306a36Sopenharmony_ci.align	32
75362306a36Sopenharmony_ci.Lbase2_64_avx:
75462306a36Sopenharmony_ci.cfi_startproc
75562306a36Sopenharmony_ci	push	%rbp
75662306a36Sopenharmony_ci.cfi_push	%rbp
75762306a36Sopenharmony_ci	mov 	%rsp,%rbp
75862306a36Sopenharmony_ci	push	%rbx
75962306a36Sopenharmony_ci.cfi_push	%rbx
76062306a36Sopenharmony_ci	push	%r12
76162306a36Sopenharmony_ci.cfi_push	%r12
76262306a36Sopenharmony_ci	push	%r13
76362306a36Sopenharmony_ci.cfi_push	%r13
76462306a36Sopenharmony_ci	push	%r14
76562306a36Sopenharmony_ci.cfi_push	%r14
76662306a36Sopenharmony_ci	push	%r15
76762306a36Sopenharmony_ci.cfi_push	%r15
76862306a36Sopenharmony_ci.Lbase2_64_avx_body:
76962306a36Sopenharmony_ci
77062306a36Sopenharmony_ci	mov	$len,%r15		# reassign $len
77162306a36Sopenharmony_ci
77262306a36Sopenharmony_ci	mov	24($ctx),$r0		# load r
77362306a36Sopenharmony_ci	mov	32($ctx),$s1
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci	mov	0($ctx),$h0		# load hash value
77662306a36Sopenharmony_ci	mov	8($ctx),$h1
77762306a36Sopenharmony_ci	mov	16($ctx),$h2#d
77862306a36Sopenharmony_ci
77962306a36Sopenharmony_ci	mov	$s1,$r1
78062306a36Sopenharmony_ci	mov	$s1,%rax
78162306a36Sopenharmony_ci	shr	\$2,$s1
78262306a36Sopenharmony_ci	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
78362306a36Sopenharmony_ci
78462306a36Sopenharmony_ci	test	\$31,$len
78562306a36Sopenharmony_ci	jz	.Linit_avx
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_ci	add	0($inp),$h0		# accumulate input
78862306a36Sopenharmony_ci	adc	8($inp),$h1
78962306a36Sopenharmony_ci	lea	16($inp),$inp
79062306a36Sopenharmony_ci	adc	$padbit,$h2
79162306a36Sopenharmony_ci	sub	\$16,%r15
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci	call	__poly1305_block
79462306a36Sopenharmony_ci
79562306a36Sopenharmony_ci.Linit_avx:
79662306a36Sopenharmony_ci	################################# base 2^64 -> base 2^26
79762306a36Sopenharmony_ci	mov	$h0,%rax
79862306a36Sopenharmony_ci	mov	$h0,%rdx
79962306a36Sopenharmony_ci	shr	\$52,$h0
80062306a36Sopenharmony_ci	mov	$h1,$d1
80162306a36Sopenharmony_ci	mov	$h1,$d2
80262306a36Sopenharmony_ci	shr	\$26,%rdx
80362306a36Sopenharmony_ci	and	\$0x3ffffff,%rax	# h[0]
80462306a36Sopenharmony_ci	shl	\$12,$d1
80562306a36Sopenharmony_ci	and	\$0x3ffffff,%rdx	# h[1]
80662306a36Sopenharmony_ci	shr	\$14,$h1
80762306a36Sopenharmony_ci	or	$d1,$h0
80862306a36Sopenharmony_ci	shl	\$24,$h2
80962306a36Sopenharmony_ci	and	\$0x3ffffff,$h0		# h[2]
81062306a36Sopenharmony_ci	shr	\$40,$d2
81162306a36Sopenharmony_ci	and	\$0x3ffffff,$h1		# h[3]
81262306a36Sopenharmony_ci	or	$d2,$h2			# h[4]
81362306a36Sopenharmony_ci
81462306a36Sopenharmony_ci	vmovd	%rax#d,$H0
81562306a36Sopenharmony_ci	vmovd	%rdx#d,$H1
81662306a36Sopenharmony_ci	vmovd	$h0#d,$H2
81762306a36Sopenharmony_ci	vmovd	$h1#d,$H3
81862306a36Sopenharmony_ci	vmovd	$h2#d,$H4
81962306a36Sopenharmony_ci	movl	\$1,20($ctx)		# set is_base2_26
82062306a36Sopenharmony_ci
82162306a36Sopenharmony_ci	call	__poly1305_init_avx
82262306a36Sopenharmony_ci
82362306a36Sopenharmony_ci.Lproceed_avx:
82462306a36Sopenharmony_ci	mov	%r15,$len
82562306a36Sopenharmony_ci	pop 		%r15
82662306a36Sopenharmony_ci.cfi_restore	%r15
82762306a36Sopenharmony_ci	pop 		%r14
82862306a36Sopenharmony_ci.cfi_restore	%r14
82962306a36Sopenharmony_ci	pop 		%r13
83062306a36Sopenharmony_ci.cfi_restore	%r13
83162306a36Sopenharmony_ci	pop 		%r12
83262306a36Sopenharmony_ci.cfi_restore	%r12
83362306a36Sopenharmony_ci	pop 		%rbx
83462306a36Sopenharmony_ci.cfi_restore	%rbx
83562306a36Sopenharmony_ci	pop 		%rbp
83662306a36Sopenharmony_ci.cfi_restore	%rbp
83762306a36Sopenharmony_ci.Lbase2_64_avx_epilogue:
83862306a36Sopenharmony_ci	jmp	.Ldo_avx
83962306a36Sopenharmony_ci.cfi_endproc
84062306a36Sopenharmony_ci
84162306a36Sopenharmony_ci.align	32
84262306a36Sopenharmony_ci.Leven_avx:
84362306a36Sopenharmony_ci.cfi_startproc
84462306a36Sopenharmony_ci	vmovd		4*0($ctx),$H0		# load hash value
84562306a36Sopenharmony_ci	vmovd		4*1($ctx),$H1
84662306a36Sopenharmony_ci	vmovd		4*2($ctx),$H2
84762306a36Sopenharmony_ci	vmovd		4*3($ctx),$H3
84862306a36Sopenharmony_ci	vmovd		4*4($ctx),$H4
84962306a36Sopenharmony_ci
85062306a36Sopenharmony_ci.Ldo_avx:
85162306a36Sopenharmony_ci___
85262306a36Sopenharmony_ci$code.=<<___	if (!$win64);
85362306a36Sopenharmony_ci	lea		8(%rsp),%r10
85462306a36Sopenharmony_ci.cfi_def_cfa_register	%r10
85562306a36Sopenharmony_ci	and		\$-32,%rsp
85662306a36Sopenharmony_ci	sub		\$-8,%rsp
85762306a36Sopenharmony_ci	lea		-0x58(%rsp),%r11
85862306a36Sopenharmony_ci	sub		\$0x178,%rsp
85962306a36Sopenharmony_ci___
86062306a36Sopenharmony_ci$code.=<<___	if ($win64);
86162306a36Sopenharmony_ci	lea		-0xf8(%rsp),%r11
86262306a36Sopenharmony_ci	sub		\$0x218,%rsp
86362306a36Sopenharmony_ci	vmovdqa		%xmm6,0x50(%r11)
86462306a36Sopenharmony_ci	vmovdqa		%xmm7,0x60(%r11)
86562306a36Sopenharmony_ci	vmovdqa		%xmm8,0x70(%r11)
86662306a36Sopenharmony_ci	vmovdqa		%xmm9,0x80(%r11)
86762306a36Sopenharmony_ci	vmovdqa		%xmm10,0x90(%r11)
86862306a36Sopenharmony_ci	vmovdqa		%xmm11,0xa0(%r11)
86962306a36Sopenharmony_ci	vmovdqa		%xmm12,0xb0(%r11)
87062306a36Sopenharmony_ci	vmovdqa		%xmm13,0xc0(%r11)
87162306a36Sopenharmony_ci	vmovdqa		%xmm14,0xd0(%r11)
87262306a36Sopenharmony_ci	vmovdqa		%xmm15,0xe0(%r11)
87362306a36Sopenharmony_ci.Ldo_avx_body:
87462306a36Sopenharmony_ci___
87562306a36Sopenharmony_ci$code.=<<___;
87662306a36Sopenharmony_ci	sub		\$64,$len
87762306a36Sopenharmony_ci	lea		-32($inp),%rax
87862306a36Sopenharmony_ci	cmovc		%rax,$inp
87962306a36Sopenharmony_ci
88062306a36Sopenharmony_ci	vmovdqu		`16*3`($ctx),$D4	# preload r0^2
88162306a36Sopenharmony_ci	lea		`16*3+64`($ctx),$ctx	# size optimization
88262306a36Sopenharmony_ci	lea		.Lconst(%rip),%rcx
88362306a36Sopenharmony_ci
88462306a36Sopenharmony_ci	################################################################
88562306a36Sopenharmony_ci	# load input
88662306a36Sopenharmony_ci	vmovdqu		16*2($inp),$T0
88762306a36Sopenharmony_ci	vmovdqu		16*3($inp),$T1
88862306a36Sopenharmony_ci	vmovdqa		64(%rcx),$MASK		# .Lmask26
88962306a36Sopenharmony_ci
89062306a36Sopenharmony_ci	vpsrldq		\$6,$T0,$T2		# splat input
89162306a36Sopenharmony_ci	vpsrldq		\$6,$T1,$T3
89262306a36Sopenharmony_ci	vpunpckhqdq	$T1,$T0,$T4		# 4
89362306a36Sopenharmony_ci	vpunpcklqdq	$T1,$T0,$T0		# 0:1
89462306a36Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T3		# 2:3
89562306a36Sopenharmony_ci
89662306a36Sopenharmony_ci	vpsrlq		\$40,$T4,$T4		# 4
89762306a36Sopenharmony_ci	vpsrlq		\$26,$T0,$T1
89862306a36Sopenharmony_ci	vpand		$MASK,$T0,$T0		# 0
89962306a36Sopenharmony_ci	vpsrlq		\$4,$T3,$T2
90062306a36Sopenharmony_ci	vpand		$MASK,$T1,$T1		# 1
90162306a36Sopenharmony_ci	vpsrlq		\$30,$T3,$T3
90262306a36Sopenharmony_ci	vpand		$MASK,$T2,$T2		# 2
90362306a36Sopenharmony_ci	vpand		$MASK,$T3,$T3		# 3
90462306a36Sopenharmony_ci	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
90562306a36Sopenharmony_ci
90662306a36Sopenharmony_ci	jbe		.Lskip_loop_avx
90762306a36Sopenharmony_ci
90862306a36Sopenharmony_ci	# expand and copy pre-calculated table to stack
90962306a36Sopenharmony_ci	vmovdqu		`16*1-64`($ctx),$D1
91062306a36Sopenharmony_ci	vmovdqu		`16*2-64`($ctx),$D2
91162306a36Sopenharmony_ci	vpshufd		\$0xEE,$D4,$D3		# 34xx -> 3434
91262306a36Sopenharmony_ci	vpshufd		\$0x44,$D4,$D0		# xx12 -> 1212
91362306a36Sopenharmony_ci	vmovdqa		$D3,-0x90(%r11)
91462306a36Sopenharmony_ci	vmovdqa		$D0,0x00(%rsp)
91562306a36Sopenharmony_ci	vpshufd		\$0xEE,$D1,$D4
91662306a36Sopenharmony_ci	vmovdqu		`16*3-64`($ctx),$D0
91762306a36Sopenharmony_ci	vpshufd		\$0x44,$D1,$D1
91862306a36Sopenharmony_ci	vmovdqa		$D4,-0x80(%r11)
91962306a36Sopenharmony_ci	vmovdqa		$D1,0x10(%rsp)
92062306a36Sopenharmony_ci	vpshufd		\$0xEE,$D2,$D3
92162306a36Sopenharmony_ci	vmovdqu		`16*4-64`($ctx),$D1
92262306a36Sopenharmony_ci	vpshufd		\$0x44,$D2,$D2
92362306a36Sopenharmony_ci	vmovdqa		$D3,-0x70(%r11)
92462306a36Sopenharmony_ci	vmovdqa		$D2,0x20(%rsp)
92562306a36Sopenharmony_ci	vpshufd		\$0xEE,$D0,$D4
92662306a36Sopenharmony_ci	vmovdqu		`16*5-64`($ctx),$D2
92762306a36Sopenharmony_ci	vpshufd		\$0x44,$D0,$D0
92862306a36Sopenharmony_ci	vmovdqa		$D4,-0x60(%r11)
92962306a36Sopenharmony_ci	vmovdqa		$D0,0x30(%rsp)
93062306a36Sopenharmony_ci	vpshufd		\$0xEE,$D1,$D3
93162306a36Sopenharmony_ci	vmovdqu		`16*6-64`($ctx),$D0
93262306a36Sopenharmony_ci	vpshufd		\$0x44,$D1,$D1
93362306a36Sopenharmony_ci	vmovdqa		$D3,-0x50(%r11)
93462306a36Sopenharmony_ci	vmovdqa		$D1,0x40(%rsp)
93562306a36Sopenharmony_ci	vpshufd		\$0xEE,$D2,$D4
93662306a36Sopenharmony_ci	vmovdqu		`16*7-64`($ctx),$D1
93762306a36Sopenharmony_ci	vpshufd		\$0x44,$D2,$D2
93862306a36Sopenharmony_ci	vmovdqa		$D4,-0x40(%r11)
93962306a36Sopenharmony_ci	vmovdqa		$D2,0x50(%rsp)
94062306a36Sopenharmony_ci	vpshufd		\$0xEE,$D0,$D3
94162306a36Sopenharmony_ci	vmovdqu		`16*8-64`($ctx),$D2
94262306a36Sopenharmony_ci	vpshufd		\$0x44,$D0,$D0
94362306a36Sopenharmony_ci	vmovdqa		$D3,-0x30(%r11)
94462306a36Sopenharmony_ci	vmovdqa		$D0,0x60(%rsp)
94562306a36Sopenharmony_ci	vpshufd		\$0xEE,$D1,$D4
94662306a36Sopenharmony_ci	vpshufd		\$0x44,$D1,$D1
94762306a36Sopenharmony_ci	vmovdqa		$D4,-0x20(%r11)
94862306a36Sopenharmony_ci	vmovdqa		$D1,0x70(%rsp)
94962306a36Sopenharmony_ci	vpshufd		\$0xEE,$D2,$D3
95062306a36Sopenharmony_ci	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
95162306a36Sopenharmony_ci	vpshufd		\$0x44,$D2,$D2
95262306a36Sopenharmony_ci	vmovdqa		$D3,-0x10(%r11)
95362306a36Sopenharmony_ci	vmovdqa		$D2,0x80(%rsp)
95462306a36Sopenharmony_ci
95562306a36Sopenharmony_ci	jmp		.Loop_avx
95662306a36Sopenharmony_ci
95762306a36Sopenharmony_ci.align	32
95862306a36Sopenharmony_ci.Loop_avx:
95962306a36Sopenharmony_ci	################################################################
96062306a36Sopenharmony_ci	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
96162306a36Sopenharmony_ci	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
96262306a36Sopenharmony_ci	#   \___________________/
96362306a36Sopenharmony_ci	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
96462306a36Sopenharmony_ci	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
96562306a36Sopenharmony_ci	#   \___________________/ \____________________/
96662306a36Sopenharmony_ci	#
96762306a36Sopenharmony_ci	# Note that we start with inp[2:3]*r^2. This is because it
96862306a36Sopenharmony_ci	# doesn't depend on reduction in previous iteration.
96962306a36Sopenharmony_ci	################################################################
97062306a36Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
97162306a36Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
97262306a36Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
97362306a36Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
97462306a36Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
97562306a36Sopenharmony_ci	#
97662306a36Sopenharmony_ci	# though note that $Tx and $Hx are "reversed" in this section,
97762306a36Sopenharmony_ci	# and $D4 is preloaded with r0^2...
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
98062306a36Sopenharmony_ci	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
98162306a36Sopenharmony_ci	  vmovdqa	$H2,0x20(%r11)				# offload hash
98262306a36Sopenharmony_ci	vpmuludq	$T2,$D4,$D2		# d3 = h2*r0
98362306a36Sopenharmony_ci	 vmovdqa	0x10(%rsp),$H2		# r1^2
98462306a36Sopenharmony_ci	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
98562306a36Sopenharmony_ci	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci	  vmovdqa	$H0,0x00(%r11)				#
98862306a36Sopenharmony_ci	vpmuludq	0x20(%rsp),$T4,$H0	# h4*s1
98962306a36Sopenharmony_ci	  vmovdqa	$H1,0x10(%r11)				#
99062306a36Sopenharmony_ci	vpmuludq	$T3,$H2,$H1		# h3*r1
99162306a36Sopenharmony_ci	vpaddq		$H0,$D0,$D0		# d0 += h4*s1
99262306a36Sopenharmony_ci	vpaddq		$H1,$D4,$D4		# d4 += h3*r1
99362306a36Sopenharmony_ci	  vmovdqa	$H3,0x30(%r11)				#
99462306a36Sopenharmony_ci	vpmuludq	$T2,$H2,$H0		# h2*r1
99562306a36Sopenharmony_ci	vpmuludq	$T1,$H2,$H1		# h1*r1
99662306a36Sopenharmony_ci	vpaddq		$H0,$D3,$D3		# d3 += h2*r1
99762306a36Sopenharmony_ci	 vmovdqa	0x30(%rsp),$H3		# r2^2
99862306a36Sopenharmony_ci	vpaddq		$H1,$D2,$D2		# d2 += h1*r1
99962306a36Sopenharmony_ci	  vmovdqa	$H4,0x40(%r11)				#
100062306a36Sopenharmony_ci	vpmuludq	$T0,$H2,$H2		# h0*r1
100162306a36Sopenharmony_ci	 vpmuludq	$T2,$H3,$H0		# h2*r2
100262306a36Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
100362306a36Sopenharmony_ci
100462306a36Sopenharmony_ci	 vmovdqa	0x40(%rsp),$H4		# s2^2
100562306a36Sopenharmony_ci	vpaddq		$H0,$D4,$D4		# d4 += h2*r2
100662306a36Sopenharmony_ci	vpmuludq	$T1,$H3,$H1		# h1*r2
100762306a36Sopenharmony_ci	vpmuludq	$T0,$H3,$H3		# h0*r2
100862306a36Sopenharmony_ci	vpaddq		$H1,$D3,$D3		# d3 += h1*r2
100962306a36Sopenharmony_ci	 vmovdqa	0x50(%rsp),$H2		# r3^2
101062306a36Sopenharmony_ci	vpaddq		$H3,$D2,$D2		# d2 += h0*r2
101162306a36Sopenharmony_ci	vpmuludq	$T4,$H4,$H0		# h4*s2
101262306a36Sopenharmony_ci	vpmuludq	$T3,$H4,$H4		# h3*s2
101362306a36Sopenharmony_ci	vpaddq		$H0,$D1,$D1		# d1 += h4*s2
101462306a36Sopenharmony_ci	 vmovdqa	0x60(%rsp),$H3		# s3^2
101562306a36Sopenharmony_ci	vpaddq		$H4,$D0,$D0		# d0 += h3*s2
101662306a36Sopenharmony_ci
101762306a36Sopenharmony_ci	 vmovdqa	0x80(%rsp),$H4		# s4^2
101862306a36Sopenharmony_ci	vpmuludq	$T1,$H2,$H1		# h1*r3
101962306a36Sopenharmony_ci	vpmuludq	$T0,$H2,$H2		# h0*r3
102062306a36Sopenharmony_ci	vpaddq		$H1,$D4,$D4		# d4 += h1*r3
102162306a36Sopenharmony_ci	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
102262306a36Sopenharmony_ci	vpmuludq	$T4,$H3,$H0		# h4*s3
102362306a36Sopenharmony_ci	vpmuludq	$T3,$H3,$H1		# h3*s3
102462306a36Sopenharmony_ci	vpaddq		$H0,$D2,$D2		# d2 += h4*s3
102562306a36Sopenharmony_ci	 vmovdqu	16*0($inp),$H0				# load input
102662306a36Sopenharmony_ci	vpaddq		$H1,$D1,$D1		# d1 += h3*s3
102762306a36Sopenharmony_ci	vpmuludq	$T2,$H3,$H3		# h2*s3
102862306a36Sopenharmony_ci	 vpmuludq	$T2,$H4,$T2		# h2*s4
102962306a36Sopenharmony_ci	vpaddq		$H3,$D0,$D0		# d0 += h2*s3
103062306a36Sopenharmony_ci
103162306a36Sopenharmony_ci	 vmovdqu	16*1($inp),$H1				#
103262306a36Sopenharmony_ci	vpaddq		$T2,$D1,$D1		# d1 += h2*s4
103362306a36Sopenharmony_ci	vpmuludq	$T3,$H4,$T3		# h3*s4
103462306a36Sopenharmony_ci	vpmuludq	$T4,$H4,$T4		# h4*s4
103562306a36Sopenharmony_ci	 vpsrldq	\$6,$H0,$H2				# splat input
103662306a36Sopenharmony_ci	vpaddq		$T3,$D2,$D2		# d2 += h3*s4
103762306a36Sopenharmony_ci	vpaddq		$T4,$D3,$D3		# d3 += h4*s4
103862306a36Sopenharmony_ci	 vpsrldq	\$6,$H1,$H3				#
103962306a36Sopenharmony_ci	vpmuludq	0x70(%rsp),$T0,$T4	# h0*r4
104062306a36Sopenharmony_ci	vpmuludq	$T1,$H4,$T0		# h1*s4
104162306a36Sopenharmony_ci	 vpunpckhqdq	$H1,$H0,$H4		# 4
104262306a36Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h0*r4
104362306a36Sopenharmony_ci	 vmovdqa	-0x90(%r11),$T4		# r0^4
104462306a36Sopenharmony_ci	vpaddq		$T0,$D0,$D0		# d0 += h1*s4
104562306a36Sopenharmony_ci
104662306a36Sopenharmony_ci	vpunpcklqdq	$H1,$H0,$H0		# 0:1
104762306a36Sopenharmony_ci	vpunpcklqdq	$H3,$H2,$H3		# 2:3
104862306a36Sopenharmony_ci
104962306a36Sopenharmony_ci	#vpsrlq		\$40,$H4,$H4		# 4
105062306a36Sopenharmony_ci	vpsrldq		\$`40/8`,$H4,$H4	# 4
105162306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$H1
105262306a36Sopenharmony_ci	vpand		$MASK,$H0,$H0		# 0
105362306a36Sopenharmony_ci	vpsrlq		\$4,$H3,$H2
105462306a36Sopenharmony_ci	vpand		$MASK,$H1,$H1		# 1
105562306a36Sopenharmony_ci	vpand		0(%rcx),$H4,$H4		# .Lmask24
105662306a36Sopenharmony_ci	vpsrlq		\$30,$H3,$H3
105762306a36Sopenharmony_ci	vpand		$MASK,$H2,$H2		# 2
105862306a36Sopenharmony_ci	vpand		$MASK,$H3,$H3		# 3
105962306a36Sopenharmony_ci	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
106062306a36Sopenharmony_ci
106162306a36Sopenharmony_ci	vpaddq		0x00(%r11),$H0,$H0	# add hash value
106262306a36Sopenharmony_ci	vpaddq		0x10(%r11),$H1,$H1
106362306a36Sopenharmony_ci	vpaddq		0x20(%r11),$H2,$H2
106462306a36Sopenharmony_ci	vpaddq		0x30(%r11),$H3,$H3
106562306a36Sopenharmony_ci	vpaddq		0x40(%r11),$H4,$H4
106662306a36Sopenharmony_ci
106762306a36Sopenharmony_ci	lea		16*2($inp),%rax
106862306a36Sopenharmony_ci	lea		16*4($inp),$inp
106962306a36Sopenharmony_ci	sub		\$64,$len
107062306a36Sopenharmony_ci	cmovc		%rax,$inp
107162306a36Sopenharmony_ci
107262306a36Sopenharmony_ci	################################################################
107362306a36Sopenharmony_ci	# Now we accumulate (inp[0:1]+hash)*r^4
107462306a36Sopenharmony_ci	################################################################
107562306a36Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
107662306a36Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
107762306a36Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
107862306a36Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
107962306a36Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
108062306a36Sopenharmony_ci
108162306a36Sopenharmony_ci	vpmuludq	$H0,$T4,$T0		# h0*r0
108262306a36Sopenharmony_ci	vpmuludq	$H1,$T4,$T1		# h1*r0
108362306a36Sopenharmony_ci	vpaddq		$T0,$D0,$D0
108462306a36Sopenharmony_ci	vpaddq		$T1,$D1,$D1
108562306a36Sopenharmony_ci	 vmovdqa	-0x80(%r11),$T2		# r1^4
108662306a36Sopenharmony_ci	vpmuludq	$H2,$T4,$T0		# h2*r0
108762306a36Sopenharmony_ci	vpmuludq	$H3,$T4,$T1		# h3*r0
108862306a36Sopenharmony_ci	vpaddq		$T0,$D2,$D2
108962306a36Sopenharmony_ci	vpaddq		$T1,$D3,$D3
109062306a36Sopenharmony_ci	vpmuludq	$H4,$T4,$T4		# h4*r0
109162306a36Sopenharmony_ci	 vpmuludq	-0x70(%r11),$H4,$T0	# h4*s1
109262306a36Sopenharmony_ci	vpaddq		$T4,$D4,$D4
109362306a36Sopenharmony_ci
109462306a36Sopenharmony_ci	vpaddq		$T0,$D0,$D0		# d0 += h4*s1
109562306a36Sopenharmony_ci	vpmuludq	$H2,$T2,$T1		# h2*r1
109662306a36Sopenharmony_ci	vpmuludq	$H3,$T2,$T0		# h3*r1
109762306a36Sopenharmony_ci	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
109862306a36Sopenharmony_ci	 vmovdqa	-0x60(%r11),$T3		# r2^4
109962306a36Sopenharmony_ci	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
110062306a36Sopenharmony_ci	vpmuludq	$H1,$T2,$T1		# h1*r1
110162306a36Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r1
110262306a36Sopenharmony_ci	vpaddq		$T1,$D2,$D2		# d2 += h1*r1
110362306a36Sopenharmony_ci	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
110462306a36Sopenharmony_ci
110562306a36Sopenharmony_ci	 vmovdqa	-0x50(%r11),$T4		# s2^4
110662306a36Sopenharmony_ci	vpmuludq	$H2,$T3,$T0		# h2*r2
110762306a36Sopenharmony_ci	vpmuludq	$H1,$T3,$T1		# h1*r2
110862306a36Sopenharmony_ci	vpaddq		$T0,$D4,$D4		# d4 += h2*r2
110962306a36Sopenharmony_ci	vpaddq		$T1,$D3,$D3		# d3 += h1*r2
111062306a36Sopenharmony_ci	 vmovdqa	-0x40(%r11),$T2		# r3^4
111162306a36Sopenharmony_ci	vpmuludq	$H0,$T3,$T3		# h0*r2
111262306a36Sopenharmony_ci	vpmuludq	$H4,$T4,$T0		# h4*s2
111362306a36Sopenharmony_ci	vpaddq		$T3,$D2,$D2		# d2 += h0*r2
111462306a36Sopenharmony_ci	vpaddq		$T0,$D1,$D1		# d1 += h4*s2
111562306a36Sopenharmony_ci	 vmovdqa	-0x30(%r11),$T3		# s3^4
111662306a36Sopenharmony_ci	vpmuludq	$H3,$T4,$T4		# h3*s2
111762306a36Sopenharmony_ci	 vpmuludq	$H1,$T2,$T1		# h1*r3
111862306a36Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
111962306a36Sopenharmony_ci
112062306a36Sopenharmony_ci	 vmovdqa	-0x10(%r11),$T4		# s4^4
112162306a36Sopenharmony_ci	vpaddq		$T1,$D4,$D4		# d4 += h1*r3
112262306a36Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r3
112362306a36Sopenharmony_ci	vpmuludq	$H4,$T3,$T0		# h4*s3
112462306a36Sopenharmony_ci	vpaddq		$T2,$D3,$D3		# d3 += h0*r3
112562306a36Sopenharmony_ci	vpaddq		$T0,$D2,$D2		# d2 += h4*s3
112662306a36Sopenharmony_ci	 vmovdqu	16*2($inp),$T0				# load input
112762306a36Sopenharmony_ci	vpmuludq	$H3,$T3,$T2		# h3*s3
112862306a36Sopenharmony_ci	vpmuludq	$H2,$T3,$T3		# h2*s3
112962306a36Sopenharmony_ci	vpaddq		$T2,$D1,$D1		# d1 += h3*s3
113062306a36Sopenharmony_ci	 vmovdqu	16*3($inp),$T1				#
113162306a36Sopenharmony_ci	vpaddq		$T3,$D0,$D0		# d0 += h2*s3
113262306a36Sopenharmony_ci
113362306a36Sopenharmony_ci	vpmuludq	$H2,$T4,$H2		# h2*s4
113462306a36Sopenharmony_ci	vpmuludq	$H3,$T4,$H3		# h3*s4
113562306a36Sopenharmony_ci	 vpsrldq	\$6,$T0,$T2				# splat input
113662306a36Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h2*s4
113762306a36Sopenharmony_ci	vpmuludq	$H4,$T4,$H4		# h4*s4
113862306a36Sopenharmony_ci	 vpsrldq	\$6,$T1,$T3				#
113962306a36Sopenharmony_ci	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*s4
114062306a36Sopenharmony_ci	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*s4
114162306a36Sopenharmony_ci	vpmuludq	-0x20(%r11),$H0,$H4	# h0*r4
114262306a36Sopenharmony_ci	vpmuludq	$H1,$T4,$H0
114362306a36Sopenharmony_ci	 vpunpckhqdq	$T1,$T0,$T4		# 4
114462306a36Sopenharmony_ci	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
114562306a36Sopenharmony_ci	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
114662306a36Sopenharmony_ci
114762306a36Sopenharmony_ci	vpunpcklqdq	$T1,$T0,$T0		# 0:1
114862306a36Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T3		# 2:3
114962306a36Sopenharmony_ci
115062306a36Sopenharmony_ci	#vpsrlq		\$40,$T4,$T4		# 4
115162306a36Sopenharmony_ci	vpsrldq		\$`40/8`,$T4,$T4	# 4
115262306a36Sopenharmony_ci	vpsrlq		\$26,$T0,$T1
115362306a36Sopenharmony_ci	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
115462306a36Sopenharmony_ci	vpand		$MASK,$T0,$T0		# 0
115562306a36Sopenharmony_ci	vpsrlq		\$4,$T3,$T2
115662306a36Sopenharmony_ci	vpand		$MASK,$T1,$T1		# 1
115762306a36Sopenharmony_ci	vpand		0(%rcx),$T4,$T4		# .Lmask24
115862306a36Sopenharmony_ci	vpsrlq		\$30,$T3,$T3
115962306a36Sopenharmony_ci	vpand		$MASK,$T2,$T2		# 2
116062306a36Sopenharmony_ci	vpand		$MASK,$T3,$T3		# 3
116162306a36Sopenharmony_ci	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
116262306a36Sopenharmony_ci
116362306a36Sopenharmony_ci	################################################################
116462306a36Sopenharmony_ci	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
116562306a36Sopenharmony_ci	# and P. Schwabe
116662306a36Sopenharmony_ci
116762306a36Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
116862306a36Sopenharmony_ci	vpand		$MASK,$H3,$H3
116962306a36Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
117062306a36Sopenharmony_ci
117162306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
117262306a36Sopenharmony_ci	vpand		$MASK,$H0,$H0
117362306a36Sopenharmony_ci	vpaddq		$D0,$D1,$H1		# h0 -> h1
117462306a36Sopenharmony_ci
117562306a36Sopenharmony_ci	vpsrlq		\$26,$H4,$D0
117662306a36Sopenharmony_ci	vpand		$MASK,$H4,$H4
117762306a36Sopenharmony_ci
117862306a36Sopenharmony_ci	vpsrlq		\$26,$H1,$D1
117962306a36Sopenharmony_ci	vpand		$MASK,$H1,$H1
118062306a36Sopenharmony_ci	vpaddq		$D1,$H2,$H2		# h1 -> h2
118162306a36Sopenharmony_ci
118262306a36Sopenharmony_ci	vpaddq		$D0,$H0,$H0
118362306a36Sopenharmony_ci	vpsllq		\$2,$D0,$D0
118462306a36Sopenharmony_ci	vpaddq		$D0,$H0,$H0		# h4 -> h0
118562306a36Sopenharmony_ci
118662306a36Sopenharmony_ci	vpsrlq		\$26,$H2,$D2
118762306a36Sopenharmony_ci	vpand		$MASK,$H2,$H2
118862306a36Sopenharmony_ci	vpaddq		$D2,$H3,$H3		# h2 -> h3
118962306a36Sopenharmony_ci
119062306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
119162306a36Sopenharmony_ci	vpand		$MASK,$H0,$H0
119262306a36Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
119362306a36Sopenharmony_ci
119462306a36Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
119562306a36Sopenharmony_ci	vpand		$MASK,$H3,$H3
119662306a36Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
119762306a36Sopenharmony_ci
119862306a36Sopenharmony_ci	ja		.Loop_avx
119962306a36Sopenharmony_ci
120062306a36Sopenharmony_ci.Lskip_loop_avx:
120162306a36Sopenharmony_ci	################################################################
120262306a36Sopenharmony_ci	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
120362306a36Sopenharmony_ci
120462306a36Sopenharmony_ci	vpshufd		\$0x10,$D4,$D4		# r0^n, xx12 -> x1x2
120562306a36Sopenharmony_ci	add		\$32,$len
120662306a36Sopenharmony_ci	jnz		.Long_tail_avx
120762306a36Sopenharmony_ci
120862306a36Sopenharmony_ci	vpaddq		$H2,$T2,$T2
120962306a36Sopenharmony_ci	vpaddq		$H0,$T0,$T0
121062306a36Sopenharmony_ci	vpaddq		$H1,$T1,$T1
121162306a36Sopenharmony_ci	vpaddq		$H3,$T3,$T3
121262306a36Sopenharmony_ci	vpaddq		$H4,$T4,$T4
121362306a36Sopenharmony_ci
121462306a36Sopenharmony_ci.Long_tail_avx:
121562306a36Sopenharmony_ci	vmovdqa		$H2,0x20(%r11)
121662306a36Sopenharmony_ci	vmovdqa		$H0,0x00(%r11)
121762306a36Sopenharmony_ci	vmovdqa		$H1,0x10(%r11)
121862306a36Sopenharmony_ci	vmovdqa		$H3,0x30(%r11)
121962306a36Sopenharmony_ci	vmovdqa		$H4,0x40(%r11)
122062306a36Sopenharmony_ci
122162306a36Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
122262306a36Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
122362306a36Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
122462306a36Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
122562306a36Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
122662306a36Sopenharmony_ci
122762306a36Sopenharmony_ci	vpmuludq	$T2,$D4,$D2		# d2 = h2*r0
122862306a36Sopenharmony_ci	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
122962306a36Sopenharmony_ci	 vpshufd	\$0x10,`16*1-64`($ctx),$H2		# r1^n
123062306a36Sopenharmony_ci	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
123162306a36Sopenharmony_ci	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
123262306a36Sopenharmony_ci	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
123362306a36Sopenharmony_ci
123462306a36Sopenharmony_ci	vpmuludq	$T3,$H2,$H0		# h3*r1
123562306a36Sopenharmony_ci	vpaddq		$H0,$D4,$D4		# d4 += h3*r1
123662306a36Sopenharmony_ci	 vpshufd	\$0x10,`16*2-64`($ctx),$H3		# s1^n
123762306a36Sopenharmony_ci	vpmuludq	$T2,$H2,$H1		# h2*r1
123862306a36Sopenharmony_ci	vpaddq		$H1,$D3,$D3		# d3 += h2*r1
123962306a36Sopenharmony_ci	 vpshufd	\$0x10,`16*3-64`($ctx),$H4		# r2^n
124062306a36Sopenharmony_ci	vpmuludq	$T1,$H2,$H0		# h1*r1
124162306a36Sopenharmony_ci	vpaddq		$H0,$D2,$D2		# d2 += h1*r1
124262306a36Sopenharmony_ci	vpmuludq	$T0,$H2,$H2		# h0*r1
124362306a36Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
124462306a36Sopenharmony_ci	vpmuludq	$T4,$H3,$H3		# h4*s1
124562306a36Sopenharmony_ci	vpaddq		$H3,$D0,$D0		# d0 += h4*s1
124662306a36Sopenharmony_ci
124762306a36Sopenharmony_ci	 vpshufd	\$0x10,`16*4-64`($ctx),$H2		# s2^n
124862306a36Sopenharmony_ci	vpmuludq	$T2,$H4,$H1		# h2*r2
124962306a36Sopenharmony_ci	vpaddq		$H1,$D4,$D4		# d4 += h2*r2
125062306a36Sopenharmony_ci	vpmuludq	$T1,$H4,$H0		# h1*r2
125162306a36Sopenharmony_ci	vpaddq		$H0,$D3,$D3		# d3 += h1*r2
125262306a36Sopenharmony_ci	 vpshufd	\$0x10,`16*5-64`($ctx),$H3		# r3^n
125362306a36Sopenharmony_ci	vpmuludq	$T0,$H4,$H4		# h0*r2
125462306a36Sopenharmony_ci	vpaddq		$H4,$D2,$D2		# d2 += h0*r2
125562306a36Sopenharmony_ci	vpmuludq	$T4,$H2,$H1		# h4*s2
125662306a36Sopenharmony_ci	vpaddq		$H1,$D1,$D1		# d1 += h4*s2
125762306a36Sopenharmony_ci	 vpshufd	\$0x10,`16*6-64`($ctx),$H4		# s3^n
125862306a36Sopenharmony_ci	vpmuludq	$T3,$H2,$H2		# h3*s2
125962306a36Sopenharmony_ci	vpaddq		$H2,$D0,$D0		# d0 += h3*s2
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_ci	vpmuludq	$T1,$H3,$H0		# h1*r3
126262306a36Sopenharmony_ci	vpaddq		$H0,$D4,$D4		# d4 += h1*r3
126362306a36Sopenharmony_ci	vpmuludq	$T0,$H3,$H3		# h0*r3
126462306a36Sopenharmony_ci	vpaddq		$H3,$D3,$D3		# d3 += h0*r3
126562306a36Sopenharmony_ci	 vpshufd	\$0x10,`16*7-64`($ctx),$H2		# r4^n
126662306a36Sopenharmony_ci	vpmuludq	$T4,$H4,$H1		# h4*s3
126762306a36Sopenharmony_ci	vpaddq		$H1,$D2,$D2		# d2 += h4*s3
126862306a36Sopenharmony_ci	 vpshufd	\$0x10,`16*8-64`($ctx),$H3		# s4^n
126962306a36Sopenharmony_ci	vpmuludq	$T3,$H4,$H0		# h3*s3
127062306a36Sopenharmony_ci	vpaddq		$H0,$D1,$D1		# d1 += h3*s3
127162306a36Sopenharmony_ci	vpmuludq	$T2,$H4,$H4		# h2*s3
127262306a36Sopenharmony_ci	vpaddq		$H4,$D0,$D0		# d0 += h2*s3
127362306a36Sopenharmony_ci
127462306a36Sopenharmony_ci	vpmuludq	$T0,$H2,$H2		# h0*r4
127562306a36Sopenharmony_ci	vpaddq		$H2,$D4,$D4		# h4 = d4 + h0*r4
127662306a36Sopenharmony_ci	vpmuludq	$T4,$H3,$H1		# h4*s4
127762306a36Sopenharmony_ci	vpaddq		$H1,$D3,$D3		# h3 = d3 + h4*s4
127862306a36Sopenharmony_ci	vpmuludq	$T3,$H3,$H0		# h3*s4
127962306a36Sopenharmony_ci	vpaddq		$H0,$D2,$D2		# h2 = d2 + h3*s4
128062306a36Sopenharmony_ci	vpmuludq	$T2,$H3,$H1		# h2*s4
128162306a36Sopenharmony_ci	vpaddq		$H1,$D1,$D1		# h1 = d1 + h2*s4
128262306a36Sopenharmony_ci	vpmuludq	$T1,$H3,$H3		# h1*s4
128362306a36Sopenharmony_ci	vpaddq		$H3,$D0,$D0		# h0 = d0 + h1*s4
128462306a36Sopenharmony_ci
128562306a36Sopenharmony_ci	jz		.Lshort_tail_avx
128662306a36Sopenharmony_ci
128762306a36Sopenharmony_ci	vmovdqu		16*0($inp),$H0		# load input
128862306a36Sopenharmony_ci	vmovdqu		16*1($inp),$H1
128962306a36Sopenharmony_ci
129062306a36Sopenharmony_ci	vpsrldq		\$6,$H0,$H2		# splat input
129162306a36Sopenharmony_ci	vpsrldq		\$6,$H1,$H3
129262306a36Sopenharmony_ci	vpunpckhqdq	$H1,$H0,$H4		# 4
129362306a36Sopenharmony_ci	vpunpcklqdq	$H1,$H0,$H0		# 0:1
129462306a36Sopenharmony_ci	vpunpcklqdq	$H3,$H2,$H3		# 2:3
129562306a36Sopenharmony_ci
129662306a36Sopenharmony_ci	vpsrlq		\$40,$H4,$H4		# 4
129762306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$H1
129862306a36Sopenharmony_ci	vpand		$MASK,$H0,$H0		# 0
129962306a36Sopenharmony_ci	vpsrlq		\$4,$H3,$H2
130062306a36Sopenharmony_ci	vpand		$MASK,$H1,$H1		# 1
130162306a36Sopenharmony_ci	vpsrlq		\$30,$H3,$H3
130262306a36Sopenharmony_ci	vpand		$MASK,$H2,$H2		# 2
130362306a36Sopenharmony_ci	vpand		$MASK,$H3,$H3		# 3
130462306a36Sopenharmony_ci	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
130562306a36Sopenharmony_ci
130662306a36Sopenharmony_ci	vpshufd		\$0x32,`16*0-64`($ctx),$T4	# r0^n, 34xx -> x3x4
130762306a36Sopenharmony_ci	vpaddq		0x00(%r11),$H0,$H0
130862306a36Sopenharmony_ci	vpaddq		0x10(%r11),$H1,$H1
130962306a36Sopenharmony_ci	vpaddq		0x20(%r11),$H2,$H2
131062306a36Sopenharmony_ci	vpaddq		0x30(%r11),$H3,$H3
131162306a36Sopenharmony_ci	vpaddq		0x40(%r11),$H4,$H4
131262306a36Sopenharmony_ci
131362306a36Sopenharmony_ci	################################################################
131462306a36Sopenharmony_ci	# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
131562306a36Sopenharmony_ci
131662306a36Sopenharmony_ci	vpmuludq	$H0,$T4,$T0		# h0*r0
131762306a36Sopenharmony_ci	vpaddq		$T0,$D0,$D0		# d0 += h0*r0
131862306a36Sopenharmony_ci	vpmuludq	$H1,$T4,$T1		# h1*r0
131962306a36Sopenharmony_ci	vpaddq		$T1,$D1,$D1		# d1 += h1*r0
132062306a36Sopenharmony_ci	vpmuludq	$H2,$T4,$T0		# h2*r0
132162306a36Sopenharmony_ci	vpaddq		$T0,$D2,$D2		# d2 += h2*r0
132262306a36Sopenharmony_ci	 vpshufd	\$0x32,`16*1-64`($ctx),$T2		# r1^n
132362306a36Sopenharmony_ci	vpmuludq	$H3,$T4,$T1		# h3*r0
132462306a36Sopenharmony_ci	vpaddq		$T1,$D3,$D3		# d3 += h3*r0
132562306a36Sopenharmony_ci	vpmuludq	$H4,$T4,$T4		# h4*r0
132662306a36Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h4*r0
132762306a36Sopenharmony_ci
132862306a36Sopenharmony_ci	vpmuludq	$H3,$T2,$T0		# h3*r1
132962306a36Sopenharmony_ci	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
133062306a36Sopenharmony_ci	 vpshufd	\$0x32,`16*2-64`($ctx),$T3		# s1
133162306a36Sopenharmony_ci	vpmuludq	$H2,$T2,$T1		# h2*r1
133262306a36Sopenharmony_ci	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
133362306a36Sopenharmony_ci	 vpshufd	\$0x32,`16*3-64`($ctx),$T4		# r2
133462306a36Sopenharmony_ci	vpmuludq	$H1,$T2,$T0		# h1*r1
133562306a36Sopenharmony_ci	vpaddq		$T0,$D2,$D2		# d2 += h1*r1
133662306a36Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r1
133762306a36Sopenharmony_ci	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
133862306a36Sopenharmony_ci	vpmuludq	$H4,$T3,$T3		# h4*s1
133962306a36Sopenharmony_ci	vpaddq		$T3,$D0,$D0		# d0 += h4*s1
134062306a36Sopenharmony_ci
134162306a36Sopenharmony_ci	 vpshufd	\$0x32,`16*4-64`($ctx),$T2		# s2
134262306a36Sopenharmony_ci	vpmuludq	$H2,$T4,$T1		# h2*r2
134362306a36Sopenharmony_ci	vpaddq		$T1,$D4,$D4		# d4 += h2*r2
134462306a36Sopenharmony_ci	vpmuludq	$H1,$T4,$T0		# h1*r2
134562306a36Sopenharmony_ci	vpaddq		$T0,$D3,$D3		# d3 += h1*r2
134662306a36Sopenharmony_ci	 vpshufd	\$0x32,`16*5-64`($ctx),$T3		# r3
134762306a36Sopenharmony_ci	vpmuludq	$H0,$T4,$T4		# h0*r2
134862306a36Sopenharmony_ci	vpaddq		$T4,$D2,$D2		# d2 += h0*r2
134962306a36Sopenharmony_ci	vpmuludq	$H4,$T2,$T1		# h4*s2
135062306a36Sopenharmony_ci	vpaddq		$T1,$D1,$D1		# d1 += h4*s2
135162306a36Sopenharmony_ci	 vpshufd	\$0x32,`16*6-64`($ctx),$T4		# s3
135262306a36Sopenharmony_ci	vpmuludq	$H3,$T2,$T2		# h3*s2
135362306a36Sopenharmony_ci	vpaddq		$T2,$D0,$D0		# d0 += h3*s2
135462306a36Sopenharmony_ci
135562306a36Sopenharmony_ci	vpmuludq	$H1,$T3,$T0		# h1*r3
135662306a36Sopenharmony_ci	vpaddq		$T0,$D4,$D4		# d4 += h1*r3
135762306a36Sopenharmony_ci	vpmuludq	$H0,$T3,$T3		# h0*r3
135862306a36Sopenharmony_ci	vpaddq		$T3,$D3,$D3		# d3 += h0*r3
135962306a36Sopenharmony_ci	 vpshufd	\$0x32,`16*7-64`($ctx),$T2		# r4
136062306a36Sopenharmony_ci	vpmuludq	$H4,$T4,$T1		# h4*s3
136162306a36Sopenharmony_ci	vpaddq		$T1,$D2,$D2		# d2 += h4*s3
136262306a36Sopenharmony_ci	 vpshufd	\$0x32,`16*8-64`($ctx),$T3		# s4
136362306a36Sopenharmony_ci	vpmuludq	$H3,$T4,$T0		# h3*s3
136462306a36Sopenharmony_ci	vpaddq		$T0,$D1,$D1		# d1 += h3*s3
136562306a36Sopenharmony_ci	vpmuludq	$H2,$T4,$T4		# h2*s3
136662306a36Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h2*s3
136762306a36Sopenharmony_ci
136862306a36Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r4
136962306a36Sopenharmony_ci	vpaddq		$T2,$D4,$D4		# d4 += h0*r4
137062306a36Sopenharmony_ci	vpmuludq	$H4,$T3,$T1		# h4*s4
137162306a36Sopenharmony_ci	vpaddq		$T1,$D3,$D3		# d3 += h4*s4
137262306a36Sopenharmony_ci	vpmuludq	$H3,$T3,$T0		# h3*s4
137362306a36Sopenharmony_ci	vpaddq		$T0,$D2,$D2		# d2 += h3*s4
137462306a36Sopenharmony_ci	vpmuludq	$H2,$T3,$T1		# h2*s4
137562306a36Sopenharmony_ci	vpaddq		$T1,$D1,$D1		# d1 += h2*s4
137662306a36Sopenharmony_ci	vpmuludq	$H1,$T3,$T3		# h1*s4
137762306a36Sopenharmony_ci	vpaddq		$T3,$D0,$D0		# d0 += h1*s4
137862306a36Sopenharmony_ci
137962306a36Sopenharmony_ci.Lshort_tail_avx:
138062306a36Sopenharmony_ci	################################################################
138162306a36Sopenharmony_ci	# horizontal addition
138262306a36Sopenharmony_ci
138362306a36Sopenharmony_ci	vpsrldq		\$8,$D4,$T4
138462306a36Sopenharmony_ci	vpsrldq		\$8,$D3,$T3
138562306a36Sopenharmony_ci	vpsrldq		\$8,$D1,$T1
138662306a36Sopenharmony_ci	vpsrldq		\$8,$D0,$T0
138762306a36Sopenharmony_ci	vpsrldq		\$8,$D2,$T2
138862306a36Sopenharmony_ci	vpaddq		$T3,$D3,$D3
138962306a36Sopenharmony_ci	vpaddq		$T4,$D4,$D4
139062306a36Sopenharmony_ci	vpaddq		$T0,$D0,$D0
139162306a36Sopenharmony_ci	vpaddq		$T1,$D1,$D1
139262306a36Sopenharmony_ci	vpaddq		$T2,$D2,$D2
139362306a36Sopenharmony_ci
139462306a36Sopenharmony_ci	################################################################
139562306a36Sopenharmony_ci	# lazy reduction
139662306a36Sopenharmony_ci
139762306a36Sopenharmony_ci	vpsrlq		\$26,$D3,$H3
139862306a36Sopenharmony_ci	vpand		$MASK,$D3,$D3
139962306a36Sopenharmony_ci	vpaddq		$H3,$D4,$D4		# h3 -> h4
140062306a36Sopenharmony_ci
140162306a36Sopenharmony_ci	vpsrlq		\$26,$D0,$H0
140262306a36Sopenharmony_ci	vpand		$MASK,$D0,$D0
140362306a36Sopenharmony_ci	vpaddq		$H0,$D1,$D1		# h0 -> h1
140462306a36Sopenharmony_ci
140562306a36Sopenharmony_ci	vpsrlq		\$26,$D4,$H4
140662306a36Sopenharmony_ci	vpand		$MASK,$D4,$D4
140762306a36Sopenharmony_ci
140862306a36Sopenharmony_ci	vpsrlq		\$26,$D1,$H1
140962306a36Sopenharmony_ci	vpand		$MASK,$D1,$D1
141062306a36Sopenharmony_ci	vpaddq		$H1,$D2,$D2		# h1 -> h2
141162306a36Sopenharmony_ci
141262306a36Sopenharmony_ci	vpaddq		$H4,$D0,$D0
141362306a36Sopenharmony_ci	vpsllq		\$2,$H4,$H4
141462306a36Sopenharmony_ci	vpaddq		$H4,$D0,$D0		# h4 -> h0
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci	vpsrlq		\$26,$D2,$H2
141762306a36Sopenharmony_ci	vpand		$MASK,$D2,$D2
141862306a36Sopenharmony_ci	vpaddq		$H2,$D3,$D3		# h2 -> h3
141962306a36Sopenharmony_ci
142062306a36Sopenharmony_ci	vpsrlq		\$26,$D0,$H0
142162306a36Sopenharmony_ci	vpand		$MASK,$D0,$D0
142262306a36Sopenharmony_ci	vpaddq		$H0,$D1,$D1		# h0 -> h1
142362306a36Sopenharmony_ci
142462306a36Sopenharmony_ci	vpsrlq		\$26,$D3,$H3
142562306a36Sopenharmony_ci	vpand		$MASK,$D3,$D3
142662306a36Sopenharmony_ci	vpaddq		$H3,$D4,$D4		# h3 -> h4
142762306a36Sopenharmony_ci
142862306a36Sopenharmony_ci	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
142962306a36Sopenharmony_ci	vmovd		$D1,`4*1-48-64`($ctx)
143062306a36Sopenharmony_ci	vmovd		$D2,`4*2-48-64`($ctx)
143162306a36Sopenharmony_ci	vmovd		$D3,`4*3-48-64`($ctx)
143262306a36Sopenharmony_ci	vmovd		$D4,`4*4-48-64`($ctx)
143362306a36Sopenharmony_ci___
143462306a36Sopenharmony_ci$code.=<<___	if ($win64);
143562306a36Sopenharmony_ci	vmovdqa		0x50(%r11),%xmm6
143662306a36Sopenharmony_ci	vmovdqa		0x60(%r11),%xmm7
143762306a36Sopenharmony_ci	vmovdqa		0x70(%r11),%xmm8
143862306a36Sopenharmony_ci	vmovdqa		0x80(%r11),%xmm9
143962306a36Sopenharmony_ci	vmovdqa		0x90(%r11),%xmm10
144062306a36Sopenharmony_ci	vmovdqa		0xa0(%r11),%xmm11
144162306a36Sopenharmony_ci	vmovdqa		0xb0(%r11),%xmm12
144262306a36Sopenharmony_ci	vmovdqa		0xc0(%r11),%xmm13
144362306a36Sopenharmony_ci	vmovdqa		0xd0(%r11),%xmm14
144462306a36Sopenharmony_ci	vmovdqa		0xe0(%r11),%xmm15
144562306a36Sopenharmony_ci	lea		0xf8(%r11),%rsp
144662306a36Sopenharmony_ci.Ldo_avx_epilogue:
144762306a36Sopenharmony_ci___
144862306a36Sopenharmony_ci$code.=<<___	if (!$win64);
144962306a36Sopenharmony_ci	lea		-8(%r10),%rsp
145062306a36Sopenharmony_ci.cfi_def_cfa_register	%rsp
145162306a36Sopenharmony_ci___
145262306a36Sopenharmony_ci$code.=<<___;
145362306a36Sopenharmony_ci	vzeroupper
145462306a36Sopenharmony_ci	RET
145562306a36Sopenharmony_ci.cfi_endproc
145662306a36Sopenharmony_ci___
145762306a36Sopenharmony_ci&end_function("poly1305_blocks_avx");
145862306a36Sopenharmony_ci
145962306a36Sopenharmony_ci&declare_function("poly1305_emit_avx", 32, 3);
146062306a36Sopenharmony_ci$code.=<<___;
146162306a36Sopenharmony_ci	cmpl	\$0,20($ctx)	# is_base2_26?
146262306a36Sopenharmony_ci	je	.Lemit
146362306a36Sopenharmony_ci
146462306a36Sopenharmony_ci	mov	0($ctx),%eax	# load hash value base 2^26
146562306a36Sopenharmony_ci	mov	4($ctx),%ecx
146662306a36Sopenharmony_ci	mov	8($ctx),%r8d
146762306a36Sopenharmony_ci	mov	12($ctx),%r11d
146862306a36Sopenharmony_ci	mov	16($ctx),%r10d
146962306a36Sopenharmony_ci
147062306a36Sopenharmony_ci	shl	\$26,%rcx	# base 2^26 -> base 2^64
147162306a36Sopenharmony_ci	mov	%r8,%r9
147262306a36Sopenharmony_ci	shl	\$52,%r8
147362306a36Sopenharmony_ci	add	%rcx,%rax
147462306a36Sopenharmony_ci	shr	\$12,%r9
147562306a36Sopenharmony_ci	add	%rax,%r8	# h0
147662306a36Sopenharmony_ci	adc	\$0,%r9
147762306a36Sopenharmony_ci
147862306a36Sopenharmony_ci	shl	\$14,%r11
147962306a36Sopenharmony_ci	mov	%r10,%rax
148062306a36Sopenharmony_ci	shr	\$24,%r10
148162306a36Sopenharmony_ci	add	%r11,%r9
148262306a36Sopenharmony_ci	shl	\$40,%rax
148362306a36Sopenharmony_ci	add	%rax,%r9	# h1
148462306a36Sopenharmony_ci	adc	\$0,%r10	# h2
148562306a36Sopenharmony_ci
148662306a36Sopenharmony_ci	mov	%r10,%rax	# could be partially reduced, so reduce
148762306a36Sopenharmony_ci	mov	%r10,%rcx
148862306a36Sopenharmony_ci	and	\$3,%r10
148962306a36Sopenharmony_ci	shr	\$2,%rax
149062306a36Sopenharmony_ci	and	\$-4,%rcx
149162306a36Sopenharmony_ci	add	%rcx,%rax
149262306a36Sopenharmony_ci	add	%rax,%r8
149362306a36Sopenharmony_ci	adc	\$0,%r9
149462306a36Sopenharmony_ci	adc	\$0,%r10
149562306a36Sopenharmony_ci
149662306a36Sopenharmony_ci	mov	%r8,%rax
149762306a36Sopenharmony_ci	add	\$5,%r8		# compare to modulus
149862306a36Sopenharmony_ci	mov	%r9,%rcx
149962306a36Sopenharmony_ci	adc	\$0,%r9
150062306a36Sopenharmony_ci	adc	\$0,%r10
150162306a36Sopenharmony_ci	shr	\$2,%r10	# did 130-bit value overflow?
150262306a36Sopenharmony_ci	cmovnz	%r8,%rax
150362306a36Sopenharmony_ci	cmovnz	%r9,%rcx
150462306a36Sopenharmony_ci
150562306a36Sopenharmony_ci	add	0($nonce),%rax	# accumulate nonce
150662306a36Sopenharmony_ci	adc	8($nonce),%rcx
150762306a36Sopenharmony_ci	mov	%rax,0($mac)	# write result
150862306a36Sopenharmony_ci	mov	%rcx,8($mac)
150962306a36Sopenharmony_ci
151062306a36Sopenharmony_ci	RET
151162306a36Sopenharmony_ci___
151262306a36Sopenharmony_ci&end_function("poly1305_emit_avx");
151362306a36Sopenharmony_ci
151462306a36Sopenharmony_ciif ($avx>1) {
151562306a36Sopenharmony_ci
151662306a36Sopenharmony_cimy ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
151762306a36Sopenharmony_ci    map("%ymm$_",(0..15));
151862306a36Sopenharmony_cimy $S4=$MASK;
151962306a36Sopenharmony_ci
152062306a36Sopenharmony_cisub poly1305_blocks_avxN {
152162306a36Sopenharmony_ci	my ($avx512) = @_;
152262306a36Sopenharmony_ci	my $suffix = $avx512 ? "_avx512" : "";
152362306a36Sopenharmony_ci$code.=<<___;
152462306a36Sopenharmony_ci.cfi_startproc
152562306a36Sopenharmony_ci	mov	20($ctx),%r8d		# is_base2_26
152662306a36Sopenharmony_ci	cmp	\$128,$len
152762306a36Sopenharmony_ci	jae	.Lblocks_avx2$suffix
152862306a36Sopenharmony_ci	test	%r8d,%r8d
152962306a36Sopenharmony_ci	jz	.Lblocks
153062306a36Sopenharmony_ci
153162306a36Sopenharmony_ci.Lblocks_avx2$suffix:
153262306a36Sopenharmony_ci	and	\$-16,$len
153362306a36Sopenharmony_ci	jz	.Lno_data_avx2$suffix
153462306a36Sopenharmony_ci
153562306a36Sopenharmony_ci	vzeroupper
153662306a36Sopenharmony_ci
153762306a36Sopenharmony_ci	test	%r8d,%r8d
153862306a36Sopenharmony_ci	jz	.Lbase2_64_avx2$suffix
153962306a36Sopenharmony_ci
154062306a36Sopenharmony_ci	test	\$63,$len
154162306a36Sopenharmony_ci	jz	.Leven_avx2$suffix
154262306a36Sopenharmony_ci
154362306a36Sopenharmony_ci	push	%rbp
154462306a36Sopenharmony_ci.cfi_push	%rbp
154562306a36Sopenharmony_ci	mov 	%rsp,%rbp
154662306a36Sopenharmony_ci	push	%rbx
154762306a36Sopenharmony_ci.cfi_push	%rbx
154862306a36Sopenharmony_ci	push	%r12
154962306a36Sopenharmony_ci.cfi_push	%r12
155062306a36Sopenharmony_ci	push	%r13
155162306a36Sopenharmony_ci.cfi_push	%r13
155262306a36Sopenharmony_ci	push	%r14
155362306a36Sopenharmony_ci.cfi_push	%r14
155462306a36Sopenharmony_ci	push	%r15
155562306a36Sopenharmony_ci.cfi_push	%r15
155662306a36Sopenharmony_ci.Lblocks_avx2_body$suffix:
155762306a36Sopenharmony_ci
155862306a36Sopenharmony_ci	mov	$len,%r15		# reassign $len
155962306a36Sopenharmony_ci
156062306a36Sopenharmony_ci	mov	0($ctx),$d1		# load hash value
156162306a36Sopenharmony_ci	mov	8($ctx),$d2
156262306a36Sopenharmony_ci	mov	16($ctx),$h2#d
156362306a36Sopenharmony_ci
156462306a36Sopenharmony_ci	mov	24($ctx),$r0		# load r
156562306a36Sopenharmony_ci	mov	32($ctx),$s1
156662306a36Sopenharmony_ci
156762306a36Sopenharmony_ci	################################# base 2^26 -> base 2^64
156862306a36Sopenharmony_ci	mov	$d1#d,$h0#d
156962306a36Sopenharmony_ci	and	\$`-1*(1<<31)`,$d1
157062306a36Sopenharmony_ci	mov	$d2,$r1			# borrow $r1
157162306a36Sopenharmony_ci	mov	$d2#d,$h1#d
157262306a36Sopenharmony_ci	and	\$`-1*(1<<31)`,$d2
157362306a36Sopenharmony_ci
157462306a36Sopenharmony_ci	shr	\$6,$d1
157562306a36Sopenharmony_ci	shl	\$52,$r1
157662306a36Sopenharmony_ci	add	$d1,$h0
157762306a36Sopenharmony_ci	shr	\$12,$h1
157862306a36Sopenharmony_ci	shr	\$18,$d2
157962306a36Sopenharmony_ci	add	$r1,$h0
158062306a36Sopenharmony_ci	adc	$d2,$h1
158162306a36Sopenharmony_ci
158262306a36Sopenharmony_ci	mov	$h2,$d1
158362306a36Sopenharmony_ci	shl	\$40,$d1
158462306a36Sopenharmony_ci	shr	\$24,$h2
158562306a36Sopenharmony_ci	add	$d1,$h1
158662306a36Sopenharmony_ci	adc	\$0,$h2			# can be partially reduced...
158762306a36Sopenharmony_ci
158862306a36Sopenharmony_ci	mov	\$-4,$d2		# ... so reduce
158962306a36Sopenharmony_ci	mov	$h2,$d1
159062306a36Sopenharmony_ci	and	$h2,$d2
159162306a36Sopenharmony_ci	shr	\$2,$d1
159262306a36Sopenharmony_ci	and	\$3,$h2
159362306a36Sopenharmony_ci	add	$d2,$d1			# =*5
159462306a36Sopenharmony_ci	add	$d1,$h0
159562306a36Sopenharmony_ci	adc	\$0,$h1
159662306a36Sopenharmony_ci	adc	\$0,$h2
159762306a36Sopenharmony_ci
159862306a36Sopenharmony_ci	mov	$s1,$r1
159962306a36Sopenharmony_ci	mov	$s1,%rax
160062306a36Sopenharmony_ci	shr	\$2,$s1
160162306a36Sopenharmony_ci	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
160262306a36Sopenharmony_ci
160362306a36Sopenharmony_ci.Lbase2_26_pre_avx2$suffix:
160462306a36Sopenharmony_ci	add	0($inp),$h0		# accumulate input
160562306a36Sopenharmony_ci	adc	8($inp),$h1
160662306a36Sopenharmony_ci	lea	16($inp),$inp
160762306a36Sopenharmony_ci	adc	$padbit,$h2
160862306a36Sopenharmony_ci	sub	\$16,%r15
160962306a36Sopenharmony_ci
161062306a36Sopenharmony_ci	call	__poly1305_block
161162306a36Sopenharmony_ci	mov	$r1,%rax
161262306a36Sopenharmony_ci
161362306a36Sopenharmony_ci	test	\$63,%r15
161462306a36Sopenharmony_ci	jnz	.Lbase2_26_pre_avx2$suffix
161562306a36Sopenharmony_ci
161662306a36Sopenharmony_ci	test	$padbit,$padbit		# if $padbit is zero,
161762306a36Sopenharmony_ci	jz	.Lstore_base2_64_avx2$suffix	# store hash in base 2^64 format
161862306a36Sopenharmony_ci
161962306a36Sopenharmony_ci	################################# base 2^64 -> base 2^26
162062306a36Sopenharmony_ci	mov	$h0,%rax
162162306a36Sopenharmony_ci	mov	$h0,%rdx
162262306a36Sopenharmony_ci	shr	\$52,$h0
162362306a36Sopenharmony_ci	mov	$h1,$r0
162462306a36Sopenharmony_ci	mov	$h1,$r1
162562306a36Sopenharmony_ci	shr	\$26,%rdx
162662306a36Sopenharmony_ci	and	\$0x3ffffff,%rax	# h[0]
162762306a36Sopenharmony_ci	shl	\$12,$r0
162862306a36Sopenharmony_ci	and	\$0x3ffffff,%rdx	# h[1]
162962306a36Sopenharmony_ci	shr	\$14,$h1
163062306a36Sopenharmony_ci	or	$r0,$h0
163162306a36Sopenharmony_ci	shl	\$24,$h2
163262306a36Sopenharmony_ci	and	\$0x3ffffff,$h0		# h[2]
163362306a36Sopenharmony_ci	shr	\$40,$r1
163462306a36Sopenharmony_ci	and	\$0x3ffffff,$h1		# h[3]
163562306a36Sopenharmony_ci	or	$r1,$h2			# h[4]
163662306a36Sopenharmony_ci
163762306a36Sopenharmony_ci	test	%r15,%r15
163862306a36Sopenharmony_ci	jz	.Lstore_base2_26_avx2$suffix
163962306a36Sopenharmony_ci
164062306a36Sopenharmony_ci	vmovd	%rax#d,%x#$H0
164162306a36Sopenharmony_ci	vmovd	%rdx#d,%x#$H1
164262306a36Sopenharmony_ci	vmovd	$h0#d,%x#$H2
164362306a36Sopenharmony_ci	vmovd	$h1#d,%x#$H3
164462306a36Sopenharmony_ci	vmovd	$h2#d,%x#$H4
164562306a36Sopenharmony_ci	jmp	.Lproceed_avx2$suffix
164662306a36Sopenharmony_ci
164762306a36Sopenharmony_ci.align	32
164862306a36Sopenharmony_ci.Lstore_base2_64_avx2$suffix:
164962306a36Sopenharmony_ci	mov	$h0,0($ctx)
165062306a36Sopenharmony_ci	mov	$h1,8($ctx)
165162306a36Sopenharmony_ci	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
165262306a36Sopenharmony_ci	jmp	.Ldone_avx2$suffix
165362306a36Sopenharmony_ci
165462306a36Sopenharmony_ci.align	16
165562306a36Sopenharmony_ci.Lstore_base2_26_avx2$suffix:
165662306a36Sopenharmony_ci	mov	%rax#d,0($ctx)		# store hash value base 2^26
165762306a36Sopenharmony_ci	mov	%rdx#d,4($ctx)
165862306a36Sopenharmony_ci	mov	$h0#d,8($ctx)
165962306a36Sopenharmony_ci	mov	$h1#d,12($ctx)
166062306a36Sopenharmony_ci	mov	$h2#d,16($ctx)
166162306a36Sopenharmony_ci.align	16
166262306a36Sopenharmony_ci.Ldone_avx2$suffix:
166362306a36Sopenharmony_ci	pop 		%r15
166462306a36Sopenharmony_ci.cfi_restore	%r15
166562306a36Sopenharmony_ci	pop 		%r14
166662306a36Sopenharmony_ci.cfi_restore	%r14
166762306a36Sopenharmony_ci	pop 		%r13
166862306a36Sopenharmony_ci.cfi_restore	%r13
166962306a36Sopenharmony_ci	pop 		%r12
167062306a36Sopenharmony_ci.cfi_restore	%r12
167162306a36Sopenharmony_ci	pop 		%rbx
167262306a36Sopenharmony_ci.cfi_restore	%rbx
167362306a36Sopenharmony_ci	pop 		%rbp
167462306a36Sopenharmony_ci.cfi_restore 	%rbp
167562306a36Sopenharmony_ci.Lno_data_avx2$suffix:
167662306a36Sopenharmony_ci.Lblocks_avx2_epilogue$suffix:
167762306a36Sopenharmony_ci	RET
167862306a36Sopenharmony_ci.cfi_endproc
167962306a36Sopenharmony_ci
168062306a36Sopenharmony_ci.align	32
168162306a36Sopenharmony_ci.Lbase2_64_avx2$suffix:
168262306a36Sopenharmony_ci.cfi_startproc
168362306a36Sopenharmony_ci	push	%rbp
168462306a36Sopenharmony_ci.cfi_push	%rbp
168562306a36Sopenharmony_ci	mov 	%rsp,%rbp
168662306a36Sopenharmony_ci	push	%rbx
168762306a36Sopenharmony_ci.cfi_push	%rbx
168862306a36Sopenharmony_ci	push	%r12
168962306a36Sopenharmony_ci.cfi_push	%r12
169062306a36Sopenharmony_ci	push	%r13
169162306a36Sopenharmony_ci.cfi_push	%r13
169262306a36Sopenharmony_ci	push	%r14
169362306a36Sopenharmony_ci.cfi_push	%r14
169462306a36Sopenharmony_ci	push	%r15
169562306a36Sopenharmony_ci.cfi_push	%r15
169662306a36Sopenharmony_ci.Lbase2_64_avx2_body$suffix:
169762306a36Sopenharmony_ci
169862306a36Sopenharmony_ci	mov	$len,%r15		# reassign $len
169962306a36Sopenharmony_ci
170062306a36Sopenharmony_ci	mov	24($ctx),$r0		# load r
170162306a36Sopenharmony_ci	mov	32($ctx),$s1
170262306a36Sopenharmony_ci
170362306a36Sopenharmony_ci	mov	0($ctx),$h0		# load hash value
170462306a36Sopenharmony_ci	mov	8($ctx),$h1
170562306a36Sopenharmony_ci	mov	16($ctx),$h2#d
170662306a36Sopenharmony_ci
170762306a36Sopenharmony_ci	mov	$s1,$r1
170862306a36Sopenharmony_ci	mov	$s1,%rax
170962306a36Sopenharmony_ci	shr	\$2,$s1
171062306a36Sopenharmony_ci	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
171162306a36Sopenharmony_ci
171262306a36Sopenharmony_ci	test	\$63,$len
171362306a36Sopenharmony_ci	jz	.Linit_avx2$suffix
171462306a36Sopenharmony_ci
171562306a36Sopenharmony_ci.Lbase2_64_pre_avx2$suffix:
171662306a36Sopenharmony_ci	add	0($inp),$h0		# accumulate input
171762306a36Sopenharmony_ci	adc	8($inp),$h1
171862306a36Sopenharmony_ci	lea	16($inp),$inp
171962306a36Sopenharmony_ci	adc	$padbit,$h2
172062306a36Sopenharmony_ci	sub	\$16,%r15
172162306a36Sopenharmony_ci
172262306a36Sopenharmony_ci	call	__poly1305_block
172362306a36Sopenharmony_ci	mov	$r1,%rax
172462306a36Sopenharmony_ci
172562306a36Sopenharmony_ci	test	\$63,%r15
172662306a36Sopenharmony_ci	jnz	.Lbase2_64_pre_avx2$suffix
172762306a36Sopenharmony_ci
172862306a36Sopenharmony_ci.Linit_avx2$suffix:
172962306a36Sopenharmony_ci	################################# base 2^64 -> base 2^26
173062306a36Sopenharmony_ci	mov	$h0,%rax
173162306a36Sopenharmony_ci	mov	$h0,%rdx
173262306a36Sopenharmony_ci	shr	\$52,$h0
173362306a36Sopenharmony_ci	mov	$h1,$d1
173462306a36Sopenharmony_ci	mov	$h1,$d2
173562306a36Sopenharmony_ci	shr	\$26,%rdx
173662306a36Sopenharmony_ci	and	\$0x3ffffff,%rax	# h[0]
173762306a36Sopenharmony_ci	shl	\$12,$d1
173862306a36Sopenharmony_ci	and	\$0x3ffffff,%rdx	# h[1]
173962306a36Sopenharmony_ci	shr	\$14,$h1
174062306a36Sopenharmony_ci	or	$d1,$h0
174162306a36Sopenharmony_ci	shl	\$24,$h2
174262306a36Sopenharmony_ci	and	\$0x3ffffff,$h0		# h[2]
174362306a36Sopenharmony_ci	shr	\$40,$d2
174462306a36Sopenharmony_ci	and	\$0x3ffffff,$h1		# h[3]
174562306a36Sopenharmony_ci	or	$d2,$h2			# h[4]
174662306a36Sopenharmony_ci
174762306a36Sopenharmony_ci	vmovd	%rax#d,%x#$H0
174862306a36Sopenharmony_ci	vmovd	%rdx#d,%x#$H1
174962306a36Sopenharmony_ci	vmovd	$h0#d,%x#$H2
175062306a36Sopenharmony_ci	vmovd	$h1#d,%x#$H3
175162306a36Sopenharmony_ci	vmovd	$h2#d,%x#$H4
175262306a36Sopenharmony_ci	movl	\$1,20($ctx)		# set is_base2_26
175362306a36Sopenharmony_ci
175462306a36Sopenharmony_ci	call	__poly1305_init_avx
175562306a36Sopenharmony_ci
175662306a36Sopenharmony_ci.Lproceed_avx2$suffix:
175762306a36Sopenharmony_ci	mov	%r15,$len			# restore $len
175862306a36Sopenharmony_ci___
175962306a36Sopenharmony_ci$code.=<<___ if (!$kernel);
176062306a36Sopenharmony_ci	mov	OPENSSL_ia32cap_P+8(%rip),%r9d
176162306a36Sopenharmony_ci	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
176262306a36Sopenharmony_ci___
176362306a36Sopenharmony_ci$code.=<<___;
176462306a36Sopenharmony_ci	pop 		%r15
176562306a36Sopenharmony_ci.cfi_restore	%r15
176662306a36Sopenharmony_ci	pop 		%r14
176762306a36Sopenharmony_ci.cfi_restore	%r14
176862306a36Sopenharmony_ci	pop 		%r13
176962306a36Sopenharmony_ci.cfi_restore	%r13
177062306a36Sopenharmony_ci	pop 		%r12
177162306a36Sopenharmony_ci.cfi_restore	%r12
177262306a36Sopenharmony_ci	pop 		%rbx
177362306a36Sopenharmony_ci.cfi_restore	%rbx
177462306a36Sopenharmony_ci	pop 		%rbp
177562306a36Sopenharmony_ci.cfi_restore 	%rbp
177662306a36Sopenharmony_ci.Lbase2_64_avx2_epilogue$suffix:
177762306a36Sopenharmony_ci	jmp	.Ldo_avx2$suffix
177862306a36Sopenharmony_ci.cfi_endproc
177962306a36Sopenharmony_ci
178062306a36Sopenharmony_ci.align	32
178162306a36Sopenharmony_ci.Leven_avx2$suffix:
178262306a36Sopenharmony_ci.cfi_startproc
178362306a36Sopenharmony_ci___
178462306a36Sopenharmony_ci$code.=<<___ if (!$kernel);
178562306a36Sopenharmony_ci	mov		OPENSSL_ia32cap_P+8(%rip),%r9d
178662306a36Sopenharmony_ci___
178762306a36Sopenharmony_ci$code.=<<___;
178862306a36Sopenharmony_ci	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
178962306a36Sopenharmony_ci	vmovd		4*1($ctx),%x#$H1
179062306a36Sopenharmony_ci	vmovd		4*2($ctx),%x#$H2
179162306a36Sopenharmony_ci	vmovd		4*3($ctx),%x#$H3
179262306a36Sopenharmony_ci	vmovd		4*4($ctx),%x#$H4
179362306a36Sopenharmony_ci
179462306a36Sopenharmony_ci.Ldo_avx2$suffix:
179562306a36Sopenharmony_ci___
179662306a36Sopenharmony_ci$code.=<<___		if (!$kernel && $avx>2);
179762306a36Sopenharmony_ci	cmp		\$512,$len
179862306a36Sopenharmony_ci	jb		.Lskip_avx512
179962306a36Sopenharmony_ci	and		%r11d,%r9d
180062306a36Sopenharmony_ci	test		\$`1<<16`,%r9d		# check for AVX512F
180162306a36Sopenharmony_ci	jnz		.Lblocks_avx512
180262306a36Sopenharmony_ci.Lskip_avx512$suffix:
180362306a36Sopenharmony_ci___
180462306a36Sopenharmony_ci$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
180562306a36Sopenharmony_ci	cmp		\$512,$len
180662306a36Sopenharmony_ci	jae		.Lblocks_avx512
180762306a36Sopenharmony_ci___
180862306a36Sopenharmony_ci$code.=<<___	if (!$win64);
180962306a36Sopenharmony_ci	lea		8(%rsp),%r10
181062306a36Sopenharmony_ci.cfi_def_cfa_register	%r10
181162306a36Sopenharmony_ci	sub		\$0x128,%rsp
181262306a36Sopenharmony_ci___
181362306a36Sopenharmony_ci$code.=<<___	if ($win64);
181462306a36Sopenharmony_ci	lea		8(%rsp),%r10
181562306a36Sopenharmony_ci	sub		\$0x1c8,%rsp
181662306a36Sopenharmony_ci	vmovdqa		%xmm6,-0xb0(%r10)
181762306a36Sopenharmony_ci	vmovdqa		%xmm7,-0xa0(%r10)
181862306a36Sopenharmony_ci	vmovdqa		%xmm8,-0x90(%r10)
181962306a36Sopenharmony_ci	vmovdqa		%xmm9,-0x80(%r10)
182062306a36Sopenharmony_ci	vmovdqa		%xmm10,-0x70(%r10)
182162306a36Sopenharmony_ci	vmovdqa		%xmm11,-0x60(%r10)
182262306a36Sopenharmony_ci	vmovdqa		%xmm12,-0x50(%r10)
182362306a36Sopenharmony_ci	vmovdqa		%xmm13,-0x40(%r10)
182462306a36Sopenharmony_ci	vmovdqa		%xmm14,-0x30(%r10)
182562306a36Sopenharmony_ci	vmovdqa		%xmm15,-0x20(%r10)
182662306a36Sopenharmony_ci.Ldo_avx2_body$suffix:
182762306a36Sopenharmony_ci___
182862306a36Sopenharmony_ci$code.=<<___;
182962306a36Sopenharmony_ci	lea		.Lconst(%rip),%rcx
183062306a36Sopenharmony_ci	lea		48+64($ctx),$ctx	# size optimization
183162306a36Sopenharmony_ci	vmovdqa		96(%rcx),$T0		# .Lpermd_avx2
183262306a36Sopenharmony_ci
183362306a36Sopenharmony_ci	# expand and copy pre-calculated table to stack
183462306a36Sopenharmony_ci	vmovdqu		`16*0-64`($ctx),%x#$T2
183562306a36Sopenharmony_ci	and		\$-512,%rsp
183662306a36Sopenharmony_ci	vmovdqu		`16*1-64`($ctx),%x#$T3
183762306a36Sopenharmony_ci	vmovdqu		`16*2-64`($ctx),%x#$T4
183862306a36Sopenharmony_ci	vmovdqu		`16*3-64`($ctx),%x#$D0
183962306a36Sopenharmony_ci	vmovdqu		`16*4-64`($ctx),%x#$D1
184062306a36Sopenharmony_ci	vmovdqu		`16*5-64`($ctx),%x#$D2
184162306a36Sopenharmony_ci	lea		0x90(%rsp),%rax		# size optimization
184262306a36Sopenharmony_ci	vmovdqu		`16*6-64`($ctx),%x#$D3
184362306a36Sopenharmony_ci	vpermd		$T2,$T0,$T2		# 00003412 -> 14243444
184462306a36Sopenharmony_ci	vmovdqu		`16*7-64`($ctx),%x#$D4
184562306a36Sopenharmony_ci	vpermd		$T3,$T0,$T3
184662306a36Sopenharmony_ci	vmovdqu		`16*8-64`($ctx),%x#$MASK
184762306a36Sopenharmony_ci	vpermd		$T4,$T0,$T4
184862306a36Sopenharmony_ci	vmovdqa		$T2,0x00(%rsp)
184962306a36Sopenharmony_ci	vpermd		$D0,$T0,$D0
185062306a36Sopenharmony_ci	vmovdqa		$T3,0x20-0x90(%rax)
185162306a36Sopenharmony_ci	vpermd		$D1,$T0,$D1
185262306a36Sopenharmony_ci	vmovdqa		$T4,0x40-0x90(%rax)
185362306a36Sopenharmony_ci	vpermd		$D2,$T0,$D2
185462306a36Sopenharmony_ci	vmovdqa		$D0,0x60-0x90(%rax)
185562306a36Sopenharmony_ci	vpermd		$D3,$T0,$D3
185662306a36Sopenharmony_ci	vmovdqa		$D1,0x80-0x90(%rax)
185762306a36Sopenharmony_ci	vpermd		$D4,$T0,$D4
185862306a36Sopenharmony_ci	vmovdqa		$D2,0xa0-0x90(%rax)
185962306a36Sopenharmony_ci	vpermd		$MASK,$T0,$MASK
186062306a36Sopenharmony_ci	vmovdqa		$D3,0xc0-0x90(%rax)
186162306a36Sopenharmony_ci	vmovdqa		$D4,0xe0-0x90(%rax)
186262306a36Sopenharmony_ci	vmovdqa		$MASK,0x100-0x90(%rax)
186362306a36Sopenharmony_ci	vmovdqa		64(%rcx),$MASK		# .Lmask26
186462306a36Sopenharmony_ci
186562306a36Sopenharmony_ci	################################################################
186662306a36Sopenharmony_ci	# load input
186762306a36Sopenharmony_ci	vmovdqu		16*0($inp),%x#$T0
186862306a36Sopenharmony_ci	vmovdqu		16*1($inp),%x#$T1
186962306a36Sopenharmony_ci	vinserti128	\$1,16*2($inp),$T0,$T0
187062306a36Sopenharmony_ci	vinserti128	\$1,16*3($inp),$T1,$T1
187162306a36Sopenharmony_ci	lea		16*4($inp),$inp
187262306a36Sopenharmony_ci
187362306a36Sopenharmony_ci	vpsrldq		\$6,$T0,$T2		# splat input
187462306a36Sopenharmony_ci	vpsrldq		\$6,$T1,$T3
187562306a36Sopenharmony_ci	vpunpckhqdq	$T1,$T0,$T4		# 4
187662306a36Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T2		# 2:3
187762306a36Sopenharmony_ci	vpunpcklqdq	$T1,$T0,$T0		# 0:1
187862306a36Sopenharmony_ci
187962306a36Sopenharmony_ci	vpsrlq		\$30,$T2,$T3
188062306a36Sopenharmony_ci	vpsrlq		\$4,$T2,$T2
188162306a36Sopenharmony_ci	vpsrlq		\$26,$T0,$T1
188262306a36Sopenharmony_ci	vpsrlq		\$40,$T4,$T4		# 4
188362306a36Sopenharmony_ci	vpand		$MASK,$T2,$T2		# 2
188462306a36Sopenharmony_ci	vpand		$MASK,$T0,$T0		# 0
188562306a36Sopenharmony_ci	vpand		$MASK,$T1,$T1		# 1
188662306a36Sopenharmony_ci	vpand		$MASK,$T3,$T3		# 3
188762306a36Sopenharmony_ci	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
188862306a36Sopenharmony_ci
188962306a36Sopenharmony_ci	vpaddq		$H2,$T2,$H2		# accumulate input
189062306a36Sopenharmony_ci	sub		\$64,$len
189162306a36Sopenharmony_ci	jz		.Ltail_avx2$suffix
189262306a36Sopenharmony_ci	jmp		.Loop_avx2$suffix
189362306a36Sopenharmony_ci
189462306a36Sopenharmony_ci.align	32
189562306a36Sopenharmony_ci.Loop_avx2$suffix:
189662306a36Sopenharmony_ci	################################################################
189762306a36Sopenharmony_ci	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
189862306a36Sopenharmony_ci	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
189962306a36Sopenharmony_ci	# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
190062306a36Sopenharmony_ci	# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
190162306a36Sopenharmony_ci	#   \________/\__________/
190262306a36Sopenharmony_ci	################################################################
190362306a36Sopenharmony_ci	#vpaddq		$H2,$T2,$H2		# accumulate input
190462306a36Sopenharmony_ci	vpaddq		$H0,$T0,$H0
190562306a36Sopenharmony_ci	vmovdqa		`32*0`(%rsp),$T0	# r0^4
190662306a36Sopenharmony_ci	vpaddq		$H1,$T1,$H1
190762306a36Sopenharmony_ci	vmovdqa		`32*1`(%rsp),$T1	# r1^4
190862306a36Sopenharmony_ci	vpaddq		$H3,$T3,$H3
190962306a36Sopenharmony_ci	vmovdqa		`32*3`(%rsp),$T2	# r2^4
191062306a36Sopenharmony_ci	vpaddq		$H4,$T4,$H4
191162306a36Sopenharmony_ci	vmovdqa		`32*6-0x90`(%rax),$T3	# s3^4
191262306a36Sopenharmony_ci	vmovdqa		`32*8-0x90`(%rax),$S4	# s4^4
191362306a36Sopenharmony_ci
191462306a36Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
191562306a36Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
191662306a36Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
191762306a36Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
191862306a36Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
191962306a36Sopenharmony_ci	#
192062306a36Sopenharmony_ci	# however, as h2 is "chronologically" first one available pull
192162306a36Sopenharmony_ci	# corresponding operations up, so it's
192262306a36Sopenharmony_ci	#
192362306a36Sopenharmony_ci	# d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
192462306a36Sopenharmony_ci	# d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
192562306a36Sopenharmony_ci	# d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
192662306a36Sopenharmony_ci	# d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
192762306a36Sopenharmony_ci	# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
192862306a36Sopenharmony_ci
192962306a36Sopenharmony_ci	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
193062306a36Sopenharmony_ci	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
193162306a36Sopenharmony_ci	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
193262306a36Sopenharmony_ci	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
193362306a36Sopenharmony_ci	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
193462306a36Sopenharmony_ci
193562306a36Sopenharmony_ci	vpmuludq	$H0,$T1,$T4		# h0*r1
193662306a36Sopenharmony_ci	vpmuludq	$H1,$T1,$H2		# h1*r1, borrow $H2 as temp
193762306a36Sopenharmony_ci	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
193862306a36Sopenharmony_ci	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
193962306a36Sopenharmony_ci	vpmuludq	$H3,$T1,$T4		# h3*r1
194062306a36Sopenharmony_ci	vpmuludq	`32*2`(%rsp),$H4,$H2	# h4*s1
194162306a36Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
194262306a36Sopenharmony_ci	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
194362306a36Sopenharmony_ci	 vmovdqa	`32*4-0x90`(%rax),$T1	# s2
194462306a36Sopenharmony_ci
194562306a36Sopenharmony_ci	vpmuludq	$H0,$T0,$T4		# h0*r0
194662306a36Sopenharmony_ci	vpmuludq	$H1,$T0,$H2		# h1*r0
194762306a36Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
194862306a36Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
194962306a36Sopenharmony_ci	vpmuludq	$H3,$T0,$T4		# h3*r0
195062306a36Sopenharmony_ci	vpmuludq	$H4,$T0,$H2		# h4*r0
195162306a36Sopenharmony_ci	 vmovdqu	16*0($inp),%x#$T0	# load input
195262306a36Sopenharmony_ci	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
195362306a36Sopenharmony_ci	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
195462306a36Sopenharmony_ci	 vinserti128	\$1,16*2($inp),$T0,$T0
195562306a36Sopenharmony_ci
195662306a36Sopenharmony_ci	vpmuludq	$H3,$T1,$T4		# h3*s2
195762306a36Sopenharmony_ci	vpmuludq	$H4,$T1,$H2		# h4*s2
195862306a36Sopenharmony_ci	 vmovdqu	16*1($inp),%x#$T1
195962306a36Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
196062306a36Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
196162306a36Sopenharmony_ci	 vmovdqa	`32*5-0x90`(%rax),$H2	# r3
196262306a36Sopenharmony_ci	vpmuludq	$H1,$T2,$T4		# h1*r2
196362306a36Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r2
196462306a36Sopenharmony_ci	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
196562306a36Sopenharmony_ci	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
196662306a36Sopenharmony_ci	 vinserti128	\$1,16*3($inp),$T1,$T1
196762306a36Sopenharmony_ci	 lea		16*4($inp),$inp
196862306a36Sopenharmony_ci
196962306a36Sopenharmony_ci	vpmuludq	$H1,$H2,$T4		# h1*r3
197062306a36Sopenharmony_ci	vpmuludq	$H0,$H2,$H2		# h0*r3
197162306a36Sopenharmony_ci	 vpsrldq	\$6,$T0,$T2		# splat input
197262306a36Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
197362306a36Sopenharmony_ci	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
197462306a36Sopenharmony_ci	vpmuludq	$H3,$T3,$T4		# h3*s3
197562306a36Sopenharmony_ci	vpmuludq	$H4,$T3,$H2		# h4*s3
197662306a36Sopenharmony_ci	 vpsrldq	\$6,$T1,$T3
197762306a36Sopenharmony_ci	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
197862306a36Sopenharmony_ci	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
197962306a36Sopenharmony_ci	 vpunpckhqdq	$T1,$T0,$T4		# 4
198062306a36Sopenharmony_ci
198162306a36Sopenharmony_ci	vpmuludq	$H3,$S4,$H3		# h3*s4
198262306a36Sopenharmony_ci	vpmuludq	$H4,$S4,$H4		# h4*s4
198362306a36Sopenharmony_ci	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
198462306a36Sopenharmony_ci	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
198562306a36Sopenharmony_ci	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
198662306a36Sopenharmony_ci	 vpunpcklqdq	$T3,$T2,$T3		# 2:3
198762306a36Sopenharmony_ci	vpmuludq	`32*7-0x90`(%rax),$H0,$H4	# h0*r4
198862306a36Sopenharmony_ci	vpmuludq	$H1,$S4,$H0		# h1*s4
198962306a36Sopenharmony_ci	vmovdqa		64(%rcx),$MASK		# .Lmask26
199062306a36Sopenharmony_ci	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
199162306a36Sopenharmony_ci	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
199262306a36Sopenharmony_ci
199362306a36Sopenharmony_ci	################################################################
199462306a36Sopenharmony_ci	# lazy reduction (interleaved with tail of input splat)
199562306a36Sopenharmony_ci
199662306a36Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
199762306a36Sopenharmony_ci	vpand		$MASK,$H3,$H3
199862306a36Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
199962306a36Sopenharmony_ci
200062306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
200162306a36Sopenharmony_ci	vpand		$MASK,$H0,$H0
200262306a36Sopenharmony_ci	vpaddq		$D0,$D1,$H1		# h0 -> h1
200362306a36Sopenharmony_ci
200462306a36Sopenharmony_ci	vpsrlq		\$26,$H4,$D4
200562306a36Sopenharmony_ci	vpand		$MASK,$H4,$H4
200662306a36Sopenharmony_ci
200762306a36Sopenharmony_ci	 vpsrlq		\$4,$T3,$T2
200862306a36Sopenharmony_ci
200962306a36Sopenharmony_ci	vpsrlq		\$26,$H1,$D1
201062306a36Sopenharmony_ci	vpand		$MASK,$H1,$H1
201162306a36Sopenharmony_ci	vpaddq		$D1,$H2,$H2		# h1 -> h2
201262306a36Sopenharmony_ci
201362306a36Sopenharmony_ci	vpaddq		$D4,$H0,$H0
201462306a36Sopenharmony_ci	vpsllq		\$2,$D4,$D4
201562306a36Sopenharmony_ci	vpaddq		$D4,$H0,$H0		# h4 -> h0
201662306a36Sopenharmony_ci
201762306a36Sopenharmony_ci	 vpand		$MASK,$T2,$T2		# 2
201862306a36Sopenharmony_ci	 vpsrlq		\$26,$T0,$T1
201962306a36Sopenharmony_ci
202062306a36Sopenharmony_ci	vpsrlq		\$26,$H2,$D2
202162306a36Sopenharmony_ci	vpand		$MASK,$H2,$H2
202262306a36Sopenharmony_ci	vpaddq		$D2,$H3,$H3		# h2 -> h3
202362306a36Sopenharmony_ci
202462306a36Sopenharmony_ci	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
202562306a36Sopenharmony_ci	 vpsrlq		\$30,$T3,$T3
202662306a36Sopenharmony_ci
202762306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
202862306a36Sopenharmony_ci	vpand		$MASK,$H0,$H0
202962306a36Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
203062306a36Sopenharmony_ci
203162306a36Sopenharmony_ci	 vpsrlq		\$40,$T4,$T4		# 4
203262306a36Sopenharmony_ci
203362306a36Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
203462306a36Sopenharmony_ci	vpand		$MASK,$H3,$H3
203562306a36Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
203662306a36Sopenharmony_ci
203762306a36Sopenharmony_ci	 vpand		$MASK,$T0,$T0		# 0
203862306a36Sopenharmony_ci	 vpand		$MASK,$T1,$T1		# 1
203962306a36Sopenharmony_ci	 vpand		$MASK,$T3,$T3		# 3
204062306a36Sopenharmony_ci	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
204162306a36Sopenharmony_ci
204262306a36Sopenharmony_ci	sub		\$64,$len
204362306a36Sopenharmony_ci	jnz		.Loop_avx2$suffix
204462306a36Sopenharmony_ci
204562306a36Sopenharmony_ci	.byte		0x66,0x90
204662306a36Sopenharmony_ci.Ltail_avx2$suffix:
204762306a36Sopenharmony_ci	################################################################
204862306a36Sopenharmony_ci	# while above multiplications were by r^4 in all lanes, in last
204962306a36Sopenharmony_ci	# iteration we multiply least significant lane by r^4 and most
205062306a36Sopenharmony_ci	# significant one by r, so copy of above except that references
205162306a36Sopenharmony_ci	# to the precomputed table are displaced by 4...
205262306a36Sopenharmony_ci
205362306a36Sopenharmony_ci	#vpaddq		$H2,$T2,$H2		# accumulate input
205462306a36Sopenharmony_ci	vpaddq		$H0,$T0,$H0
205562306a36Sopenharmony_ci	vmovdqu		`32*0+4`(%rsp),$T0	# r0^4
205662306a36Sopenharmony_ci	vpaddq		$H1,$T1,$H1
205762306a36Sopenharmony_ci	vmovdqu		`32*1+4`(%rsp),$T1	# r1^4
205862306a36Sopenharmony_ci	vpaddq		$H3,$T3,$H3
205962306a36Sopenharmony_ci	vmovdqu		`32*3+4`(%rsp),$T2	# r2^4
206062306a36Sopenharmony_ci	vpaddq		$H4,$T4,$H4
206162306a36Sopenharmony_ci	vmovdqu		`32*6+4-0x90`(%rax),$T3	# s3^4
206262306a36Sopenharmony_ci	vmovdqu		`32*8+4-0x90`(%rax),$S4	# s4^4
206362306a36Sopenharmony_ci
206462306a36Sopenharmony_ci	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
206562306a36Sopenharmony_ci	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
206662306a36Sopenharmony_ci	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
206762306a36Sopenharmony_ci	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
206862306a36Sopenharmony_ci	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
206962306a36Sopenharmony_ci
207062306a36Sopenharmony_ci	vpmuludq	$H0,$T1,$T4		# h0*r1
207162306a36Sopenharmony_ci	vpmuludq	$H1,$T1,$H2		# h1*r1
207262306a36Sopenharmony_ci	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
207362306a36Sopenharmony_ci	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
207462306a36Sopenharmony_ci	vpmuludq	$H3,$T1,$T4		# h3*r1
207562306a36Sopenharmony_ci	vpmuludq	`32*2+4`(%rsp),$H4,$H2	# h4*s1
207662306a36Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
207762306a36Sopenharmony_ci	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
207862306a36Sopenharmony_ci
207962306a36Sopenharmony_ci	vpmuludq	$H0,$T0,$T4		# h0*r0
208062306a36Sopenharmony_ci	vpmuludq	$H1,$T0,$H2		# h1*r0
208162306a36Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
208262306a36Sopenharmony_ci	 vmovdqu	`32*4+4-0x90`(%rax),$T1	# s2
208362306a36Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
208462306a36Sopenharmony_ci	vpmuludq	$H3,$T0,$T4		# h3*r0
208562306a36Sopenharmony_ci	vpmuludq	$H4,$T0,$H2		# h4*r0
208662306a36Sopenharmony_ci	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
208762306a36Sopenharmony_ci	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
208862306a36Sopenharmony_ci
208962306a36Sopenharmony_ci	vpmuludq	$H3,$T1,$T4		# h3*s2
209062306a36Sopenharmony_ci	vpmuludq	$H4,$T1,$H2		# h4*s2
209162306a36Sopenharmony_ci	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
209262306a36Sopenharmony_ci	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
209362306a36Sopenharmony_ci	 vmovdqu	`32*5+4-0x90`(%rax),$H2	# r3
209462306a36Sopenharmony_ci	vpmuludq	$H1,$T2,$T4		# h1*r2
209562306a36Sopenharmony_ci	vpmuludq	$H0,$T2,$T2		# h0*r2
209662306a36Sopenharmony_ci	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
209762306a36Sopenharmony_ci	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
209862306a36Sopenharmony_ci
209962306a36Sopenharmony_ci	vpmuludq	$H1,$H2,$T4		# h1*r3
210062306a36Sopenharmony_ci	vpmuludq	$H0,$H2,$H2		# h0*r3
210162306a36Sopenharmony_ci	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
210262306a36Sopenharmony_ci	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
210362306a36Sopenharmony_ci	vpmuludq	$H3,$T3,$T4		# h3*s3
210462306a36Sopenharmony_ci	vpmuludq	$H4,$T3,$H2		# h4*s3
210562306a36Sopenharmony_ci	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
210662306a36Sopenharmony_ci	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
210762306a36Sopenharmony_ci
210862306a36Sopenharmony_ci	vpmuludq	$H3,$S4,$H3		# h3*s4
210962306a36Sopenharmony_ci	vpmuludq	$H4,$S4,$H4		# h4*s4
211062306a36Sopenharmony_ci	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
211162306a36Sopenharmony_ci	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
211262306a36Sopenharmony_ci	vpmuludq	`32*7+4-0x90`(%rax),$H0,$H4		# h0*r4
211362306a36Sopenharmony_ci	vpmuludq	$H1,$S4,$H0		# h1*s4
211462306a36Sopenharmony_ci	vmovdqa		64(%rcx),$MASK		# .Lmask26
211562306a36Sopenharmony_ci	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
211662306a36Sopenharmony_ci	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
211762306a36Sopenharmony_ci
211862306a36Sopenharmony_ci	################################################################
211962306a36Sopenharmony_ci	# horizontal addition
212062306a36Sopenharmony_ci
212162306a36Sopenharmony_ci	vpsrldq		\$8,$D1,$T1
212262306a36Sopenharmony_ci	vpsrldq		\$8,$H2,$T2
212362306a36Sopenharmony_ci	vpsrldq		\$8,$H3,$T3
212462306a36Sopenharmony_ci	vpsrldq		\$8,$H4,$T4
212562306a36Sopenharmony_ci	vpsrldq		\$8,$H0,$T0
212662306a36Sopenharmony_ci	vpaddq		$T1,$D1,$D1
212762306a36Sopenharmony_ci	vpaddq		$T2,$H2,$H2
212862306a36Sopenharmony_ci	vpaddq		$T3,$H3,$H3
212962306a36Sopenharmony_ci	vpaddq		$T4,$H4,$H4
213062306a36Sopenharmony_ci	vpaddq		$T0,$H0,$H0
213162306a36Sopenharmony_ci
213262306a36Sopenharmony_ci	vpermq		\$0x2,$H3,$T3
213362306a36Sopenharmony_ci	vpermq		\$0x2,$H4,$T4
213462306a36Sopenharmony_ci	vpermq		\$0x2,$H0,$T0
213562306a36Sopenharmony_ci	vpermq		\$0x2,$D1,$T1
213662306a36Sopenharmony_ci	vpermq		\$0x2,$H2,$T2
213762306a36Sopenharmony_ci	vpaddq		$T3,$H3,$H3
213862306a36Sopenharmony_ci	vpaddq		$T4,$H4,$H4
213962306a36Sopenharmony_ci	vpaddq		$T0,$H0,$H0
214062306a36Sopenharmony_ci	vpaddq		$T1,$D1,$D1
214162306a36Sopenharmony_ci	vpaddq		$T2,$H2,$H2
214262306a36Sopenharmony_ci
214362306a36Sopenharmony_ci	################################################################
214462306a36Sopenharmony_ci	# lazy reduction
214562306a36Sopenharmony_ci
214662306a36Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
214762306a36Sopenharmony_ci	vpand		$MASK,$H3,$H3
214862306a36Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
214962306a36Sopenharmony_ci
215062306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
215162306a36Sopenharmony_ci	vpand		$MASK,$H0,$H0
215262306a36Sopenharmony_ci	vpaddq		$D0,$D1,$H1		# h0 -> h1
215362306a36Sopenharmony_ci
215462306a36Sopenharmony_ci	vpsrlq		\$26,$H4,$D4
215562306a36Sopenharmony_ci	vpand		$MASK,$H4,$H4
215662306a36Sopenharmony_ci
215762306a36Sopenharmony_ci	vpsrlq		\$26,$H1,$D1
215862306a36Sopenharmony_ci	vpand		$MASK,$H1,$H1
215962306a36Sopenharmony_ci	vpaddq		$D1,$H2,$H2		# h1 -> h2
216062306a36Sopenharmony_ci
216162306a36Sopenharmony_ci	vpaddq		$D4,$H0,$H0
216262306a36Sopenharmony_ci	vpsllq		\$2,$D4,$D4
216362306a36Sopenharmony_ci	vpaddq		$D4,$H0,$H0		# h4 -> h0
216462306a36Sopenharmony_ci
216562306a36Sopenharmony_ci	vpsrlq		\$26,$H2,$D2
216662306a36Sopenharmony_ci	vpand		$MASK,$H2,$H2
216762306a36Sopenharmony_ci	vpaddq		$D2,$H3,$H3		# h2 -> h3
216862306a36Sopenharmony_ci
216962306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
217062306a36Sopenharmony_ci	vpand		$MASK,$H0,$H0
217162306a36Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
217262306a36Sopenharmony_ci
217362306a36Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
217462306a36Sopenharmony_ci	vpand		$MASK,$H3,$H3
217562306a36Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
217662306a36Sopenharmony_ci
217762306a36Sopenharmony_ci	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
217862306a36Sopenharmony_ci	vmovd		%x#$H1,`4*1-48-64`($ctx)
217962306a36Sopenharmony_ci	vmovd		%x#$H2,`4*2-48-64`($ctx)
218062306a36Sopenharmony_ci	vmovd		%x#$H3,`4*3-48-64`($ctx)
218162306a36Sopenharmony_ci	vmovd		%x#$H4,`4*4-48-64`($ctx)
218262306a36Sopenharmony_ci___
218362306a36Sopenharmony_ci$code.=<<___	if ($win64);
218462306a36Sopenharmony_ci	vmovdqa		-0xb0(%r10),%xmm6
218562306a36Sopenharmony_ci	vmovdqa		-0xa0(%r10),%xmm7
218662306a36Sopenharmony_ci	vmovdqa		-0x90(%r10),%xmm8
218762306a36Sopenharmony_ci	vmovdqa		-0x80(%r10),%xmm9
218862306a36Sopenharmony_ci	vmovdqa		-0x70(%r10),%xmm10
218962306a36Sopenharmony_ci	vmovdqa		-0x60(%r10),%xmm11
219062306a36Sopenharmony_ci	vmovdqa		-0x50(%r10),%xmm12
219162306a36Sopenharmony_ci	vmovdqa		-0x40(%r10),%xmm13
219262306a36Sopenharmony_ci	vmovdqa		-0x30(%r10),%xmm14
219362306a36Sopenharmony_ci	vmovdqa		-0x20(%r10),%xmm15
219462306a36Sopenharmony_ci	lea		-8(%r10),%rsp
219562306a36Sopenharmony_ci.Ldo_avx2_epilogue$suffix:
219662306a36Sopenharmony_ci___
219762306a36Sopenharmony_ci$code.=<<___	if (!$win64);
219862306a36Sopenharmony_ci	lea		-8(%r10),%rsp
219962306a36Sopenharmony_ci.cfi_def_cfa_register	%rsp
220062306a36Sopenharmony_ci___
220162306a36Sopenharmony_ci$code.=<<___;
220262306a36Sopenharmony_ci	vzeroupper
220362306a36Sopenharmony_ci	RET
220462306a36Sopenharmony_ci.cfi_endproc
220562306a36Sopenharmony_ci___
220662306a36Sopenharmony_ciif($avx > 2 && $avx512) {
220762306a36Sopenharmony_cimy ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
220862306a36Sopenharmony_cimy ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
220962306a36Sopenharmony_cimy $PADBIT="%zmm30";
221062306a36Sopenharmony_ci
221162306a36Sopenharmony_cimap(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));		# switch to %zmm domain
221262306a36Sopenharmony_cimap(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
221362306a36Sopenharmony_cimap(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
221462306a36Sopenharmony_cimap(s/%y/%z/,($MASK));
221562306a36Sopenharmony_ci
221662306a36Sopenharmony_ci$code.=<<___;
221762306a36Sopenharmony_ci.cfi_startproc
221862306a36Sopenharmony_ci.Lblocks_avx512:
221962306a36Sopenharmony_ci	mov		\$15,%eax
222062306a36Sopenharmony_ci	kmovw		%eax,%k2
222162306a36Sopenharmony_ci___
222262306a36Sopenharmony_ci$code.=<<___	if (!$win64);
222362306a36Sopenharmony_ci	lea		8(%rsp),%r10
222462306a36Sopenharmony_ci.cfi_def_cfa_register	%r10
222562306a36Sopenharmony_ci	sub		\$0x128,%rsp
222662306a36Sopenharmony_ci___
222762306a36Sopenharmony_ci$code.=<<___	if ($win64);
222862306a36Sopenharmony_ci	lea		8(%rsp),%r10
222962306a36Sopenharmony_ci	sub		\$0x1c8,%rsp
223062306a36Sopenharmony_ci	vmovdqa		%xmm6,-0xb0(%r10)
223162306a36Sopenharmony_ci	vmovdqa		%xmm7,-0xa0(%r10)
223262306a36Sopenharmony_ci	vmovdqa		%xmm8,-0x90(%r10)
223362306a36Sopenharmony_ci	vmovdqa		%xmm9,-0x80(%r10)
223462306a36Sopenharmony_ci	vmovdqa		%xmm10,-0x70(%r10)
223562306a36Sopenharmony_ci	vmovdqa		%xmm11,-0x60(%r10)
223662306a36Sopenharmony_ci	vmovdqa		%xmm12,-0x50(%r10)
223762306a36Sopenharmony_ci	vmovdqa		%xmm13,-0x40(%r10)
223862306a36Sopenharmony_ci	vmovdqa		%xmm14,-0x30(%r10)
223962306a36Sopenharmony_ci	vmovdqa		%xmm15,-0x20(%r10)
224062306a36Sopenharmony_ci.Ldo_avx512_body:
224162306a36Sopenharmony_ci___
224262306a36Sopenharmony_ci$code.=<<___;
224362306a36Sopenharmony_ci	lea		.Lconst(%rip),%rcx
224462306a36Sopenharmony_ci	lea		48+64($ctx),$ctx	# size optimization
224562306a36Sopenharmony_ci	vmovdqa		96(%rcx),%y#$T2		# .Lpermd_avx2
224662306a36Sopenharmony_ci
224762306a36Sopenharmony_ci	# expand pre-calculated table
224862306a36Sopenharmony_ci	vmovdqu		`16*0-64`($ctx),%x#$D0	# will become expanded ${R0}
224962306a36Sopenharmony_ci	and		\$-512,%rsp
225062306a36Sopenharmony_ci	vmovdqu		`16*1-64`($ctx),%x#$D1	# will become ... ${R1}
225162306a36Sopenharmony_ci	mov		\$0x20,%rax
225262306a36Sopenharmony_ci	vmovdqu		`16*2-64`($ctx),%x#$T0	# ... ${S1}
225362306a36Sopenharmony_ci	vmovdqu		`16*3-64`($ctx),%x#$D2	# ... ${R2}
225462306a36Sopenharmony_ci	vmovdqu		`16*4-64`($ctx),%x#$T1	# ... ${S2}
225562306a36Sopenharmony_ci	vmovdqu		`16*5-64`($ctx),%x#$D3	# ... ${R3}
225662306a36Sopenharmony_ci	vmovdqu		`16*6-64`($ctx),%x#$T3	# ... ${S3}
225762306a36Sopenharmony_ci	vmovdqu		`16*7-64`($ctx),%x#$D4	# ... ${R4}
225862306a36Sopenharmony_ci	vmovdqu		`16*8-64`($ctx),%x#$T4	# ... ${S4}
225962306a36Sopenharmony_ci	vpermd		$D0,$T2,$R0		# 00003412 -> 14243444
226062306a36Sopenharmony_ci	vpbroadcastq	64(%rcx),$MASK		# .Lmask26
226162306a36Sopenharmony_ci	vpermd		$D1,$T2,$R1
226262306a36Sopenharmony_ci	vpermd		$T0,$T2,$S1
226362306a36Sopenharmony_ci	vpermd		$D2,$T2,$R2
226462306a36Sopenharmony_ci	vmovdqa64	$R0,0x00(%rsp){%k2}	# save in case $len%128 != 0
226562306a36Sopenharmony_ci	 vpsrlq		\$32,$R0,$T0		# 14243444 -> 01020304
226662306a36Sopenharmony_ci	vpermd		$T1,$T2,$S2
226762306a36Sopenharmony_ci	vmovdqu64	$R1,0x00(%rsp,%rax){%k2}
226862306a36Sopenharmony_ci	 vpsrlq		\$32,$R1,$T1
226962306a36Sopenharmony_ci	vpermd		$D3,$T2,$R3
227062306a36Sopenharmony_ci	vmovdqa64	$S1,0x40(%rsp){%k2}
227162306a36Sopenharmony_ci	vpermd		$T3,$T2,$S3
227262306a36Sopenharmony_ci	vpermd		$D4,$T2,$R4
227362306a36Sopenharmony_ci	vmovdqu64	$R2,0x40(%rsp,%rax){%k2}
227462306a36Sopenharmony_ci	vpermd		$T4,$T2,$S4
227562306a36Sopenharmony_ci	vmovdqa64	$S2,0x80(%rsp){%k2}
227662306a36Sopenharmony_ci	vmovdqu64	$R3,0x80(%rsp,%rax){%k2}
227762306a36Sopenharmony_ci	vmovdqa64	$S3,0xc0(%rsp){%k2}
227862306a36Sopenharmony_ci	vmovdqu64	$R4,0xc0(%rsp,%rax){%k2}
227962306a36Sopenharmony_ci	vmovdqa64	$S4,0x100(%rsp){%k2}
228062306a36Sopenharmony_ci
228162306a36Sopenharmony_ci	################################################################
228262306a36Sopenharmony_ci	# calculate 5th through 8th powers of the key
228362306a36Sopenharmony_ci	#
228462306a36Sopenharmony_ci	# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
228562306a36Sopenharmony_ci	# d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
228662306a36Sopenharmony_ci	# d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
228762306a36Sopenharmony_ci	# d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
228862306a36Sopenharmony_ci	# d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
228962306a36Sopenharmony_ci
229062306a36Sopenharmony_ci	vpmuludq	$T0,$R0,$D0		# d0 = r0'*r0
229162306a36Sopenharmony_ci	vpmuludq	$T0,$R1,$D1		# d1 = r0'*r1
229262306a36Sopenharmony_ci	vpmuludq	$T0,$R2,$D2		# d2 = r0'*r2
229362306a36Sopenharmony_ci	vpmuludq	$T0,$R3,$D3		# d3 = r0'*r3
229462306a36Sopenharmony_ci	vpmuludq	$T0,$R4,$D4		# d4 = r0'*r4
229562306a36Sopenharmony_ci	 vpsrlq		\$32,$R2,$T2
229662306a36Sopenharmony_ci
229762306a36Sopenharmony_ci	vpmuludq	$T1,$S4,$M0
229862306a36Sopenharmony_ci	vpmuludq	$T1,$R0,$M1
229962306a36Sopenharmony_ci	vpmuludq	$T1,$R1,$M2
230062306a36Sopenharmony_ci	vpmuludq	$T1,$R2,$M3
230162306a36Sopenharmony_ci	vpmuludq	$T1,$R3,$M4
230262306a36Sopenharmony_ci	 vpsrlq		\$32,$R3,$T3
230362306a36Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += r1'*5*r4
230462306a36Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += r1'*r0
230562306a36Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += r1'*r1
230662306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += r1'*r2
230762306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += r1'*r3
230862306a36Sopenharmony_ci
230962306a36Sopenharmony_ci	vpmuludq	$T2,$S3,$M0
231062306a36Sopenharmony_ci	vpmuludq	$T2,$S4,$M1
231162306a36Sopenharmony_ci	vpmuludq	$T2,$R1,$M3
231262306a36Sopenharmony_ci	vpmuludq	$T2,$R2,$M4
231362306a36Sopenharmony_ci	vpmuludq	$T2,$R0,$M2
231462306a36Sopenharmony_ci	 vpsrlq		\$32,$R4,$T4
231562306a36Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r3
231662306a36Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r4
231762306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += r2'*r1
231862306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += r2'*r2
231962306a36Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += r2'*r0
232062306a36Sopenharmony_ci
232162306a36Sopenharmony_ci	vpmuludq	$T3,$S2,$M0
232262306a36Sopenharmony_ci	vpmuludq	$T3,$R0,$M3
232362306a36Sopenharmony_ci	vpmuludq	$T3,$R1,$M4
232462306a36Sopenharmony_ci	vpmuludq	$T3,$S3,$M1
232562306a36Sopenharmony_ci	vpmuludq	$T3,$S4,$M2
232662306a36Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += r3'*5*r2
232762306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += r3'*r0
232862306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += r3'*r1
232962306a36Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += r3'*5*r3
233062306a36Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += r3'*5*r4
233162306a36Sopenharmony_ci
233262306a36Sopenharmony_ci	vpmuludq	$T4,$S4,$M3
233362306a36Sopenharmony_ci	vpmuludq	$T4,$R0,$M4
233462306a36Sopenharmony_ci	vpmuludq	$T4,$S1,$M0
233562306a36Sopenharmony_ci	vpmuludq	$T4,$S2,$M1
233662306a36Sopenharmony_ci	vpmuludq	$T4,$S3,$M2
233762306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += r2'*5*r4
233862306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += r2'*r0
233962306a36Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r1
234062306a36Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r2
234162306a36Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += r2'*5*r3
234262306a36Sopenharmony_ci
234362306a36Sopenharmony_ci	################################################################
234462306a36Sopenharmony_ci	# load input
234562306a36Sopenharmony_ci	vmovdqu64	16*0($inp),%z#$T3
234662306a36Sopenharmony_ci	vmovdqu64	16*4($inp),%z#$T4
234762306a36Sopenharmony_ci	lea		16*8($inp),$inp
234862306a36Sopenharmony_ci
234962306a36Sopenharmony_ci	################################################################
235062306a36Sopenharmony_ci	# lazy reduction
235162306a36Sopenharmony_ci
235262306a36Sopenharmony_ci	vpsrlq		\$26,$D3,$M3
235362306a36Sopenharmony_ci	vpandq		$MASK,$D3,$D3
235462306a36Sopenharmony_ci	vpaddq		$M3,$D4,$D4		# d3 -> d4
235562306a36Sopenharmony_ci
235662306a36Sopenharmony_ci	vpsrlq		\$26,$D0,$M0
235762306a36Sopenharmony_ci	vpandq		$MASK,$D0,$D0
235862306a36Sopenharmony_ci	vpaddq		$M0,$D1,$D1		# d0 -> d1
235962306a36Sopenharmony_ci
236062306a36Sopenharmony_ci	vpsrlq		\$26,$D4,$M4
236162306a36Sopenharmony_ci	vpandq		$MASK,$D4,$D4
236262306a36Sopenharmony_ci
236362306a36Sopenharmony_ci	vpsrlq		\$26,$D1,$M1
236462306a36Sopenharmony_ci	vpandq		$MASK,$D1,$D1
236562306a36Sopenharmony_ci	vpaddq		$M1,$D2,$D2		# d1 -> d2
236662306a36Sopenharmony_ci
236762306a36Sopenharmony_ci	vpaddq		$M4,$D0,$D0
236862306a36Sopenharmony_ci	vpsllq		\$2,$M4,$M4
236962306a36Sopenharmony_ci	vpaddq		$M4,$D0,$D0		# d4 -> d0
237062306a36Sopenharmony_ci
237162306a36Sopenharmony_ci	vpsrlq		\$26,$D2,$M2
237262306a36Sopenharmony_ci	vpandq		$MASK,$D2,$D2
237362306a36Sopenharmony_ci	vpaddq		$M2,$D3,$D3		# d2 -> d3
237462306a36Sopenharmony_ci
237562306a36Sopenharmony_ci	vpsrlq		\$26,$D0,$M0
237662306a36Sopenharmony_ci	vpandq		$MASK,$D0,$D0
237762306a36Sopenharmony_ci	vpaddq		$M0,$D1,$D1		# d0 -> d1
237862306a36Sopenharmony_ci
237962306a36Sopenharmony_ci	vpsrlq		\$26,$D3,$M3
238062306a36Sopenharmony_ci	vpandq		$MASK,$D3,$D3
238162306a36Sopenharmony_ci	vpaddq		$M3,$D4,$D4		# d3 -> d4
238262306a36Sopenharmony_ci
238362306a36Sopenharmony_ci	################################################################
238462306a36Sopenharmony_ci	# at this point we have 14243444 in $R0-$S4 and 05060708 in
238562306a36Sopenharmony_ci	# $D0-$D4, ...
238662306a36Sopenharmony_ci
238762306a36Sopenharmony_ci	vpunpcklqdq	$T4,$T3,$T0	# transpose input
238862306a36Sopenharmony_ci	vpunpckhqdq	$T4,$T3,$T4
238962306a36Sopenharmony_ci
239062306a36Sopenharmony_ci	# ... since input 64-bit lanes are ordered as 73625140, we could
239162306a36Sopenharmony_ci	# "vperm" it to 76543210 (here and in each loop iteration), *or*
239262306a36Sopenharmony_ci	# we could just flow along, hence the goal for $R0-$S4 is
239362306a36Sopenharmony_ci	# 1858286838784888 ...
239462306a36Sopenharmony_ci
239562306a36Sopenharmony_ci	vmovdqa32	128(%rcx),$M0		# .Lpermd_avx512:
239662306a36Sopenharmony_ci	mov		\$0x7777,%eax
239762306a36Sopenharmony_ci	kmovw		%eax,%k1
239862306a36Sopenharmony_ci
239962306a36Sopenharmony_ci	vpermd		$R0,$M0,$R0		# 14243444 -> 1---2---3---4---
240062306a36Sopenharmony_ci	vpermd		$R1,$M0,$R1
240162306a36Sopenharmony_ci	vpermd		$R2,$M0,$R2
240262306a36Sopenharmony_ci	vpermd		$R3,$M0,$R3
240362306a36Sopenharmony_ci	vpermd		$R4,$M0,$R4
240462306a36Sopenharmony_ci
240562306a36Sopenharmony_ci	vpermd		$D0,$M0,${R0}{%k1}	# 05060708 -> 1858286838784888
240662306a36Sopenharmony_ci	vpermd		$D1,$M0,${R1}{%k1}
240762306a36Sopenharmony_ci	vpermd		$D2,$M0,${R2}{%k1}
240862306a36Sopenharmony_ci	vpermd		$D3,$M0,${R3}{%k1}
240962306a36Sopenharmony_ci	vpermd		$D4,$M0,${R4}{%k1}
241062306a36Sopenharmony_ci
241162306a36Sopenharmony_ci	vpslld		\$2,$R1,$S1		# *5
241262306a36Sopenharmony_ci	vpslld		\$2,$R2,$S2
241362306a36Sopenharmony_ci	vpslld		\$2,$R3,$S3
241462306a36Sopenharmony_ci	vpslld		\$2,$R4,$S4
241562306a36Sopenharmony_ci	vpaddd		$R1,$S1,$S1
241662306a36Sopenharmony_ci	vpaddd		$R2,$S2,$S2
241762306a36Sopenharmony_ci	vpaddd		$R3,$S3,$S3
241862306a36Sopenharmony_ci	vpaddd		$R4,$S4,$S4
241962306a36Sopenharmony_ci
242062306a36Sopenharmony_ci	vpbroadcastq	32(%rcx),$PADBIT	# .L129
242162306a36Sopenharmony_ci
242262306a36Sopenharmony_ci	vpsrlq		\$52,$T0,$T2		# splat input
242362306a36Sopenharmony_ci	vpsllq		\$12,$T4,$T3
242462306a36Sopenharmony_ci	vporq		$T3,$T2,$T2
242562306a36Sopenharmony_ci	vpsrlq		\$26,$T0,$T1
242662306a36Sopenharmony_ci	vpsrlq		\$14,$T4,$T3
242762306a36Sopenharmony_ci	vpsrlq		\$40,$T4,$T4		# 4
242862306a36Sopenharmony_ci	vpandq		$MASK,$T2,$T2		# 2
242962306a36Sopenharmony_ci	vpandq		$MASK,$T0,$T0		# 0
243062306a36Sopenharmony_ci	#vpandq		$MASK,$T1,$T1		# 1
243162306a36Sopenharmony_ci	#vpandq		$MASK,$T3,$T3		# 3
243262306a36Sopenharmony_ci	#vporq		$PADBIT,$T4,$T4		# padbit, yes, always
243362306a36Sopenharmony_ci
243462306a36Sopenharmony_ci	vpaddq		$H2,$T2,$H2		# accumulate input
243562306a36Sopenharmony_ci	sub		\$192,$len
243662306a36Sopenharmony_ci	jbe		.Ltail_avx512
243762306a36Sopenharmony_ci	jmp		.Loop_avx512
243862306a36Sopenharmony_ci
243962306a36Sopenharmony_ci.align	32
244062306a36Sopenharmony_ci.Loop_avx512:
244162306a36Sopenharmony_ci	################################################################
244262306a36Sopenharmony_ci	# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
244362306a36Sopenharmony_ci	# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
244462306a36Sopenharmony_ci	# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
244562306a36Sopenharmony_ci	# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
244662306a36Sopenharmony_ci	# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
244762306a36Sopenharmony_ci	# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
244862306a36Sopenharmony_ci	# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
244962306a36Sopenharmony_ci	# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
245062306a36Sopenharmony_ci	#   \________/\___________/
245162306a36Sopenharmony_ci	################################################################
245262306a36Sopenharmony_ci	#vpaddq		$H2,$T2,$H2		# accumulate input
245362306a36Sopenharmony_ci
245462306a36Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
245562306a36Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
245662306a36Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
245762306a36Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
245862306a36Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
245962306a36Sopenharmony_ci	#
246062306a36Sopenharmony_ci	# however, as h2 is "chronologically" first one available pull
246162306a36Sopenharmony_ci	# corresponding operations up, so it's
246262306a36Sopenharmony_ci	#
246362306a36Sopenharmony_ci	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
246462306a36Sopenharmony_ci	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
246562306a36Sopenharmony_ci	# d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
246662306a36Sopenharmony_ci	# d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
246762306a36Sopenharmony_ci	# d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
246862306a36Sopenharmony_ci
246962306a36Sopenharmony_ci	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
247062306a36Sopenharmony_ci	 vpaddq		$H0,$T0,$H0
247162306a36Sopenharmony_ci	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
247262306a36Sopenharmony_ci	 vpandq		$MASK,$T1,$T1		# 1
247362306a36Sopenharmony_ci	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
247462306a36Sopenharmony_ci	 vpandq		$MASK,$T3,$T3		# 3
247562306a36Sopenharmony_ci	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
247662306a36Sopenharmony_ci	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
247762306a36Sopenharmony_ci	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
247862306a36Sopenharmony_ci	 vpaddq		$H1,$T1,$H1		# accumulate input
247962306a36Sopenharmony_ci	 vpaddq		$H3,$T3,$H3
248062306a36Sopenharmony_ci	 vpaddq		$H4,$T4,$H4
248162306a36Sopenharmony_ci
248262306a36Sopenharmony_ci	  vmovdqu64	16*0($inp),$T3		# load input
248362306a36Sopenharmony_ci	  vmovdqu64	16*4($inp),$T4
248462306a36Sopenharmony_ci	  lea		16*8($inp),$inp
248562306a36Sopenharmony_ci	vpmuludq	$H0,$R3,$M3
248662306a36Sopenharmony_ci	vpmuludq	$H0,$R4,$M4
248762306a36Sopenharmony_ci	vpmuludq	$H0,$R0,$M0
248862306a36Sopenharmony_ci	vpmuludq	$H0,$R1,$M1
248962306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
249062306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
249162306a36Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
249262306a36Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
249362306a36Sopenharmony_ci
249462306a36Sopenharmony_ci	vpmuludq	$H1,$R2,$M3
249562306a36Sopenharmony_ci	vpmuludq	$H1,$R3,$M4
249662306a36Sopenharmony_ci	vpmuludq	$H1,$S4,$M0
249762306a36Sopenharmony_ci	vpmuludq	$H0,$R2,$M2
249862306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
249962306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
250062306a36Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
250162306a36Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
250262306a36Sopenharmony_ci
250362306a36Sopenharmony_ci	  vpunpcklqdq	$T4,$T3,$T0		# transpose input
250462306a36Sopenharmony_ci	  vpunpckhqdq	$T4,$T3,$T4
250562306a36Sopenharmony_ci
250662306a36Sopenharmony_ci	vpmuludq	$H3,$R0,$M3
250762306a36Sopenharmony_ci	vpmuludq	$H3,$R1,$M4
250862306a36Sopenharmony_ci	vpmuludq	$H1,$R0,$M1
250962306a36Sopenharmony_ci	vpmuludq	$H1,$R1,$M2
251062306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
251162306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
251262306a36Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
251362306a36Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
251462306a36Sopenharmony_ci
251562306a36Sopenharmony_ci	vpmuludq	$H4,$S4,$M3
251662306a36Sopenharmony_ci	vpmuludq	$H4,$R0,$M4
251762306a36Sopenharmony_ci	vpmuludq	$H3,$S2,$M0
251862306a36Sopenharmony_ci	vpmuludq	$H3,$S3,$M1
251962306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h4*s4
252062306a36Sopenharmony_ci	vpmuludq	$H3,$S4,$M2
252162306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
252262306a36Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
252362306a36Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
252462306a36Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
252562306a36Sopenharmony_ci
252662306a36Sopenharmony_ci	vpmuludq	$H4,$S1,$M0
252762306a36Sopenharmony_ci	vpmuludq	$H4,$S2,$M1
252862306a36Sopenharmony_ci	vpmuludq	$H4,$S3,$M2
252962306a36Sopenharmony_ci	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
253062306a36Sopenharmony_ci	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
253162306a36Sopenharmony_ci	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
253262306a36Sopenharmony_ci
253362306a36Sopenharmony_ci	################################################################
253462306a36Sopenharmony_ci	# lazy reduction (interleaved with input splat)
253562306a36Sopenharmony_ci
253662306a36Sopenharmony_ci	 vpsrlq		\$52,$T0,$T2		# splat input
253762306a36Sopenharmony_ci	 vpsllq		\$12,$T4,$T3
253862306a36Sopenharmony_ci
253962306a36Sopenharmony_ci	vpsrlq		\$26,$D3,$H3
254062306a36Sopenharmony_ci	vpandq		$MASK,$D3,$D3
254162306a36Sopenharmony_ci	vpaddq		$H3,$D4,$H4		# h3 -> h4
254262306a36Sopenharmony_ci
254362306a36Sopenharmony_ci	 vporq		$T3,$T2,$T2
254462306a36Sopenharmony_ci
254562306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
254662306a36Sopenharmony_ci	vpandq		$MASK,$H0,$H0
254762306a36Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
254862306a36Sopenharmony_ci
254962306a36Sopenharmony_ci	 vpandq		$MASK,$T2,$T2		# 2
255062306a36Sopenharmony_ci
255162306a36Sopenharmony_ci	vpsrlq		\$26,$H4,$D4
255262306a36Sopenharmony_ci	vpandq		$MASK,$H4,$H4
255362306a36Sopenharmony_ci
255462306a36Sopenharmony_ci	vpsrlq		\$26,$H1,$D1
255562306a36Sopenharmony_ci	vpandq		$MASK,$H1,$H1
255662306a36Sopenharmony_ci	vpaddq		$D1,$H2,$H2		# h1 -> h2
255762306a36Sopenharmony_ci
255862306a36Sopenharmony_ci	vpaddq		$D4,$H0,$H0
255962306a36Sopenharmony_ci	vpsllq		\$2,$D4,$D4
256062306a36Sopenharmony_ci	vpaddq		$D4,$H0,$H0		# h4 -> h0
256162306a36Sopenharmony_ci
256262306a36Sopenharmony_ci	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
256362306a36Sopenharmony_ci	 vpsrlq		\$26,$T0,$T1
256462306a36Sopenharmony_ci
256562306a36Sopenharmony_ci	vpsrlq		\$26,$H2,$D2
256662306a36Sopenharmony_ci	vpandq		$MASK,$H2,$H2
256762306a36Sopenharmony_ci	vpaddq		$D2,$D3,$H3		# h2 -> h3
256862306a36Sopenharmony_ci
256962306a36Sopenharmony_ci	 vpsrlq		\$14,$T4,$T3
257062306a36Sopenharmony_ci
257162306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
257262306a36Sopenharmony_ci	vpandq		$MASK,$H0,$H0
257362306a36Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
257462306a36Sopenharmony_ci
257562306a36Sopenharmony_ci	 vpsrlq		\$40,$T4,$T4		# 4
257662306a36Sopenharmony_ci
257762306a36Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
257862306a36Sopenharmony_ci	vpandq		$MASK,$H3,$H3
257962306a36Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
258062306a36Sopenharmony_ci
258162306a36Sopenharmony_ci	 vpandq		$MASK,$T0,$T0		# 0
258262306a36Sopenharmony_ci	 #vpandq	$MASK,$T1,$T1		# 1
258362306a36Sopenharmony_ci	 #vpandq	$MASK,$T3,$T3		# 3
258462306a36Sopenharmony_ci	 #vporq		$PADBIT,$T4,$T4		# padbit, yes, always
258562306a36Sopenharmony_ci
258662306a36Sopenharmony_ci	sub		\$128,$len
258762306a36Sopenharmony_ci	ja		.Loop_avx512
258862306a36Sopenharmony_ci
258962306a36Sopenharmony_ci.Ltail_avx512:
259062306a36Sopenharmony_ci	################################################################
259162306a36Sopenharmony_ci	# while above multiplications were by r^8 in all lanes, in last
259262306a36Sopenharmony_ci	# iteration we multiply least significant lane by r^8 and most
259362306a36Sopenharmony_ci	# significant one by r, that's why table gets shifted...
259462306a36Sopenharmony_ci
259562306a36Sopenharmony_ci	vpsrlq		\$32,$R0,$R0		# 0105020603070408
259662306a36Sopenharmony_ci	vpsrlq		\$32,$R1,$R1
259762306a36Sopenharmony_ci	vpsrlq		\$32,$R2,$R2
259862306a36Sopenharmony_ci	vpsrlq		\$32,$S3,$S3
259962306a36Sopenharmony_ci	vpsrlq		\$32,$S4,$S4
260062306a36Sopenharmony_ci	vpsrlq		\$32,$R3,$R3
260162306a36Sopenharmony_ci	vpsrlq		\$32,$R4,$R4
260262306a36Sopenharmony_ci	vpsrlq		\$32,$S1,$S1
260362306a36Sopenharmony_ci	vpsrlq		\$32,$S2,$S2
260462306a36Sopenharmony_ci
260562306a36Sopenharmony_ci	################################################################
260662306a36Sopenharmony_ci	# load either next or last 64 byte of input
260762306a36Sopenharmony_ci	lea		($inp,$len),$inp
260862306a36Sopenharmony_ci
260962306a36Sopenharmony_ci	#vpaddq		$H2,$T2,$H2		# accumulate input
261062306a36Sopenharmony_ci	vpaddq		$H0,$T0,$H0
261162306a36Sopenharmony_ci
261262306a36Sopenharmony_ci	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
261362306a36Sopenharmony_ci	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
261462306a36Sopenharmony_ci	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
261562306a36Sopenharmony_ci	 vpandq		$MASK,$T1,$T1		# 1
261662306a36Sopenharmony_ci	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
261762306a36Sopenharmony_ci	 vpandq		$MASK,$T3,$T3		# 3
261862306a36Sopenharmony_ci	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
261962306a36Sopenharmony_ci	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
262062306a36Sopenharmony_ci	 vpaddq		$H1,$T1,$H1		# accumulate input
262162306a36Sopenharmony_ci	 vpaddq		$H3,$T3,$H3
262262306a36Sopenharmony_ci	 vpaddq		$H4,$T4,$H4
262362306a36Sopenharmony_ci
262462306a36Sopenharmony_ci	  vmovdqu	16*0($inp),%x#$T0
262562306a36Sopenharmony_ci	vpmuludq	$H0,$R3,$M3
262662306a36Sopenharmony_ci	vpmuludq	$H0,$R4,$M4
262762306a36Sopenharmony_ci	vpmuludq	$H0,$R0,$M0
262862306a36Sopenharmony_ci	vpmuludq	$H0,$R1,$M1
262962306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
263062306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
263162306a36Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
263262306a36Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
263362306a36Sopenharmony_ci
263462306a36Sopenharmony_ci	  vmovdqu	16*1($inp),%x#$T1
263562306a36Sopenharmony_ci	vpmuludq	$H1,$R2,$M3
263662306a36Sopenharmony_ci	vpmuludq	$H1,$R3,$M4
263762306a36Sopenharmony_ci	vpmuludq	$H1,$S4,$M0
263862306a36Sopenharmony_ci	vpmuludq	$H0,$R2,$M2
263962306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
264062306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
264162306a36Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
264262306a36Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
264362306a36Sopenharmony_ci
264462306a36Sopenharmony_ci	  vinserti128	\$1,16*2($inp),%y#$T0,%y#$T0
264562306a36Sopenharmony_ci	vpmuludq	$H3,$R0,$M3
264662306a36Sopenharmony_ci	vpmuludq	$H3,$R1,$M4
264762306a36Sopenharmony_ci	vpmuludq	$H1,$R0,$M1
264862306a36Sopenharmony_ci	vpmuludq	$H1,$R1,$M2
264962306a36Sopenharmony_ci	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
265062306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
265162306a36Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
265262306a36Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
265362306a36Sopenharmony_ci
265462306a36Sopenharmony_ci	  vinserti128	\$1,16*3($inp),%y#$T1,%y#$T1
265562306a36Sopenharmony_ci	vpmuludq	$H4,$S4,$M3
265662306a36Sopenharmony_ci	vpmuludq	$H4,$R0,$M4
265762306a36Sopenharmony_ci	vpmuludq	$H3,$S2,$M0
265862306a36Sopenharmony_ci	vpmuludq	$H3,$S3,$M1
265962306a36Sopenharmony_ci	vpmuludq	$H3,$S4,$M2
266062306a36Sopenharmony_ci	vpaddq		$M3,$D3,$H3		# h3 = d3 + h4*s4
266162306a36Sopenharmony_ci	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
266262306a36Sopenharmony_ci	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
266362306a36Sopenharmony_ci	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
266462306a36Sopenharmony_ci	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
266562306a36Sopenharmony_ci
266662306a36Sopenharmony_ci	vpmuludq	$H4,$S1,$M0
266762306a36Sopenharmony_ci	vpmuludq	$H4,$S2,$M1
266862306a36Sopenharmony_ci	vpmuludq	$H4,$S3,$M2
266962306a36Sopenharmony_ci	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
267062306a36Sopenharmony_ci	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
267162306a36Sopenharmony_ci	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
267262306a36Sopenharmony_ci
267362306a36Sopenharmony_ci	################################################################
267462306a36Sopenharmony_ci	# horizontal addition
267562306a36Sopenharmony_ci
267662306a36Sopenharmony_ci	mov		\$1,%eax
267762306a36Sopenharmony_ci	vpermq		\$0xb1,$H3,$D3
267862306a36Sopenharmony_ci	vpermq		\$0xb1,$D4,$H4
267962306a36Sopenharmony_ci	vpermq		\$0xb1,$H0,$D0
268062306a36Sopenharmony_ci	vpermq		\$0xb1,$H1,$D1
268162306a36Sopenharmony_ci	vpermq		\$0xb1,$H2,$D2
268262306a36Sopenharmony_ci	vpaddq		$D3,$H3,$H3
268362306a36Sopenharmony_ci	vpaddq		$D4,$H4,$H4
268462306a36Sopenharmony_ci	vpaddq		$D0,$H0,$H0
268562306a36Sopenharmony_ci	vpaddq		$D1,$H1,$H1
268662306a36Sopenharmony_ci	vpaddq		$D2,$H2,$H2
268762306a36Sopenharmony_ci
268862306a36Sopenharmony_ci	kmovw		%eax,%k3
268962306a36Sopenharmony_ci	vpermq		\$0x2,$H3,$D3
269062306a36Sopenharmony_ci	vpermq		\$0x2,$H4,$D4
269162306a36Sopenharmony_ci	vpermq		\$0x2,$H0,$D0
269262306a36Sopenharmony_ci	vpermq		\$0x2,$H1,$D1
269362306a36Sopenharmony_ci	vpermq		\$0x2,$H2,$D2
269462306a36Sopenharmony_ci	vpaddq		$D3,$H3,$H3
269562306a36Sopenharmony_ci	vpaddq		$D4,$H4,$H4
269662306a36Sopenharmony_ci	vpaddq		$D0,$H0,$H0
269762306a36Sopenharmony_ci	vpaddq		$D1,$H1,$H1
269862306a36Sopenharmony_ci	vpaddq		$D2,$H2,$H2
269962306a36Sopenharmony_ci
270062306a36Sopenharmony_ci	vextracti64x4	\$0x1,$H3,%y#$D3
270162306a36Sopenharmony_ci	vextracti64x4	\$0x1,$H4,%y#$D4
270262306a36Sopenharmony_ci	vextracti64x4	\$0x1,$H0,%y#$D0
270362306a36Sopenharmony_ci	vextracti64x4	\$0x1,$H1,%y#$D1
270462306a36Sopenharmony_ci	vextracti64x4	\$0x1,$H2,%y#$D2
270562306a36Sopenharmony_ci	vpaddq		$D3,$H3,${H3}{%k3}{z}	# keep single qword in case
270662306a36Sopenharmony_ci	vpaddq		$D4,$H4,${H4}{%k3}{z}	# it's passed to .Ltail_avx2
270762306a36Sopenharmony_ci	vpaddq		$D0,$H0,${H0}{%k3}{z}
270862306a36Sopenharmony_ci	vpaddq		$D1,$H1,${H1}{%k3}{z}
270962306a36Sopenharmony_ci	vpaddq		$D2,$H2,${H2}{%k3}{z}
271062306a36Sopenharmony_ci___
271162306a36Sopenharmony_cimap(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
271262306a36Sopenharmony_cimap(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
271362306a36Sopenharmony_ci$code.=<<___;
271462306a36Sopenharmony_ci	################################################################
271562306a36Sopenharmony_ci	# lazy reduction (interleaved with input splat)
271662306a36Sopenharmony_ci
271762306a36Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
271862306a36Sopenharmony_ci	vpand		$MASK,$H3,$H3
271962306a36Sopenharmony_ci	 vpsrldq	\$6,$T0,$T2		# splat input
272062306a36Sopenharmony_ci	 vpsrldq	\$6,$T1,$T3
272162306a36Sopenharmony_ci	 vpunpckhqdq	$T1,$T0,$T4		# 4
272262306a36Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
272362306a36Sopenharmony_ci
272462306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
272562306a36Sopenharmony_ci	vpand		$MASK,$H0,$H0
272662306a36Sopenharmony_ci	 vpunpcklqdq	$T3,$T2,$T2		# 2:3
272762306a36Sopenharmony_ci	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
272862306a36Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
272962306a36Sopenharmony_ci
273062306a36Sopenharmony_ci	vpsrlq		\$26,$H4,$D4
273162306a36Sopenharmony_ci	vpand		$MASK,$H4,$H4
273262306a36Sopenharmony_ci
273362306a36Sopenharmony_ci	vpsrlq		\$26,$H1,$D1
273462306a36Sopenharmony_ci	vpand		$MASK,$H1,$H1
273562306a36Sopenharmony_ci	 vpsrlq		\$30,$T2,$T3
273662306a36Sopenharmony_ci	 vpsrlq		\$4,$T2,$T2
273762306a36Sopenharmony_ci	vpaddq		$D1,$H2,$H2		# h1 -> h2
273862306a36Sopenharmony_ci
273962306a36Sopenharmony_ci	vpaddq		$D4,$H0,$H0
274062306a36Sopenharmony_ci	vpsllq		\$2,$D4,$D4
274162306a36Sopenharmony_ci	 vpsrlq		\$26,$T0,$T1
274262306a36Sopenharmony_ci	 vpsrlq		\$40,$T4,$T4		# 4
274362306a36Sopenharmony_ci	vpaddq		$D4,$H0,$H0		# h4 -> h0
274462306a36Sopenharmony_ci
274562306a36Sopenharmony_ci	vpsrlq		\$26,$H2,$D2
274662306a36Sopenharmony_ci	vpand		$MASK,$H2,$H2
274762306a36Sopenharmony_ci	 vpand		$MASK,$T2,$T2		# 2
274862306a36Sopenharmony_ci	 vpand		$MASK,$T0,$T0		# 0
274962306a36Sopenharmony_ci	vpaddq		$D2,$H3,$H3		# h2 -> h3
275062306a36Sopenharmony_ci
275162306a36Sopenharmony_ci	vpsrlq		\$26,$H0,$D0
275262306a36Sopenharmony_ci	vpand		$MASK,$H0,$H0
275362306a36Sopenharmony_ci	 vpaddq		$H2,$T2,$H2		# accumulate input for .Ltail_avx2
275462306a36Sopenharmony_ci	 vpand		$MASK,$T1,$T1		# 1
275562306a36Sopenharmony_ci	vpaddq		$D0,$H1,$H1		# h0 -> h1
275662306a36Sopenharmony_ci
275762306a36Sopenharmony_ci	vpsrlq		\$26,$H3,$D3
275862306a36Sopenharmony_ci	vpand		$MASK,$H3,$H3
275962306a36Sopenharmony_ci	 vpand		$MASK,$T3,$T3		# 3
276062306a36Sopenharmony_ci	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
276162306a36Sopenharmony_ci	vpaddq		$D3,$H4,$H4		# h3 -> h4
276262306a36Sopenharmony_ci
276362306a36Sopenharmony_ci	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
276462306a36Sopenharmony_ci	add		\$64,$len
276562306a36Sopenharmony_ci	jnz		.Ltail_avx2$suffix
276662306a36Sopenharmony_ci
276762306a36Sopenharmony_ci	vpsubq		$T2,$H2,$H2		# undo input accumulation
276862306a36Sopenharmony_ci	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
276962306a36Sopenharmony_ci	vmovd		%x#$H1,`4*1-48-64`($ctx)
277062306a36Sopenharmony_ci	vmovd		%x#$H2,`4*2-48-64`($ctx)
277162306a36Sopenharmony_ci	vmovd		%x#$H3,`4*3-48-64`($ctx)
277262306a36Sopenharmony_ci	vmovd		%x#$H4,`4*4-48-64`($ctx)
277362306a36Sopenharmony_ci	vzeroall
277462306a36Sopenharmony_ci___
277562306a36Sopenharmony_ci$code.=<<___	if ($win64);
277662306a36Sopenharmony_ci	movdqa		-0xb0(%r10),%xmm6
277762306a36Sopenharmony_ci	movdqa		-0xa0(%r10),%xmm7
277862306a36Sopenharmony_ci	movdqa		-0x90(%r10),%xmm8
277962306a36Sopenharmony_ci	movdqa		-0x80(%r10),%xmm9
278062306a36Sopenharmony_ci	movdqa		-0x70(%r10),%xmm10
278162306a36Sopenharmony_ci	movdqa		-0x60(%r10),%xmm11
278262306a36Sopenharmony_ci	movdqa		-0x50(%r10),%xmm12
278362306a36Sopenharmony_ci	movdqa		-0x40(%r10),%xmm13
278462306a36Sopenharmony_ci	movdqa		-0x30(%r10),%xmm14
278562306a36Sopenharmony_ci	movdqa		-0x20(%r10),%xmm15
278662306a36Sopenharmony_ci	lea		-8(%r10),%rsp
278762306a36Sopenharmony_ci.Ldo_avx512_epilogue:
278862306a36Sopenharmony_ci___
278962306a36Sopenharmony_ci$code.=<<___	if (!$win64);
279062306a36Sopenharmony_ci	lea		-8(%r10),%rsp
279162306a36Sopenharmony_ci.cfi_def_cfa_register	%rsp
279262306a36Sopenharmony_ci___
279362306a36Sopenharmony_ci$code.=<<___;
279462306a36Sopenharmony_ci	RET
279562306a36Sopenharmony_ci.cfi_endproc
279662306a36Sopenharmony_ci___
279762306a36Sopenharmony_ci
279862306a36Sopenharmony_ci}
279962306a36Sopenharmony_ci
280062306a36Sopenharmony_ci}
280162306a36Sopenharmony_ci
280262306a36Sopenharmony_ci&declare_function("poly1305_blocks_avx2", 32, 4);
280362306a36Sopenharmony_cipoly1305_blocks_avxN(0);
280462306a36Sopenharmony_ci&end_function("poly1305_blocks_avx2");
280562306a36Sopenharmony_ci
280662306a36Sopenharmony_ci#######################################################################
280762306a36Sopenharmony_ciif ($avx>2) {
280862306a36Sopenharmony_ci# On entry we have input length divisible by 64. But since inner loop
280962306a36Sopenharmony_ci# processes 128 bytes per iteration, cases when length is not divisible
281062306a36Sopenharmony_ci# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
281162306a36Sopenharmony_ci# reason stack layout is kept identical to poly1305_blocks_avx2. If not
281262306a36Sopenharmony_ci# for this tail, we wouldn't have to even allocate stack frame...
281362306a36Sopenharmony_ci
281462306a36Sopenharmony_ciif($kernel) {
281562306a36Sopenharmony_ci	$code .= "#ifdef CONFIG_AS_AVX512\n";
281662306a36Sopenharmony_ci}
281762306a36Sopenharmony_ci
281862306a36Sopenharmony_ci&declare_function("poly1305_blocks_avx512", 32, 4);
281962306a36Sopenharmony_cipoly1305_blocks_avxN(1);
282062306a36Sopenharmony_ci&end_function("poly1305_blocks_avx512");
282162306a36Sopenharmony_ci
282262306a36Sopenharmony_ciif ($kernel) {
282362306a36Sopenharmony_ci	$code .= "#endif\n";
282462306a36Sopenharmony_ci}
282562306a36Sopenharmony_ci
282662306a36Sopenharmony_ciif (!$kernel && $avx>3) {
282762306a36Sopenharmony_ci########################################################################
282862306a36Sopenharmony_ci# VPMADD52 version using 2^44 radix.
282962306a36Sopenharmony_ci#
283062306a36Sopenharmony_ci# One can argue that base 2^52 would be more natural. Well, even though
283162306a36Sopenharmony_ci# some operations would be more natural, one has to recognize couple of
283262306a36Sopenharmony_ci# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
283362306a36Sopenharmony_ci# at amount of multiply-n-accumulate operations. Secondly, it makes it
283462306a36Sopenharmony_ci# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
283562306a36Sopenharmony_ci# reference implementations], which means that more such operations
283662306a36Sopenharmony_ci# would have to be performed in inner loop, which in turn makes critical
283762306a36Sopenharmony_ci# path longer. In other words, even though base 2^44 reduction might
283862306a36Sopenharmony_ci# look less elegant, overall critical path is actually shorter...
283962306a36Sopenharmony_ci
284062306a36Sopenharmony_ci########################################################################
284162306a36Sopenharmony_ci# Layout of opaque area is following.
284262306a36Sopenharmony_ci#
284362306a36Sopenharmony_ci#	unsigned __int64 h[3];		# current hash value base 2^44
284462306a36Sopenharmony_ci#	unsigned __int64 s[2];		# key value*20 base 2^44
284562306a36Sopenharmony_ci#	unsigned __int64 r[3];		# key value base 2^44
284662306a36Sopenharmony_ci#	struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
284762306a36Sopenharmony_ci#					# r^n positions reflect
284862306a36Sopenharmony_ci#					# placement in register, not
284962306a36Sopenharmony_ci#					# memory, R[3] is R[1]*20
285062306a36Sopenharmony_ci
285162306a36Sopenharmony_ci$code.=<<___;
285262306a36Sopenharmony_ci.type	poly1305_init_base2_44,\@function,3
285362306a36Sopenharmony_ci.align	32
285462306a36Sopenharmony_cipoly1305_init_base2_44:
285562306a36Sopenharmony_ci	xor	%eax,%eax
285662306a36Sopenharmony_ci	mov	%rax,0($ctx)		# initialize hash value
285762306a36Sopenharmony_ci	mov	%rax,8($ctx)
285862306a36Sopenharmony_ci	mov	%rax,16($ctx)
285962306a36Sopenharmony_ci
286062306a36Sopenharmony_ci.Linit_base2_44:
286162306a36Sopenharmony_ci	lea	poly1305_blocks_vpmadd52(%rip),%r10
286262306a36Sopenharmony_ci	lea	poly1305_emit_base2_44(%rip),%r11
286362306a36Sopenharmony_ci
286462306a36Sopenharmony_ci	mov	\$0x0ffffffc0fffffff,%rax
286562306a36Sopenharmony_ci	mov	\$0x0ffffffc0ffffffc,%rcx
286662306a36Sopenharmony_ci	and	0($inp),%rax
286762306a36Sopenharmony_ci	mov	\$0x00000fffffffffff,%r8
286862306a36Sopenharmony_ci	and	8($inp),%rcx
286962306a36Sopenharmony_ci	mov	\$0x00000fffffffffff,%r9
287062306a36Sopenharmony_ci	and	%rax,%r8
287162306a36Sopenharmony_ci	shrd	\$44,%rcx,%rax
287262306a36Sopenharmony_ci	mov	%r8,40($ctx)		# r0
287362306a36Sopenharmony_ci	and	%r9,%rax
287462306a36Sopenharmony_ci	shr	\$24,%rcx
287562306a36Sopenharmony_ci	mov	%rax,48($ctx)		# r1
287662306a36Sopenharmony_ci	lea	(%rax,%rax,4),%rax	# *5
287762306a36Sopenharmony_ci	mov	%rcx,56($ctx)		# r2
287862306a36Sopenharmony_ci	shl	\$2,%rax		# magic <<2
287962306a36Sopenharmony_ci	lea	(%rcx,%rcx,4),%rcx	# *5
288062306a36Sopenharmony_ci	shl	\$2,%rcx		# magic <<2
288162306a36Sopenharmony_ci	mov	%rax,24($ctx)		# s1
288262306a36Sopenharmony_ci	mov	%rcx,32($ctx)		# s2
288362306a36Sopenharmony_ci	movq	\$-1,64($ctx)		# write impossible value
288462306a36Sopenharmony_ci___
288562306a36Sopenharmony_ci$code.=<<___	if ($flavour !~ /elf32/);
288662306a36Sopenharmony_ci	mov	%r10,0(%rdx)
288762306a36Sopenharmony_ci	mov	%r11,8(%rdx)
288862306a36Sopenharmony_ci___
288962306a36Sopenharmony_ci$code.=<<___	if ($flavour =~ /elf32/);
289062306a36Sopenharmony_ci	mov	%r10d,0(%rdx)
289162306a36Sopenharmony_ci	mov	%r11d,4(%rdx)
289262306a36Sopenharmony_ci___
289362306a36Sopenharmony_ci$code.=<<___;
289462306a36Sopenharmony_ci	mov	\$1,%eax
289562306a36Sopenharmony_ci	RET
289662306a36Sopenharmony_ci.size	poly1305_init_base2_44,.-poly1305_init_base2_44
289762306a36Sopenharmony_ci___
289862306a36Sopenharmony_ci{
289962306a36Sopenharmony_cimy ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
290062306a36Sopenharmony_cimy ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
290162306a36Sopenharmony_cimy ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
290262306a36Sopenharmony_ci
290362306a36Sopenharmony_ci$code.=<<___;
290462306a36Sopenharmony_ci.type	poly1305_blocks_vpmadd52,\@function,4
290562306a36Sopenharmony_ci.align	32
290662306a36Sopenharmony_cipoly1305_blocks_vpmadd52:
290762306a36Sopenharmony_ci	shr	\$4,$len
290862306a36Sopenharmony_ci	jz	.Lno_data_vpmadd52		# too short
290962306a36Sopenharmony_ci
291062306a36Sopenharmony_ci	shl	\$40,$padbit
291162306a36Sopenharmony_ci	mov	64($ctx),%r8			# peek on power of the key
291262306a36Sopenharmony_ci
291362306a36Sopenharmony_ci	# if powers of the key are not calculated yet, process up to 3
291462306a36Sopenharmony_ci	# blocks with this single-block subroutine, otherwise ensure that
291562306a36Sopenharmony_ci	# length is divisible by 2 blocks and pass the rest down to next
291662306a36Sopenharmony_ci	# subroutine...
291762306a36Sopenharmony_ci
291862306a36Sopenharmony_ci	mov	\$3,%rax
291962306a36Sopenharmony_ci	mov	\$1,%r10
292062306a36Sopenharmony_ci	cmp	\$4,$len			# is input long
292162306a36Sopenharmony_ci	cmovae	%r10,%rax
292262306a36Sopenharmony_ci	test	%r8,%r8				# is power value impossible?
292362306a36Sopenharmony_ci	cmovns	%r10,%rax
292462306a36Sopenharmony_ci
292562306a36Sopenharmony_ci	and	$len,%rax			# is input of favourable length?
292662306a36Sopenharmony_ci	jz	.Lblocks_vpmadd52_4x
292762306a36Sopenharmony_ci
292862306a36Sopenharmony_ci	sub		%rax,$len
292962306a36Sopenharmony_ci	mov		\$7,%r10d
293062306a36Sopenharmony_ci	mov		\$1,%r11d
293162306a36Sopenharmony_ci	kmovw		%r10d,%k7
293262306a36Sopenharmony_ci	lea		.L2_44_inp_permd(%rip),%r10
293362306a36Sopenharmony_ci	kmovw		%r11d,%k1
293462306a36Sopenharmony_ci
293562306a36Sopenharmony_ci	vmovq		$padbit,%x#$PAD
293662306a36Sopenharmony_ci	vmovdqa64	0(%r10),$inp_permd	# .L2_44_inp_permd
293762306a36Sopenharmony_ci	vmovdqa64	32(%r10),$inp_shift	# .L2_44_inp_shift
293862306a36Sopenharmony_ci	vpermq		\$0xcf,$PAD,$PAD
293962306a36Sopenharmony_ci	vmovdqa64	64(%r10),$reduc_mask	# .L2_44_mask
294062306a36Sopenharmony_ci
294162306a36Sopenharmony_ci	vmovdqu64	0($ctx),${Dlo}{%k7}{z}		# load hash value
294262306a36Sopenharmony_ci	vmovdqu64	40($ctx),${r2r1r0}{%k7}{z}	# load keys
294362306a36Sopenharmony_ci	vmovdqu64	32($ctx),${r1r0s2}{%k7}{z}
294462306a36Sopenharmony_ci	vmovdqu64	24($ctx),${r0s2s1}{%k7}{z}
294562306a36Sopenharmony_ci
294662306a36Sopenharmony_ci	vmovdqa64	96(%r10),$reduc_rght	# .L2_44_shift_rgt
294762306a36Sopenharmony_ci	vmovdqa64	128(%r10),$reduc_left	# .L2_44_shift_lft
294862306a36Sopenharmony_ci
294962306a36Sopenharmony_ci	jmp		.Loop_vpmadd52
295062306a36Sopenharmony_ci
295162306a36Sopenharmony_ci.align	32
295262306a36Sopenharmony_ci.Loop_vpmadd52:
295362306a36Sopenharmony_ci	vmovdqu32	0($inp),%x#$T0		# load input as ----3210
295462306a36Sopenharmony_ci	lea		16($inp),$inp
295562306a36Sopenharmony_ci
295662306a36Sopenharmony_ci	vpermd		$T0,$inp_permd,$T0	# ----3210 -> --322110
295762306a36Sopenharmony_ci	vpsrlvq		$inp_shift,$T0,$T0
295862306a36Sopenharmony_ci	vpandq		$reduc_mask,$T0,$T0
295962306a36Sopenharmony_ci	vporq		$PAD,$T0,$T0
296062306a36Sopenharmony_ci
296162306a36Sopenharmony_ci	vpaddq		$T0,$Dlo,$Dlo		# accumulate input
296262306a36Sopenharmony_ci
296362306a36Sopenharmony_ci	vpermq		\$0,$Dlo,${H0}{%k7}{z}	# smash hash value
296462306a36Sopenharmony_ci	vpermq		\$0b01010101,$Dlo,${H1}{%k7}{z}
296562306a36Sopenharmony_ci	vpermq		\$0b10101010,$Dlo,${H2}{%k7}{z}
296662306a36Sopenharmony_ci
296762306a36Sopenharmony_ci	vpxord		$Dlo,$Dlo,$Dlo
296862306a36Sopenharmony_ci	vpxord		$Dhi,$Dhi,$Dhi
296962306a36Sopenharmony_ci
297062306a36Sopenharmony_ci	vpmadd52luq	$r2r1r0,$H0,$Dlo
297162306a36Sopenharmony_ci	vpmadd52huq	$r2r1r0,$H0,$Dhi
297262306a36Sopenharmony_ci
297362306a36Sopenharmony_ci	vpmadd52luq	$r1r0s2,$H1,$Dlo
297462306a36Sopenharmony_ci	vpmadd52huq	$r1r0s2,$H1,$Dhi
297562306a36Sopenharmony_ci
297662306a36Sopenharmony_ci	vpmadd52luq	$r0s2s1,$H2,$Dlo
297762306a36Sopenharmony_ci	vpmadd52huq	$r0s2s1,$H2,$Dhi
297862306a36Sopenharmony_ci
297962306a36Sopenharmony_ci	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost qword
298062306a36Sopenharmony_ci	vpsllvq		$reduc_left,$Dhi,$Dhi	# 0 in topmost qword
298162306a36Sopenharmony_ci	vpandq		$reduc_mask,$Dlo,$Dlo
298262306a36Sopenharmony_ci
298362306a36Sopenharmony_ci	vpaddq		$T0,$Dhi,$Dhi
298462306a36Sopenharmony_ci
298562306a36Sopenharmony_ci	vpermq		\$0b10010011,$Dhi,$Dhi	# 0 in lowest qword
298662306a36Sopenharmony_ci
298762306a36Sopenharmony_ci	vpaddq		$Dhi,$Dlo,$Dlo		# note topmost qword :-)
298862306a36Sopenharmony_ci
298962306a36Sopenharmony_ci	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost word
299062306a36Sopenharmony_ci	vpandq		$reduc_mask,$Dlo,$Dlo
299162306a36Sopenharmony_ci
299262306a36Sopenharmony_ci	vpermq		\$0b10010011,$T0,$T0
299362306a36Sopenharmony_ci
299462306a36Sopenharmony_ci	vpaddq		$T0,$Dlo,$Dlo
299562306a36Sopenharmony_ci
299662306a36Sopenharmony_ci	vpermq		\$0b10010011,$Dlo,${T0}{%k1}{z}
299762306a36Sopenharmony_ci
299862306a36Sopenharmony_ci	vpaddq		$T0,$Dlo,$Dlo
299962306a36Sopenharmony_ci	vpsllq		\$2,$T0,$T0
300062306a36Sopenharmony_ci
300162306a36Sopenharmony_ci	vpaddq		$T0,$Dlo,$Dlo
300262306a36Sopenharmony_ci
300362306a36Sopenharmony_ci	dec		%rax			# len-=16
300462306a36Sopenharmony_ci	jnz		.Loop_vpmadd52
300562306a36Sopenharmony_ci
300662306a36Sopenharmony_ci	vmovdqu64	$Dlo,0($ctx){%k7}	# store hash value
300762306a36Sopenharmony_ci
300862306a36Sopenharmony_ci	test		$len,$len
300962306a36Sopenharmony_ci	jnz		.Lblocks_vpmadd52_4x
301062306a36Sopenharmony_ci
301162306a36Sopenharmony_ci.Lno_data_vpmadd52:
301262306a36Sopenharmony_ci	RET
301362306a36Sopenharmony_ci.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
301462306a36Sopenharmony_ci___
301562306a36Sopenharmony_ci}
301662306a36Sopenharmony_ci{
301762306a36Sopenharmony_ci########################################################################
301862306a36Sopenharmony_ci# As implied by its name 4x subroutine processes 4 blocks in parallel
301962306a36Sopenharmony_ci# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
302062306a36Sopenharmony_ci# and is handled in 256-bit %ymm registers.
302162306a36Sopenharmony_ci
302262306a36Sopenharmony_cimy ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
302362306a36Sopenharmony_cimy ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
302462306a36Sopenharmony_cimy ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
302562306a36Sopenharmony_ci
302662306a36Sopenharmony_ci$code.=<<___;
302762306a36Sopenharmony_ci.type	poly1305_blocks_vpmadd52_4x,\@function,4
302862306a36Sopenharmony_ci.align	32
302962306a36Sopenharmony_cipoly1305_blocks_vpmadd52_4x:
303062306a36Sopenharmony_ci	shr	\$4,$len
303162306a36Sopenharmony_ci	jz	.Lno_data_vpmadd52_4x		# too short
303262306a36Sopenharmony_ci
303362306a36Sopenharmony_ci	shl	\$40,$padbit
303462306a36Sopenharmony_ci	mov	64($ctx),%r8			# peek on power of the key
303562306a36Sopenharmony_ci
303662306a36Sopenharmony_ci.Lblocks_vpmadd52_4x:
303762306a36Sopenharmony_ci	vpbroadcastq	$padbit,$PAD
303862306a36Sopenharmony_ci
303962306a36Sopenharmony_ci	vmovdqa64	.Lx_mask44(%rip),$mask44
304062306a36Sopenharmony_ci	mov		\$5,%eax
304162306a36Sopenharmony_ci	vmovdqa64	.Lx_mask42(%rip),$mask42
304262306a36Sopenharmony_ci	kmovw		%eax,%k1		# used in 2x path
304362306a36Sopenharmony_ci
304462306a36Sopenharmony_ci	test		%r8,%r8			# is power value impossible?
304562306a36Sopenharmony_ci	js		.Linit_vpmadd52		# if it is, then init R[4]
304662306a36Sopenharmony_ci
304762306a36Sopenharmony_ci	vmovq		0($ctx),%x#$H0		# load current hash value
304862306a36Sopenharmony_ci	vmovq		8($ctx),%x#$H1
304962306a36Sopenharmony_ci	vmovq		16($ctx),%x#$H2
305062306a36Sopenharmony_ci
305162306a36Sopenharmony_ci	test		\$3,$len		# is length 4*n+2?
305262306a36Sopenharmony_ci	jnz		.Lblocks_vpmadd52_2x_do
305362306a36Sopenharmony_ci
305462306a36Sopenharmony_ci.Lblocks_vpmadd52_4x_do:
305562306a36Sopenharmony_ci	vpbroadcastq	64($ctx),$R0		# load 4th power of the key
305662306a36Sopenharmony_ci	vpbroadcastq	96($ctx),$R1
305762306a36Sopenharmony_ci	vpbroadcastq	128($ctx),$R2
305862306a36Sopenharmony_ci	vpbroadcastq	160($ctx),$S1
305962306a36Sopenharmony_ci
306062306a36Sopenharmony_ci.Lblocks_vpmadd52_4x_key_loaded:
306162306a36Sopenharmony_ci	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
306262306a36Sopenharmony_ci	vpaddq		$R2,$S2,$S2
306362306a36Sopenharmony_ci	vpsllq		\$2,$S2,$S2
306462306a36Sopenharmony_ci
306562306a36Sopenharmony_ci	test		\$7,$len		# is len 8*n?
306662306a36Sopenharmony_ci	jz		.Lblocks_vpmadd52_8x
306762306a36Sopenharmony_ci
306862306a36Sopenharmony_ci	vmovdqu64	16*0($inp),$T2		# load data
306962306a36Sopenharmony_ci	vmovdqu64	16*2($inp),$T3
307062306a36Sopenharmony_ci	lea		16*4($inp),$inp
307162306a36Sopenharmony_ci
307262306a36Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T1		# transpose data
307362306a36Sopenharmony_ci	vpunpckhqdq	$T3,$T2,$T3
307462306a36Sopenharmony_ci
307562306a36Sopenharmony_ci	# at this point 64-bit lanes are ordered as 3-1-2-0
307662306a36Sopenharmony_ci
307762306a36Sopenharmony_ci	vpsrlq		\$24,$T3,$T2		# splat the data
307862306a36Sopenharmony_ci	vporq		$PAD,$T2,$T2
307962306a36Sopenharmony_ci	 vpaddq		$T2,$H2,$H2		# accumulate input
308062306a36Sopenharmony_ci	vpandq		$mask44,$T1,$T0
308162306a36Sopenharmony_ci	vpsrlq		\$44,$T1,$T1
308262306a36Sopenharmony_ci	vpsllq		\$20,$T3,$T3
308362306a36Sopenharmony_ci	vporq		$T3,$T1,$T1
308462306a36Sopenharmony_ci	vpandq		$mask44,$T1,$T1
308562306a36Sopenharmony_ci
308662306a36Sopenharmony_ci	sub		\$4,$len
308762306a36Sopenharmony_ci	jz		.Ltail_vpmadd52_4x
308862306a36Sopenharmony_ci	jmp		.Loop_vpmadd52_4x
308962306a36Sopenharmony_ci	ud2
309062306a36Sopenharmony_ci
309162306a36Sopenharmony_ci.align	32
309262306a36Sopenharmony_ci.Linit_vpmadd52:
309362306a36Sopenharmony_ci	vmovq		24($ctx),%x#$S1		# load key
309462306a36Sopenharmony_ci	vmovq		56($ctx),%x#$H2
309562306a36Sopenharmony_ci	vmovq		32($ctx),%x#$S2
309662306a36Sopenharmony_ci	vmovq		40($ctx),%x#$R0
309762306a36Sopenharmony_ci	vmovq		48($ctx),%x#$R1
309862306a36Sopenharmony_ci
309962306a36Sopenharmony_ci	vmovdqa		$R0,$H0
310062306a36Sopenharmony_ci	vmovdqa		$R1,$H1
310162306a36Sopenharmony_ci	vmovdqa		$H2,$R2
310262306a36Sopenharmony_ci
310362306a36Sopenharmony_ci	mov		\$2,%eax
310462306a36Sopenharmony_ci
310562306a36Sopenharmony_ci.Lmul_init_vpmadd52:
310662306a36Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
310762306a36Sopenharmony_ci	vpmadd52luq	$H2,$S1,$D0lo
310862306a36Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
310962306a36Sopenharmony_ci	vpmadd52huq	$H2,$S1,$D0hi
311062306a36Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
311162306a36Sopenharmony_ci	vpmadd52luq	$H2,$S2,$D1lo
311262306a36Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
311362306a36Sopenharmony_ci	vpmadd52huq	$H2,$S2,$D1hi
311462306a36Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
311562306a36Sopenharmony_ci	vpmadd52luq	$H2,$R0,$D2lo
311662306a36Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
311762306a36Sopenharmony_ci	vpmadd52huq	$H2,$R0,$D2hi
311862306a36Sopenharmony_ci
311962306a36Sopenharmony_ci	vpmadd52luq	$H0,$R0,$D0lo
312062306a36Sopenharmony_ci	vpmadd52huq	$H0,$R0,$D0hi
312162306a36Sopenharmony_ci	vpmadd52luq	$H0,$R1,$D1lo
312262306a36Sopenharmony_ci	vpmadd52huq	$H0,$R1,$D1hi
312362306a36Sopenharmony_ci	vpmadd52luq	$H0,$R2,$D2lo
312462306a36Sopenharmony_ci	vpmadd52huq	$H0,$R2,$D2hi
312562306a36Sopenharmony_ci
312662306a36Sopenharmony_ci	vpmadd52luq	$H1,$S2,$D0lo
312762306a36Sopenharmony_ci	vpmadd52huq	$H1,$S2,$D0hi
312862306a36Sopenharmony_ci	vpmadd52luq	$H1,$R0,$D1lo
312962306a36Sopenharmony_ci	vpmadd52huq	$H1,$R0,$D1hi
313062306a36Sopenharmony_ci	vpmadd52luq	$H1,$R1,$D2lo
313162306a36Sopenharmony_ci	vpmadd52huq	$H1,$R1,$D2hi
313262306a36Sopenharmony_ci
313362306a36Sopenharmony_ci	################################################################
313462306a36Sopenharmony_ci	# partial reduction
313562306a36Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
313662306a36Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
313762306a36Sopenharmony_ci	vpandq		$mask44,$D0lo,$H0
313862306a36Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
313962306a36Sopenharmony_ci
314062306a36Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
314162306a36Sopenharmony_ci
314262306a36Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
314362306a36Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
314462306a36Sopenharmony_ci	vpandq		$mask44,$D1lo,$H1
314562306a36Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
314662306a36Sopenharmony_ci
314762306a36Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
314862306a36Sopenharmony_ci
314962306a36Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
315062306a36Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
315162306a36Sopenharmony_ci	vpandq		$mask42,$D2lo,$H2
315262306a36Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
315362306a36Sopenharmony_ci
315462306a36Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
315562306a36Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
315662306a36Sopenharmony_ci
315762306a36Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
315862306a36Sopenharmony_ci
315962306a36Sopenharmony_ci	vpsrlq		\$44,$H0,$tmp		# additional step
316062306a36Sopenharmony_ci	vpandq		$mask44,$H0,$H0
316162306a36Sopenharmony_ci
316262306a36Sopenharmony_ci	vpaddq		$tmp,$H1,$H1
316362306a36Sopenharmony_ci
316462306a36Sopenharmony_ci	dec		%eax
316562306a36Sopenharmony_ci	jz		.Ldone_init_vpmadd52
316662306a36Sopenharmony_ci
316762306a36Sopenharmony_ci	vpunpcklqdq	$R1,$H1,$R1		# 1,2
316862306a36Sopenharmony_ci	vpbroadcastq	%x#$H1,%x#$H1		# 2,2
316962306a36Sopenharmony_ci	vpunpcklqdq	$R2,$H2,$R2
317062306a36Sopenharmony_ci	vpbroadcastq	%x#$H2,%x#$H2
317162306a36Sopenharmony_ci	vpunpcklqdq	$R0,$H0,$R0
317262306a36Sopenharmony_ci	vpbroadcastq	%x#$H0,%x#$H0
317362306a36Sopenharmony_ci
317462306a36Sopenharmony_ci	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
317562306a36Sopenharmony_ci	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
317662306a36Sopenharmony_ci	vpaddq		$R1,$S1,$S1
317762306a36Sopenharmony_ci	vpaddq		$R2,$S2,$S2
317862306a36Sopenharmony_ci	vpsllq		\$2,$S1,$S1
317962306a36Sopenharmony_ci	vpsllq		\$2,$S2,$S2
318062306a36Sopenharmony_ci
318162306a36Sopenharmony_ci	jmp		.Lmul_init_vpmadd52
318262306a36Sopenharmony_ci	ud2
318362306a36Sopenharmony_ci
318462306a36Sopenharmony_ci.align	32
318562306a36Sopenharmony_ci.Ldone_init_vpmadd52:
318662306a36Sopenharmony_ci	vinserti128	\$1,%x#$R1,$H1,$R1	# 1,2,3,4
318762306a36Sopenharmony_ci	vinserti128	\$1,%x#$R2,$H2,$R2
318862306a36Sopenharmony_ci	vinserti128	\$1,%x#$R0,$H0,$R0
318962306a36Sopenharmony_ci
319062306a36Sopenharmony_ci	vpermq		\$0b11011000,$R1,$R1	# 1,3,2,4
319162306a36Sopenharmony_ci	vpermq		\$0b11011000,$R2,$R2
319262306a36Sopenharmony_ci	vpermq		\$0b11011000,$R0,$R0
319362306a36Sopenharmony_ci
319462306a36Sopenharmony_ci	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
319562306a36Sopenharmony_ci	vpaddq		$R1,$S1,$S1
319662306a36Sopenharmony_ci	vpsllq		\$2,$S1,$S1
319762306a36Sopenharmony_ci
319862306a36Sopenharmony_ci	vmovq		0($ctx),%x#$H0		# load current hash value
319962306a36Sopenharmony_ci	vmovq		8($ctx),%x#$H1
320062306a36Sopenharmony_ci	vmovq		16($ctx),%x#$H2
320162306a36Sopenharmony_ci
320262306a36Sopenharmony_ci	test		\$3,$len		# is length 4*n+2?
320362306a36Sopenharmony_ci	jnz		.Ldone_init_vpmadd52_2x
320462306a36Sopenharmony_ci
320562306a36Sopenharmony_ci	vmovdqu64	$R0,64($ctx)		# save key powers
320662306a36Sopenharmony_ci	vpbroadcastq	%x#$R0,$R0		# broadcast 4th power
320762306a36Sopenharmony_ci	vmovdqu64	$R1,96($ctx)
320862306a36Sopenharmony_ci	vpbroadcastq	%x#$R1,$R1
320962306a36Sopenharmony_ci	vmovdqu64	$R2,128($ctx)
321062306a36Sopenharmony_ci	vpbroadcastq	%x#$R2,$R2
321162306a36Sopenharmony_ci	vmovdqu64	$S1,160($ctx)
321262306a36Sopenharmony_ci	vpbroadcastq	%x#$S1,$S1
321362306a36Sopenharmony_ci
321462306a36Sopenharmony_ci	jmp		.Lblocks_vpmadd52_4x_key_loaded
321562306a36Sopenharmony_ci	ud2
321662306a36Sopenharmony_ci
321762306a36Sopenharmony_ci.align	32
321862306a36Sopenharmony_ci.Ldone_init_vpmadd52_2x:
321962306a36Sopenharmony_ci	vmovdqu64	$R0,64($ctx)		# save key powers
322062306a36Sopenharmony_ci	vpsrldq		\$8,$R0,$R0		# 0-1-0-2
322162306a36Sopenharmony_ci	vmovdqu64	$R1,96($ctx)
322262306a36Sopenharmony_ci	vpsrldq		\$8,$R1,$R1
322362306a36Sopenharmony_ci	vmovdqu64	$R2,128($ctx)
322462306a36Sopenharmony_ci	vpsrldq		\$8,$R2,$R2
322562306a36Sopenharmony_ci	vmovdqu64	$S1,160($ctx)
322662306a36Sopenharmony_ci	vpsrldq		\$8,$S1,$S1
322762306a36Sopenharmony_ci	jmp		.Lblocks_vpmadd52_2x_key_loaded
322862306a36Sopenharmony_ci	ud2
322962306a36Sopenharmony_ci
323062306a36Sopenharmony_ci.align	32
323162306a36Sopenharmony_ci.Lblocks_vpmadd52_2x_do:
323262306a36Sopenharmony_ci	vmovdqu64	128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
323362306a36Sopenharmony_ci	vmovdqu64	160+8($ctx),${S1}{%k1}{z}
323462306a36Sopenharmony_ci	vmovdqu64	64+8($ctx),${R0}{%k1}{z}
323562306a36Sopenharmony_ci	vmovdqu64	96+8($ctx),${R1}{%k1}{z}
323662306a36Sopenharmony_ci
323762306a36Sopenharmony_ci.Lblocks_vpmadd52_2x_key_loaded:
323862306a36Sopenharmony_ci	vmovdqu64	16*0($inp),$T2		# load data
323962306a36Sopenharmony_ci	vpxorq		$T3,$T3,$T3
324062306a36Sopenharmony_ci	lea		16*2($inp),$inp
324162306a36Sopenharmony_ci
324262306a36Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T1		# transpose data
324362306a36Sopenharmony_ci	vpunpckhqdq	$T3,$T2,$T3
324462306a36Sopenharmony_ci
324562306a36Sopenharmony_ci	# at this point 64-bit lanes are ordered as x-1-x-0
324662306a36Sopenharmony_ci
324762306a36Sopenharmony_ci	vpsrlq		\$24,$T3,$T2		# splat the data
324862306a36Sopenharmony_ci	vporq		$PAD,$T2,$T2
324962306a36Sopenharmony_ci	 vpaddq		$T2,$H2,$H2		# accumulate input
325062306a36Sopenharmony_ci	vpandq		$mask44,$T1,$T0
325162306a36Sopenharmony_ci	vpsrlq		\$44,$T1,$T1
325262306a36Sopenharmony_ci	vpsllq		\$20,$T3,$T3
325362306a36Sopenharmony_ci	vporq		$T3,$T1,$T1
325462306a36Sopenharmony_ci	vpandq		$mask44,$T1,$T1
325562306a36Sopenharmony_ci
325662306a36Sopenharmony_ci	jmp		.Ltail_vpmadd52_2x
325762306a36Sopenharmony_ci	ud2
325862306a36Sopenharmony_ci
325962306a36Sopenharmony_ci.align	32
326062306a36Sopenharmony_ci.Loop_vpmadd52_4x:
326162306a36Sopenharmony_ci	#vpaddq		$T2,$H2,$H2		# accumulate input
326262306a36Sopenharmony_ci	vpaddq		$T0,$H0,$H0
326362306a36Sopenharmony_ci	vpaddq		$T1,$H1,$H1
326462306a36Sopenharmony_ci
326562306a36Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
326662306a36Sopenharmony_ci	vpmadd52luq	$H2,$S1,$D0lo
326762306a36Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
326862306a36Sopenharmony_ci	vpmadd52huq	$H2,$S1,$D0hi
326962306a36Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
327062306a36Sopenharmony_ci	vpmadd52luq	$H2,$S2,$D1lo
327162306a36Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
327262306a36Sopenharmony_ci	vpmadd52huq	$H2,$S2,$D1hi
327362306a36Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
327462306a36Sopenharmony_ci	vpmadd52luq	$H2,$R0,$D2lo
327562306a36Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
327662306a36Sopenharmony_ci	vpmadd52huq	$H2,$R0,$D2hi
327762306a36Sopenharmony_ci
327862306a36Sopenharmony_ci	 vmovdqu64	16*0($inp),$T2		# load data
327962306a36Sopenharmony_ci	 vmovdqu64	16*2($inp),$T3
328062306a36Sopenharmony_ci	 lea		16*4($inp),$inp
328162306a36Sopenharmony_ci	vpmadd52luq	$H0,$R0,$D0lo
328262306a36Sopenharmony_ci	vpmadd52huq	$H0,$R0,$D0hi
328362306a36Sopenharmony_ci	vpmadd52luq	$H0,$R1,$D1lo
328462306a36Sopenharmony_ci	vpmadd52huq	$H0,$R1,$D1hi
328562306a36Sopenharmony_ci	vpmadd52luq	$H0,$R2,$D2lo
328662306a36Sopenharmony_ci	vpmadd52huq	$H0,$R2,$D2hi
328762306a36Sopenharmony_ci
328862306a36Sopenharmony_ci	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
328962306a36Sopenharmony_ci	 vpunpckhqdq	$T3,$T2,$T3
329062306a36Sopenharmony_ci	vpmadd52luq	$H1,$S2,$D0lo
329162306a36Sopenharmony_ci	vpmadd52huq	$H1,$S2,$D0hi
329262306a36Sopenharmony_ci	vpmadd52luq	$H1,$R0,$D1lo
329362306a36Sopenharmony_ci	vpmadd52huq	$H1,$R0,$D1hi
329462306a36Sopenharmony_ci	vpmadd52luq	$H1,$R1,$D2lo
329562306a36Sopenharmony_ci	vpmadd52huq	$H1,$R1,$D2hi
329662306a36Sopenharmony_ci
329762306a36Sopenharmony_ci	################################################################
329862306a36Sopenharmony_ci	# partial reduction (interleaved with data splat)
329962306a36Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
330062306a36Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
330162306a36Sopenharmony_ci	vpandq		$mask44,$D0lo,$H0
330262306a36Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
330362306a36Sopenharmony_ci
330462306a36Sopenharmony_ci	 vpsrlq		\$24,$T3,$T2
330562306a36Sopenharmony_ci	 vporq		$PAD,$T2,$T2
330662306a36Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
330762306a36Sopenharmony_ci
330862306a36Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
330962306a36Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
331062306a36Sopenharmony_ci	vpandq		$mask44,$D1lo,$H1
331162306a36Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
331262306a36Sopenharmony_ci
331362306a36Sopenharmony_ci	 vpandq		$mask44,$T1,$T0
331462306a36Sopenharmony_ci	 vpsrlq		\$44,$T1,$T1
331562306a36Sopenharmony_ci	 vpsllq		\$20,$T3,$T3
331662306a36Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
331762306a36Sopenharmony_ci
331862306a36Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
331962306a36Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
332062306a36Sopenharmony_ci	vpandq		$mask42,$D2lo,$H2
332162306a36Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
332262306a36Sopenharmony_ci
332362306a36Sopenharmony_ci	  vpaddq	$T2,$H2,$H2		# accumulate input
332462306a36Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
332562306a36Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
332662306a36Sopenharmony_ci
332762306a36Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
332862306a36Sopenharmony_ci	 vporq		$T3,$T1,$T1
332962306a36Sopenharmony_ci	 vpandq		$mask44,$T1,$T1
333062306a36Sopenharmony_ci
333162306a36Sopenharmony_ci	vpsrlq		\$44,$H0,$tmp		# additional step
333262306a36Sopenharmony_ci	vpandq		$mask44,$H0,$H0
333362306a36Sopenharmony_ci
333462306a36Sopenharmony_ci	vpaddq		$tmp,$H1,$H1
333562306a36Sopenharmony_ci
333662306a36Sopenharmony_ci	sub		\$4,$len		# len-=64
333762306a36Sopenharmony_ci	jnz		.Loop_vpmadd52_4x
333862306a36Sopenharmony_ci
333962306a36Sopenharmony_ci.Ltail_vpmadd52_4x:
334062306a36Sopenharmony_ci	vmovdqu64	128($ctx),$R2		# load all key powers
334162306a36Sopenharmony_ci	vmovdqu64	160($ctx),$S1
334262306a36Sopenharmony_ci	vmovdqu64	64($ctx),$R0
334362306a36Sopenharmony_ci	vmovdqu64	96($ctx),$R1
334462306a36Sopenharmony_ci
334562306a36Sopenharmony_ci.Ltail_vpmadd52_2x:
334662306a36Sopenharmony_ci	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
334762306a36Sopenharmony_ci	vpaddq		$R2,$S2,$S2
334862306a36Sopenharmony_ci	vpsllq		\$2,$S2,$S2
334962306a36Sopenharmony_ci
335062306a36Sopenharmony_ci	#vpaddq		$T2,$H2,$H2		# accumulate input
335162306a36Sopenharmony_ci	vpaddq		$T0,$H0,$H0
335262306a36Sopenharmony_ci	vpaddq		$T1,$H1,$H1
335362306a36Sopenharmony_ci
335462306a36Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
335562306a36Sopenharmony_ci	vpmadd52luq	$H2,$S1,$D0lo
335662306a36Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
335762306a36Sopenharmony_ci	vpmadd52huq	$H2,$S1,$D0hi
335862306a36Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
335962306a36Sopenharmony_ci	vpmadd52luq	$H2,$S2,$D1lo
336062306a36Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
336162306a36Sopenharmony_ci	vpmadd52huq	$H2,$S2,$D1hi
336262306a36Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
336362306a36Sopenharmony_ci	vpmadd52luq	$H2,$R0,$D2lo
336462306a36Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
336562306a36Sopenharmony_ci	vpmadd52huq	$H2,$R0,$D2hi
336662306a36Sopenharmony_ci
336762306a36Sopenharmony_ci	vpmadd52luq	$H0,$R0,$D0lo
336862306a36Sopenharmony_ci	vpmadd52huq	$H0,$R0,$D0hi
336962306a36Sopenharmony_ci	vpmadd52luq	$H0,$R1,$D1lo
337062306a36Sopenharmony_ci	vpmadd52huq	$H0,$R1,$D1hi
337162306a36Sopenharmony_ci	vpmadd52luq	$H0,$R2,$D2lo
337262306a36Sopenharmony_ci	vpmadd52huq	$H0,$R2,$D2hi
337362306a36Sopenharmony_ci
337462306a36Sopenharmony_ci	vpmadd52luq	$H1,$S2,$D0lo
337562306a36Sopenharmony_ci	vpmadd52huq	$H1,$S2,$D0hi
337662306a36Sopenharmony_ci	vpmadd52luq	$H1,$R0,$D1lo
337762306a36Sopenharmony_ci	vpmadd52huq	$H1,$R0,$D1hi
337862306a36Sopenharmony_ci	vpmadd52luq	$H1,$R1,$D2lo
337962306a36Sopenharmony_ci	vpmadd52huq	$H1,$R1,$D2hi
338062306a36Sopenharmony_ci
338162306a36Sopenharmony_ci	################################################################
338262306a36Sopenharmony_ci	# horizontal addition
338362306a36Sopenharmony_ci
338462306a36Sopenharmony_ci	mov		\$1,%eax
338562306a36Sopenharmony_ci	kmovw		%eax,%k1
338662306a36Sopenharmony_ci	vpsrldq		\$8,$D0lo,$T0
338762306a36Sopenharmony_ci	vpsrldq		\$8,$D0hi,$H0
338862306a36Sopenharmony_ci	vpsrldq		\$8,$D1lo,$T1
338962306a36Sopenharmony_ci	vpsrldq		\$8,$D1hi,$H1
339062306a36Sopenharmony_ci	vpaddq		$T0,$D0lo,$D0lo
339162306a36Sopenharmony_ci	vpaddq		$H0,$D0hi,$D0hi
339262306a36Sopenharmony_ci	vpsrldq		\$8,$D2lo,$T2
339362306a36Sopenharmony_ci	vpsrldq		\$8,$D2hi,$H2
339462306a36Sopenharmony_ci	vpaddq		$T1,$D1lo,$D1lo
339562306a36Sopenharmony_ci	vpaddq		$H1,$D1hi,$D1hi
339662306a36Sopenharmony_ci	 vpermq		\$0x2,$D0lo,$T0
339762306a36Sopenharmony_ci	 vpermq		\$0x2,$D0hi,$H0
339862306a36Sopenharmony_ci	vpaddq		$T2,$D2lo,$D2lo
339962306a36Sopenharmony_ci	vpaddq		$H2,$D2hi,$D2hi
340062306a36Sopenharmony_ci
340162306a36Sopenharmony_ci	vpermq		\$0x2,$D1lo,$T1
340262306a36Sopenharmony_ci	vpermq		\$0x2,$D1hi,$H1
340362306a36Sopenharmony_ci	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
340462306a36Sopenharmony_ci	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
340562306a36Sopenharmony_ci	vpermq		\$0x2,$D2lo,$T2
340662306a36Sopenharmony_ci	vpermq		\$0x2,$D2hi,$H2
340762306a36Sopenharmony_ci	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
340862306a36Sopenharmony_ci	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
340962306a36Sopenharmony_ci	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
341062306a36Sopenharmony_ci	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
341162306a36Sopenharmony_ci
341262306a36Sopenharmony_ci	################################################################
341362306a36Sopenharmony_ci	# partial reduction
341462306a36Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
341562306a36Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
341662306a36Sopenharmony_ci	vpandq		$mask44,$D0lo,$H0
341762306a36Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
341862306a36Sopenharmony_ci
341962306a36Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
342062306a36Sopenharmony_ci
342162306a36Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
342262306a36Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
342362306a36Sopenharmony_ci	vpandq		$mask44,$D1lo,$H1
342462306a36Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
342562306a36Sopenharmony_ci
342662306a36Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
342762306a36Sopenharmony_ci
342862306a36Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
342962306a36Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
343062306a36Sopenharmony_ci	vpandq		$mask42,$D2lo,$H2
343162306a36Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
343262306a36Sopenharmony_ci
343362306a36Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
343462306a36Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
343562306a36Sopenharmony_ci
343662306a36Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
343762306a36Sopenharmony_ci
343862306a36Sopenharmony_ci	vpsrlq		\$44,$H0,$tmp		# additional step
343962306a36Sopenharmony_ci	vpandq		$mask44,$H0,$H0
344062306a36Sopenharmony_ci
344162306a36Sopenharmony_ci	vpaddq		$tmp,$H1,$H1
344262306a36Sopenharmony_ci						# at this point $len is
344362306a36Sopenharmony_ci						# either 4*n+2 or 0...
344462306a36Sopenharmony_ci	sub		\$2,$len		# len-=32
344562306a36Sopenharmony_ci	ja		.Lblocks_vpmadd52_4x_do
344662306a36Sopenharmony_ci
344762306a36Sopenharmony_ci	vmovq		%x#$H0,0($ctx)
344862306a36Sopenharmony_ci	vmovq		%x#$H1,8($ctx)
344962306a36Sopenharmony_ci	vmovq		%x#$H2,16($ctx)
345062306a36Sopenharmony_ci	vzeroall
345162306a36Sopenharmony_ci
345262306a36Sopenharmony_ci.Lno_data_vpmadd52_4x:
345362306a36Sopenharmony_ci	RET
345462306a36Sopenharmony_ci.size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
345562306a36Sopenharmony_ci___
345662306a36Sopenharmony_ci}
345762306a36Sopenharmony_ci{
345862306a36Sopenharmony_ci########################################################################
345962306a36Sopenharmony_ci# As implied by its name 8x subroutine processes 8 blocks in parallel...
346062306a36Sopenharmony_ci# This is intermediate version, as it's used only in cases when input
346162306a36Sopenharmony_ci# length is either 8*n, 8*n+1 or 8*n+2...
346262306a36Sopenharmony_ci
346362306a36Sopenharmony_cimy ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
346462306a36Sopenharmony_cimy ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
346562306a36Sopenharmony_cimy ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
346662306a36Sopenharmony_cimy ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
346762306a36Sopenharmony_ci
346862306a36Sopenharmony_ci$code.=<<___;
346962306a36Sopenharmony_ci.type	poly1305_blocks_vpmadd52_8x,\@function,4
347062306a36Sopenharmony_ci.align	32
347162306a36Sopenharmony_cipoly1305_blocks_vpmadd52_8x:
347262306a36Sopenharmony_ci	shr	\$4,$len
347362306a36Sopenharmony_ci	jz	.Lno_data_vpmadd52_8x		# too short
347462306a36Sopenharmony_ci
347562306a36Sopenharmony_ci	shl	\$40,$padbit
347662306a36Sopenharmony_ci	mov	64($ctx),%r8			# peek on power of the key
347762306a36Sopenharmony_ci
347862306a36Sopenharmony_ci	vmovdqa64	.Lx_mask44(%rip),$mask44
347962306a36Sopenharmony_ci	vmovdqa64	.Lx_mask42(%rip),$mask42
348062306a36Sopenharmony_ci
348162306a36Sopenharmony_ci	test	%r8,%r8				# is power value impossible?
348262306a36Sopenharmony_ci	js	.Linit_vpmadd52			# if it is, then init R[4]
348362306a36Sopenharmony_ci
348462306a36Sopenharmony_ci	vmovq	0($ctx),%x#$H0			# load current hash value
348562306a36Sopenharmony_ci	vmovq	8($ctx),%x#$H1
348662306a36Sopenharmony_ci	vmovq	16($ctx),%x#$H2
348762306a36Sopenharmony_ci
348862306a36Sopenharmony_ci.Lblocks_vpmadd52_8x:
348962306a36Sopenharmony_ci	################################################################
349062306a36Sopenharmony_ci	# fist we calculate more key powers
349162306a36Sopenharmony_ci
349262306a36Sopenharmony_ci	vmovdqu64	128($ctx),$R2		# load 1-3-2-4 powers
349362306a36Sopenharmony_ci	vmovdqu64	160($ctx),$S1
349462306a36Sopenharmony_ci	vmovdqu64	64($ctx),$R0
349562306a36Sopenharmony_ci	vmovdqu64	96($ctx),$R1
349662306a36Sopenharmony_ci
349762306a36Sopenharmony_ci	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
349862306a36Sopenharmony_ci	vpaddq		$R2,$S2,$S2
349962306a36Sopenharmony_ci	vpsllq		\$2,$S2,$S2
350062306a36Sopenharmony_ci
350162306a36Sopenharmony_ci	vpbroadcastq	%x#$R2,$RR2		# broadcast 4th power
350262306a36Sopenharmony_ci	vpbroadcastq	%x#$R0,$RR0
350362306a36Sopenharmony_ci	vpbroadcastq	%x#$R1,$RR1
350462306a36Sopenharmony_ci
350562306a36Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
350662306a36Sopenharmony_ci	vpmadd52luq	$RR2,$S1,$D0lo
350762306a36Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
350862306a36Sopenharmony_ci	vpmadd52huq	$RR2,$S1,$D0hi
350962306a36Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
351062306a36Sopenharmony_ci	vpmadd52luq	$RR2,$S2,$D1lo
351162306a36Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
351262306a36Sopenharmony_ci	vpmadd52huq	$RR2,$S2,$D1hi
351362306a36Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
351462306a36Sopenharmony_ci	vpmadd52luq	$RR2,$R0,$D2lo
351562306a36Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
351662306a36Sopenharmony_ci	vpmadd52huq	$RR2,$R0,$D2hi
351762306a36Sopenharmony_ci
351862306a36Sopenharmony_ci	vpmadd52luq	$RR0,$R0,$D0lo
351962306a36Sopenharmony_ci	vpmadd52huq	$RR0,$R0,$D0hi
352062306a36Sopenharmony_ci	vpmadd52luq	$RR0,$R1,$D1lo
352162306a36Sopenharmony_ci	vpmadd52huq	$RR0,$R1,$D1hi
352262306a36Sopenharmony_ci	vpmadd52luq	$RR0,$R2,$D2lo
352362306a36Sopenharmony_ci	vpmadd52huq	$RR0,$R2,$D2hi
352462306a36Sopenharmony_ci
352562306a36Sopenharmony_ci	vpmadd52luq	$RR1,$S2,$D0lo
352662306a36Sopenharmony_ci	vpmadd52huq	$RR1,$S2,$D0hi
352762306a36Sopenharmony_ci	vpmadd52luq	$RR1,$R0,$D1lo
352862306a36Sopenharmony_ci	vpmadd52huq	$RR1,$R0,$D1hi
352962306a36Sopenharmony_ci	vpmadd52luq	$RR1,$R1,$D2lo
353062306a36Sopenharmony_ci	vpmadd52huq	$RR1,$R1,$D2hi
353162306a36Sopenharmony_ci
353262306a36Sopenharmony_ci	################################################################
353362306a36Sopenharmony_ci	# partial reduction
353462306a36Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
353562306a36Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
353662306a36Sopenharmony_ci	vpandq		$mask44,$D0lo,$RR0
353762306a36Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
353862306a36Sopenharmony_ci
353962306a36Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
354062306a36Sopenharmony_ci
354162306a36Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
354262306a36Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
354362306a36Sopenharmony_ci	vpandq		$mask44,$D1lo,$RR1
354462306a36Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
354562306a36Sopenharmony_ci
354662306a36Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
354762306a36Sopenharmony_ci
354862306a36Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
354962306a36Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
355062306a36Sopenharmony_ci	vpandq		$mask42,$D2lo,$RR2
355162306a36Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
355262306a36Sopenharmony_ci
355362306a36Sopenharmony_ci	vpaddq		$D2hi,$RR0,$RR0
355462306a36Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
355562306a36Sopenharmony_ci
355662306a36Sopenharmony_ci	vpaddq		$D2hi,$RR0,$RR0
355762306a36Sopenharmony_ci
355862306a36Sopenharmony_ci	vpsrlq		\$44,$RR0,$tmp		# additional step
355962306a36Sopenharmony_ci	vpandq		$mask44,$RR0,$RR0
356062306a36Sopenharmony_ci
356162306a36Sopenharmony_ci	vpaddq		$tmp,$RR1,$RR1
356262306a36Sopenharmony_ci
356362306a36Sopenharmony_ci	################################################################
356462306a36Sopenharmony_ci	# At this point Rx holds 1324 powers, RRx - 5768, and the goal
356562306a36Sopenharmony_ci	# is 15263748, which reflects how data is loaded...
356662306a36Sopenharmony_ci
356762306a36Sopenharmony_ci	vpunpcklqdq	$R2,$RR2,$T2		# 3748
356862306a36Sopenharmony_ci	vpunpckhqdq	$R2,$RR2,$R2		# 1526
356962306a36Sopenharmony_ci	vpunpcklqdq	$R0,$RR0,$T0
357062306a36Sopenharmony_ci	vpunpckhqdq	$R0,$RR0,$R0
357162306a36Sopenharmony_ci	vpunpcklqdq	$R1,$RR1,$T1
357262306a36Sopenharmony_ci	vpunpckhqdq	$R1,$RR1,$R1
357362306a36Sopenharmony_ci___
357462306a36Sopenharmony_ci######## switch to %zmm
357562306a36Sopenharmony_cimap(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
357662306a36Sopenharmony_cimap(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
357762306a36Sopenharmony_cimap(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
357862306a36Sopenharmony_cimap(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
357962306a36Sopenharmony_ci
358062306a36Sopenharmony_ci$code.=<<___;
358162306a36Sopenharmony_ci	vshufi64x2	\$0x44,$R2,$T2,$RR2	# 15263748
358262306a36Sopenharmony_ci	vshufi64x2	\$0x44,$R0,$T0,$RR0
358362306a36Sopenharmony_ci	vshufi64x2	\$0x44,$R1,$T1,$RR1
358462306a36Sopenharmony_ci
358562306a36Sopenharmony_ci	vmovdqu64	16*0($inp),$T2		# load data
358662306a36Sopenharmony_ci	vmovdqu64	16*4($inp),$T3
358762306a36Sopenharmony_ci	lea		16*8($inp),$inp
358862306a36Sopenharmony_ci
358962306a36Sopenharmony_ci	vpsllq		\$2,$RR2,$SS2		# S2 = R2*5*4
359062306a36Sopenharmony_ci	vpsllq		\$2,$RR1,$SS1		# S1 = R1*5*4
359162306a36Sopenharmony_ci	vpaddq		$RR2,$SS2,$SS2
359262306a36Sopenharmony_ci	vpaddq		$RR1,$SS1,$SS1
359362306a36Sopenharmony_ci	vpsllq		\$2,$SS2,$SS2
359462306a36Sopenharmony_ci	vpsllq		\$2,$SS1,$SS1
359562306a36Sopenharmony_ci
359662306a36Sopenharmony_ci	vpbroadcastq	$padbit,$PAD
359762306a36Sopenharmony_ci	vpbroadcastq	%x#$mask44,$mask44
359862306a36Sopenharmony_ci	vpbroadcastq	%x#$mask42,$mask42
359962306a36Sopenharmony_ci
360062306a36Sopenharmony_ci	vpbroadcastq	%x#$SS1,$S1		# broadcast 8th power
360162306a36Sopenharmony_ci	vpbroadcastq	%x#$SS2,$S2
360262306a36Sopenharmony_ci	vpbroadcastq	%x#$RR0,$R0
360362306a36Sopenharmony_ci	vpbroadcastq	%x#$RR1,$R1
360462306a36Sopenharmony_ci	vpbroadcastq	%x#$RR2,$R2
360562306a36Sopenharmony_ci
360662306a36Sopenharmony_ci	vpunpcklqdq	$T3,$T2,$T1		# transpose data
360762306a36Sopenharmony_ci	vpunpckhqdq	$T3,$T2,$T3
360862306a36Sopenharmony_ci
360962306a36Sopenharmony_ci	# at this point 64-bit lanes are ordered as 73625140
361062306a36Sopenharmony_ci
361162306a36Sopenharmony_ci	vpsrlq		\$24,$T3,$T2		# splat the data
361262306a36Sopenharmony_ci	vporq		$PAD,$T2,$T2
361362306a36Sopenharmony_ci	 vpaddq		$T2,$H2,$H2		# accumulate input
361462306a36Sopenharmony_ci	vpandq		$mask44,$T1,$T0
361562306a36Sopenharmony_ci	vpsrlq		\$44,$T1,$T1
361662306a36Sopenharmony_ci	vpsllq		\$20,$T3,$T3
361762306a36Sopenharmony_ci	vporq		$T3,$T1,$T1
361862306a36Sopenharmony_ci	vpandq		$mask44,$T1,$T1
361962306a36Sopenharmony_ci
362062306a36Sopenharmony_ci	sub		\$8,$len
362162306a36Sopenharmony_ci	jz		.Ltail_vpmadd52_8x
362262306a36Sopenharmony_ci	jmp		.Loop_vpmadd52_8x
362362306a36Sopenharmony_ci
362462306a36Sopenharmony_ci.align	32
362562306a36Sopenharmony_ci.Loop_vpmadd52_8x:
362662306a36Sopenharmony_ci	#vpaddq		$T2,$H2,$H2		# accumulate input
362762306a36Sopenharmony_ci	vpaddq		$T0,$H0,$H0
362862306a36Sopenharmony_ci	vpaddq		$T1,$H1,$H1
362962306a36Sopenharmony_ci
363062306a36Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
363162306a36Sopenharmony_ci	vpmadd52luq	$H2,$S1,$D0lo
363262306a36Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
363362306a36Sopenharmony_ci	vpmadd52huq	$H2,$S1,$D0hi
363462306a36Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
363562306a36Sopenharmony_ci	vpmadd52luq	$H2,$S2,$D1lo
363662306a36Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
363762306a36Sopenharmony_ci	vpmadd52huq	$H2,$S2,$D1hi
363862306a36Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
363962306a36Sopenharmony_ci	vpmadd52luq	$H2,$R0,$D2lo
364062306a36Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
364162306a36Sopenharmony_ci	vpmadd52huq	$H2,$R0,$D2hi
364262306a36Sopenharmony_ci
364362306a36Sopenharmony_ci	 vmovdqu64	16*0($inp),$T2		# load data
364462306a36Sopenharmony_ci	 vmovdqu64	16*4($inp),$T3
364562306a36Sopenharmony_ci	 lea		16*8($inp),$inp
364662306a36Sopenharmony_ci	vpmadd52luq	$H0,$R0,$D0lo
364762306a36Sopenharmony_ci	vpmadd52huq	$H0,$R0,$D0hi
364862306a36Sopenharmony_ci	vpmadd52luq	$H0,$R1,$D1lo
364962306a36Sopenharmony_ci	vpmadd52huq	$H0,$R1,$D1hi
365062306a36Sopenharmony_ci	vpmadd52luq	$H0,$R2,$D2lo
365162306a36Sopenharmony_ci	vpmadd52huq	$H0,$R2,$D2hi
365262306a36Sopenharmony_ci
365362306a36Sopenharmony_ci	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
365462306a36Sopenharmony_ci	 vpunpckhqdq	$T3,$T2,$T3
365562306a36Sopenharmony_ci	vpmadd52luq	$H1,$S2,$D0lo
365662306a36Sopenharmony_ci	vpmadd52huq	$H1,$S2,$D0hi
365762306a36Sopenharmony_ci	vpmadd52luq	$H1,$R0,$D1lo
365862306a36Sopenharmony_ci	vpmadd52huq	$H1,$R0,$D1hi
365962306a36Sopenharmony_ci	vpmadd52luq	$H1,$R1,$D2lo
366062306a36Sopenharmony_ci	vpmadd52huq	$H1,$R1,$D2hi
366162306a36Sopenharmony_ci
366262306a36Sopenharmony_ci	################################################################
366362306a36Sopenharmony_ci	# partial reduction (interleaved with data splat)
366462306a36Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
366562306a36Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
366662306a36Sopenharmony_ci	vpandq		$mask44,$D0lo,$H0
366762306a36Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
366862306a36Sopenharmony_ci
366962306a36Sopenharmony_ci	 vpsrlq		\$24,$T3,$T2
367062306a36Sopenharmony_ci	 vporq		$PAD,$T2,$T2
367162306a36Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
367262306a36Sopenharmony_ci
367362306a36Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
367462306a36Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
367562306a36Sopenharmony_ci	vpandq		$mask44,$D1lo,$H1
367662306a36Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
367762306a36Sopenharmony_ci
367862306a36Sopenharmony_ci	 vpandq		$mask44,$T1,$T0
367962306a36Sopenharmony_ci	 vpsrlq		\$44,$T1,$T1
368062306a36Sopenharmony_ci	 vpsllq		\$20,$T3,$T3
368162306a36Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
368262306a36Sopenharmony_ci
368362306a36Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
368462306a36Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
368562306a36Sopenharmony_ci	vpandq		$mask42,$D2lo,$H2
368662306a36Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
368762306a36Sopenharmony_ci
368862306a36Sopenharmony_ci	  vpaddq	$T2,$H2,$H2		# accumulate input
368962306a36Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
369062306a36Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
369162306a36Sopenharmony_ci
369262306a36Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
369362306a36Sopenharmony_ci	 vporq		$T3,$T1,$T1
369462306a36Sopenharmony_ci	 vpandq		$mask44,$T1,$T1
369562306a36Sopenharmony_ci
369662306a36Sopenharmony_ci	vpsrlq		\$44,$H0,$tmp		# additional step
369762306a36Sopenharmony_ci	vpandq		$mask44,$H0,$H0
369862306a36Sopenharmony_ci
369962306a36Sopenharmony_ci	vpaddq		$tmp,$H1,$H1
370062306a36Sopenharmony_ci
370162306a36Sopenharmony_ci	sub		\$8,$len		# len-=128
370262306a36Sopenharmony_ci	jnz		.Loop_vpmadd52_8x
370362306a36Sopenharmony_ci
370462306a36Sopenharmony_ci.Ltail_vpmadd52_8x:
370562306a36Sopenharmony_ci	#vpaddq		$T2,$H2,$H2		# accumulate input
370662306a36Sopenharmony_ci	vpaddq		$T0,$H0,$H0
370762306a36Sopenharmony_ci	vpaddq		$T1,$H1,$H1
370862306a36Sopenharmony_ci
370962306a36Sopenharmony_ci	vpxorq		$D0lo,$D0lo,$D0lo
371062306a36Sopenharmony_ci	vpmadd52luq	$H2,$SS1,$D0lo
371162306a36Sopenharmony_ci	vpxorq		$D0hi,$D0hi,$D0hi
371262306a36Sopenharmony_ci	vpmadd52huq	$H2,$SS1,$D0hi
371362306a36Sopenharmony_ci	vpxorq		$D1lo,$D1lo,$D1lo
371462306a36Sopenharmony_ci	vpmadd52luq	$H2,$SS2,$D1lo
371562306a36Sopenharmony_ci	vpxorq		$D1hi,$D1hi,$D1hi
371662306a36Sopenharmony_ci	vpmadd52huq	$H2,$SS2,$D1hi
371762306a36Sopenharmony_ci	vpxorq		$D2lo,$D2lo,$D2lo
371862306a36Sopenharmony_ci	vpmadd52luq	$H2,$RR0,$D2lo
371962306a36Sopenharmony_ci	vpxorq		$D2hi,$D2hi,$D2hi
372062306a36Sopenharmony_ci	vpmadd52huq	$H2,$RR0,$D2hi
372162306a36Sopenharmony_ci
372262306a36Sopenharmony_ci	vpmadd52luq	$H0,$RR0,$D0lo
372362306a36Sopenharmony_ci	vpmadd52huq	$H0,$RR0,$D0hi
372462306a36Sopenharmony_ci	vpmadd52luq	$H0,$RR1,$D1lo
372562306a36Sopenharmony_ci	vpmadd52huq	$H0,$RR1,$D1hi
372662306a36Sopenharmony_ci	vpmadd52luq	$H0,$RR2,$D2lo
372762306a36Sopenharmony_ci	vpmadd52huq	$H0,$RR2,$D2hi
372862306a36Sopenharmony_ci
372962306a36Sopenharmony_ci	vpmadd52luq	$H1,$SS2,$D0lo
373062306a36Sopenharmony_ci	vpmadd52huq	$H1,$SS2,$D0hi
373162306a36Sopenharmony_ci	vpmadd52luq	$H1,$RR0,$D1lo
373262306a36Sopenharmony_ci	vpmadd52huq	$H1,$RR0,$D1hi
373362306a36Sopenharmony_ci	vpmadd52luq	$H1,$RR1,$D2lo
373462306a36Sopenharmony_ci	vpmadd52huq	$H1,$RR1,$D2hi
373562306a36Sopenharmony_ci
373662306a36Sopenharmony_ci	################################################################
373762306a36Sopenharmony_ci	# horizontal addition
373862306a36Sopenharmony_ci
373962306a36Sopenharmony_ci	mov		\$1,%eax
374062306a36Sopenharmony_ci	kmovw		%eax,%k1
374162306a36Sopenharmony_ci	vpsrldq		\$8,$D0lo,$T0
374262306a36Sopenharmony_ci	vpsrldq		\$8,$D0hi,$H0
374362306a36Sopenharmony_ci	vpsrldq		\$8,$D1lo,$T1
374462306a36Sopenharmony_ci	vpsrldq		\$8,$D1hi,$H1
374562306a36Sopenharmony_ci	vpaddq		$T0,$D0lo,$D0lo
374662306a36Sopenharmony_ci	vpaddq		$H0,$D0hi,$D0hi
374762306a36Sopenharmony_ci	vpsrldq		\$8,$D2lo,$T2
374862306a36Sopenharmony_ci	vpsrldq		\$8,$D2hi,$H2
374962306a36Sopenharmony_ci	vpaddq		$T1,$D1lo,$D1lo
375062306a36Sopenharmony_ci	vpaddq		$H1,$D1hi,$D1hi
375162306a36Sopenharmony_ci	 vpermq		\$0x2,$D0lo,$T0
375262306a36Sopenharmony_ci	 vpermq		\$0x2,$D0hi,$H0
375362306a36Sopenharmony_ci	vpaddq		$T2,$D2lo,$D2lo
375462306a36Sopenharmony_ci	vpaddq		$H2,$D2hi,$D2hi
375562306a36Sopenharmony_ci
375662306a36Sopenharmony_ci	vpermq		\$0x2,$D1lo,$T1
375762306a36Sopenharmony_ci	vpermq		\$0x2,$D1hi,$H1
375862306a36Sopenharmony_ci	vpaddq		$T0,$D0lo,$D0lo
375962306a36Sopenharmony_ci	vpaddq		$H0,$D0hi,$D0hi
376062306a36Sopenharmony_ci	vpermq		\$0x2,$D2lo,$T2
376162306a36Sopenharmony_ci	vpermq		\$0x2,$D2hi,$H2
376262306a36Sopenharmony_ci	vpaddq		$T1,$D1lo,$D1lo
376362306a36Sopenharmony_ci	vpaddq		$H1,$D1hi,$D1hi
376462306a36Sopenharmony_ci	 vextracti64x4	\$1,$D0lo,%y#$T0
376562306a36Sopenharmony_ci	 vextracti64x4	\$1,$D0hi,%y#$H0
376662306a36Sopenharmony_ci	vpaddq		$T2,$D2lo,$D2lo
376762306a36Sopenharmony_ci	vpaddq		$H2,$D2hi,$D2hi
376862306a36Sopenharmony_ci
376962306a36Sopenharmony_ci	vextracti64x4	\$1,$D1lo,%y#$T1
377062306a36Sopenharmony_ci	vextracti64x4	\$1,$D1hi,%y#$H1
377162306a36Sopenharmony_ci	vextracti64x4	\$1,$D2lo,%y#$T2
377262306a36Sopenharmony_ci	vextracti64x4	\$1,$D2hi,%y#$H2
377362306a36Sopenharmony_ci___
377462306a36Sopenharmony_ci######## switch back to %ymm
377562306a36Sopenharmony_cimap(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
377662306a36Sopenharmony_cimap(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
377762306a36Sopenharmony_cimap(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
377862306a36Sopenharmony_ci
377962306a36Sopenharmony_ci$code.=<<___;
378062306a36Sopenharmony_ci	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
378162306a36Sopenharmony_ci	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
378262306a36Sopenharmony_ci	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
378362306a36Sopenharmony_ci	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
378462306a36Sopenharmony_ci	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
378562306a36Sopenharmony_ci	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
378662306a36Sopenharmony_ci
378762306a36Sopenharmony_ci	################################################################
378862306a36Sopenharmony_ci	# partial reduction
378962306a36Sopenharmony_ci	vpsrlq		\$44,$D0lo,$tmp
379062306a36Sopenharmony_ci	vpsllq		\$8,$D0hi,$D0hi
379162306a36Sopenharmony_ci	vpandq		$mask44,$D0lo,$H0
379262306a36Sopenharmony_ci	vpaddq		$tmp,$D0hi,$D0hi
379362306a36Sopenharmony_ci
379462306a36Sopenharmony_ci	vpaddq		$D0hi,$D1lo,$D1lo
379562306a36Sopenharmony_ci
379662306a36Sopenharmony_ci	vpsrlq		\$44,$D1lo,$tmp
379762306a36Sopenharmony_ci	vpsllq		\$8,$D1hi,$D1hi
379862306a36Sopenharmony_ci	vpandq		$mask44,$D1lo,$H1
379962306a36Sopenharmony_ci	vpaddq		$tmp,$D1hi,$D1hi
380062306a36Sopenharmony_ci
380162306a36Sopenharmony_ci	vpaddq		$D1hi,$D2lo,$D2lo
380262306a36Sopenharmony_ci
380362306a36Sopenharmony_ci	vpsrlq		\$42,$D2lo,$tmp
380462306a36Sopenharmony_ci	vpsllq		\$10,$D2hi,$D2hi
380562306a36Sopenharmony_ci	vpandq		$mask42,$D2lo,$H2
380662306a36Sopenharmony_ci	vpaddq		$tmp,$D2hi,$D2hi
380762306a36Sopenharmony_ci
380862306a36Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
380962306a36Sopenharmony_ci	vpsllq		\$2,$D2hi,$D2hi
381062306a36Sopenharmony_ci
381162306a36Sopenharmony_ci	vpaddq		$D2hi,$H0,$H0
381262306a36Sopenharmony_ci
381362306a36Sopenharmony_ci	vpsrlq		\$44,$H0,$tmp		# additional step
381462306a36Sopenharmony_ci	vpandq		$mask44,$H0,$H0
381562306a36Sopenharmony_ci
381662306a36Sopenharmony_ci	vpaddq		$tmp,$H1,$H1
381762306a36Sopenharmony_ci
381862306a36Sopenharmony_ci	################################################################
381962306a36Sopenharmony_ci
382062306a36Sopenharmony_ci	vmovq		%x#$H0,0($ctx)
382162306a36Sopenharmony_ci	vmovq		%x#$H1,8($ctx)
382262306a36Sopenharmony_ci	vmovq		%x#$H2,16($ctx)
382362306a36Sopenharmony_ci	vzeroall
382462306a36Sopenharmony_ci
382562306a36Sopenharmony_ci.Lno_data_vpmadd52_8x:
382662306a36Sopenharmony_ci	RET
382762306a36Sopenharmony_ci.size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
382862306a36Sopenharmony_ci___
382962306a36Sopenharmony_ci}
383062306a36Sopenharmony_ci$code.=<<___;
383162306a36Sopenharmony_ci.type	poly1305_emit_base2_44,\@function,3
383262306a36Sopenharmony_ci.align	32
383362306a36Sopenharmony_cipoly1305_emit_base2_44:
383462306a36Sopenharmony_ci	mov	0($ctx),%r8	# load hash value
383562306a36Sopenharmony_ci	mov	8($ctx),%r9
383662306a36Sopenharmony_ci	mov	16($ctx),%r10
383762306a36Sopenharmony_ci
383862306a36Sopenharmony_ci	mov	%r9,%rax
383962306a36Sopenharmony_ci	shr	\$20,%r9
384062306a36Sopenharmony_ci	shl	\$44,%rax
384162306a36Sopenharmony_ci	mov	%r10,%rcx
384262306a36Sopenharmony_ci	shr	\$40,%r10
384362306a36Sopenharmony_ci	shl	\$24,%rcx
384462306a36Sopenharmony_ci
384562306a36Sopenharmony_ci	add	%rax,%r8
384662306a36Sopenharmony_ci	adc	%rcx,%r9
384762306a36Sopenharmony_ci	adc	\$0,%r10
384862306a36Sopenharmony_ci
384962306a36Sopenharmony_ci	mov	%r8,%rax
385062306a36Sopenharmony_ci	add	\$5,%r8		# compare to modulus
385162306a36Sopenharmony_ci	mov	%r9,%rcx
385262306a36Sopenharmony_ci	adc	\$0,%r9
385362306a36Sopenharmony_ci	adc	\$0,%r10
385462306a36Sopenharmony_ci	shr	\$2,%r10	# did 130-bit value overflow?
385562306a36Sopenharmony_ci	cmovnz	%r8,%rax
385662306a36Sopenharmony_ci	cmovnz	%r9,%rcx
385762306a36Sopenharmony_ci
385862306a36Sopenharmony_ci	add	0($nonce),%rax	# accumulate nonce
385962306a36Sopenharmony_ci	adc	8($nonce),%rcx
386062306a36Sopenharmony_ci	mov	%rax,0($mac)	# write result
386162306a36Sopenharmony_ci	mov	%rcx,8($mac)
386262306a36Sopenharmony_ci
386362306a36Sopenharmony_ci	RET
386462306a36Sopenharmony_ci.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
386562306a36Sopenharmony_ci___
386662306a36Sopenharmony_ci}	}	}
386762306a36Sopenharmony_ci}
386862306a36Sopenharmony_ci
386962306a36Sopenharmony_ciif (!$kernel)
387062306a36Sopenharmony_ci{	# chacha20-poly1305 helpers
387162306a36Sopenharmony_cimy ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
387262306a36Sopenharmony_ci                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
387362306a36Sopenharmony_ci$code.=<<___;
387462306a36Sopenharmony_ci.globl	xor128_encrypt_n_pad
387562306a36Sopenharmony_ci.type	xor128_encrypt_n_pad,\@abi-omnipotent
387662306a36Sopenharmony_ci.align	16
387762306a36Sopenharmony_cixor128_encrypt_n_pad:
387862306a36Sopenharmony_ci	sub	$otp,$inp
387962306a36Sopenharmony_ci	sub	$otp,$out
388062306a36Sopenharmony_ci	mov	$len,%r10		# put len aside
388162306a36Sopenharmony_ci	shr	\$4,$len		# len / 16
388262306a36Sopenharmony_ci	jz	.Ltail_enc
388362306a36Sopenharmony_ci	nop
388462306a36Sopenharmony_ci.Loop_enc_xmm:
388562306a36Sopenharmony_ci	movdqu	($inp,$otp),%xmm0
388662306a36Sopenharmony_ci	pxor	($otp),%xmm0
388762306a36Sopenharmony_ci	movdqu	%xmm0,($out,$otp)
388862306a36Sopenharmony_ci	movdqa	%xmm0,($otp)
388962306a36Sopenharmony_ci	lea	16($otp),$otp
389062306a36Sopenharmony_ci	dec	$len
389162306a36Sopenharmony_ci	jnz	.Loop_enc_xmm
389262306a36Sopenharmony_ci
389362306a36Sopenharmony_ci	and	\$15,%r10		# len % 16
389462306a36Sopenharmony_ci	jz	.Ldone_enc
389562306a36Sopenharmony_ci
389662306a36Sopenharmony_ci.Ltail_enc:
389762306a36Sopenharmony_ci	mov	\$16,$len
389862306a36Sopenharmony_ci	sub	%r10,$len
389962306a36Sopenharmony_ci	xor	%eax,%eax
390062306a36Sopenharmony_ci.Loop_enc_byte:
390162306a36Sopenharmony_ci	mov	($inp,$otp),%al
390262306a36Sopenharmony_ci	xor	($otp),%al
390362306a36Sopenharmony_ci	mov	%al,($out,$otp)
390462306a36Sopenharmony_ci	mov	%al,($otp)
390562306a36Sopenharmony_ci	lea	1($otp),$otp
390662306a36Sopenharmony_ci	dec	%r10
390762306a36Sopenharmony_ci	jnz	.Loop_enc_byte
390862306a36Sopenharmony_ci
390962306a36Sopenharmony_ci	xor	%eax,%eax
391062306a36Sopenharmony_ci.Loop_enc_pad:
391162306a36Sopenharmony_ci	mov	%al,($otp)
391262306a36Sopenharmony_ci	lea	1($otp),$otp
391362306a36Sopenharmony_ci	dec	$len
391462306a36Sopenharmony_ci	jnz	.Loop_enc_pad
391562306a36Sopenharmony_ci
391662306a36Sopenharmony_ci.Ldone_enc:
391762306a36Sopenharmony_ci	mov	$otp,%rax
391862306a36Sopenharmony_ci	RET
391962306a36Sopenharmony_ci.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
392062306a36Sopenharmony_ci
392162306a36Sopenharmony_ci.globl	xor128_decrypt_n_pad
392262306a36Sopenharmony_ci.type	xor128_decrypt_n_pad,\@abi-omnipotent
392362306a36Sopenharmony_ci.align	16
392462306a36Sopenharmony_cixor128_decrypt_n_pad:
392562306a36Sopenharmony_ci	sub	$otp,$inp
392662306a36Sopenharmony_ci	sub	$otp,$out
392762306a36Sopenharmony_ci	mov	$len,%r10		# put len aside
392862306a36Sopenharmony_ci	shr	\$4,$len		# len / 16
392962306a36Sopenharmony_ci	jz	.Ltail_dec
393062306a36Sopenharmony_ci	nop
393162306a36Sopenharmony_ci.Loop_dec_xmm:
393262306a36Sopenharmony_ci	movdqu	($inp,$otp),%xmm0
393362306a36Sopenharmony_ci	movdqa	($otp),%xmm1
393462306a36Sopenharmony_ci	pxor	%xmm0,%xmm1
393562306a36Sopenharmony_ci	movdqu	%xmm1,($out,$otp)
393662306a36Sopenharmony_ci	movdqa	%xmm0,($otp)
393762306a36Sopenharmony_ci	lea	16($otp),$otp
393862306a36Sopenharmony_ci	dec	$len
393962306a36Sopenharmony_ci	jnz	.Loop_dec_xmm
394062306a36Sopenharmony_ci
394162306a36Sopenharmony_ci	pxor	%xmm1,%xmm1
394262306a36Sopenharmony_ci	and	\$15,%r10		# len % 16
394362306a36Sopenharmony_ci	jz	.Ldone_dec
394462306a36Sopenharmony_ci
394562306a36Sopenharmony_ci.Ltail_dec:
394662306a36Sopenharmony_ci	mov	\$16,$len
394762306a36Sopenharmony_ci	sub	%r10,$len
394862306a36Sopenharmony_ci	xor	%eax,%eax
394962306a36Sopenharmony_ci	xor	%r11d,%r11d
395062306a36Sopenharmony_ci.Loop_dec_byte:
395162306a36Sopenharmony_ci	mov	($inp,$otp),%r11b
395262306a36Sopenharmony_ci	mov	($otp),%al
395362306a36Sopenharmony_ci	xor	%r11b,%al
395462306a36Sopenharmony_ci	mov	%al,($out,$otp)
395562306a36Sopenharmony_ci	mov	%r11b,($otp)
395662306a36Sopenharmony_ci	lea	1($otp),$otp
395762306a36Sopenharmony_ci	dec	%r10
395862306a36Sopenharmony_ci	jnz	.Loop_dec_byte
395962306a36Sopenharmony_ci
396062306a36Sopenharmony_ci	xor	%eax,%eax
396162306a36Sopenharmony_ci.Loop_dec_pad:
396262306a36Sopenharmony_ci	mov	%al,($otp)
396362306a36Sopenharmony_ci	lea	1($otp),$otp
396462306a36Sopenharmony_ci	dec	$len
396562306a36Sopenharmony_ci	jnz	.Loop_dec_pad
396662306a36Sopenharmony_ci
396762306a36Sopenharmony_ci.Ldone_dec:
396862306a36Sopenharmony_ci	mov	$otp,%rax
396962306a36Sopenharmony_ci	RET
397062306a36Sopenharmony_ci.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
397162306a36Sopenharmony_ci___
397262306a36Sopenharmony_ci}
397362306a36Sopenharmony_ci
397462306a36Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
397562306a36Sopenharmony_ci#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
397662306a36Sopenharmony_ciif ($win64) {
397762306a36Sopenharmony_ci$rec="%rcx";
397862306a36Sopenharmony_ci$frame="%rdx";
397962306a36Sopenharmony_ci$context="%r8";
398062306a36Sopenharmony_ci$disp="%r9";
398162306a36Sopenharmony_ci
398262306a36Sopenharmony_ci$code.=<<___;
398362306a36Sopenharmony_ci.extern	__imp_RtlVirtualUnwind
398462306a36Sopenharmony_ci.type	se_handler,\@abi-omnipotent
398562306a36Sopenharmony_ci.align	16
398662306a36Sopenharmony_cise_handler:
398762306a36Sopenharmony_ci	push	%rsi
398862306a36Sopenharmony_ci	push	%rdi
398962306a36Sopenharmony_ci	push	%rbx
399062306a36Sopenharmony_ci	push	%rbp
399162306a36Sopenharmony_ci	push	%r12
399262306a36Sopenharmony_ci	push	%r13
399362306a36Sopenharmony_ci	push	%r14
399462306a36Sopenharmony_ci	push	%r15
399562306a36Sopenharmony_ci	pushfq
399662306a36Sopenharmony_ci	sub	\$64,%rsp
399762306a36Sopenharmony_ci
399862306a36Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
399962306a36Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
400062306a36Sopenharmony_ci
400162306a36Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
400262306a36Sopenharmony_ci	mov	56($disp),%r11		# disp->HandlerData
400362306a36Sopenharmony_ci
400462306a36Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
400562306a36Sopenharmony_ci	lea	(%rsi,%r10),%r10	# prologue label
400662306a36Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<.Lprologue
400762306a36Sopenharmony_ci	jb	.Lcommon_seh_tail
400862306a36Sopenharmony_ci
400962306a36Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
401062306a36Sopenharmony_ci
401162306a36Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
401262306a36Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
401362306a36Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
401462306a36Sopenharmony_ci	jae	.Lcommon_seh_tail
401562306a36Sopenharmony_ci
401662306a36Sopenharmony_ci	lea	48(%rax),%rax
401762306a36Sopenharmony_ci
401862306a36Sopenharmony_ci	mov	-8(%rax),%rbx
401962306a36Sopenharmony_ci	mov	-16(%rax),%rbp
402062306a36Sopenharmony_ci	mov	-24(%rax),%r12
402162306a36Sopenharmony_ci	mov	-32(%rax),%r13
402262306a36Sopenharmony_ci	mov	-40(%rax),%r14
402362306a36Sopenharmony_ci	mov	-48(%rax),%r15
402462306a36Sopenharmony_ci	mov	%rbx,144($context)	# restore context->Rbx
402562306a36Sopenharmony_ci	mov	%rbp,160($context)	# restore context->Rbp
402662306a36Sopenharmony_ci	mov	%r12,216($context)	# restore context->R12
402762306a36Sopenharmony_ci	mov	%r13,224($context)	# restore context->R13
402862306a36Sopenharmony_ci	mov	%r14,232($context)	# restore context->R14
402962306a36Sopenharmony_ci	mov	%r15,240($context)	# restore context->R14
403062306a36Sopenharmony_ci
403162306a36Sopenharmony_ci	jmp	.Lcommon_seh_tail
403262306a36Sopenharmony_ci.size	se_handler,.-se_handler
403362306a36Sopenharmony_ci
403462306a36Sopenharmony_ci.type	avx_handler,\@abi-omnipotent
403562306a36Sopenharmony_ci.align	16
403662306a36Sopenharmony_ciavx_handler:
403762306a36Sopenharmony_ci	push	%rsi
403862306a36Sopenharmony_ci	push	%rdi
403962306a36Sopenharmony_ci	push	%rbx
404062306a36Sopenharmony_ci	push	%rbp
404162306a36Sopenharmony_ci	push	%r12
404262306a36Sopenharmony_ci	push	%r13
404362306a36Sopenharmony_ci	push	%r14
404462306a36Sopenharmony_ci	push	%r15
404562306a36Sopenharmony_ci	pushfq
404662306a36Sopenharmony_ci	sub	\$64,%rsp
404762306a36Sopenharmony_ci
404862306a36Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
404962306a36Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
405062306a36Sopenharmony_ci
405162306a36Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
405262306a36Sopenharmony_ci	mov	56($disp),%r11		# disp->HandlerData
405362306a36Sopenharmony_ci
405462306a36Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
405562306a36Sopenharmony_ci	lea	(%rsi,%r10),%r10	# prologue label
405662306a36Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<prologue label
405762306a36Sopenharmony_ci	jb	.Lcommon_seh_tail
405862306a36Sopenharmony_ci
405962306a36Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
406062306a36Sopenharmony_ci
406162306a36Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
406262306a36Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
406362306a36Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=epilogue label
406462306a36Sopenharmony_ci	jae	.Lcommon_seh_tail
406562306a36Sopenharmony_ci
406662306a36Sopenharmony_ci	mov	208($context),%rax	# pull context->R11
406762306a36Sopenharmony_ci
406862306a36Sopenharmony_ci	lea	0x50(%rax),%rsi
406962306a36Sopenharmony_ci	lea	0xf8(%rax),%rax
407062306a36Sopenharmony_ci	lea	512($context),%rdi	# &context.Xmm6
407162306a36Sopenharmony_ci	mov	\$20,%ecx
407262306a36Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
407362306a36Sopenharmony_ci
407462306a36Sopenharmony_ci.Lcommon_seh_tail:
407562306a36Sopenharmony_ci	mov	8(%rax),%rdi
407662306a36Sopenharmony_ci	mov	16(%rax),%rsi
407762306a36Sopenharmony_ci	mov	%rax,152($context)	# restore context->Rsp
407862306a36Sopenharmony_ci	mov	%rsi,168($context)	# restore context->Rsi
407962306a36Sopenharmony_ci	mov	%rdi,176($context)	# restore context->Rdi
408062306a36Sopenharmony_ci
408162306a36Sopenharmony_ci	mov	40($disp),%rdi		# disp->ContextRecord
408262306a36Sopenharmony_ci	mov	$context,%rsi		# context
408362306a36Sopenharmony_ci	mov	\$154,%ecx		# sizeof(CONTEXT)
408462306a36Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
408562306a36Sopenharmony_ci
408662306a36Sopenharmony_ci	mov	$disp,%rsi
408762306a36Sopenharmony_ci	xor	%ecx,%ecx		# arg1, UNW_FLAG_NHANDLER
408862306a36Sopenharmony_ci	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
408962306a36Sopenharmony_ci	mov	0(%rsi),%r8		# arg3, disp->ControlPc
409062306a36Sopenharmony_ci	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
409162306a36Sopenharmony_ci	mov	40(%rsi),%r10		# disp->ContextRecord
409262306a36Sopenharmony_ci	lea	56(%rsi),%r11		# &disp->HandlerData
409362306a36Sopenharmony_ci	lea	24(%rsi),%r12		# &disp->EstablisherFrame
409462306a36Sopenharmony_ci	mov	%r10,32(%rsp)		# arg5
409562306a36Sopenharmony_ci	mov	%r11,40(%rsp)		# arg6
409662306a36Sopenharmony_ci	mov	%r12,48(%rsp)		# arg7
409762306a36Sopenharmony_ci	mov	%rcx,56(%rsp)		# arg8, (NULL)
409862306a36Sopenharmony_ci	call	*__imp_RtlVirtualUnwind(%rip)
409962306a36Sopenharmony_ci
410062306a36Sopenharmony_ci	mov	\$1,%eax		# ExceptionContinueSearch
410162306a36Sopenharmony_ci	add	\$64,%rsp
410262306a36Sopenharmony_ci	popfq
410362306a36Sopenharmony_ci	pop	%r15
410462306a36Sopenharmony_ci	pop	%r14
410562306a36Sopenharmony_ci	pop	%r13
410662306a36Sopenharmony_ci	pop	%r12
410762306a36Sopenharmony_ci	pop	%rbp
410862306a36Sopenharmony_ci	pop	%rbx
410962306a36Sopenharmony_ci	pop	%rdi
411062306a36Sopenharmony_ci	pop	%rsi
411162306a36Sopenharmony_ci	RET
411262306a36Sopenharmony_ci.size	avx_handler,.-avx_handler
411362306a36Sopenharmony_ci
411462306a36Sopenharmony_ci.section	.pdata
411562306a36Sopenharmony_ci.align	4
411662306a36Sopenharmony_ci	.rva	.LSEH_begin_poly1305_init_x86_64
411762306a36Sopenharmony_ci	.rva	.LSEH_end_poly1305_init_x86_64
411862306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_init_x86_64
411962306a36Sopenharmony_ci
412062306a36Sopenharmony_ci	.rva	.LSEH_begin_poly1305_blocks_x86_64
412162306a36Sopenharmony_ci	.rva	.LSEH_end_poly1305_blocks_x86_64
412262306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_x86_64
412362306a36Sopenharmony_ci
412462306a36Sopenharmony_ci	.rva	.LSEH_begin_poly1305_emit_x86_64
412562306a36Sopenharmony_ci	.rva	.LSEH_end_poly1305_emit_x86_64
412662306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_emit_x86_64
412762306a36Sopenharmony_ci___
412862306a36Sopenharmony_ci$code.=<<___ if ($avx);
412962306a36Sopenharmony_ci	.rva	.LSEH_begin_poly1305_blocks_avx
413062306a36Sopenharmony_ci	.rva	.Lbase2_64_avx
413162306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx_1
413262306a36Sopenharmony_ci
413362306a36Sopenharmony_ci	.rva	.Lbase2_64_avx
413462306a36Sopenharmony_ci	.rva	.Leven_avx
413562306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx_2
413662306a36Sopenharmony_ci
413762306a36Sopenharmony_ci	.rva	.Leven_avx
413862306a36Sopenharmony_ci	.rva	.LSEH_end_poly1305_blocks_avx
413962306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx_3
414062306a36Sopenharmony_ci
414162306a36Sopenharmony_ci	.rva	.LSEH_begin_poly1305_emit_avx
414262306a36Sopenharmony_ci	.rva	.LSEH_end_poly1305_emit_avx
414362306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_emit_avx
414462306a36Sopenharmony_ci___
414562306a36Sopenharmony_ci$code.=<<___ if ($avx>1);
414662306a36Sopenharmony_ci	.rva	.LSEH_begin_poly1305_blocks_avx2
414762306a36Sopenharmony_ci	.rva	.Lbase2_64_avx2
414862306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx2_1
414962306a36Sopenharmony_ci
415062306a36Sopenharmony_ci	.rva	.Lbase2_64_avx2
415162306a36Sopenharmony_ci	.rva	.Leven_avx2
415262306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx2_2
415362306a36Sopenharmony_ci
415462306a36Sopenharmony_ci	.rva	.Leven_avx2
415562306a36Sopenharmony_ci	.rva	.LSEH_end_poly1305_blocks_avx2
415662306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx2_3
415762306a36Sopenharmony_ci___
415862306a36Sopenharmony_ci$code.=<<___ if ($avx>2);
415962306a36Sopenharmony_ci	.rva	.LSEH_begin_poly1305_blocks_avx512
416062306a36Sopenharmony_ci	.rva	.LSEH_end_poly1305_blocks_avx512
416162306a36Sopenharmony_ci	.rva	.LSEH_info_poly1305_blocks_avx512
416262306a36Sopenharmony_ci___
416362306a36Sopenharmony_ci$code.=<<___;
416462306a36Sopenharmony_ci.section	.xdata
416562306a36Sopenharmony_ci.align	8
416662306a36Sopenharmony_ci.LSEH_info_poly1305_init_x86_64:
416762306a36Sopenharmony_ci	.byte	9,0,0,0
416862306a36Sopenharmony_ci	.rva	se_handler
416962306a36Sopenharmony_ci	.rva	.LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
417062306a36Sopenharmony_ci
417162306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_x86_64:
417262306a36Sopenharmony_ci	.byte	9,0,0,0
417362306a36Sopenharmony_ci	.rva	se_handler
417462306a36Sopenharmony_ci	.rva	.Lblocks_body,.Lblocks_epilogue
417562306a36Sopenharmony_ci
417662306a36Sopenharmony_ci.LSEH_info_poly1305_emit_x86_64:
417762306a36Sopenharmony_ci	.byte	9,0,0,0
417862306a36Sopenharmony_ci	.rva	se_handler
417962306a36Sopenharmony_ci	.rva	.LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
418062306a36Sopenharmony_ci___
418162306a36Sopenharmony_ci$code.=<<___ if ($avx);
418262306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_1:
418362306a36Sopenharmony_ci	.byte	9,0,0,0
418462306a36Sopenharmony_ci	.rva	se_handler
418562306a36Sopenharmony_ci	.rva	.Lblocks_avx_body,.Lblocks_avx_epilogue		# HandlerData[]
418662306a36Sopenharmony_ci
418762306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_2:
418862306a36Sopenharmony_ci	.byte	9,0,0,0
418962306a36Sopenharmony_ci	.rva	se_handler
419062306a36Sopenharmony_ci	.rva	.Lbase2_64_avx_body,.Lbase2_64_avx_epilogue	# HandlerData[]
419162306a36Sopenharmony_ci
419262306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx_3:
419362306a36Sopenharmony_ci	.byte	9,0,0,0
419462306a36Sopenharmony_ci	.rva	avx_handler
419562306a36Sopenharmony_ci	.rva	.Ldo_avx_body,.Ldo_avx_epilogue			# HandlerData[]
419662306a36Sopenharmony_ci
419762306a36Sopenharmony_ci.LSEH_info_poly1305_emit_avx:
419862306a36Sopenharmony_ci	.byte	9,0,0,0
419962306a36Sopenharmony_ci	.rva	se_handler
420062306a36Sopenharmony_ci	.rva	.LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
420162306a36Sopenharmony_ci___
420262306a36Sopenharmony_ci$code.=<<___ if ($avx>1);
420362306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_1:
420462306a36Sopenharmony_ci	.byte	9,0,0,0
420562306a36Sopenharmony_ci	.rva	se_handler
420662306a36Sopenharmony_ci	.rva	.Lblocks_avx2_body,.Lblocks_avx2_epilogue	# HandlerData[]
420762306a36Sopenharmony_ci
420862306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_2:
420962306a36Sopenharmony_ci	.byte	9,0,0,0
421062306a36Sopenharmony_ci	.rva	se_handler
421162306a36Sopenharmony_ci	.rva	.Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue	# HandlerData[]
421262306a36Sopenharmony_ci
421362306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx2_3:
421462306a36Sopenharmony_ci	.byte	9,0,0,0
421562306a36Sopenharmony_ci	.rva	avx_handler
421662306a36Sopenharmony_ci	.rva	.Ldo_avx2_body,.Ldo_avx2_epilogue		# HandlerData[]
421762306a36Sopenharmony_ci___
421862306a36Sopenharmony_ci$code.=<<___ if ($avx>2);
421962306a36Sopenharmony_ci.LSEH_info_poly1305_blocks_avx512:
422062306a36Sopenharmony_ci	.byte	9,0,0,0
422162306a36Sopenharmony_ci	.rva	avx_handler
422262306a36Sopenharmony_ci	.rva	.Ldo_avx512_body,.Ldo_avx512_epilogue		# HandlerData[]
422362306a36Sopenharmony_ci___
422462306a36Sopenharmony_ci}
422562306a36Sopenharmony_ci
422662306a36Sopenharmony_ciopen SELF,$0;
422762306a36Sopenharmony_ciwhile(<SELF>) {
422862306a36Sopenharmony_ci	next if (/^#!/);
422962306a36Sopenharmony_ci	last if (!s/^#/\/\// and !/^$/);
423062306a36Sopenharmony_ci	print;
423162306a36Sopenharmony_ci}
423262306a36Sopenharmony_ciclose SELF;
423362306a36Sopenharmony_ci
423462306a36Sopenharmony_ciforeach (split('\n',$code)) {
423562306a36Sopenharmony_ci	s/\`([^\`]*)\`/eval($1)/ge;
423662306a36Sopenharmony_ci	s/%r([a-z]+)#d/%e$1/g;
423762306a36Sopenharmony_ci	s/%r([0-9]+)#d/%r$1d/g;
423862306a36Sopenharmony_ci	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
423962306a36Sopenharmony_ci
424062306a36Sopenharmony_ci	if ($kernel) {
424162306a36Sopenharmony_ci		s/(^\.type.*),[0-9]+$/\1/;
424262306a36Sopenharmony_ci		s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
424362306a36Sopenharmony_ci		next if /^\.cfi.*/;
424462306a36Sopenharmony_ci	}
424562306a36Sopenharmony_ci
424662306a36Sopenharmony_ci	print $_,"\n";
424762306a36Sopenharmony_ci}
424862306a36Sopenharmony_ciclose STDOUT;
4249