11cb0ef41Sopenharmony_ci#! /usr/bin/env perl
21cb0ef41Sopenharmony_ci# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
31cb0ef41Sopenharmony_ci#
41cb0ef41Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
51cb0ef41Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
61cb0ef41Sopenharmony_ci# in the file LICENSE in the source distribution or at
71cb0ef41Sopenharmony_ci# https://www.openssl.org/source/license.html
81cb0ef41Sopenharmony_ci
91cb0ef41Sopenharmony_ci#
101cb0ef41Sopenharmony_ci# ====================================================================
111cb0ef41Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
121cb0ef41Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
131cb0ef41Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
141cb0ef41Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
151cb0ef41Sopenharmony_ci# ====================================================================
161cb0ef41Sopenharmony_ci#
171cb0ef41Sopenharmony_ci# This module implements Poly1305 hash for x86.
181cb0ef41Sopenharmony_ci#
191cb0ef41Sopenharmony_ci# April 2015
201cb0ef41Sopenharmony_ci#
211cb0ef41Sopenharmony_ci# Numbers are cycles per processed byte with poly1305_blocks alone,
221cb0ef41Sopenharmony_ci# measured with rdtsc at fixed clock frequency.
231cb0ef41Sopenharmony_ci#
241cb0ef41Sopenharmony_ci#		IALU/gcc-3.4(*)	SSE2(**)	AVX2
251cb0ef41Sopenharmony_ci# Pentium	15.7/+80%	-
261cb0ef41Sopenharmony_ci# PIII		6.21/+90%	-
271cb0ef41Sopenharmony_ci# P4		19.8/+40%	3.24
281cb0ef41Sopenharmony_ci# Core 2	4.85/+90%	1.80
291cb0ef41Sopenharmony_ci# Westmere	4.58/+100%	1.43
301cb0ef41Sopenharmony_ci# Sandy Bridge	3.90/+100%	1.36
311cb0ef41Sopenharmony_ci# Haswell	3.88/+70%	1.18		0.72
321cb0ef41Sopenharmony_ci# Skylake	3.10/+60%	1.14		0.62
331cb0ef41Sopenharmony_ci# Silvermont	11.0/+40%	4.80
341cb0ef41Sopenharmony_ci# Goldmont	4.10/+200%	2.10
351cb0ef41Sopenharmony_ci# VIA Nano	6.71/+90%	2.47
361cb0ef41Sopenharmony_ci# Sledgehammer	3.51/+180%	4.27
371cb0ef41Sopenharmony_ci# Bulldozer	4.53/+140%	1.31
381cb0ef41Sopenharmony_ci#
391cb0ef41Sopenharmony_ci# (*)	gcc 4.8 for some reason generated worse code;
401cb0ef41Sopenharmony_ci# (**)	besides SSE2 there are floating-point and AVX options; FP
411cb0ef41Sopenharmony_ci#	is deemed unnecessary, because pre-SSE2 processor are too
421cb0ef41Sopenharmony_ci#	old to care about, while it's not the fastest option on
431cb0ef41Sopenharmony_ci#	SSE2-capable ones; AVX is omitted, because it doesn't give
441cb0ef41Sopenharmony_ci#	a lot of improvement, 5-10% depending on processor;
451cb0ef41Sopenharmony_ci
461cb0ef41Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
471cb0ef41Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm");
481cb0ef41Sopenharmony_cirequire "x86asm.pl";
491cb0ef41Sopenharmony_ci
501cb0ef41Sopenharmony_ci$output=pop and open STDOUT,">$output";
511cb0ef41Sopenharmony_ci
521cb0ef41Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
531cb0ef41Sopenharmony_ci
541cb0ef41Sopenharmony_ci$sse2=$avx=0;
551cb0ef41Sopenharmony_cifor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
561cb0ef41Sopenharmony_ci
571cb0ef41Sopenharmony_ciif ($sse2) {
581cb0ef41Sopenharmony_ci	&static_label("const_sse2");
591cb0ef41Sopenharmony_ci	&static_label("enter_blocks");
601cb0ef41Sopenharmony_ci	&static_label("enter_emit");
611cb0ef41Sopenharmony_ci	&external_label("OPENSSL_ia32cap_P");
621cb0ef41Sopenharmony_ci
631cb0ef41Sopenharmony_ci	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
641cb0ef41Sopenharmony_ci			=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
651cb0ef41Sopenharmony_ci		$avx = ($1>=2.19) + ($1>=2.22);
661cb0ef41Sopenharmony_ci	}
671cb0ef41Sopenharmony_ci
681cb0ef41Sopenharmony_ci	if (!$avx && $ARGV[0] eq "win32n" &&
691cb0ef41Sopenharmony_ci	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
701cb0ef41Sopenharmony_ci	$avx = ($1>=2.09) + ($1>=2.10);
711cb0ef41Sopenharmony_ci	}
721cb0ef41Sopenharmony_ci
731cb0ef41Sopenharmony_ci	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) {
741cb0ef41Sopenharmony_ci		$avx = ($2>=3.0) + ($2>3.0);
751cb0ef41Sopenharmony_ci	}
761cb0ef41Sopenharmony_ci}
771cb0ef41Sopenharmony_ci
781cb0ef41Sopenharmony_ci########################################################################
791cb0ef41Sopenharmony_ci# Layout of opaque area is following.
801cb0ef41Sopenharmony_ci#
811cb0ef41Sopenharmony_ci#	unsigned __int32 h[5];		# current hash value base 2^32
821cb0ef41Sopenharmony_ci#	unsigned __int32 pad;		# is_base2_26 in vector context
831cb0ef41Sopenharmony_ci#	unsigned __int32 r[4];		# key value base 2^32
841cb0ef41Sopenharmony_ci
851cb0ef41Sopenharmony_ci&align(64);
861cb0ef41Sopenharmony_ci&function_begin("poly1305_init");
871cb0ef41Sopenharmony_ci	&mov	("edi",&wparam(0));		# context
881cb0ef41Sopenharmony_ci	&mov	("esi",&wparam(1));		# key
891cb0ef41Sopenharmony_ci	&mov	("ebp",&wparam(2));		# function table
901cb0ef41Sopenharmony_ci
911cb0ef41Sopenharmony_ci	&xor	("eax","eax");
921cb0ef41Sopenharmony_ci	&mov	(&DWP(4*0,"edi"),"eax");	# zero hash value
931cb0ef41Sopenharmony_ci	&mov	(&DWP(4*1,"edi"),"eax");
941cb0ef41Sopenharmony_ci	&mov	(&DWP(4*2,"edi"),"eax");
951cb0ef41Sopenharmony_ci	&mov	(&DWP(4*3,"edi"),"eax");
961cb0ef41Sopenharmony_ci	&mov	(&DWP(4*4,"edi"),"eax");
971cb0ef41Sopenharmony_ci	&mov	(&DWP(4*5,"edi"),"eax");	# is_base2_26
981cb0ef41Sopenharmony_ci
991cb0ef41Sopenharmony_ci	&cmp	("esi",0);
1001cb0ef41Sopenharmony_ci	&je	(&label("nokey"));
1011cb0ef41Sopenharmony_ci
1021cb0ef41Sopenharmony_ci    if ($sse2) {
1031cb0ef41Sopenharmony_ci	&call	(&label("pic_point"));
1041cb0ef41Sopenharmony_ci    &set_label("pic_point");
1051cb0ef41Sopenharmony_ci	&blindpop("ebx");
1061cb0ef41Sopenharmony_ci
1071cb0ef41Sopenharmony_ci	&lea	("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx"));
1081cb0ef41Sopenharmony_ci	&lea	("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx"));
1091cb0ef41Sopenharmony_ci
1101cb0ef41Sopenharmony_ci	&picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point"));
1111cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP(0,"edi"));
1121cb0ef41Sopenharmony_ci	&and	("ecx",1<<26|1<<24);
1131cb0ef41Sopenharmony_ci	&cmp	("ecx",1<<26|1<<24);		# SSE2 and XMM?
1141cb0ef41Sopenharmony_ci	&jne	(&label("no_sse2"));
1151cb0ef41Sopenharmony_ci
1161cb0ef41Sopenharmony_ci	&lea	("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx"));
1171cb0ef41Sopenharmony_ci	&lea	("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx"));
1181cb0ef41Sopenharmony_ci
1191cb0ef41Sopenharmony_ci      if ($avx>1) {
1201cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP(8,"edi"));
1211cb0ef41Sopenharmony_ci	&test	("ecx",1<<5);			# AVX2?
1221cb0ef41Sopenharmony_ci	&jz	(&label("no_sse2"));
1231cb0ef41Sopenharmony_ci
1241cb0ef41Sopenharmony_ci	&lea	("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx"));
1251cb0ef41Sopenharmony_ci      }
1261cb0ef41Sopenharmony_ci    &set_label("no_sse2");
1271cb0ef41Sopenharmony_ci	&mov	("edi",&wparam(0));		# reload context
1281cb0ef41Sopenharmony_ci	&mov	(&DWP(0,"ebp"),"eax");		# fill function table
1291cb0ef41Sopenharmony_ci	&mov	(&DWP(4,"ebp"),"edx");
1301cb0ef41Sopenharmony_ci    }
1311cb0ef41Sopenharmony_ci
1321cb0ef41Sopenharmony_ci	&mov	("eax",&DWP(4*0,"esi"));	# load input key
1331cb0ef41Sopenharmony_ci	&mov	("ebx",&DWP(4*1,"esi"));
1341cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP(4*2,"esi"));
1351cb0ef41Sopenharmony_ci	&mov	("edx",&DWP(4*3,"esi"));
1361cb0ef41Sopenharmony_ci	&and	("eax",0x0fffffff);
1371cb0ef41Sopenharmony_ci	&and	("ebx",0x0ffffffc);
1381cb0ef41Sopenharmony_ci	&and	("ecx",0x0ffffffc);
1391cb0ef41Sopenharmony_ci	&and	("edx",0x0ffffffc);
1401cb0ef41Sopenharmony_ci	&mov	(&DWP(4*6,"edi"),"eax");
1411cb0ef41Sopenharmony_ci	&mov	(&DWP(4*7,"edi"),"ebx");
1421cb0ef41Sopenharmony_ci	&mov	(&DWP(4*8,"edi"),"ecx");
1431cb0ef41Sopenharmony_ci	&mov	(&DWP(4*9,"edi"),"edx");
1441cb0ef41Sopenharmony_ci
1451cb0ef41Sopenharmony_ci	&mov	("eax",$sse2);
1461cb0ef41Sopenharmony_ci&set_label("nokey");
1471cb0ef41Sopenharmony_ci&function_end("poly1305_init");
1481cb0ef41Sopenharmony_ci
1491cb0ef41Sopenharmony_ci($h0,$h1,$h2,$h3,$h4,
1501cb0ef41Sopenharmony_ci $d0,$d1,$d2,$d3,
1511cb0ef41Sopenharmony_ci $r0,$r1,$r2,$r3,
1521cb0ef41Sopenharmony_ci     $s1,$s2,$s3)=map(4*$_,(0..15));
1531cb0ef41Sopenharmony_ci
1541cb0ef41Sopenharmony_ci&function_begin("poly1305_blocks");
1551cb0ef41Sopenharmony_ci	&mov	("edi",&wparam(0));		# ctx
1561cb0ef41Sopenharmony_ci	&mov	("esi",&wparam(1));		# inp
1571cb0ef41Sopenharmony_ci	&mov	("ecx",&wparam(2));		# len
1581cb0ef41Sopenharmony_ci&set_label("enter_blocks");
1591cb0ef41Sopenharmony_ci	&and	("ecx",-15);
1601cb0ef41Sopenharmony_ci	&jz	(&label("nodata"));
1611cb0ef41Sopenharmony_ci
1621cb0ef41Sopenharmony_ci	&stack_push(16);
1631cb0ef41Sopenharmony_ci	&mov	("eax",&DWP(4*6,"edi"));	# r0
1641cb0ef41Sopenharmony_ci	&mov	("ebx",&DWP(4*7,"edi"));	# r1
1651cb0ef41Sopenharmony_ci	 &lea	("ebp",&DWP(0,"esi","ecx"));	# end of input
1661cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP(4*8,"edi"));	# r2
1671cb0ef41Sopenharmony_ci	&mov	("edx",&DWP(4*9,"edi"));	# r3
1681cb0ef41Sopenharmony_ci
1691cb0ef41Sopenharmony_ci	&mov	(&wparam(2),"ebp");
1701cb0ef41Sopenharmony_ci	&mov	("ebp","esi");
1711cb0ef41Sopenharmony_ci
1721cb0ef41Sopenharmony_ci	&mov	(&DWP($r0,"esp"),"eax");	# r0
1731cb0ef41Sopenharmony_ci	&mov	("eax","ebx");
1741cb0ef41Sopenharmony_ci	&shr	("eax",2);
1751cb0ef41Sopenharmony_ci	&mov	(&DWP($r1,"esp"),"ebx");	# r1
1761cb0ef41Sopenharmony_ci	&add	("eax","ebx");			# s1
1771cb0ef41Sopenharmony_ci	&mov	("ebx","ecx");
1781cb0ef41Sopenharmony_ci	&shr	("ebx",2);
1791cb0ef41Sopenharmony_ci	&mov	(&DWP($r2,"esp"),"ecx");	# r2
1801cb0ef41Sopenharmony_ci	&add	("ebx","ecx");			# s2
1811cb0ef41Sopenharmony_ci	&mov	("ecx","edx");
1821cb0ef41Sopenharmony_ci	&shr	("ecx",2);
1831cb0ef41Sopenharmony_ci	&mov	(&DWP($r3,"esp"),"edx");	# r3
1841cb0ef41Sopenharmony_ci	&add	("ecx","edx");			# s3
1851cb0ef41Sopenharmony_ci	&mov	(&DWP($s1,"esp"),"eax");	# s1
1861cb0ef41Sopenharmony_ci	&mov	(&DWP($s2,"esp"),"ebx");	# s2
1871cb0ef41Sopenharmony_ci	&mov	(&DWP($s3,"esp"),"ecx");	# s3
1881cb0ef41Sopenharmony_ci
1891cb0ef41Sopenharmony_ci	&mov	("eax",&DWP(4*0,"edi"));	# load hash value
1901cb0ef41Sopenharmony_ci	&mov	("ebx",&DWP(4*1,"edi"));
1911cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP(4*2,"edi"));
1921cb0ef41Sopenharmony_ci	&mov	("esi",&DWP(4*3,"edi"));
1931cb0ef41Sopenharmony_ci	&mov	("edi",&DWP(4*4,"edi"));
1941cb0ef41Sopenharmony_ci	&jmp	(&label("loop"));
1951cb0ef41Sopenharmony_ci
1961cb0ef41Sopenharmony_ci&set_label("loop",32);
1971cb0ef41Sopenharmony_ci	&add	("eax",&DWP(4*0,"ebp"));	# accumulate input
1981cb0ef41Sopenharmony_ci	&adc	("ebx",&DWP(4*1,"ebp"));
1991cb0ef41Sopenharmony_ci	&adc	("ecx",&DWP(4*2,"ebp"));
2001cb0ef41Sopenharmony_ci	&adc	("esi",&DWP(4*3,"ebp"));
2011cb0ef41Sopenharmony_ci	&lea	("ebp",&DWP(4*4,"ebp"));
2021cb0ef41Sopenharmony_ci	&adc	("edi",&wparam(3));		# padbit
2031cb0ef41Sopenharmony_ci
2041cb0ef41Sopenharmony_ci	&mov	(&DWP($h0,"esp"),"eax");	# put aside hash[+inp]
2051cb0ef41Sopenharmony_ci	&mov	(&DWP($h3,"esp"),"esi");
2061cb0ef41Sopenharmony_ci
2071cb0ef41Sopenharmony_ci	&mul	(&DWP($r0,"esp"));		# h0*r0
2081cb0ef41Sopenharmony_ci	 &mov	(&DWP($h4,"esp"),"edi");
2091cb0ef41Sopenharmony_ci	&mov	("edi","eax");
2101cb0ef41Sopenharmony_ci	&mov	("eax","ebx");			# h1
2111cb0ef41Sopenharmony_ci	&mov	("esi","edx");
2121cb0ef41Sopenharmony_ci	&mul	(&DWP($s3,"esp"));		# h1*s3
2131cb0ef41Sopenharmony_ci	&add	("edi","eax");
2141cb0ef41Sopenharmony_ci	&mov	("eax","ecx");			# h2
2151cb0ef41Sopenharmony_ci	&adc	("esi","edx");
2161cb0ef41Sopenharmony_ci	&mul	(&DWP($s2,"esp"));		# h2*s2
2171cb0ef41Sopenharmony_ci	&add	("edi","eax");
2181cb0ef41Sopenharmony_ci	&mov	("eax",&DWP($h3,"esp"));
2191cb0ef41Sopenharmony_ci	&adc	("esi","edx");
2201cb0ef41Sopenharmony_ci	&mul	(&DWP($s1,"esp"));		# h3*s1
2211cb0ef41Sopenharmony_ci	&add	("edi","eax");
2221cb0ef41Sopenharmony_ci	 &mov	("eax",&DWP($h0,"esp"));
2231cb0ef41Sopenharmony_ci	&adc	("esi","edx");
2241cb0ef41Sopenharmony_ci
2251cb0ef41Sopenharmony_ci	&mul	(&DWP($r1,"esp"));		# h0*r1
2261cb0ef41Sopenharmony_ci	 &mov	(&DWP($d0,"esp"),"edi");
2271cb0ef41Sopenharmony_ci	&xor	("edi","edi");
2281cb0ef41Sopenharmony_ci	&add	("esi","eax");
2291cb0ef41Sopenharmony_ci	&mov	("eax","ebx");			# h1
2301cb0ef41Sopenharmony_ci	&adc	("edi","edx");
2311cb0ef41Sopenharmony_ci	&mul	(&DWP($r0,"esp"));		# h1*r0
2321cb0ef41Sopenharmony_ci	&add	("esi","eax");
2331cb0ef41Sopenharmony_ci	&mov	("eax","ecx");			# h2
2341cb0ef41Sopenharmony_ci	&adc	("edi","edx");
2351cb0ef41Sopenharmony_ci	&mul	(&DWP($s3,"esp"));		# h2*s3
2361cb0ef41Sopenharmony_ci	&add	("esi","eax");
2371cb0ef41Sopenharmony_ci	&mov	("eax",&DWP($h3,"esp"));
2381cb0ef41Sopenharmony_ci	&adc	("edi","edx");
2391cb0ef41Sopenharmony_ci	&mul	(&DWP($s2,"esp"));		# h3*s2
2401cb0ef41Sopenharmony_ci	&add	("esi","eax");
2411cb0ef41Sopenharmony_ci	&mov	("eax",&DWP($h4,"esp"));
2421cb0ef41Sopenharmony_ci	&adc	("edi","edx");
2431cb0ef41Sopenharmony_ci	&imul	("eax",&DWP($s1,"esp"));	# h4*s1
2441cb0ef41Sopenharmony_ci	&add	("esi","eax");
2451cb0ef41Sopenharmony_ci	 &mov	("eax",&DWP($h0,"esp"));
2461cb0ef41Sopenharmony_ci	&adc	("edi",0);
2471cb0ef41Sopenharmony_ci
2481cb0ef41Sopenharmony_ci	&mul	(&DWP($r2,"esp"));		# h0*r2
2491cb0ef41Sopenharmony_ci	 &mov	(&DWP($d1,"esp"),"esi");
2501cb0ef41Sopenharmony_ci	&xor	("esi","esi");
2511cb0ef41Sopenharmony_ci	&add	("edi","eax");
2521cb0ef41Sopenharmony_ci	&mov	("eax","ebx");			# h1
2531cb0ef41Sopenharmony_ci	&adc	("esi","edx");
2541cb0ef41Sopenharmony_ci	&mul	(&DWP($r1,"esp"));		# h1*r1
2551cb0ef41Sopenharmony_ci	&add	("edi","eax");
2561cb0ef41Sopenharmony_ci	&mov	("eax","ecx");			# h2
2571cb0ef41Sopenharmony_ci	&adc	("esi","edx");
2581cb0ef41Sopenharmony_ci	&mul	(&DWP($r0,"esp"));		# h2*r0
2591cb0ef41Sopenharmony_ci	&add	("edi","eax");
2601cb0ef41Sopenharmony_ci	&mov	("eax",&DWP($h3,"esp"));
2611cb0ef41Sopenharmony_ci	&adc	("esi","edx");
2621cb0ef41Sopenharmony_ci	&mul	(&DWP($s3,"esp"));		# h3*s3
2631cb0ef41Sopenharmony_ci	&add	("edi","eax");
2641cb0ef41Sopenharmony_ci	&mov	("eax",&DWP($h4,"esp"));
2651cb0ef41Sopenharmony_ci	&adc	("esi","edx");
2661cb0ef41Sopenharmony_ci	&imul	("eax",&DWP($s2,"esp"));	# h4*s2
2671cb0ef41Sopenharmony_ci	&add	("edi","eax");
2681cb0ef41Sopenharmony_ci	 &mov	("eax",&DWP($h0,"esp"));
2691cb0ef41Sopenharmony_ci	&adc	("esi",0);
2701cb0ef41Sopenharmony_ci
2711cb0ef41Sopenharmony_ci	&mul	(&DWP($r3,"esp"));		# h0*r3
2721cb0ef41Sopenharmony_ci	 &mov	(&DWP($d2,"esp"),"edi");
2731cb0ef41Sopenharmony_ci	&xor	("edi","edi");
2741cb0ef41Sopenharmony_ci	&add	("esi","eax");
2751cb0ef41Sopenharmony_ci	&mov	("eax","ebx");			# h1
2761cb0ef41Sopenharmony_ci	&adc	("edi","edx");
2771cb0ef41Sopenharmony_ci	&mul	(&DWP($r2,"esp"));		# h1*r2
2781cb0ef41Sopenharmony_ci	&add	("esi","eax");
2791cb0ef41Sopenharmony_ci	&mov	("eax","ecx");			# h2
2801cb0ef41Sopenharmony_ci	&adc	("edi","edx");
2811cb0ef41Sopenharmony_ci	&mul	(&DWP($r1,"esp"));		# h2*r1
2821cb0ef41Sopenharmony_ci	&add	("esi","eax");
2831cb0ef41Sopenharmony_ci	&mov	("eax",&DWP($h3,"esp"));
2841cb0ef41Sopenharmony_ci	&adc	("edi","edx");
2851cb0ef41Sopenharmony_ci	&mul	(&DWP($r0,"esp"));		# h3*r0
2861cb0ef41Sopenharmony_ci	&add	("esi","eax");
2871cb0ef41Sopenharmony_ci	 &mov	("ecx",&DWP($h4,"esp"));
2881cb0ef41Sopenharmony_ci	&adc	("edi","edx");
2891cb0ef41Sopenharmony_ci
2901cb0ef41Sopenharmony_ci	&mov	("edx","ecx");
2911cb0ef41Sopenharmony_ci	&imul	("ecx",&DWP($s3,"esp"));	# h4*s3
2921cb0ef41Sopenharmony_ci	&add	("esi","ecx");
2931cb0ef41Sopenharmony_ci	 &mov	("eax",&DWP($d0,"esp"));
2941cb0ef41Sopenharmony_ci	&adc	("edi",0);
2951cb0ef41Sopenharmony_ci
2961cb0ef41Sopenharmony_ci	&imul	("edx",&DWP($r0,"esp"));	# h4*r0
2971cb0ef41Sopenharmony_ci	&add	("edx","edi");
2981cb0ef41Sopenharmony_ci
2991cb0ef41Sopenharmony_ci	&mov	("ebx",&DWP($d1,"esp"));
3001cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP($d2,"esp"));
3011cb0ef41Sopenharmony_ci
3021cb0ef41Sopenharmony_ci	&mov	("edi","edx");			# last reduction step
3031cb0ef41Sopenharmony_ci	&shr	("edx",2);
3041cb0ef41Sopenharmony_ci	&and	("edi",3);
3051cb0ef41Sopenharmony_ci	&lea	("edx",&DWP(0,"edx","edx",4));	# *5
3061cb0ef41Sopenharmony_ci	&add	("eax","edx");
3071cb0ef41Sopenharmony_ci	&adc	("ebx",0);
3081cb0ef41Sopenharmony_ci	&adc	("ecx",0);
3091cb0ef41Sopenharmony_ci	&adc	("esi",0);
3101cb0ef41Sopenharmony_ci	&adc	("edi",0);
3111cb0ef41Sopenharmony_ci
3121cb0ef41Sopenharmony_ci	&cmp	("ebp",&wparam(2));		# done yet?
3131cb0ef41Sopenharmony_ci	&jne	(&label("loop"));
3141cb0ef41Sopenharmony_ci
3151cb0ef41Sopenharmony_ci	&mov	("edx",&wparam(0));		# ctx
3161cb0ef41Sopenharmony_ci	&stack_pop(16);
3171cb0ef41Sopenharmony_ci	&mov	(&DWP(4*0,"edx"),"eax");	# store hash value
3181cb0ef41Sopenharmony_ci	&mov	(&DWP(4*1,"edx"),"ebx");
3191cb0ef41Sopenharmony_ci	&mov	(&DWP(4*2,"edx"),"ecx");
3201cb0ef41Sopenharmony_ci	&mov	(&DWP(4*3,"edx"),"esi");
3211cb0ef41Sopenharmony_ci	&mov	(&DWP(4*4,"edx"),"edi");
3221cb0ef41Sopenharmony_ci&set_label("nodata");
3231cb0ef41Sopenharmony_ci&function_end("poly1305_blocks");
3241cb0ef41Sopenharmony_ci
3251cb0ef41Sopenharmony_ci&function_begin("poly1305_emit");
3261cb0ef41Sopenharmony_ci	&mov	("ebp",&wparam(0));		# context
3271cb0ef41Sopenharmony_ci&set_label("enter_emit");
3281cb0ef41Sopenharmony_ci	&mov	("edi",&wparam(1));		# output
3291cb0ef41Sopenharmony_ci	&mov	("eax",&DWP(4*0,"ebp"));	# load hash value
3301cb0ef41Sopenharmony_ci	&mov	("ebx",&DWP(4*1,"ebp"));
3311cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP(4*2,"ebp"));
3321cb0ef41Sopenharmony_ci	&mov	("edx",&DWP(4*3,"ebp"));
3331cb0ef41Sopenharmony_ci	&mov	("esi",&DWP(4*4,"ebp"));
3341cb0ef41Sopenharmony_ci
3351cb0ef41Sopenharmony_ci	&add	("eax",5);			# compare to modulus
3361cb0ef41Sopenharmony_ci	&adc	("ebx",0);
3371cb0ef41Sopenharmony_ci	&adc	("ecx",0);
3381cb0ef41Sopenharmony_ci	&adc	("edx",0);
3391cb0ef41Sopenharmony_ci	&adc	("esi",0);
3401cb0ef41Sopenharmony_ci	&shr	("esi",2);			# did it carry/borrow?
3411cb0ef41Sopenharmony_ci	&neg	("esi");			# do we choose hash-modulus?
3421cb0ef41Sopenharmony_ci
3431cb0ef41Sopenharmony_ci	&and	("eax","esi");
3441cb0ef41Sopenharmony_ci	&and	("ebx","esi");
3451cb0ef41Sopenharmony_ci	&and	("ecx","esi");
3461cb0ef41Sopenharmony_ci	&and	("edx","esi");
3471cb0ef41Sopenharmony_ci	&mov	(&DWP(4*0,"edi"),"eax");
3481cb0ef41Sopenharmony_ci	&mov	(&DWP(4*1,"edi"),"ebx");
3491cb0ef41Sopenharmony_ci	&mov	(&DWP(4*2,"edi"),"ecx");
3501cb0ef41Sopenharmony_ci	&mov	(&DWP(4*3,"edi"),"edx");
3511cb0ef41Sopenharmony_ci
3521cb0ef41Sopenharmony_ci	&not	("esi");			# or original hash value?
3531cb0ef41Sopenharmony_ci	&mov	("eax",&DWP(4*0,"ebp"));
3541cb0ef41Sopenharmony_ci	&mov	("ebx",&DWP(4*1,"ebp"));
3551cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP(4*2,"ebp"));
3561cb0ef41Sopenharmony_ci	&mov	("edx",&DWP(4*3,"ebp"));
3571cb0ef41Sopenharmony_ci	&mov	("ebp",&wparam(2));
3581cb0ef41Sopenharmony_ci	&and	("eax","esi");
3591cb0ef41Sopenharmony_ci	&and	("ebx","esi");
3601cb0ef41Sopenharmony_ci	&and	("ecx","esi");
3611cb0ef41Sopenharmony_ci	&and	("edx","esi");
3621cb0ef41Sopenharmony_ci	&or	("eax",&DWP(4*0,"edi"));
3631cb0ef41Sopenharmony_ci	&or	("ebx",&DWP(4*1,"edi"));
3641cb0ef41Sopenharmony_ci	&or	("ecx",&DWP(4*2,"edi"));
3651cb0ef41Sopenharmony_ci	&or	("edx",&DWP(4*3,"edi"));
3661cb0ef41Sopenharmony_ci
3671cb0ef41Sopenharmony_ci	&add	("eax",&DWP(4*0,"ebp"));	# accumulate key
3681cb0ef41Sopenharmony_ci	&adc	("ebx",&DWP(4*1,"ebp"));
3691cb0ef41Sopenharmony_ci	&adc	("ecx",&DWP(4*2,"ebp"));
3701cb0ef41Sopenharmony_ci	&adc	("edx",&DWP(4*3,"ebp"));
3711cb0ef41Sopenharmony_ci
3721cb0ef41Sopenharmony_ci	&mov	(&DWP(4*0,"edi"),"eax");
3731cb0ef41Sopenharmony_ci	&mov	(&DWP(4*1,"edi"),"ebx");
3741cb0ef41Sopenharmony_ci	&mov	(&DWP(4*2,"edi"),"ecx");
3751cb0ef41Sopenharmony_ci	&mov	(&DWP(4*3,"edi"),"edx");
3761cb0ef41Sopenharmony_ci&function_end("poly1305_emit");
3771cb0ef41Sopenharmony_ci
3781cb0ef41Sopenharmony_ciif ($sse2) {
3791cb0ef41Sopenharmony_ci########################################################################
3801cb0ef41Sopenharmony_ci# Layout of opaque area is following.
3811cb0ef41Sopenharmony_ci#
3821cb0ef41Sopenharmony_ci#	unsigned __int32 h[5];		# current hash value base 2^26
3831cb0ef41Sopenharmony_ci#	unsigned __int32 is_base2_26;
3841cb0ef41Sopenharmony_ci#	unsigned __int32 r[4];		# key value base 2^32
3851cb0ef41Sopenharmony_ci#	unsigned __int32 pad[2];
3861cb0ef41Sopenharmony_ci#	struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9];
3871cb0ef41Sopenharmony_ci#
3881cb0ef41Sopenharmony_ci# where r^n are base 2^26 digits of degrees of multiplier key. There are
3891cb0ef41Sopenharmony_ci# 5 digits, but last four are interleaved with multiples of 5, totalling
3901cb0ef41Sopenharmony_ci# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
3911cb0ef41Sopenharmony_ci
3921cb0ef41Sopenharmony_cimy ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
3931cb0ef41Sopenharmony_cimy $MASK=$T2;	# borrow and keep in mind
3941cb0ef41Sopenharmony_ci
3951cb0ef41Sopenharmony_ci&align	(32);
3961cb0ef41Sopenharmony_ci&function_begin_B("_poly1305_init_sse2");
3971cb0ef41Sopenharmony_ci	&movdqu		($D4,&QWP(4*6,"edi"));		# key base 2^32
3981cb0ef41Sopenharmony_ci	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
3991cb0ef41Sopenharmony_ci	&mov		("ebp","esp");
4001cb0ef41Sopenharmony_ci	&sub		("esp",16*(9+5));
4011cb0ef41Sopenharmony_ci	&and		("esp",-16);
4021cb0ef41Sopenharmony_ci
4031cb0ef41Sopenharmony_ci	#&pand		($D4,&QWP(96,"ebx"));		# magic mask
4041cb0ef41Sopenharmony_ci	&movq		($MASK,&QWP(64,"ebx"));
4051cb0ef41Sopenharmony_ci
4061cb0ef41Sopenharmony_ci	&movdqa		($D0,$D4);
4071cb0ef41Sopenharmony_ci	&movdqa		($D1,$D4);
4081cb0ef41Sopenharmony_ci	&movdqa		($D2,$D4);
4091cb0ef41Sopenharmony_ci
4101cb0ef41Sopenharmony_ci	&pand		($D0,$MASK);			# -> base 2^26
4111cb0ef41Sopenharmony_ci	&psrlq		($D1,26);
4121cb0ef41Sopenharmony_ci	&psrldq		($D2,6);
4131cb0ef41Sopenharmony_ci	&pand		($D1,$MASK);
4141cb0ef41Sopenharmony_ci	&movdqa		($D3,$D2);
4151cb0ef41Sopenharmony_ci	&psrlq		($D2,4)
4161cb0ef41Sopenharmony_ci	&psrlq		($D3,30);
4171cb0ef41Sopenharmony_ci	&pand		($D2,$MASK);
4181cb0ef41Sopenharmony_ci	&pand		($D3,$MASK);
4191cb0ef41Sopenharmony_ci	&psrldq		($D4,13);
4201cb0ef41Sopenharmony_ci
4211cb0ef41Sopenharmony_ci	&lea		("edx",&DWP(16*9,"esp"));	# size optimization
4221cb0ef41Sopenharmony_ci	&mov		("ecx",2);
4231cb0ef41Sopenharmony_ci&set_label("square");
4241cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$D0);
4251cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$D1);
4261cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$D2);
4271cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$D3);
4281cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$D4);
4291cb0ef41Sopenharmony_ci
4301cb0ef41Sopenharmony_ci	&movdqa		($T1,$D1);
4311cb0ef41Sopenharmony_ci	&movdqa		($T0,$D2);
4321cb0ef41Sopenharmony_ci	&pslld		($T1,2);
4331cb0ef41Sopenharmony_ci	&pslld		($T0,2);
4341cb0ef41Sopenharmony_ci	&paddd		($T1,$D1);			# *5
4351cb0ef41Sopenharmony_ci	&paddd		($T0,$D2);			# *5
4361cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*5,"esp"),$T1);
4371cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*6,"esp"),$T0);
4381cb0ef41Sopenharmony_ci	&movdqa		($T1,$D3);
4391cb0ef41Sopenharmony_ci	&movdqa		($T0,$D4);
4401cb0ef41Sopenharmony_ci	&pslld		($T1,2);
4411cb0ef41Sopenharmony_ci	&pslld		($T0,2);
4421cb0ef41Sopenharmony_ci	&paddd		($T1,$D3);			# *5
4431cb0ef41Sopenharmony_ci	&paddd		($T0,$D4);			# *5
4441cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*7,"esp"),$T1);
4451cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*8,"esp"),$T0);
4461cb0ef41Sopenharmony_ci
4471cb0ef41Sopenharmony_ci	&pshufd		($T1,$D0,0b01000100);
4481cb0ef41Sopenharmony_ci	&movdqa		($T0,$D1);
4491cb0ef41Sopenharmony_ci	&pshufd		($D1,$D1,0b01000100);
4501cb0ef41Sopenharmony_ci	&pshufd		($D2,$D2,0b01000100);
4511cb0ef41Sopenharmony_ci	&pshufd		($D3,$D3,0b01000100);
4521cb0ef41Sopenharmony_ci	&pshufd		($D4,$D4,0b01000100);
4531cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"edx"),$T1);
4541cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"edx"),$D1);
4551cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"edx"),$D2);
4561cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"edx"),$D3);
4571cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"edx"),$D4);
4581cb0ef41Sopenharmony_ci
4591cb0ef41Sopenharmony_ci	################################################################
4601cb0ef41Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
4611cb0ef41Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
4621cb0ef41Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
4631cb0ef41Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
4641cb0ef41Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
4651cb0ef41Sopenharmony_ci
4661cb0ef41Sopenharmony_ci	&pmuludq	($D4,$D0);			# h4*r0
4671cb0ef41Sopenharmony_ci	&pmuludq	($D3,$D0);			# h3*r0
4681cb0ef41Sopenharmony_ci	&pmuludq	($D2,$D0);			# h2*r0
4691cb0ef41Sopenharmony_ci	&pmuludq	($D1,$D0);			# h1*r0
4701cb0ef41Sopenharmony_ci	&pmuludq	($D0,$T1);			# h0*r0
4711cb0ef41Sopenharmony_ci
4721cb0ef41Sopenharmony_cisub pmuladd {
4731cb0ef41Sopenharmony_cimy $load = shift;
4741cb0ef41Sopenharmony_cimy $base = shift; $base = "esp" if (!defined($base));
4751cb0ef41Sopenharmony_ci
4761cb0ef41Sopenharmony_ci	################################################################
4771cb0ef41Sopenharmony_ci	# As for choice to "rotate" $T0-$T2 in order to move paddq
4781cb0ef41Sopenharmony_ci	# past next multiplication. While it makes code harder to read
4791cb0ef41Sopenharmony_ci	# and doesn't have significant effect on most processors, it
4801cb0ef41Sopenharmony_ci	# makes a lot of difference on Atom, up to 30% improvement.
4811cb0ef41Sopenharmony_ci
4821cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
4831cb0ef41Sopenharmony_ci	&pmuludq	($T0,&QWP(16*3,$base));		# r1*h3
4841cb0ef41Sopenharmony_ci	&movdqa		($T2,$T1);
4851cb0ef41Sopenharmony_ci	&pmuludq	($T1,&QWP(16*2,$base));		# r1*h2
4861cb0ef41Sopenharmony_ci	&paddq		($D4,$T0);
4871cb0ef41Sopenharmony_ci	&movdqa		($T0,$T2);
4881cb0ef41Sopenharmony_ci	&pmuludq	($T2,&QWP(16*1,$base));		# r1*h1
4891cb0ef41Sopenharmony_ci	&paddq		($D3,$T1);
4901cb0ef41Sopenharmony_ci	&$load		($T1,5);			# s1
4911cb0ef41Sopenharmony_ci	&pmuludq	($T0,&QWP(16*0,$base));		# r1*h0
4921cb0ef41Sopenharmony_ci	&paddq		($D2,$T2);
4931cb0ef41Sopenharmony_ci	&pmuludq	($T1,&QWP(16*4,$base));		# s1*h4
4941cb0ef41Sopenharmony_ci	 &$load		($T2,2);			# r2^n
4951cb0ef41Sopenharmony_ci	&paddq		($D1,$T0);
4961cb0ef41Sopenharmony_ci
4971cb0ef41Sopenharmony_ci	&movdqa		($T0,$T2);
4981cb0ef41Sopenharmony_ci	&pmuludq	($T2,&QWP(16*2,$base));		# r2*h2
4991cb0ef41Sopenharmony_ci	 &paddq		($D0,$T1);
5001cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
5011cb0ef41Sopenharmony_ci	&pmuludq	($T0,&QWP(16*1,$base));		# r2*h1
5021cb0ef41Sopenharmony_ci	&paddq		($D4,$T2);
5031cb0ef41Sopenharmony_ci	&$load		($T2,6);			# s2^n
5041cb0ef41Sopenharmony_ci	&pmuludq	($T1,&QWP(16*0,$base));		# r2*h0
5051cb0ef41Sopenharmony_ci	&paddq		($D3,$T0);
5061cb0ef41Sopenharmony_ci	&movdqa		($T0,$T2);
5071cb0ef41Sopenharmony_ci	&pmuludq	($T2,&QWP(16*4,$base));		# s2*h4
5081cb0ef41Sopenharmony_ci	&paddq		($D2,$T1);
5091cb0ef41Sopenharmony_ci	&pmuludq	($T0,&QWP(16*3,$base));		# s2*h3
5101cb0ef41Sopenharmony_ci	 &$load		($T1,3);			# r3^n
5111cb0ef41Sopenharmony_ci	&paddq		($D1,$T2);
5121cb0ef41Sopenharmony_ci
5131cb0ef41Sopenharmony_ci	&movdqa		($T2,$T1);
5141cb0ef41Sopenharmony_ci	&pmuludq	($T1,&QWP(16*1,$base));		# r3*h1
5151cb0ef41Sopenharmony_ci	 &paddq		($D0,$T0);
5161cb0ef41Sopenharmony_ci	&$load		($T0,7);			# s3^n
5171cb0ef41Sopenharmony_ci	&pmuludq	($T2,&QWP(16*0,$base));		# r3*h0
5181cb0ef41Sopenharmony_ci	&paddq		($D4,$T1);
5191cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
5201cb0ef41Sopenharmony_ci	&pmuludq	($T0,&QWP(16*4,$base));		# s3*h4
5211cb0ef41Sopenharmony_ci	&paddq		($D3,$T2);
5221cb0ef41Sopenharmony_ci	&movdqa		($T2,$T1);
5231cb0ef41Sopenharmony_ci	&pmuludq	($T1,&QWP(16*3,$base));		# s3*h3
5241cb0ef41Sopenharmony_ci	&paddq		($D2,$T0);
5251cb0ef41Sopenharmony_ci	&pmuludq	($T2,&QWP(16*2,$base));		# s3*h2
5261cb0ef41Sopenharmony_ci	 &$load		($T0,4);			# r4^n
5271cb0ef41Sopenharmony_ci	&paddq		($D1,$T1);
5281cb0ef41Sopenharmony_ci
5291cb0ef41Sopenharmony_ci	&$load		($T1,8);			# s4^n
5301cb0ef41Sopenharmony_ci	&pmuludq	($T0,&QWP(16*0,$base));		# r4*h0
5311cb0ef41Sopenharmony_ci	 &paddq		($D0,$T2);
5321cb0ef41Sopenharmony_ci	&movdqa		($T2,$T1);
5331cb0ef41Sopenharmony_ci	&pmuludq	($T1,&QWP(16*4,$base));		# s4*h4
5341cb0ef41Sopenharmony_ci	&paddq		($D4,$T0);
5351cb0ef41Sopenharmony_ci	&movdqa		($T0,$T2);
5361cb0ef41Sopenharmony_ci	&pmuludq	($T2,&QWP(16*1,$base));		# s4*h1
5371cb0ef41Sopenharmony_ci	&paddq		($D3,$T1);
5381cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
5391cb0ef41Sopenharmony_ci	&pmuludq	($T0,&QWP(16*2,$base));		# s4*h2
5401cb0ef41Sopenharmony_ci	&paddq		($D0,$T2);
5411cb0ef41Sopenharmony_ci	&pmuludq	($T1,&QWP(16*3,$base));		# s4*h3
5421cb0ef41Sopenharmony_ci	 &movdqa	($MASK,&QWP(64,"ebx"));
5431cb0ef41Sopenharmony_ci	&paddq		($D1,$T0);
5441cb0ef41Sopenharmony_ci	&paddq		($D2,$T1);
5451cb0ef41Sopenharmony_ci}
5461cb0ef41Sopenharmony_ci	&pmuladd	(sub {	my ($reg,$i)=@_;
5471cb0ef41Sopenharmony_ci				&movdqa ($reg,&QWP(16*$i,"esp"));
5481cb0ef41Sopenharmony_ci			     },"edx");
5491cb0ef41Sopenharmony_ci
5501cb0ef41Sopenharmony_cisub lazy_reduction {
5511cb0ef41Sopenharmony_cimy $extra = shift;
5521cb0ef41Sopenharmony_ci
5531cb0ef41Sopenharmony_ci	################################################################
5541cb0ef41Sopenharmony_ci	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
5551cb0ef41Sopenharmony_ci	# and P. Schwabe
5561cb0ef41Sopenharmony_ci	#
5571cb0ef41Sopenharmony_ci	# [(*) see discussion in poly1305-armv4 module]
5581cb0ef41Sopenharmony_ci
5591cb0ef41Sopenharmony_ci	 &movdqa	($T0,$D3);
5601cb0ef41Sopenharmony_ci	 &pand		($D3,$MASK);
5611cb0ef41Sopenharmony_ci	 &psrlq		($T0,26);
5621cb0ef41Sopenharmony_ci	 &$extra	()				if (defined($extra));
5631cb0ef41Sopenharmony_ci	 &paddq		($T0,$D4);			# h3 -> h4
5641cb0ef41Sopenharmony_ci	&movdqa		($T1,$D0);
5651cb0ef41Sopenharmony_ci	&pand		($D0,$MASK);
5661cb0ef41Sopenharmony_ci	&psrlq		($T1,26);
5671cb0ef41Sopenharmony_ci	 &movdqa	($D4,$T0);
5681cb0ef41Sopenharmony_ci	&paddq		($T1,$D1);			# h0 -> h1
5691cb0ef41Sopenharmony_ci	 &psrlq		($T0,26);
5701cb0ef41Sopenharmony_ci	 &pand		($D4,$MASK);
5711cb0ef41Sopenharmony_ci	&movdqa		($D1,$T1);
5721cb0ef41Sopenharmony_ci	&psrlq		($T1,26);
5731cb0ef41Sopenharmony_ci	 &paddd		($D0,$T0);			# favour paddd when
5741cb0ef41Sopenharmony_ci							# possible, because
5751cb0ef41Sopenharmony_ci							# paddq is "broken"
5761cb0ef41Sopenharmony_ci							# on Atom
5771cb0ef41Sopenharmony_ci	 &psllq		($T0,2);
5781cb0ef41Sopenharmony_ci	&paddq		($T1,$D2);			# h1 -> h2
5791cb0ef41Sopenharmony_ci	 &paddq		($T0,$D0);			# h4 -> h0 (*)
5801cb0ef41Sopenharmony_ci	&pand		($D1,$MASK);
5811cb0ef41Sopenharmony_ci	&movdqa		($D2,$T1);
5821cb0ef41Sopenharmony_ci	&psrlq		($T1,26);
5831cb0ef41Sopenharmony_ci	&pand		($D2,$MASK);
5841cb0ef41Sopenharmony_ci	&paddd		($T1,$D3);			# h2 -> h3
5851cb0ef41Sopenharmony_ci	 &movdqa	($D0,$T0);
5861cb0ef41Sopenharmony_ci	 &psrlq		($T0,26);
5871cb0ef41Sopenharmony_ci	&movdqa		($D3,$T1);
5881cb0ef41Sopenharmony_ci	&psrlq		($T1,26);
5891cb0ef41Sopenharmony_ci	 &pand		($D0,$MASK);
5901cb0ef41Sopenharmony_ci	 &paddd		($D1,$T0);			# h0 -> h1
5911cb0ef41Sopenharmony_ci	&pand		($D3,$MASK);
5921cb0ef41Sopenharmony_ci	&paddd		($D4,$T1);			# h3 -> h4
5931cb0ef41Sopenharmony_ci}
5941cb0ef41Sopenharmony_ci	&lazy_reduction	();
5951cb0ef41Sopenharmony_ci
5961cb0ef41Sopenharmony_ci	&dec		("ecx");
5971cb0ef41Sopenharmony_ci	&jz		(&label("square_break"));
5981cb0ef41Sopenharmony_ci
5991cb0ef41Sopenharmony_ci	&punpcklqdq	($D0,&QWP(16*0,"esp"));		# 0:r^1:0:r^2
6001cb0ef41Sopenharmony_ci	&punpcklqdq	($D1,&QWP(16*1,"esp"));
6011cb0ef41Sopenharmony_ci	&punpcklqdq	($D2,&QWP(16*2,"esp"));
6021cb0ef41Sopenharmony_ci	&punpcklqdq	($D3,&QWP(16*3,"esp"));
6031cb0ef41Sopenharmony_ci	&punpcklqdq	($D4,&QWP(16*4,"esp"));
6041cb0ef41Sopenharmony_ci	&jmp		(&label("square"));
6051cb0ef41Sopenharmony_ci
6061cb0ef41Sopenharmony_ci&set_label("square_break");
6071cb0ef41Sopenharmony_ci	&psllq		($D0,32);			# -> r^3:0:r^4:0
6081cb0ef41Sopenharmony_ci	&psllq		($D1,32);
6091cb0ef41Sopenharmony_ci	&psllq		($D2,32);
6101cb0ef41Sopenharmony_ci	&psllq		($D3,32);
6111cb0ef41Sopenharmony_ci	&psllq		($D4,32);
6121cb0ef41Sopenharmony_ci	&por		($D0,&QWP(16*0,"esp"));		# r^3:r^1:r^4:r^2
6131cb0ef41Sopenharmony_ci	&por		($D1,&QWP(16*1,"esp"));
6141cb0ef41Sopenharmony_ci	&por		($D2,&QWP(16*2,"esp"));
6151cb0ef41Sopenharmony_ci	&por		($D3,&QWP(16*3,"esp"));
6161cb0ef41Sopenharmony_ci	&por		($D4,&QWP(16*4,"esp"));
6171cb0ef41Sopenharmony_ci
6181cb0ef41Sopenharmony_ci	&pshufd		($D0,$D0,0b10001101);		# -> r^1:r^2:r^3:r^4
6191cb0ef41Sopenharmony_ci	&pshufd		($D1,$D1,0b10001101);
6201cb0ef41Sopenharmony_ci	&pshufd		($D2,$D2,0b10001101);
6211cb0ef41Sopenharmony_ci	&pshufd		($D3,$D3,0b10001101);
6221cb0ef41Sopenharmony_ci	&pshufd		($D4,$D4,0b10001101);
6231cb0ef41Sopenharmony_ci
6241cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*0,"edi"),$D0);		# save the table
6251cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*1,"edi"),$D1);
6261cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*2,"edi"),$D2);
6271cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*3,"edi"),$D3);
6281cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*4,"edi"),$D4);
6291cb0ef41Sopenharmony_ci
6301cb0ef41Sopenharmony_ci	&movdqa		($T1,$D1);
6311cb0ef41Sopenharmony_ci	&movdqa		($T0,$D2);
6321cb0ef41Sopenharmony_ci	&pslld		($T1,2);
6331cb0ef41Sopenharmony_ci	&pslld		($T0,2);
6341cb0ef41Sopenharmony_ci	&paddd		($T1,$D1);			# *5
6351cb0ef41Sopenharmony_ci	&paddd		($T0,$D2);			# *5
6361cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*5,"edi"),$T1);
6371cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*6,"edi"),$T0);
6381cb0ef41Sopenharmony_ci	&movdqa		($T1,$D3);
6391cb0ef41Sopenharmony_ci	&movdqa		($T0,$D4);
6401cb0ef41Sopenharmony_ci	&pslld		($T1,2);
6411cb0ef41Sopenharmony_ci	&pslld		($T0,2);
6421cb0ef41Sopenharmony_ci	&paddd		($T1,$D3);			# *5
6431cb0ef41Sopenharmony_ci	&paddd		($T0,$D4);			# *5
6441cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*7,"edi"),$T1);
6451cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*8,"edi"),$T0);
6461cb0ef41Sopenharmony_ci
6471cb0ef41Sopenharmony_ci	&mov		("esp","ebp");
6481cb0ef41Sopenharmony_ci	&lea		("edi",&DWP(-16*3,"edi"));	# size de-optimization
6491cb0ef41Sopenharmony_ci	&ret		();
6501cb0ef41Sopenharmony_ci&function_end_B("_poly1305_init_sse2");
6511cb0ef41Sopenharmony_ci
6521cb0ef41Sopenharmony_ci&align	(32);
6531cb0ef41Sopenharmony_ci&function_begin("_poly1305_blocks_sse2");
6541cb0ef41Sopenharmony_ci	&mov	("edi",&wparam(0));			# ctx
6551cb0ef41Sopenharmony_ci	&mov	("esi",&wparam(1));			# inp
6561cb0ef41Sopenharmony_ci	&mov	("ecx",&wparam(2));			# len
6571cb0ef41Sopenharmony_ci
6581cb0ef41Sopenharmony_ci	&mov	("eax",&DWP(4*5,"edi"));		# is_base2_26
6591cb0ef41Sopenharmony_ci	&and	("ecx",-16);
6601cb0ef41Sopenharmony_ci	&jz	(&label("nodata"));
6611cb0ef41Sopenharmony_ci	&cmp	("ecx",64);
6621cb0ef41Sopenharmony_ci	&jae	(&label("enter_sse2"));
6631cb0ef41Sopenharmony_ci	&test	("eax","eax");				# is_base2_26?
6641cb0ef41Sopenharmony_ci	&jz	(&label("enter_blocks"));
6651cb0ef41Sopenharmony_ci
6661cb0ef41Sopenharmony_ci&set_label("enter_sse2",16);
6671cb0ef41Sopenharmony_ci	&call	(&label("pic_point"));
6681cb0ef41Sopenharmony_ci&set_label("pic_point");
6691cb0ef41Sopenharmony_ci	&blindpop("ebx");
6701cb0ef41Sopenharmony_ci	&lea	("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
6711cb0ef41Sopenharmony_ci
6721cb0ef41Sopenharmony_ci	&test	("eax","eax");				# is_base2_26?
6731cb0ef41Sopenharmony_ci	&jnz	(&label("base2_26"));
6741cb0ef41Sopenharmony_ci
6751cb0ef41Sopenharmony_ci	&call	("_poly1305_init_sse2");
6761cb0ef41Sopenharmony_ci
6771cb0ef41Sopenharmony_ci	################################################# base 2^32 -> base 2^26
6781cb0ef41Sopenharmony_ci	&mov	("eax",&DWP(0,"edi"));
6791cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP(3,"edi"));
6801cb0ef41Sopenharmony_ci	&mov	("edx",&DWP(6,"edi"));
6811cb0ef41Sopenharmony_ci	&mov	("esi",&DWP(9,"edi"));
6821cb0ef41Sopenharmony_ci	&mov	("ebp",&DWP(13,"edi"));
6831cb0ef41Sopenharmony_ci	&mov	(&DWP(4*5,"edi"),1);			# is_base2_26
6841cb0ef41Sopenharmony_ci
6851cb0ef41Sopenharmony_ci	&shr	("ecx",2);
6861cb0ef41Sopenharmony_ci	&and	("eax",0x3ffffff);
6871cb0ef41Sopenharmony_ci	&shr	("edx",4);
6881cb0ef41Sopenharmony_ci	&and	("ecx",0x3ffffff);
6891cb0ef41Sopenharmony_ci	&shr	("esi",6);
6901cb0ef41Sopenharmony_ci	&and	("edx",0x3ffffff);
6911cb0ef41Sopenharmony_ci
6921cb0ef41Sopenharmony_ci	&movd	($D0,"eax");
6931cb0ef41Sopenharmony_ci	&movd	($D1,"ecx");
6941cb0ef41Sopenharmony_ci	&movd	($D2,"edx");
6951cb0ef41Sopenharmony_ci	&movd	($D3,"esi");
6961cb0ef41Sopenharmony_ci	&movd	($D4,"ebp");
6971cb0ef41Sopenharmony_ci
6981cb0ef41Sopenharmony_ci	&mov	("esi",&wparam(1));			# [reload] inp
6991cb0ef41Sopenharmony_ci	&mov	("ecx",&wparam(2));			# [reload] len
7001cb0ef41Sopenharmony_ci	&jmp	(&label("base2_32"));
7011cb0ef41Sopenharmony_ci
7021cb0ef41Sopenharmony_ci&set_label("base2_26",16);
7031cb0ef41Sopenharmony_ci	&movd	($D0,&DWP(4*0,"edi"));			# load hash value
7041cb0ef41Sopenharmony_ci	&movd	($D1,&DWP(4*1,"edi"));
7051cb0ef41Sopenharmony_ci	&movd	($D2,&DWP(4*2,"edi"));
7061cb0ef41Sopenharmony_ci	&movd	($D3,&DWP(4*3,"edi"));
7071cb0ef41Sopenharmony_ci	&movd	($D4,&DWP(4*4,"edi"));
7081cb0ef41Sopenharmony_ci	&movdqa	($MASK,&QWP(64,"ebx"));
7091cb0ef41Sopenharmony_ci
7101cb0ef41Sopenharmony_ci&set_label("base2_32");
7111cb0ef41Sopenharmony_ci	&mov	("eax",&wparam(3));			# padbit
7121cb0ef41Sopenharmony_ci	&mov	("ebp","esp");
7131cb0ef41Sopenharmony_ci
7141cb0ef41Sopenharmony_ci	&sub	("esp",16*(5+5+5+9+9));
7151cb0ef41Sopenharmony_ci	&and	("esp",-16);
7161cb0ef41Sopenharmony_ci
7171cb0ef41Sopenharmony_ci	&lea	("edi",&DWP(16*3,"edi"));		# size optimization
7181cb0ef41Sopenharmony_ci	&shl	("eax",24);				# padbit
7191cb0ef41Sopenharmony_ci
7201cb0ef41Sopenharmony_ci	&test	("ecx",31);
7211cb0ef41Sopenharmony_ci	&jz	(&label("even"));
7221cb0ef41Sopenharmony_ci
7231cb0ef41Sopenharmony_ci	################################################################
7241cb0ef41Sopenharmony_ci	# process single block, with SSE2, because it's still faster
7251cb0ef41Sopenharmony_ci	# even though half of result is discarded
7261cb0ef41Sopenharmony_ci
7271cb0ef41Sopenharmony_ci	&movdqu		($T1,&QWP(0,"esi"));		# input
7281cb0ef41Sopenharmony_ci	&lea		("esi",&DWP(16,"esi"));
7291cb0ef41Sopenharmony_ci
7301cb0ef41Sopenharmony_ci	&movdqa		($T0,$T1);			# -> base 2^26 ...
7311cb0ef41Sopenharmony_ci	&pand		($T1,$MASK);
7321cb0ef41Sopenharmony_ci	&paddd		($D0,$T1);			# ... and accumulate
7331cb0ef41Sopenharmony_ci
7341cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
7351cb0ef41Sopenharmony_ci	&psrlq		($T0,26);
7361cb0ef41Sopenharmony_ci	&psrldq		($T1,6);
7371cb0ef41Sopenharmony_ci	&pand		($T0,$MASK);
7381cb0ef41Sopenharmony_ci	&paddd		($D1,$T0);
7391cb0ef41Sopenharmony_ci
7401cb0ef41Sopenharmony_ci	&movdqa		($T0,$T1);
7411cb0ef41Sopenharmony_ci	&psrlq		($T1,4);
7421cb0ef41Sopenharmony_ci	&pand		($T1,$MASK);
7431cb0ef41Sopenharmony_ci	&paddd		($D2,$T1);
7441cb0ef41Sopenharmony_ci
7451cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
7461cb0ef41Sopenharmony_ci	&psrlq		($T0,30);
7471cb0ef41Sopenharmony_ci	&pand		($T0,$MASK);
7481cb0ef41Sopenharmony_ci	&psrldq		($T1,7);
7491cb0ef41Sopenharmony_ci	&paddd		($D3,$T0);
7501cb0ef41Sopenharmony_ci
7511cb0ef41Sopenharmony_ci	&movd		($T0,"eax");			# padbit
7521cb0ef41Sopenharmony_ci	&paddd		($D4,$T1);
7531cb0ef41Sopenharmony_ci	 &movd		($T1,&DWP(16*0+12,"edi"));	# r0
7541cb0ef41Sopenharmony_ci	&paddd		($D4,$T0);
7551cb0ef41Sopenharmony_ci
7561cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$D0);
7571cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$D1);
7581cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$D2);
7591cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$D3);
7601cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$D4);
7611cb0ef41Sopenharmony_ci
7621cb0ef41Sopenharmony_ci	################################################################
7631cb0ef41Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
7641cb0ef41Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
7651cb0ef41Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
7661cb0ef41Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
7671cb0ef41Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
7681cb0ef41Sopenharmony_ci
7691cb0ef41Sopenharmony_ci	&pmuludq	($D0,$T1);			# h4*r0
7701cb0ef41Sopenharmony_ci	&pmuludq	($D1,$T1);			# h3*r0
7711cb0ef41Sopenharmony_ci	&pmuludq	($D2,$T1);			# h2*r0
7721cb0ef41Sopenharmony_ci	 &movd		($T0,&DWP(16*1+12,"edi"));	# r1
7731cb0ef41Sopenharmony_ci	&pmuludq	($D3,$T1);			# h1*r0
7741cb0ef41Sopenharmony_ci	&pmuludq	($D4,$T1);			# h0*r0
7751cb0ef41Sopenharmony_ci
7761cb0ef41Sopenharmony_ci	&pmuladd	(sub {	my ($reg,$i)=@_;
7771cb0ef41Sopenharmony_ci				&movd ($reg,&DWP(16*$i+12,"edi"));
7781cb0ef41Sopenharmony_ci			     });
7791cb0ef41Sopenharmony_ci
7801cb0ef41Sopenharmony_ci	&lazy_reduction	();
7811cb0ef41Sopenharmony_ci
7821cb0ef41Sopenharmony_ci	&sub		("ecx",16);
7831cb0ef41Sopenharmony_ci	&jz		(&label("done"));
7841cb0ef41Sopenharmony_ci
7851cb0ef41Sopenharmony_ci&set_label("even");
7861cb0ef41Sopenharmony_ci	&lea		("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization
7871cb0ef41Sopenharmony_ci	&lea		("eax",&DWP(-16*2,"esi"));
7881cb0ef41Sopenharmony_ci	&sub		("ecx",64);
7891cb0ef41Sopenharmony_ci
7901cb0ef41Sopenharmony_ci	################################################################
7911cb0ef41Sopenharmony_ci	# expand and copy pre-calculated table to stack
7921cb0ef41Sopenharmony_ci
7931cb0ef41Sopenharmony_ci	&movdqu		($T0,&QWP(16*0,"edi"));		# r^1:r^2:r^3:r^4
7941cb0ef41Sopenharmony_ci	&pshufd		($T1,$T0,0b01000100);		# duplicate r^3:r^4
7951cb0ef41Sopenharmony_ci	&cmovb		("esi","eax");
7961cb0ef41Sopenharmony_ci	&pshufd		($T0,$T0,0b11101110);		# duplicate r^1:r^2
7971cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"edx"),$T1);
7981cb0ef41Sopenharmony_ci	&lea		("eax",&DWP(16*10,"esp"));
7991cb0ef41Sopenharmony_ci	&movdqu		($T1,&QWP(16*1,"edi"));
8001cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*(0-9),"edx"),$T0);
8011cb0ef41Sopenharmony_ci	&pshufd		($T0,$T1,0b01000100);
8021cb0ef41Sopenharmony_ci	&pshufd		($T1,$T1,0b11101110);
8031cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"edx"),$T0);
8041cb0ef41Sopenharmony_ci	&movdqu		($T0,&QWP(16*2,"edi"));
8051cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*(1-9),"edx"),$T1);
8061cb0ef41Sopenharmony_ci	&pshufd		($T1,$T0,0b01000100);
8071cb0ef41Sopenharmony_ci	&pshufd		($T0,$T0,0b11101110);
8081cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"edx"),$T1);
8091cb0ef41Sopenharmony_ci	&movdqu		($T1,&QWP(16*3,"edi"));
8101cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*(2-9),"edx"),$T0);
8111cb0ef41Sopenharmony_ci	&pshufd		($T0,$T1,0b01000100);
8121cb0ef41Sopenharmony_ci	&pshufd		($T1,$T1,0b11101110);
8131cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"edx"),$T0);
8141cb0ef41Sopenharmony_ci	&movdqu		($T0,&QWP(16*4,"edi"));
8151cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*(3-9),"edx"),$T1);
8161cb0ef41Sopenharmony_ci	&pshufd		($T1,$T0,0b01000100);
8171cb0ef41Sopenharmony_ci	&pshufd		($T0,$T0,0b11101110);
8181cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"edx"),$T1);
8191cb0ef41Sopenharmony_ci	&movdqu		($T1,&QWP(16*5,"edi"));
8201cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*(4-9),"edx"),$T0);
8211cb0ef41Sopenharmony_ci	&pshufd		($T0,$T1,0b01000100);
8221cb0ef41Sopenharmony_ci	&pshufd		($T1,$T1,0b11101110);
8231cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*5,"edx"),$T0);
8241cb0ef41Sopenharmony_ci	&movdqu		($T0,&QWP(16*6,"edi"));
8251cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*(5-9),"edx"),$T1);
8261cb0ef41Sopenharmony_ci	&pshufd		($T1,$T0,0b01000100);
8271cb0ef41Sopenharmony_ci	&pshufd		($T0,$T0,0b11101110);
8281cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*6,"edx"),$T1);
8291cb0ef41Sopenharmony_ci	&movdqu		($T1,&QWP(16*7,"edi"));
8301cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*(6-9),"edx"),$T0);
8311cb0ef41Sopenharmony_ci	&pshufd		($T0,$T1,0b01000100);
8321cb0ef41Sopenharmony_ci	&pshufd		($T1,$T1,0b11101110);
8331cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*7,"edx"),$T0);
8341cb0ef41Sopenharmony_ci	&movdqu		($T0,&QWP(16*8,"edi"));
8351cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*(7-9),"edx"),$T1);
8361cb0ef41Sopenharmony_ci	&pshufd		($T1,$T0,0b01000100);
8371cb0ef41Sopenharmony_ci	&pshufd		($T0,$T0,0b11101110);
8381cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*8,"edx"),$T1);
8391cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*(8-9),"edx"),$T0);
8401cb0ef41Sopenharmony_ci
8411cb0ef41Sopenharmony_cisub load_input {
8421cb0ef41Sopenharmony_cimy ($inpbase,$offbase)=@_;
8431cb0ef41Sopenharmony_ci
8441cb0ef41Sopenharmony_ci	&movdqu		($T0,&QWP($inpbase+0,"esi"));	# load input
8451cb0ef41Sopenharmony_ci	&movdqu		($T1,&QWP($inpbase+16,"esi"));
8461cb0ef41Sopenharmony_ci	&lea		("esi",&DWP(16*2,"esi"));
8471cb0ef41Sopenharmony_ci
8481cb0ef41Sopenharmony_ci	&movdqa		(&QWP($offbase+16*2,"esp"),$D2);
8491cb0ef41Sopenharmony_ci	&movdqa		(&QWP($offbase+16*3,"esp"),$D3);
8501cb0ef41Sopenharmony_ci	&movdqa		(&QWP($offbase+16*4,"esp"),$D4);
8511cb0ef41Sopenharmony_ci
8521cb0ef41Sopenharmony_ci	&movdqa		($D2,$T0);			# splat input
8531cb0ef41Sopenharmony_ci	&movdqa		($D3,$T1);
8541cb0ef41Sopenharmony_ci	&psrldq		($D2,6);
8551cb0ef41Sopenharmony_ci	&psrldq		($D3,6);
8561cb0ef41Sopenharmony_ci	&movdqa		($D4,$T0);
8571cb0ef41Sopenharmony_ci	&punpcklqdq	($D2,$D3);			# 2:3
8581cb0ef41Sopenharmony_ci	&punpckhqdq	($D4,$T1);			# 4
8591cb0ef41Sopenharmony_ci	&punpcklqdq	($T0,$T1);			# 0:1
8601cb0ef41Sopenharmony_ci
8611cb0ef41Sopenharmony_ci	&movdqa		($D3,$D2);
8621cb0ef41Sopenharmony_ci	&psrlq		($D2,4);
8631cb0ef41Sopenharmony_ci	&psrlq		($D3,30);
8641cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
8651cb0ef41Sopenharmony_ci	&psrlq		($D4,40);			# 4
8661cb0ef41Sopenharmony_ci	&psrlq		($T1,26);
8671cb0ef41Sopenharmony_ci	&pand		($T0,$MASK);			# 0
8681cb0ef41Sopenharmony_ci	&pand		($T1,$MASK);			# 1
8691cb0ef41Sopenharmony_ci	&pand		($D2,$MASK);			# 2
8701cb0ef41Sopenharmony_ci	&pand		($D3,$MASK);			# 3
8711cb0ef41Sopenharmony_ci	&por		($D4,&QWP(0,"ebx"));		# padbit, yes, always
8721cb0ef41Sopenharmony_ci
8731cb0ef41Sopenharmony_ci	&movdqa		(&QWP($offbase+16*0,"esp"),$D0)	if ($offbase);
8741cb0ef41Sopenharmony_ci	&movdqa		(&QWP($offbase+16*1,"esp"),$D1)	if ($offbase);
8751cb0ef41Sopenharmony_ci}
8761cb0ef41Sopenharmony_ci	&load_input	(16*2,16*5);
8771cb0ef41Sopenharmony_ci
8781cb0ef41Sopenharmony_ci	&jbe		(&label("skip_loop"));
8791cb0ef41Sopenharmony_ci	&jmp		(&label("loop"));
8801cb0ef41Sopenharmony_ci
8811cb0ef41Sopenharmony_ci&set_label("loop",32);
8821cb0ef41Sopenharmony_ci	################################################################
8831cb0ef41Sopenharmony_ci	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
8841cb0ef41Sopenharmony_ci	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
8851cb0ef41Sopenharmony_ci	#   \___________________/
8861cb0ef41Sopenharmony_ci	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
8871cb0ef41Sopenharmony_ci	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
8881cb0ef41Sopenharmony_ci	#   \___________________/ \____________________/
8891cb0ef41Sopenharmony_ci	################################################################
8901cb0ef41Sopenharmony_ci
8911cb0ef41Sopenharmony_ci	&movdqa		($T2,&QWP(16*(0-9),"edx"));	# r0^2
8921cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"eax"),$T1);
8931cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"eax"),$D2);
8941cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"eax"),$D3);
8951cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"eax"),$D4);
8961cb0ef41Sopenharmony_ci
8971cb0ef41Sopenharmony_ci	################################################################
8981cb0ef41Sopenharmony_ci	# d4 = h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
8991cb0ef41Sopenharmony_ci	# d3 = h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
9001cb0ef41Sopenharmony_ci	# d2 = h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
9011cb0ef41Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
9021cb0ef41Sopenharmony_ci	# d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
9031cb0ef41Sopenharmony_ci
9041cb0ef41Sopenharmony_ci	&movdqa		($D1,$T0);
9051cb0ef41Sopenharmony_ci	&pmuludq	($T0,$T2);			# h0*r0
9061cb0ef41Sopenharmony_ci	&movdqa		($D0,$T1);
9071cb0ef41Sopenharmony_ci	&pmuludq	($T1,$T2);			# h1*r0
9081cb0ef41Sopenharmony_ci	&pmuludq	($D2,$T2);			# h2*r0
9091cb0ef41Sopenharmony_ci	&pmuludq	($D3,$T2);			# h3*r0
9101cb0ef41Sopenharmony_ci	&pmuludq	($D4,$T2);			# h4*r0
9111cb0ef41Sopenharmony_ci
9121cb0ef41Sopenharmony_cisub pmuladd_alt {
9131cb0ef41Sopenharmony_cimy $addr = shift;
9141cb0ef41Sopenharmony_ci
9151cb0ef41Sopenharmony_ci	&pmuludq	($D0,&$addr(8));		# h1*s4
9161cb0ef41Sopenharmony_ci	&movdqa		($T2,$D1);
9171cb0ef41Sopenharmony_ci	&pmuludq	($D1,&$addr(1));		# h0*r1
9181cb0ef41Sopenharmony_ci	&paddq		($D0,$T0);
9191cb0ef41Sopenharmony_ci	&movdqa		($T0,$T2);
9201cb0ef41Sopenharmony_ci	&pmuludq	($T2,&$addr(2));		# h0*r2
9211cb0ef41Sopenharmony_ci	&paddq		($D1,$T1);
9221cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
9231cb0ef41Sopenharmony_ci	&pmuludq	($T0,&$addr(3));		# h0*r3
9241cb0ef41Sopenharmony_ci	&paddq		($D2,$T2);
9251cb0ef41Sopenharmony_ci	 &movdqa	($T2,&QWP(16*1,"eax"));		# pull h1
9261cb0ef41Sopenharmony_ci	&pmuludq	($T1,&$addr(4));		# h0*r4
9271cb0ef41Sopenharmony_ci	&paddq		($D3,$T0);
9281cb0ef41Sopenharmony_ci
9291cb0ef41Sopenharmony_ci	&movdqa		($T0,$T2);
9301cb0ef41Sopenharmony_ci	&pmuludq	($T2,&$addr(1));		# h1*r1
9311cb0ef41Sopenharmony_ci	 &paddq		($D4,$T1);
9321cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
9331cb0ef41Sopenharmony_ci	&pmuludq	($T0,&$addr(2));		# h1*r2
9341cb0ef41Sopenharmony_ci	&paddq		($D2,$T2);
9351cb0ef41Sopenharmony_ci	&movdqa		($T2,&QWP(16*2,"eax"));		# pull h2
9361cb0ef41Sopenharmony_ci	&pmuludq	($T1,&$addr(3));		# h1*r3
9371cb0ef41Sopenharmony_ci	&paddq		($D3,$T0);
9381cb0ef41Sopenharmony_ci	&movdqa		($T0,$T2);
9391cb0ef41Sopenharmony_ci	&pmuludq	($T2,&$addr(7));		# h2*s3
9401cb0ef41Sopenharmony_ci	&paddq		($D4,$T1);
9411cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
9421cb0ef41Sopenharmony_ci	&pmuludq	($T0,&$addr(8));		# h2*s4
9431cb0ef41Sopenharmony_ci	&paddq		($D0,$T2);
9441cb0ef41Sopenharmony_ci
9451cb0ef41Sopenharmony_ci	&movdqa		($T2,$T1);
9461cb0ef41Sopenharmony_ci	&pmuludq	($T1,&$addr(1));		# h2*r1
9471cb0ef41Sopenharmony_ci	 &paddq		($D1,$T0);
9481cb0ef41Sopenharmony_ci	&movdqa		($T0,&QWP(16*3,"eax"));		# pull h3
9491cb0ef41Sopenharmony_ci	&pmuludq	($T2,&$addr(2));		# h2*r2
9501cb0ef41Sopenharmony_ci	&paddq		($D3,$T1);
9511cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
9521cb0ef41Sopenharmony_ci	&pmuludq	($T0,&$addr(6));		# h3*s2
9531cb0ef41Sopenharmony_ci	&paddq		($D4,$T2);
9541cb0ef41Sopenharmony_ci	&movdqa		($T2,$T1);
9551cb0ef41Sopenharmony_ci	&pmuludq	($T1,&$addr(7));		# h3*s3
9561cb0ef41Sopenharmony_ci	&paddq		($D0,$T0);
9571cb0ef41Sopenharmony_ci	&movdqa		($T0,$T2);
9581cb0ef41Sopenharmony_ci	&pmuludq	($T2,&$addr(8));		# h3*s4
9591cb0ef41Sopenharmony_ci	&paddq		($D1,$T1);
9601cb0ef41Sopenharmony_ci
9611cb0ef41Sopenharmony_ci	&movdqa		($T1,&QWP(16*4,"eax"));		# pull h4
9621cb0ef41Sopenharmony_ci	&pmuludq	($T0,&$addr(1));		# h3*r1
9631cb0ef41Sopenharmony_ci	 &paddq		($D2,$T2);
9641cb0ef41Sopenharmony_ci	&movdqa		($T2,$T1);
9651cb0ef41Sopenharmony_ci	&pmuludq	($T1,&$addr(8));		# h4*s4
9661cb0ef41Sopenharmony_ci	&paddq		($D4,$T0);
9671cb0ef41Sopenharmony_ci	&movdqa		($T0,$T2);
9681cb0ef41Sopenharmony_ci	&pmuludq	($T2,&$addr(5));		# h4*s1
9691cb0ef41Sopenharmony_ci	&paddq		($D3,$T1);
9701cb0ef41Sopenharmony_ci	&movdqa		($T1,$T0);
9711cb0ef41Sopenharmony_ci	&pmuludq	($T0,&$addr(6));		# h4*s2
9721cb0ef41Sopenharmony_ci	&paddq		($D0,$T2);
9731cb0ef41Sopenharmony_ci	 &movdqa	($MASK,&QWP(64,"ebx"));
9741cb0ef41Sopenharmony_ci	&pmuludq	($T1,&$addr(7));		# h4*s3
9751cb0ef41Sopenharmony_ci	&paddq		($D1,$T0);
9761cb0ef41Sopenharmony_ci	&paddq		($D2,$T1);
9771cb0ef41Sopenharmony_ci}
9781cb0ef41Sopenharmony_ci	&pmuladd_alt	(sub {	my $i=shift; &QWP(16*($i-9),"edx");	});
9791cb0ef41Sopenharmony_ci
9801cb0ef41Sopenharmony_ci	&load_input	(-16*2,0);
9811cb0ef41Sopenharmony_ci	&lea		("eax",&DWP(-16*2,"esi"));
9821cb0ef41Sopenharmony_ci	&sub		("ecx",64);
9831cb0ef41Sopenharmony_ci
9841cb0ef41Sopenharmony_ci	&paddd		($T0,&QWP(16*(5+0),"esp"));	# add hash value
9851cb0ef41Sopenharmony_ci	&paddd		($T1,&QWP(16*(5+1),"esp"));
9861cb0ef41Sopenharmony_ci	&paddd		($D2,&QWP(16*(5+2),"esp"));
9871cb0ef41Sopenharmony_ci	&paddd		($D3,&QWP(16*(5+3),"esp"));
9881cb0ef41Sopenharmony_ci	&paddd		($D4,&QWP(16*(5+4),"esp"));
9891cb0ef41Sopenharmony_ci
9901cb0ef41Sopenharmony_ci	&cmovb		("esi","eax");
9911cb0ef41Sopenharmony_ci	&lea		("eax",&DWP(16*10,"esp"));
9921cb0ef41Sopenharmony_ci
9931cb0ef41Sopenharmony_ci	&movdqa		($T2,&QWP(16*0,"edx"));		# r0^4
9941cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$D1);
9951cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"eax"),$T1);
9961cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"eax"),$D2);
9971cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"eax"),$D3);
9981cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"eax"),$D4);
9991cb0ef41Sopenharmony_ci
10001cb0ef41Sopenharmony_ci	################################################################
10011cb0ef41Sopenharmony_ci	# d4 += h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
10021cb0ef41Sopenharmony_ci	# d3 += h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
10031cb0ef41Sopenharmony_ci	# d2 += h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
10041cb0ef41Sopenharmony_ci	# d1 += h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
10051cb0ef41Sopenharmony_ci	# d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
10061cb0ef41Sopenharmony_ci
10071cb0ef41Sopenharmony_ci	&movdqa		($D1,$T0);
10081cb0ef41Sopenharmony_ci	&pmuludq	($T0,$T2);			# h0*r0
10091cb0ef41Sopenharmony_ci	&paddq		($T0,$D0);
10101cb0ef41Sopenharmony_ci	&movdqa		($D0,$T1);
10111cb0ef41Sopenharmony_ci	&pmuludq	($T1,$T2);			# h1*r0
10121cb0ef41Sopenharmony_ci	&pmuludq	($D2,$T2);			# h2*r0
10131cb0ef41Sopenharmony_ci	&pmuludq	($D3,$T2);			# h3*r0
10141cb0ef41Sopenharmony_ci	&pmuludq	($D4,$T2);			# h4*r0
10151cb0ef41Sopenharmony_ci
10161cb0ef41Sopenharmony_ci	&paddq		($T1,&QWP(16*1,"esp"));
10171cb0ef41Sopenharmony_ci	&paddq		($D2,&QWP(16*2,"esp"));
10181cb0ef41Sopenharmony_ci	&paddq		($D3,&QWP(16*3,"esp"));
10191cb0ef41Sopenharmony_ci	&paddq		($D4,&QWP(16*4,"esp"));
10201cb0ef41Sopenharmony_ci
10211cb0ef41Sopenharmony_ci	&pmuladd_alt	(sub {	my $i=shift; &QWP(16*$i,"edx");	});
10221cb0ef41Sopenharmony_ci
10231cb0ef41Sopenharmony_ci	&lazy_reduction	();
10241cb0ef41Sopenharmony_ci
10251cb0ef41Sopenharmony_ci	&load_input	(16*2,16*5);
10261cb0ef41Sopenharmony_ci
10271cb0ef41Sopenharmony_ci	&ja		(&label("loop"));
10281cb0ef41Sopenharmony_ci
10291cb0ef41Sopenharmony_ci&set_label("skip_loop");
10301cb0ef41Sopenharmony_ci	################################################################
10311cb0ef41Sopenharmony_ci	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
10321cb0ef41Sopenharmony_ci
10331cb0ef41Sopenharmony_ci	 &pshufd	($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n
10341cb0ef41Sopenharmony_ci	&add		("ecx",32);
10351cb0ef41Sopenharmony_ci	&jnz		(&label("long_tail"));
10361cb0ef41Sopenharmony_ci
10371cb0ef41Sopenharmony_ci	&paddd		($T0,$D0);			# add hash value
10381cb0ef41Sopenharmony_ci	&paddd		($T1,$D1);
10391cb0ef41Sopenharmony_ci	&paddd		($D2,&QWP(16*7,"esp"));
10401cb0ef41Sopenharmony_ci	&paddd		($D3,&QWP(16*8,"esp"));
10411cb0ef41Sopenharmony_ci	&paddd		($D4,&QWP(16*9,"esp"));
10421cb0ef41Sopenharmony_ci
10431cb0ef41Sopenharmony_ci&set_label("long_tail");
10441cb0ef41Sopenharmony_ci
10451cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"eax"),$T0);
10461cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"eax"),$T1);
10471cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"eax"),$D2);
10481cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"eax"),$D3);
10491cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"eax"),$D4);
10501cb0ef41Sopenharmony_ci
10511cb0ef41Sopenharmony_ci	################################################################
10521cb0ef41Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
10531cb0ef41Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
10541cb0ef41Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
10551cb0ef41Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
10561cb0ef41Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
10571cb0ef41Sopenharmony_ci
10581cb0ef41Sopenharmony_ci	&pmuludq	($T0,$T2);			# h0*r0
10591cb0ef41Sopenharmony_ci	&pmuludq	($T1,$T2);			# h1*r0
10601cb0ef41Sopenharmony_ci	&pmuludq	($D2,$T2);			# h2*r0
10611cb0ef41Sopenharmony_ci	&movdqa		($D0,$T0);
10621cb0ef41Sopenharmony_ci	 &pshufd	($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n
10631cb0ef41Sopenharmony_ci	&pmuludq	($D3,$T2);			# h3*r0
10641cb0ef41Sopenharmony_ci	&movdqa		($D1,$T1);
10651cb0ef41Sopenharmony_ci	&pmuludq	($D4,$T2);			# h4*r0
10661cb0ef41Sopenharmony_ci
10671cb0ef41Sopenharmony_ci	&pmuladd	(sub {	my ($reg,$i)=@_;
10681cb0ef41Sopenharmony_ci				&pshufd ($reg,&QWP(16*($i-9),"edx"),0x10);
10691cb0ef41Sopenharmony_ci			     },"eax");
10701cb0ef41Sopenharmony_ci
10711cb0ef41Sopenharmony_ci	&jz		(&label("short_tail"));
10721cb0ef41Sopenharmony_ci
10731cb0ef41Sopenharmony_ci	&load_input	(-16*2,0);
10741cb0ef41Sopenharmony_ci
10751cb0ef41Sopenharmony_ci	 &pshufd	($T2,&QWP(16*0,"edx"),0x10);	# r0^n
10761cb0ef41Sopenharmony_ci	&paddd		($T0,&QWP(16*5,"esp"));		# add hash value
10771cb0ef41Sopenharmony_ci	&paddd		($T1,&QWP(16*6,"esp"));
10781cb0ef41Sopenharmony_ci	&paddd		($D2,&QWP(16*7,"esp"));
10791cb0ef41Sopenharmony_ci	&paddd		($D3,&QWP(16*8,"esp"));
10801cb0ef41Sopenharmony_ci	&paddd		($D4,&QWP(16*9,"esp"));
10811cb0ef41Sopenharmony_ci
10821cb0ef41Sopenharmony_ci	################################################################
10831cb0ef41Sopenharmony_ci	# multiply inp[0:1] by r^4:r^3 and accumulate
10841cb0ef41Sopenharmony_ci
10851cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$T0);
10861cb0ef41Sopenharmony_ci	&pmuludq	($T0,$T2);			# h0*r0
10871cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$T1);
10881cb0ef41Sopenharmony_ci	&pmuludq	($T1,$T2);			# h1*r0
10891cb0ef41Sopenharmony_ci	&paddq		($D0,$T0);
10901cb0ef41Sopenharmony_ci	&movdqa		($T0,$D2);
10911cb0ef41Sopenharmony_ci	&pmuludq	($D2,$T2);			# h2*r0
10921cb0ef41Sopenharmony_ci	&paddq		($D1,$T1);
10931cb0ef41Sopenharmony_ci	&movdqa		($T1,$D3);
10941cb0ef41Sopenharmony_ci	&pmuludq	($D3,$T2);			# h3*r0
10951cb0ef41Sopenharmony_ci	&paddq		($D2,&QWP(16*2,"esp"));
10961cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$T0);
10971cb0ef41Sopenharmony_ci	 &pshufd	($T0,&QWP(16*1,"edx"),0x10);	# r1^n
10981cb0ef41Sopenharmony_ci	&paddq		($D3,&QWP(16*3,"esp"));
10991cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$T1);
11001cb0ef41Sopenharmony_ci	&movdqa		($T1,$D4);
11011cb0ef41Sopenharmony_ci	&pmuludq	($D4,$T2);			# h4*r0
11021cb0ef41Sopenharmony_ci	&paddq		($D4,&QWP(16*4,"esp"));
11031cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$T1);
11041cb0ef41Sopenharmony_ci
11051cb0ef41Sopenharmony_ci	&pmuladd	(sub {	my ($reg,$i)=@_;
11061cb0ef41Sopenharmony_ci				&pshufd ($reg,&QWP(16*$i,"edx"),0x10);
11071cb0ef41Sopenharmony_ci			     });
11081cb0ef41Sopenharmony_ci
11091cb0ef41Sopenharmony_ci&set_label("short_tail");
11101cb0ef41Sopenharmony_ci
11111cb0ef41Sopenharmony_ci	################################################################
11121cb0ef41Sopenharmony_ci	# horizontal addition
11131cb0ef41Sopenharmony_ci
11141cb0ef41Sopenharmony_ci	&pshufd		($T1,$D4,0b01001110);
11151cb0ef41Sopenharmony_ci	&pshufd		($T0,$D3,0b01001110);
11161cb0ef41Sopenharmony_ci	&paddq		($D4,$T1);
11171cb0ef41Sopenharmony_ci	&paddq		($D3,$T0);
11181cb0ef41Sopenharmony_ci	&pshufd		($T1,$D0,0b01001110);
11191cb0ef41Sopenharmony_ci	&pshufd		($T0,$D1,0b01001110);
11201cb0ef41Sopenharmony_ci	&paddq		($D0,$T1);
11211cb0ef41Sopenharmony_ci	&paddq		($D1,$T0);
11221cb0ef41Sopenharmony_ci	&pshufd		($T1,$D2,0b01001110);
11231cb0ef41Sopenharmony_ci	#&paddq		($D2,$T1);
11241cb0ef41Sopenharmony_ci
11251cb0ef41Sopenharmony_ci	&lazy_reduction	(sub { &paddq ($D2,$T1) });
11261cb0ef41Sopenharmony_ci
11271cb0ef41Sopenharmony_ci&set_label("done");
11281cb0ef41Sopenharmony_ci	&movd		(&DWP(-16*3+4*0,"edi"),$D0);	# store hash value
11291cb0ef41Sopenharmony_ci	&movd		(&DWP(-16*3+4*1,"edi"),$D1);
11301cb0ef41Sopenharmony_ci	&movd		(&DWP(-16*3+4*2,"edi"),$D2);
11311cb0ef41Sopenharmony_ci	&movd		(&DWP(-16*3+4*3,"edi"),$D3);
11321cb0ef41Sopenharmony_ci	&movd		(&DWP(-16*3+4*4,"edi"),$D4);
11331cb0ef41Sopenharmony_ci	&mov	("esp","ebp");
11341cb0ef41Sopenharmony_ci&set_label("nodata");
11351cb0ef41Sopenharmony_ci&function_end("_poly1305_blocks_sse2");
11361cb0ef41Sopenharmony_ci
11371cb0ef41Sopenharmony_ci&align	(32);
11381cb0ef41Sopenharmony_ci&function_begin("_poly1305_emit_sse2");
11391cb0ef41Sopenharmony_ci	&mov	("ebp",&wparam(0));		# context
11401cb0ef41Sopenharmony_ci
11411cb0ef41Sopenharmony_ci	&cmp	(&DWP(4*5,"ebp"),0);		# is_base2_26?
11421cb0ef41Sopenharmony_ci	&je	(&label("enter_emit"));
11431cb0ef41Sopenharmony_ci
11441cb0ef41Sopenharmony_ci	&mov	("eax",&DWP(4*0,"ebp"));	# load hash value
11451cb0ef41Sopenharmony_ci	&mov	("edi",&DWP(4*1,"ebp"));
11461cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP(4*2,"ebp"));
11471cb0ef41Sopenharmony_ci	&mov	("edx",&DWP(4*3,"ebp"));
11481cb0ef41Sopenharmony_ci	&mov	("esi",&DWP(4*4,"ebp"));
11491cb0ef41Sopenharmony_ci
11501cb0ef41Sopenharmony_ci	&mov	("ebx","edi");			# base 2^26 -> base 2^32
11511cb0ef41Sopenharmony_ci	&shl	("edi",26);
11521cb0ef41Sopenharmony_ci	&shr	("ebx",6);
11531cb0ef41Sopenharmony_ci	&add	("eax","edi");
11541cb0ef41Sopenharmony_ci	&mov	("edi","ecx");
11551cb0ef41Sopenharmony_ci	&adc	("ebx",0);
11561cb0ef41Sopenharmony_ci
11571cb0ef41Sopenharmony_ci	&shl	("edi",20);
11581cb0ef41Sopenharmony_ci	&shr	("ecx",12);
11591cb0ef41Sopenharmony_ci	&add	("ebx","edi");
11601cb0ef41Sopenharmony_ci	&mov	("edi","edx");
11611cb0ef41Sopenharmony_ci	&adc	("ecx",0);
11621cb0ef41Sopenharmony_ci
11631cb0ef41Sopenharmony_ci	&shl	("edi",14);
11641cb0ef41Sopenharmony_ci	&shr	("edx",18);
11651cb0ef41Sopenharmony_ci	&add	("ecx","edi");
11661cb0ef41Sopenharmony_ci	&mov	("edi","esi");
11671cb0ef41Sopenharmony_ci	&adc	("edx",0);
11681cb0ef41Sopenharmony_ci
11691cb0ef41Sopenharmony_ci	&shl	("edi",8);
11701cb0ef41Sopenharmony_ci	&shr	("esi",24);
11711cb0ef41Sopenharmony_ci	&add	("edx","edi");
11721cb0ef41Sopenharmony_ci	&adc	("esi",0);			# can be partially reduced
11731cb0ef41Sopenharmony_ci
11741cb0ef41Sopenharmony_ci	&mov	("edi","esi");			# final reduction
11751cb0ef41Sopenharmony_ci	&and	("esi",3);
11761cb0ef41Sopenharmony_ci	&shr	("edi",2);
11771cb0ef41Sopenharmony_ci	&lea	("ebp",&DWP(0,"edi","edi",4));	# *5
11781cb0ef41Sopenharmony_ci	 &mov	("edi",&wparam(1));		# output
11791cb0ef41Sopenharmony_ci	&add	("eax","ebp");
11801cb0ef41Sopenharmony_ci	 &mov	("ebp",&wparam(2));		# key
11811cb0ef41Sopenharmony_ci	&adc	("ebx",0);
11821cb0ef41Sopenharmony_ci	&adc	("ecx",0);
11831cb0ef41Sopenharmony_ci	&adc	("edx",0);
11841cb0ef41Sopenharmony_ci	&adc	("esi",0);
11851cb0ef41Sopenharmony_ci
11861cb0ef41Sopenharmony_ci	&movd	($D0,"eax");			# offload original hash value
11871cb0ef41Sopenharmony_ci	&add	("eax",5);			# compare to modulus
11881cb0ef41Sopenharmony_ci	&movd	($D1,"ebx");
11891cb0ef41Sopenharmony_ci	&adc	("ebx",0);
11901cb0ef41Sopenharmony_ci	&movd	($D2,"ecx");
11911cb0ef41Sopenharmony_ci	&adc	("ecx",0);
11921cb0ef41Sopenharmony_ci	&movd	($D3,"edx");
11931cb0ef41Sopenharmony_ci	&adc	("edx",0);
11941cb0ef41Sopenharmony_ci	&adc	("esi",0);
11951cb0ef41Sopenharmony_ci	&shr	("esi",2);			# did it carry/borrow?
11961cb0ef41Sopenharmony_ci
11971cb0ef41Sopenharmony_ci	&neg	("esi");			# do we choose (hash-modulus) ...
11981cb0ef41Sopenharmony_ci	&and	("eax","esi");
11991cb0ef41Sopenharmony_ci	&and	("ebx","esi");
12001cb0ef41Sopenharmony_ci	&and	("ecx","esi");
12011cb0ef41Sopenharmony_ci	&and	("edx","esi");
12021cb0ef41Sopenharmony_ci	&mov	(&DWP(4*0,"edi"),"eax");
12031cb0ef41Sopenharmony_ci	&movd	("eax",$D0);
12041cb0ef41Sopenharmony_ci	&mov	(&DWP(4*1,"edi"),"ebx");
12051cb0ef41Sopenharmony_ci	&movd	("ebx",$D1);
12061cb0ef41Sopenharmony_ci	&mov	(&DWP(4*2,"edi"),"ecx");
12071cb0ef41Sopenharmony_ci	&movd	("ecx",$D2);
12081cb0ef41Sopenharmony_ci	&mov	(&DWP(4*3,"edi"),"edx");
12091cb0ef41Sopenharmony_ci	&movd	("edx",$D3);
12101cb0ef41Sopenharmony_ci
12111cb0ef41Sopenharmony_ci	&not	("esi");			# ... or original hash value?
12121cb0ef41Sopenharmony_ci	&and	("eax","esi");
12131cb0ef41Sopenharmony_ci	&and	("ebx","esi");
12141cb0ef41Sopenharmony_ci	&or	("eax",&DWP(4*0,"edi"));
12151cb0ef41Sopenharmony_ci	&and	("ecx","esi");
12161cb0ef41Sopenharmony_ci	&or	("ebx",&DWP(4*1,"edi"));
12171cb0ef41Sopenharmony_ci	&and	("edx","esi");
12181cb0ef41Sopenharmony_ci	&or	("ecx",&DWP(4*2,"edi"));
12191cb0ef41Sopenharmony_ci	&or	("edx",&DWP(4*3,"edi"));
12201cb0ef41Sopenharmony_ci
12211cb0ef41Sopenharmony_ci	&add	("eax",&DWP(4*0,"ebp"));	# accumulate key
12221cb0ef41Sopenharmony_ci	&adc	("ebx",&DWP(4*1,"ebp"));
12231cb0ef41Sopenharmony_ci	&mov	(&DWP(4*0,"edi"),"eax");
12241cb0ef41Sopenharmony_ci	&adc	("ecx",&DWP(4*2,"ebp"));
12251cb0ef41Sopenharmony_ci	&mov	(&DWP(4*1,"edi"),"ebx");
12261cb0ef41Sopenharmony_ci	&adc	("edx",&DWP(4*3,"ebp"));
12271cb0ef41Sopenharmony_ci	&mov	(&DWP(4*2,"edi"),"ecx");
12281cb0ef41Sopenharmony_ci	&mov	(&DWP(4*3,"edi"),"edx");
12291cb0ef41Sopenharmony_ci&function_end("_poly1305_emit_sse2");
12301cb0ef41Sopenharmony_ci
12311cb0ef41Sopenharmony_ciif ($avx>1) {
12321cb0ef41Sopenharmony_ci########################################################################
12331cb0ef41Sopenharmony_ci# Note that poly1305_init_avx2 operates on %xmm, I could have used
12341cb0ef41Sopenharmony_ci# poly1305_init_sse2...
12351cb0ef41Sopenharmony_ci
12361cb0ef41Sopenharmony_ci&align	(32);
12371cb0ef41Sopenharmony_ci&function_begin_B("_poly1305_init_avx2");
12381cb0ef41Sopenharmony_ci	&vmovdqu	($D4,&QWP(4*6,"edi"));		# key base 2^32
12391cb0ef41Sopenharmony_ci	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
12401cb0ef41Sopenharmony_ci	&mov		("ebp","esp");
12411cb0ef41Sopenharmony_ci	&sub		("esp",16*(9+5));
12421cb0ef41Sopenharmony_ci	&and		("esp",-16);
12431cb0ef41Sopenharmony_ci
12441cb0ef41Sopenharmony_ci	#&vpand		($D4,$D4,&QWP(96,"ebx"));	# magic mask
12451cb0ef41Sopenharmony_ci	&vmovdqa	($MASK,&QWP(64,"ebx"));
12461cb0ef41Sopenharmony_ci
12471cb0ef41Sopenharmony_ci	&vpand		($D0,$D4,$MASK);		# -> base 2^26
12481cb0ef41Sopenharmony_ci	&vpsrlq		($D1,$D4,26);
12491cb0ef41Sopenharmony_ci	&vpsrldq	($D3,$D4,6);
12501cb0ef41Sopenharmony_ci	&vpand		($D1,$D1,$MASK);
12511cb0ef41Sopenharmony_ci	&vpsrlq		($D2,$D3,4)
12521cb0ef41Sopenharmony_ci	&vpsrlq		($D3,$D3,30);
12531cb0ef41Sopenharmony_ci	&vpand		($D2,$D2,$MASK);
12541cb0ef41Sopenharmony_ci	&vpand		($D3,$D3,$MASK);
12551cb0ef41Sopenharmony_ci	&vpsrldq	($D4,$D4,13);
12561cb0ef41Sopenharmony_ci
12571cb0ef41Sopenharmony_ci	&lea		("edx",&DWP(16*9,"esp"));	# size optimization
12581cb0ef41Sopenharmony_ci	&mov		("ecx",2);
12591cb0ef41Sopenharmony_ci&set_label("square");
12601cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*0,"esp"),$D0);
12611cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*1,"esp"),$D1);
12621cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*2,"esp"),$D2);
12631cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*3,"esp"),$D3);
12641cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*4,"esp"),$D4);
12651cb0ef41Sopenharmony_ci
12661cb0ef41Sopenharmony_ci	&vpslld		($T1,$D1,2);
12671cb0ef41Sopenharmony_ci	&vpslld		($T0,$D2,2);
12681cb0ef41Sopenharmony_ci	&vpaddd		($T1,$T1,$D1);			# *5
12691cb0ef41Sopenharmony_ci	&vpaddd		($T0,$T0,$D2);			# *5
12701cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*5,"esp"),$T1);
12711cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*6,"esp"),$T0);
12721cb0ef41Sopenharmony_ci	&vpslld		($T1,$D3,2);
12731cb0ef41Sopenharmony_ci	&vpslld		($T0,$D4,2);
12741cb0ef41Sopenharmony_ci	&vpaddd		($T1,$T1,$D3);			# *5
12751cb0ef41Sopenharmony_ci	&vpaddd		($T0,$T0,$D4);			# *5
12761cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*7,"esp"),$T1);
12771cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*8,"esp"),$T0);
12781cb0ef41Sopenharmony_ci
12791cb0ef41Sopenharmony_ci	&vpshufd	($T0,$D0,0b01000100);
12801cb0ef41Sopenharmony_ci	&vmovdqa	($T1,$D1);
12811cb0ef41Sopenharmony_ci	&vpshufd	($D1,$D1,0b01000100);
12821cb0ef41Sopenharmony_ci	&vpshufd	($D2,$D2,0b01000100);
12831cb0ef41Sopenharmony_ci	&vpshufd	($D3,$D3,0b01000100);
12841cb0ef41Sopenharmony_ci	&vpshufd	($D4,$D4,0b01000100);
12851cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*0,"edx"),$T0);
12861cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*1,"edx"),$D1);
12871cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*2,"edx"),$D2);
12881cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*3,"edx"),$D3);
12891cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(16*4,"edx"),$D4);
12901cb0ef41Sopenharmony_ci
12911cb0ef41Sopenharmony_ci	################################################################
12921cb0ef41Sopenharmony_ci	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
12931cb0ef41Sopenharmony_ci	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
12941cb0ef41Sopenharmony_ci	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
12951cb0ef41Sopenharmony_ci	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
12961cb0ef41Sopenharmony_ci	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
12971cb0ef41Sopenharmony_ci
12981cb0ef41Sopenharmony_ci	&vpmuludq	($D4,$D4,$D0);			# h4*r0
12991cb0ef41Sopenharmony_ci	&vpmuludq	($D3,$D3,$D0);			# h3*r0
13001cb0ef41Sopenharmony_ci	&vpmuludq	($D2,$D2,$D0);			# h2*r0
13011cb0ef41Sopenharmony_ci	&vpmuludq	($D1,$D1,$D0);			# h1*r0
13021cb0ef41Sopenharmony_ci	&vpmuludq	($D0,$T0,$D0);			# h0*r0
13031cb0ef41Sopenharmony_ci
13041cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T1,&QWP(16*3,"edx"));	# r1*h3
13051cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T0);
13061cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T1,&QWP(16*2,"edx"));	# r1*h2
13071cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T2);
13081cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T1,&QWP(16*1,"edx"));	# r1*h1
13091cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T0);
13101cb0ef41Sopenharmony_ci	&vmovdqa	($T2,&QWP(16*5,"esp"));		# s1
13111cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T1,&QWP(16*0,"edx"));	# r1*h0
13121cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T1);
13131cb0ef41Sopenharmony_ci	 &vmovdqa	($T0,&QWP(16*2,"esp"));		# r2
13141cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T2,&QWP(16*4,"edx"));	# s1*h4
13151cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$T2);
13161cb0ef41Sopenharmony_ci
13171cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T0,&QWP(16*2,"edx"));	# r2*h2
13181cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T1);
13191cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# r2*h1
13201cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T2);
13211cb0ef41Sopenharmony_ci	&vmovdqa	($T1,&QWP(16*6,"esp"));		# s2
13221cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T0,&QWP(16*0,"edx"));	# r2*h0
13231cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T0);
13241cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T1,&QWP(16*4,"edx"));	# s2*h4
13251cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T2);
13261cb0ef41Sopenharmony_ci	 &vmovdqa	($T0,&QWP(16*3,"esp"));		# r3
13271cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T1,&QWP(16*3,"edx"));	# s2*h3
13281cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$T1);
13291cb0ef41Sopenharmony_ci
13301cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# r3*h1
13311cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T2);
13321cb0ef41Sopenharmony_ci	&vmovdqa	($T1,&QWP(16*7,"esp"));		# s3
13331cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T0,&QWP(16*0,"edx"));	# r3*h0
13341cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T0);
13351cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T1,&QWP(16*4,"edx"));	# s3*h4
13361cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T2);
13371cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T1,&QWP(16*3,"edx"));	# s3*h3
13381cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T0);
13391cb0ef41Sopenharmony_ci	 &vmovdqa	($T2,&QWP(16*4,"esp"));		# r4
13401cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T1,&QWP(16*2,"edx"));	# s3*h2
13411cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$T1);
13421cb0ef41Sopenharmony_ci
13431cb0ef41Sopenharmony_ci	&vmovdqa	($T0,&QWP(16*8,"esp"));		# s4
13441cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T2,&QWP(16*0,"edx"));	# r4*h0
13451cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T2);
13461cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T0,&QWP(16*4,"edx"));	# s4*h4
13471cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T1);
13481cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# s4*h1
13491cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$T2);
13501cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T0,&QWP(16*2,"edx"));	# s4*h2
13511cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T1);
13521cb0ef41Sopenharmony_ci	 &vmovdqa	($MASK,&QWP(64,"ebx"));
13531cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T0,&QWP(16*3,"edx"));	# s4*h3
13541cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T0);
13551cb0ef41Sopenharmony_ci
13561cb0ef41Sopenharmony_ci	################################################################
13571cb0ef41Sopenharmony_ci	# lazy reduction
13581cb0ef41Sopenharmony_ci	 &vpsrlq	($T0,$D3,26);
13591cb0ef41Sopenharmony_ci	 &vpand		($D3,$D3,$MASK);
13601cb0ef41Sopenharmony_ci	&vpsrlq		($T1,$D0,26);
13611cb0ef41Sopenharmony_ci	&vpand		($D0,$D0,$MASK);
13621cb0ef41Sopenharmony_ci	 &vpaddq	($D4,$D4,$T0);			# h3 -> h4
13631cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T1);			# h0 -> h1
13641cb0ef41Sopenharmony_ci	 &vpsrlq	($T0,$D4,26);
13651cb0ef41Sopenharmony_ci	 &vpand		($D4,$D4,$MASK);
13661cb0ef41Sopenharmony_ci	&vpsrlq		($T1,$D1,26);
13671cb0ef41Sopenharmony_ci	&vpand		($D1,$D1,$MASK);
13681cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T1);			# h1 -> h2
13691cb0ef41Sopenharmony_ci	 &vpaddd	($D0,$D0,$T0);
13701cb0ef41Sopenharmony_ci	 &vpsllq	($T0,$T0,2);
13711cb0ef41Sopenharmony_ci	&vpsrlq		($T1,$D2,26);
13721cb0ef41Sopenharmony_ci	&vpand		($D2,$D2,$MASK);
13731cb0ef41Sopenharmony_ci	 &vpaddd	($D0,$D0,$T0);			# h4 -> h0
13741cb0ef41Sopenharmony_ci	&vpaddd		($D3,$D3,$T1);			# h2 -> h3
13751cb0ef41Sopenharmony_ci	&vpsrlq		($T1,$D3,26);
13761cb0ef41Sopenharmony_ci	 &vpsrlq	($T0,$D0,26);
13771cb0ef41Sopenharmony_ci	 &vpand		($D0,$D0,$MASK);
13781cb0ef41Sopenharmony_ci	&vpand		($D3,$D3,$MASK);
13791cb0ef41Sopenharmony_ci	 &vpaddd	($D1,$D1,$T0);			# h0 -> h1
13801cb0ef41Sopenharmony_ci	&vpaddd		($D4,$D4,$T1);			# h3 -> h4
13811cb0ef41Sopenharmony_ci
13821cb0ef41Sopenharmony_ci	&dec		("ecx");
13831cb0ef41Sopenharmony_ci	&jz		(&label("square_break"));
13841cb0ef41Sopenharmony_ci
13851cb0ef41Sopenharmony_ci	&vpunpcklqdq	($D0,$D0,&QWP(16*0,"esp"));	# 0:r^1:0:r^2
13861cb0ef41Sopenharmony_ci	&vpunpcklqdq	($D1,$D1,&QWP(16*1,"esp"));
13871cb0ef41Sopenharmony_ci	&vpunpcklqdq	($D2,$D2,&QWP(16*2,"esp"));
13881cb0ef41Sopenharmony_ci	&vpunpcklqdq	($D3,$D3,&QWP(16*3,"esp"));
13891cb0ef41Sopenharmony_ci	&vpunpcklqdq	($D4,$D4,&QWP(16*4,"esp"));
13901cb0ef41Sopenharmony_ci	&jmp		(&label("square"));
13911cb0ef41Sopenharmony_ci
13921cb0ef41Sopenharmony_ci&set_label("square_break");
13931cb0ef41Sopenharmony_ci	&vpsllq		($D0,$D0,32);			# -> r^3:0:r^4:0
13941cb0ef41Sopenharmony_ci	&vpsllq		($D1,$D1,32);
13951cb0ef41Sopenharmony_ci	&vpsllq		($D2,$D2,32);
13961cb0ef41Sopenharmony_ci	&vpsllq		($D3,$D3,32);
13971cb0ef41Sopenharmony_ci	&vpsllq		($D4,$D4,32);
13981cb0ef41Sopenharmony_ci	&vpor		($D0,$D0,&QWP(16*0,"esp"));	# r^3:r^1:r^4:r^2
13991cb0ef41Sopenharmony_ci	&vpor		($D1,$D1,&QWP(16*1,"esp"));
14001cb0ef41Sopenharmony_ci	&vpor		($D2,$D2,&QWP(16*2,"esp"));
14011cb0ef41Sopenharmony_ci	&vpor		($D3,$D3,&QWP(16*3,"esp"));
14021cb0ef41Sopenharmony_ci	&vpor		($D4,$D4,&QWP(16*4,"esp"));
14031cb0ef41Sopenharmony_ci
14041cb0ef41Sopenharmony_ci	&vpshufd	($D0,$D0,0b10001101);		# -> r^1:r^2:r^3:r^4
14051cb0ef41Sopenharmony_ci	&vpshufd	($D1,$D1,0b10001101);
14061cb0ef41Sopenharmony_ci	&vpshufd	($D2,$D2,0b10001101);
14071cb0ef41Sopenharmony_ci	&vpshufd	($D3,$D3,0b10001101);
14081cb0ef41Sopenharmony_ci	&vpshufd	($D4,$D4,0b10001101);
14091cb0ef41Sopenharmony_ci
14101cb0ef41Sopenharmony_ci	&vmovdqu	(&QWP(16*0,"edi"),$D0);		# save the table
14111cb0ef41Sopenharmony_ci	&vmovdqu	(&QWP(16*1,"edi"),$D1);
14121cb0ef41Sopenharmony_ci	&vmovdqu	(&QWP(16*2,"edi"),$D2);
14131cb0ef41Sopenharmony_ci	&vmovdqu	(&QWP(16*3,"edi"),$D3);
14141cb0ef41Sopenharmony_ci	&vmovdqu	(&QWP(16*4,"edi"),$D4);
14151cb0ef41Sopenharmony_ci
14161cb0ef41Sopenharmony_ci	&vpslld		($T1,$D1,2);
14171cb0ef41Sopenharmony_ci	&vpslld		($T0,$D2,2);
14181cb0ef41Sopenharmony_ci	&vpaddd		($T1,$T1,$D1);			# *5
14191cb0ef41Sopenharmony_ci	&vpaddd		($T0,$T0,$D2);			# *5
14201cb0ef41Sopenharmony_ci	&vmovdqu	(&QWP(16*5,"edi"),$T1);
14211cb0ef41Sopenharmony_ci	&vmovdqu	(&QWP(16*6,"edi"),$T0);
14221cb0ef41Sopenharmony_ci	&vpslld		($T1,$D3,2);
14231cb0ef41Sopenharmony_ci	&vpslld		($T0,$D4,2);
14241cb0ef41Sopenharmony_ci	&vpaddd		($T1,$T1,$D3);			# *5
14251cb0ef41Sopenharmony_ci	&vpaddd		($T0,$T0,$D4);			# *5
14261cb0ef41Sopenharmony_ci	&vmovdqu	(&QWP(16*7,"edi"),$T1);
14271cb0ef41Sopenharmony_ci	&vmovdqu	(&QWP(16*8,"edi"),$T0);
14281cb0ef41Sopenharmony_ci
14291cb0ef41Sopenharmony_ci	&mov		("esp","ebp");
14301cb0ef41Sopenharmony_ci	&lea		("edi",&DWP(-16*3,"edi"));	# size de-optimization
14311cb0ef41Sopenharmony_ci	&ret		();
14321cb0ef41Sopenharmony_ci&function_end_B("_poly1305_init_avx2");
14331cb0ef41Sopenharmony_ci
14341cb0ef41Sopenharmony_ci########################################################################
14351cb0ef41Sopenharmony_ci# now it's time to switch to %ymm
14361cb0ef41Sopenharmony_ci
14371cb0ef41Sopenharmony_cimy ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7));
14381cb0ef41Sopenharmony_cimy $MASK=$T2;
14391cb0ef41Sopenharmony_ci
14401cb0ef41Sopenharmony_cisub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
14411cb0ef41Sopenharmony_ci
14421cb0ef41Sopenharmony_ci&align	(32);
14431cb0ef41Sopenharmony_ci&function_begin("_poly1305_blocks_avx2");
14441cb0ef41Sopenharmony_ci	&mov	("edi",&wparam(0));			# ctx
14451cb0ef41Sopenharmony_ci	&mov	("esi",&wparam(1));			# inp
14461cb0ef41Sopenharmony_ci	&mov	("ecx",&wparam(2));			# len
14471cb0ef41Sopenharmony_ci
14481cb0ef41Sopenharmony_ci	&mov	("eax",&DWP(4*5,"edi"));		# is_base2_26
14491cb0ef41Sopenharmony_ci	&and	("ecx",-16);
14501cb0ef41Sopenharmony_ci	&jz	(&label("nodata"));
14511cb0ef41Sopenharmony_ci	&cmp	("ecx",64);
14521cb0ef41Sopenharmony_ci	&jae	(&label("enter_avx2"));
14531cb0ef41Sopenharmony_ci	&test	("eax","eax");				# is_base2_26?
14541cb0ef41Sopenharmony_ci	&jz	(&label("enter_blocks"));
14551cb0ef41Sopenharmony_ci
14561cb0ef41Sopenharmony_ci&set_label("enter_avx2");
14571cb0ef41Sopenharmony_ci	&vzeroupper	();
14581cb0ef41Sopenharmony_ci
14591cb0ef41Sopenharmony_ci	&call	(&label("pic_point"));
14601cb0ef41Sopenharmony_ci&set_label("pic_point");
14611cb0ef41Sopenharmony_ci	&blindpop("ebx");
14621cb0ef41Sopenharmony_ci	&lea	("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
14631cb0ef41Sopenharmony_ci
14641cb0ef41Sopenharmony_ci	&test	("eax","eax");				# is_base2_26?
14651cb0ef41Sopenharmony_ci	&jnz	(&label("base2_26"));
14661cb0ef41Sopenharmony_ci
14671cb0ef41Sopenharmony_ci	&call	("_poly1305_init_avx2");
14681cb0ef41Sopenharmony_ci
14691cb0ef41Sopenharmony_ci	################################################# base 2^32 -> base 2^26
14701cb0ef41Sopenharmony_ci	&mov	("eax",&DWP(0,"edi"));
14711cb0ef41Sopenharmony_ci	&mov	("ecx",&DWP(3,"edi"));
14721cb0ef41Sopenharmony_ci	&mov	("edx",&DWP(6,"edi"));
14731cb0ef41Sopenharmony_ci	&mov	("esi",&DWP(9,"edi"));
14741cb0ef41Sopenharmony_ci	&mov	("ebp",&DWP(13,"edi"));
14751cb0ef41Sopenharmony_ci
14761cb0ef41Sopenharmony_ci	&shr	("ecx",2);
14771cb0ef41Sopenharmony_ci	&and	("eax",0x3ffffff);
14781cb0ef41Sopenharmony_ci	&shr	("edx",4);
14791cb0ef41Sopenharmony_ci	&and	("ecx",0x3ffffff);
14801cb0ef41Sopenharmony_ci	&shr	("esi",6);
14811cb0ef41Sopenharmony_ci	&and	("edx",0x3ffffff);
14821cb0ef41Sopenharmony_ci
14831cb0ef41Sopenharmony_ci	&mov	(&DWP(4*0,"edi"),"eax");
14841cb0ef41Sopenharmony_ci	&mov	(&DWP(4*1,"edi"),"ecx");
14851cb0ef41Sopenharmony_ci	&mov	(&DWP(4*2,"edi"),"edx");
14861cb0ef41Sopenharmony_ci	&mov	(&DWP(4*3,"edi"),"esi");
14871cb0ef41Sopenharmony_ci	&mov	(&DWP(4*4,"edi"),"ebp");
14881cb0ef41Sopenharmony_ci	&mov	(&DWP(4*5,"edi"),1);			# is_base2_26
14891cb0ef41Sopenharmony_ci
14901cb0ef41Sopenharmony_ci	&mov	("esi",&wparam(1));			# [reload] inp
14911cb0ef41Sopenharmony_ci	&mov	("ecx",&wparam(2));			# [reload] len
14921cb0ef41Sopenharmony_ci
14931cb0ef41Sopenharmony_ci&set_label("base2_26");
14941cb0ef41Sopenharmony_ci	&mov	("eax",&wparam(3));			# padbit
14951cb0ef41Sopenharmony_ci	&mov	("ebp","esp");
14961cb0ef41Sopenharmony_ci
14971cb0ef41Sopenharmony_ci	&sub	("esp",32*(5+9));
14981cb0ef41Sopenharmony_ci	&and	("esp",-512);				# ensure that frame
14991cb0ef41Sopenharmony_ci							# doesn't cross page
15001cb0ef41Sopenharmony_ci							# boundary, which is
15011cb0ef41Sopenharmony_ci							# essential for
15021cb0ef41Sopenharmony_ci							# misaligned 32-byte
15031cb0ef41Sopenharmony_ci							# loads
15041cb0ef41Sopenharmony_ci
15051cb0ef41Sopenharmony_ci	################################################################
15061cb0ef41Sopenharmony_ci        # expand and copy pre-calculated table to stack
15071cb0ef41Sopenharmony_ci
15081cb0ef41Sopenharmony_ci	&vmovdqu	(&X($D0),&QWP(16*(3+0),"edi"));
15091cb0ef41Sopenharmony_ci	&lea		("edx",&DWP(32*5+128,"esp"));	# +128 size optimization
15101cb0ef41Sopenharmony_ci	&vmovdqu	(&X($D1),&QWP(16*(3+1),"edi"));
15111cb0ef41Sopenharmony_ci	&vmovdqu	(&X($D2),&QWP(16*(3+2),"edi"));
15121cb0ef41Sopenharmony_ci	&vmovdqu	(&X($D3),&QWP(16*(3+3),"edi"));
15131cb0ef41Sopenharmony_ci	&vmovdqu	(&X($D4),&QWP(16*(3+4),"edi"));
15141cb0ef41Sopenharmony_ci	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
15151cb0ef41Sopenharmony_ci	&vpermq		($D0,$D0,0b01000000);		# 00001234 -> 12343434
15161cb0ef41Sopenharmony_ci	&vpermq		($D1,$D1,0b01000000);
15171cb0ef41Sopenharmony_ci	&vpermq		($D2,$D2,0b01000000);
15181cb0ef41Sopenharmony_ci	&vpermq		($D3,$D3,0b01000000);
15191cb0ef41Sopenharmony_ci	&vpermq		($D4,$D4,0b01000000);
15201cb0ef41Sopenharmony_ci	&vpshufd	($D0,$D0,0b11001000);		# 12343434 -> 14243444
15211cb0ef41Sopenharmony_ci	&vpshufd	($D1,$D1,0b11001000);
15221cb0ef41Sopenharmony_ci	&vpshufd	($D2,$D2,0b11001000);
15231cb0ef41Sopenharmony_ci	&vpshufd	($D3,$D3,0b11001000);
15241cb0ef41Sopenharmony_ci	&vpshufd	($D4,$D4,0b11001000);
15251cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*0-128,"edx"),$D0);
15261cb0ef41Sopenharmony_ci	&vmovdqu	(&X($D0),&QWP(16*5,"edi"));
15271cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*1-128,"edx"),$D1);
15281cb0ef41Sopenharmony_ci	&vmovdqu	(&X($D1),&QWP(16*6,"edi"));
15291cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*2-128,"edx"),$D2);
15301cb0ef41Sopenharmony_ci	&vmovdqu	(&X($D2),&QWP(16*7,"edi"));
15311cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*3-128,"edx"),$D3);
15321cb0ef41Sopenharmony_ci	&vmovdqu	(&X($D3),&QWP(16*8,"edi"));
15331cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*4-128,"edx"),$D4);
15341cb0ef41Sopenharmony_ci	&vpermq		($D0,$D0,0b01000000);
15351cb0ef41Sopenharmony_ci	&vpermq		($D1,$D1,0b01000000);
15361cb0ef41Sopenharmony_ci	&vpermq		($D2,$D2,0b01000000);
15371cb0ef41Sopenharmony_ci	&vpermq		($D3,$D3,0b01000000);
15381cb0ef41Sopenharmony_ci	&vpshufd	($D0,$D0,0b11001000);
15391cb0ef41Sopenharmony_ci	&vpshufd	($D1,$D1,0b11001000);
15401cb0ef41Sopenharmony_ci	&vpshufd	($D2,$D2,0b11001000);
15411cb0ef41Sopenharmony_ci	&vpshufd	($D3,$D3,0b11001000);
15421cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*5-128,"edx"),$D0);
15431cb0ef41Sopenharmony_ci	&vmovd		(&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value
15441cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*6-128,"edx"),$D1);
15451cb0ef41Sopenharmony_ci	&vmovd		(&X($D1),&DWP(-16*3+4*1,"edi"));
15461cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*7-128,"edx"),$D2);
15471cb0ef41Sopenharmony_ci	&vmovd		(&X($D2),&DWP(-16*3+4*2,"edi"));
15481cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*8-128,"edx"),$D3);
15491cb0ef41Sopenharmony_ci	&vmovd		(&X($D3),&DWP(-16*3+4*3,"edi"));
15501cb0ef41Sopenharmony_ci	&vmovd		(&X($D4),&DWP(-16*3+4*4,"edi"));
15511cb0ef41Sopenharmony_ci	&vmovdqa	($MASK,&QWP(64,"ebx"));
15521cb0ef41Sopenharmony_ci	&neg		("eax");			# padbit
15531cb0ef41Sopenharmony_ci
15541cb0ef41Sopenharmony_ci	&test		("ecx",63);
15551cb0ef41Sopenharmony_ci	&jz		(&label("even"));
15561cb0ef41Sopenharmony_ci
15571cb0ef41Sopenharmony_ci	&mov		("edx","ecx");
15581cb0ef41Sopenharmony_ci	&and		("ecx",-64);
15591cb0ef41Sopenharmony_ci	&and		("edx",63);
15601cb0ef41Sopenharmony_ci
15611cb0ef41Sopenharmony_ci	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));
15621cb0ef41Sopenharmony_ci	&cmp		("edx",32);
15631cb0ef41Sopenharmony_ci	&jb		(&label("one"));
15641cb0ef41Sopenharmony_ci
15651cb0ef41Sopenharmony_ci	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
15661cb0ef41Sopenharmony_ci	&je		(&label("two"));
15671cb0ef41Sopenharmony_ci
15681cb0ef41Sopenharmony_ci	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
15691cb0ef41Sopenharmony_ci	&lea		("esi",&DWP(16*3,"esi"));
15701cb0ef41Sopenharmony_ci	&lea		("ebx",&DWP(8,"ebx"));		# three padbits
15711cb0ef41Sopenharmony_ci	&lea		("edx",&DWP(32*5+128+8,"esp"));	# --:r^1:r^2:r^3 (*)
15721cb0ef41Sopenharmony_ci	&jmp		(&label("tail"));
15731cb0ef41Sopenharmony_ci
15741cb0ef41Sopenharmony_ci&set_label("two");
15751cb0ef41Sopenharmony_ci	&lea		("esi",&DWP(16*2,"esi"));
15761cb0ef41Sopenharmony_ci	&lea		("ebx",&DWP(16,"ebx"));		# two padbits
15771cb0ef41Sopenharmony_ci	&lea		("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*)
15781cb0ef41Sopenharmony_ci	&jmp		(&label("tail"));
15791cb0ef41Sopenharmony_ci
15801cb0ef41Sopenharmony_ci&set_label("one");
15811cb0ef41Sopenharmony_ci	&lea		("esi",&DWP(16*1,"esi"));
15821cb0ef41Sopenharmony_ci	&vpxor		($T1,$T1,$T1);
15831cb0ef41Sopenharmony_ci	&lea		("ebx",&DWP(32,"ebx","eax",8));	# one or no padbits
15841cb0ef41Sopenharmony_ci	&lea		("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*)
15851cb0ef41Sopenharmony_ci	&jmp		(&label("tail"));
15861cb0ef41Sopenharmony_ci
15871cb0ef41Sopenharmony_ci# (*)	spots marked with '--' are data from next table entry, but they
15881cb0ef41Sopenharmony_ci#	are multiplied by 0 and therefore rendered insignificant
15891cb0ef41Sopenharmony_ci
15901cb0ef41Sopenharmony_ci&set_label("even",32);
15911cb0ef41Sopenharmony_ci	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));	# load input
15921cb0ef41Sopenharmony_ci	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
15931cb0ef41Sopenharmony_ci	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
15941cb0ef41Sopenharmony_ci	&vinserti128	($T1,$T1,&QWP(16*3,"esi"),1);
15951cb0ef41Sopenharmony_ci	&lea		("esi",&DWP(16*4,"esi"));
15961cb0ef41Sopenharmony_ci	&sub		("ecx",64);
15971cb0ef41Sopenharmony_ci	&jz		(&label("tail"));
15981cb0ef41Sopenharmony_ci
15991cb0ef41Sopenharmony_ci&set_label("loop");
16001cb0ef41Sopenharmony_ci	################################################################
16011cb0ef41Sopenharmony_ci	# ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
16021cb0ef41Sopenharmony_ci	# ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
16031cb0ef41Sopenharmony_ci	# ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
16041cb0ef41Sopenharmony_ci	# ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
16051cb0ef41Sopenharmony_ci	#   \________/ \_______/
16061cb0ef41Sopenharmony_ci	################################################################
16071cb0ef41Sopenharmony_ci
16081cb0ef41Sopenharmony_cisub vsplat_input {
16091cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*2,"esp"),$D2);
16101cb0ef41Sopenharmony_ci	&vpsrldq	($D2,$T0,6);			# splat input
16111cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*0,"esp"),$D0);
16121cb0ef41Sopenharmony_ci	&vpsrldq	($D0,$T1,6);
16131cb0ef41Sopenharmony_ci	&vmovdqa	(&QWP(32*1,"esp"),$D1);
16141cb0ef41Sopenharmony_ci	&vpunpckhqdq	($D1,$T0,$T1);			# 4
16151cb0ef41Sopenharmony_ci	&vpunpcklqdq	($T0,$T0,$T1);			# 0:1
16161cb0ef41Sopenharmony_ci	&vpunpcklqdq	($D2,$D2,$D0);			# 2:3
16171cb0ef41Sopenharmony_ci
16181cb0ef41Sopenharmony_ci	&vpsrlq		($D0,$D2,30);
16191cb0ef41Sopenharmony_ci	&vpsrlq		($D2,$D2,4);
16201cb0ef41Sopenharmony_ci	&vpsrlq		($T1,$T0,26);
16211cb0ef41Sopenharmony_ci	&vpsrlq		($D1,$D1,40);			# 4
16221cb0ef41Sopenharmony_ci	&vpand		($D2,$D2,$MASK);		# 2
16231cb0ef41Sopenharmony_ci	&vpand		($T0,$T0,$MASK);		# 0
16241cb0ef41Sopenharmony_ci	&vpand		($T1,$T1,$MASK);		# 1
16251cb0ef41Sopenharmony_ci	&vpand		($D0,$D0,$MASK);		# 3 (*)
16261cb0ef41Sopenharmony_ci	&vpor		($D1,$D1,&QWP(0,"ebx"));	# padbit, yes, always
16271cb0ef41Sopenharmony_ci
16281cb0ef41Sopenharmony_ci	# (*)	note that output is counterintuitive, inp[3:4] is
16291cb0ef41Sopenharmony_ci	#	returned in $D1-2, while $D3-4 are preserved;
16301cb0ef41Sopenharmony_ci}
16311cb0ef41Sopenharmony_ci	&vsplat_input	();
16321cb0ef41Sopenharmony_ci
16331cb0ef41Sopenharmony_cisub vpmuladd {
16341cb0ef41Sopenharmony_cimy $addr = shift;
16351cb0ef41Sopenharmony_ci
16361cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,&QWP(32*2,"esp"));	# add hash value
16371cb0ef41Sopenharmony_ci	&vpaddq		($T0,$T0,&QWP(32*0,"esp"));
16381cb0ef41Sopenharmony_ci	&vpaddq		($T1,$T1,&QWP(32*1,"esp"));
16391cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$D3);
16401cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$D4);
16411cb0ef41Sopenharmony_ci
16421cb0ef41Sopenharmony_ci	################################################################
16431cb0ef41Sopenharmony_ci	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0   + h4*5*r4
16441cb0ef41Sopenharmony_ci	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1   + h4*r0
16451cb0ef41Sopenharmony_ci	# d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
16461cb0ef41Sopenharmony_ci	# d1 = h2*5*r4 + h0*r1 + h1*r0   + h3*5*r3 + h4*5*r2
16471cb0ef41Sopenharmony_ci	# d2 = h2*r0   + h0*r2 + h1*r1   + h3*5*r4 + h4*5*r3
16481cb0ef41Sopenharmony_ci
16491cb0ef41Sopenharmony_ci	&vpmuludq	($D3,$D2,&$addr(1));		# d3 = h2*r1
16501cb0ef41Sopenharmony_ci	 &vmovdqa	(QWP(32*1,"esp"),$T1);
16511cb0ef41Sopenharmony_ci	&vpmuludq	($D4,$D2,&$addr(2));		# d4 = h2*r2
16521cb0ef41Sopenharmony_ci	 &vmovdqa	(QWP(32*3,"esp"),$D0);
16531cb0ef41Sopenharmony_ci	&vpmuludq	($D0,$D2,&$addr(7));		# d0 = h2*s3
16541cb0ef41Sopenharmony_ci	 &vmovdqa	(QWP(32*4,"esp"),$D1);
16551cb0ef41Sopenharmony_ci	&vpmuludq	($D1,$D2,&$addr(8));		# d1 = h2*s4
16561cb0ef41Sopenharmony_ci	&vpmuludq	($D2,$D2,&$addr(0));		# d2 = h2*r0
16571cb0ef41Sopenharmony_ci
16581cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T0,&$addr(3));		# h0*r3
16591cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T2);			# d3 += h0*r3
16601cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T0,&$addr(4));		# h0*r4
16611cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T1);			# d4 + h0*r4
16621cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T0,&$addr(0));		# h0*r0
16631cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$T2);			# d0 + h0*r0
16641cb0ef41Sopenharmony_ci	 &vmovdqa	($T2,&QWP(32*1,"esp"));		# h1
16651cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T0,&$addr(1));		# h0*r1
16661cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T1);			# d1 += h0*r1
16671cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T0,&$addr(2));		# h0*r2
16681cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T0);			# d2 += h0*r2
16691cb0ef41Sopenharmony_ci
16701cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T2,&$addr(2));		# h1*r2
16711cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T1);			# d3 += h1*r2
16721cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T2,&$addr(3));		# h1*r3
16731cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T0);			# d4 += h1*r3
16741cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T2,&$addr(8));		# h1*s4
16751cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$T1);			# d0 += h1*s4
16761cb0ef41Sopenharmony_ci	 &vmovdqa	($T1,&QWP(32*3,"esp"));		# h3
16771cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T2,&$addr(0));		# h1*r0
16781cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T0);			# d1 += h1*r0
16791cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T2,&$addr(1));		# h1*r1
16801cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T2);			# d2 += h1*r1
16811cb0ef41Sopenharmony_ci
16821cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T1,&$addr(0));		# h3*r0
16831cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T0);			# d3 += h3*r0
16841cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T1,&$addr(1));		# h3*r1
16851cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T2);			# d4 += h3*r1
16861cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T1,&$addr(6));		# h3*s2
16871cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$T0);			# d0 += h3*s2
16881cb0ef41Sopenharmony_ci	 &vmovdqa	($T0,&QWP(32*4,"esp"));		# h4
16891cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T1,&$addr(7));		# h3*s3
16901cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T2);			# d1+= h3*s3
16911cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T1,&$addr(8));		# h3*s4
16921cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T1);			# d2 += h3*s4
16931cb0ef41Sopenharmony_ci
16941cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T0,&$addr(8));		# h4*s4
16951cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T2);			# d3 += h4*s4
16961cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T0,&$addr(5));		# h4*s1
16971cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$T1);			# d0 += h4*s1
16981cb0ef41Sopenharmony_ci	&vpmuludq	($T2,$T0,&$addr(0));		# h4*r0
16991cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T2);			# d4 += h4*r0
17001cb0ef41Sopenharmony_ci	 &vmovdqa	($MASK,&QWP(64,"ebx"));
17011cb0ef41Sopenharmony_ci	&vpmuludq	($T1,$T0,&$addr(6));		# h4*s2
17021cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T1);			# d1 += h4*s2
17031cb0ef41Sopenharmony_ci	&vpmuludq	($T0,$T0,&$addr(7));		# h4*s3
17041cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T0);			# d2 += h4*s3
17051cb0ef41Sopenharmony_ci}
17061cb0ef41Sopenharmony_ci	&vpmuladd	(sub {	my $i=shift; &QWP(32*$i-128,"edx");	});
17071cb0ef41Sopenharmony_ci
17081cb0ef41Sopenharmony_cisub vlazy_reduction {
17091cb0ef41Sopenharmony_ci	################################################################
17101cb0ef41Sopenharmony_ci	# lazy reduction
17111cb0ef41Sopenharmony_ci
17121cb0ef41Sopenharmony_ci	 &vpsrlq	($T0,$D3,26);
17131cb0ef41Sopenharmony_ci	 &vpand		($D3,$D3,$MASK);
17141cb0ef41Sopenharmony_ci	&vpsrlq		($T1,$D0,26);
17151cb0ef41Sopenharmony_ci	&vpand		($D0,$D0,$MASK);
17161cb0ef41Sopenharmony_ci	 &vpaddq	($D4,$D4,$T0);			# h3 -> h4
17171cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T1);			# h0 -> h1
17181cb0ef41Sopenharmony_ci	 &vpsrlq	($T0,$D4,26);
17191cb0ef41Sopenharmony_ci	 &vpand		($D4,$D4,$MASK);
17201cb0ef41Sopenharmony_ci	&vpsrlq		($T1,$D1,26);
17211cb0ef41Sopenharmony_ci	&vpand		($D1,$D1,$MASK);
17221cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T1);			# h1 -> h2
17231cb0ef41Sopenharmony_ci	 &vpaddq	($D0,$D0,$T0);
17241cb0ef41Sopenharmony_ci	 &vpsllq	($T0,$T0,2);
17251cb0ef41Sopenharmony_ci	&vpsrlq		($T1,$D2,26);
17261cb0ef41Sopenharmony_ci	&vpand		($D2,$D2,$MASK);
17271cb0ef41Sopenharmony_ci	 &vpaddq	($D0,$D0,$T0);			# h4 -> h0
17281cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T1);			# h2 -> h3
17291cb0ef41Sopenharmony_ci	&vpsrlq		($T1,$D3,26);
17301cb0ef41Sopenharmony_ci	 &vpsrlq	($T0,$D0,26);
17311cb0ef41Sopenharmony_ci	 &vpand		($D0,$D0,$MASK);
17321cb0ef41Sopenharmony_ci	&vpand		($D3,$D3,$MASK);
17331cb0ef41Sopenharmony_ci	 &vpaddq	($D1,$D1,$T0);			# h0 -> h1
17341cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T1);			# h3 -> h4
17351cb0ef41Sopenharmony_ci}
17361cb0ef41Sopenharmony_ci	&vlazy_reduction();
17371cb0ef41Sopenharmony_ci
17381cb0ef41Sopenharmony_ci	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));	# load input
17391cb0ef41Sopenharmony_ci	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
17401cb0ef41Sopenharmony_ci	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
17411cb0ef41Sopenharmony_ci	&vinserti128	($T1,$T1,&QWP(16*3,"esi"),1);
17421cb0ef41Sopenharmony_ci	&lea		("esi",&DWP(16*4,"esi"));
17431cb0ef41Sopenharmony_ci	&sub		("ecx",64);
17441cb0ef41Sopenharmony_ci	&jnz		(&label("loop"));
17451cb0ef41Sopenharmony_ci
17461cb0ef41Sopenharmony_ci&set_label("tail");
17471cb0ef41Sopenharmony_ci	&vsplat_input	();
17481cb0ef41Sopenharmony_ci	&and		("ebx",-64);			# restore pointer
17491cb0ef41Sopenharmony_ci
17501cb0ef41Sopenharmony_ci	&vpmuladd	(sub {	my $i=shift; &QWP(4+32*$i-128,"edx");	});
17511cb0ef41Sopenharmony_ci
17521cb0ef41Sopenharmony_ci	################################################################
17531cb0ef41Sopenharmony_ci	# horizontal addition
17541cb0ef41Sopenharmony_ci
17551cb0ef41Sopenharmony_ci	&vpsrldq	($T0,$D4,8);
17561cb0ef41Sopenharmony_ci	&vpsrldq	($T1,$D3,8);
17571cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T0);
17581cb0ef41Sopenharmony_ci	&vpsrldq	($T0,$D0,8);
17591cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T1);
17601cb0ef41Sopenharmony_ci	&vpsrldq	($T1,$D1,8);
17611cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$T0);
17621cb0ef41Sopenharmony_ci	&vpsrldq	($T0,$D2,8);
17631cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T1);
17641cb0ef41Sopenharmony_ci	&vpermq		($T1,$D4,2);			# keep folding
17651cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T0);
17661cb0ef41Sopenharmony_ci	&vpermq		($T0,$D3,2);
17671cb0ef41Sopenharmony_ci	&vpaddq		($D4,$D4,$T1);
17681cb0ef41Sopenharmony_ci	&vpermq		($T1,$D0,2);
17691cb0ef41Sopenharmony_ci	&vpaddq		($D3,$D3,$T0);
17701cb0ef41Sopenharmony_ci	&vpermq		($T0,$D1,2);
17711cb0ef41Sopenharmony_ci	&vpaddq		($D0,$D0,$T1);
17721cb0ef41Sopenharmony_ci	&vpermq		($T1,$D2,2);
17731cb0ef41Sopenharmony_ci	&vpaddq		($D1,$D1,$T0);
17741cb0ef41Sopenharmony_ci	&vpaddq		($D2,$D2,$T1);
17751cb0ef41Sopenharmony_ci
17761cb0ef41Sopenharmony_ci	&vlazy_reduction();
17771cb0ef41Sopenharmony_ci
17781cb0ef41Sopenharmony_ci	&cmp		("ecx",0);
17791cb0ef41Sopenharmony_ci	&je		(&label("done"));
17801cb0ef41Sopenharmony_ci
17811cb0ef41Sopenharmony_ci	################################################################
17821cb0ef41Sopenharmony_ci	# clear all but single word
17831cb0ef41Sopenharmony_ci
17841cb0ef41Sopenharmony_ci	&vpshufd	(&X($D0),&X($D0),0b11111100);
17851cb0ef41Sopenharmony_ci	&lea		("edx",&DWP(32*5+128,"esp"));	# restore pointer
17861cb0ef41Sopenharmony_ci	&vpshufd	(&X($D1),&X($D1),0b11111100);
17871cb0ef41Sopenharmony_ci	&vpshufd	(&X($D2),&X($D2),0b11111100);
17881cb0ef41Sopenharmony_ci	&vpshufd	(&X($D3),&X($D3),0b11111100);
17891cb0ef41Sopenharmony_ci	&vpshufd	(&X($D4),&X($D4),0b11111100);
17901cb0ef41Sopenharmony_ci	&jmp		(&label("even"));
17911cb0ef41Sopenharmony_ci
17921cb0ef41Sopenharmony_ci&set_label("done",16);
17931cb0ef41Sopenharmony_ci	&vmovd		(&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
17941cb0ef41Sopenharmony_ci	&vmovd		(&DWP(-16*3+4*1,"edi"),&X($D1));
17951cb0ef41Sopenharmony_ci	&vmovd		(&DWP(-16*3+4*2,"edi"),&X($D2));
17961cb0ef41Sopenharmony_ci	&vmovd		(&DWP(-16*3+4*3,"edi"),&X($D3));
17971cb0ef41Sopenharmony_ci	&vmovd		(&DWP(-16*3+4*4,"edi"),&X($D4));
17981cb0ef41Sopenharmony_ci	&vzeroupper	();
17991cb0ef41Sopenharmony_ci	&mov	("esp","ebp");
18001cb0ef41Sopenharmony_ci&set_label("nodata");
18011cb0ef41Sopenharmony_ci&function_end("_poly1305_blocks_avx2");
18021cb0ef41Sopenharmony_ci}
18031cb0ef41Sopenharmony_ci&set_label("const_sse2",64);
18041cb0ef41Sopenharmony_ci	&data_word(1<<24,0,	1<<24,0,	1<<24,0,	1<<24,0);
18051cb0ef41Sopenharmony_ci	&data_word(0,0,		0,0,		0,0,		0,0);
18061cb0ef41Sopenharmony_ci	&data_word(0x03ffffff,0,0x03ffffff,0,	0x03ffffff,0,	0x03ffffff,0);
18071cb0ef41Sopenharmony_ci	&data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc);
18081cb0ef41Sopenharmony_ci}
18091cb0ef41Sopenharmony_ci&asciz	("Poly1305 for x86, CRYPTOGAMS by <appro\@openssl.org>");
18101cb0ef41Sopenharmony_ci&align	(4);
18111cb0ef41Sopenharmony_ci
18121cb0ef41Sopenharmony_ci&asm_finish();
18131cb0ef41Sopenharmony_ci
18141cb0ef41Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
1815