11cb0ef41Sopenharmony_ci#! /usr/bin/env perl 21cb0ef41Sopenharmony_ci# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 31cb0ef41Sopenharmony_ci# 41cb0ef41Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 51cb0ef41Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 61cb0ef41Sopenharmony_ci# in the file LICENSE in the source distribution or at 71cb0ef41Sopenharmony_ci# https://www.openssl.org/source/license.html 81cb0ef41Sopenharmony_ci 91cb0ef41Sopenharmony_ci# 101cb0ef41Sopenharmony_ci# ==================================================================== 111cb0ef41Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 121cb0ef41Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 131cb0ef41Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 141cb0ef41Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 151cb0ef41Sopenharmony_ci# ==================================================================== 161cb0ef41Sopenharmony_ci# 171cb0ef41Sopenharmony_ci# This module implements Poly1305 hash for x86. 181cb0ef41Sopenharmony_ci# 191cb0ef41Sopenharmony_ci# April 2015 201cb0ef41Sopenharmony_ci# 211cb0ef41Sopenharmony_ci# Numbers are cycles per processed byte with poly1305_blocks alone, 221cb0ef41Sopenharmony_ci# measured with rdtsc at fixed clock frequency. 231cb0ef41Sopenharmony_ci# 241cb0ef41Sopenharmony_ci# IALU/gcc-3.4(*) SSE2(**) AVX2 251cb0ef41Sopenharmony_ci# Pentium 15.7/+80% - 261cb0ef41Sopenharmony_ci# PIII 6.21/+90% - 271cb0ef41Sopenharmony_ci# P4 19.8/+40% 3.24 281cb0ef41Sopenharmony_ci# Core 2 4.85/+90% 1.80 291cb0ef41Sopenharmony_ci# Westmere 4.58/+100% 1.43 301cb0ef41Sopenharmony_ci# Sandy Bridge 3.90/+100% 1.36 311cb0ef41Sopenharmony_ci# Haswell 3.88/+70% 1.18 0.72 321cb0ef41Sopenharmony_ci# Skylake 3.10/+60% 1.14 0.62 331cb0ef41Sopenharmony_ci# Silvermont 11.0/+40% 4.80 341cb0ef41Sopenharmony_ci# Goldmont 4.10/+200% 2.10 351cb0ef41Sopenharmony_ci# VIA Nano 6.71/+90% 2.47 361cb0ef41Sopenharmony_ci# Sledgehammer 3.51/+180% 4.27 371cb0ef41Sopenharmony_ci# Bulldozer 4.53/+140% 1.31 381cb0ef41Sopenharmony_ci# 391cb0ef41Sopenharmony_ci# (*) gcc 4.8 for some reason generated worse code; 401cb0ef41Sopenharmony_ci# (**) besides SSE2 there are floating-point and AVX options; FP 411cb0ef41Sopenharmony_ci# is deemed unnecessary, because pre-SSE2 processor are too 421cb0ef41Sopenharmony_ci# old to care about, while it's not the fastest option on 431cb0ef41Sopenharmony_ci# SSE2-capable ones; AVX is omitted, because it doesn't give 441cb0ef41Sopenharmony_ci# a lot of improvement, 5-10% depending on processor; 451cb0ef41Sopenharmony_ci 461cb0ef41Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 471cb0ef41Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 481cb0ef41Sopenharmony_cirequire "x86asm.pl"; 491cb0ef41Sopenharmony_ci 501cb0ef41Sopenharmony_ci$output=pop and open STDOUT,">$output"; 511cb0ef41Sopenharmony_ci 521cb0ef41Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 531cb0ef41Sopenharmony_ci 541cb0ef41Sopenharmony_ci$sse2=$avx=0; 551cb0ef41Sopenharmony_cifor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 561cb0ef41Sopenharmony_ci 571cb0ef41Sopenharmony_ciif ($sse2) { 581cb0ef41Sopenharmony_ci &static_label("const_sse2"); 591cb0ef41Sopenharmony_ci &static_label("enter_blocks"); 601cb0ef41Sopenharmony_ci &static_label("enter_emit"); 611cb0ef41Sopenharmony_ci &external_label("OPENSSL_ia32cap_P"); 621cb0ef41Sopenharmony_ci 631cb0ef41Sopenharmony_ci if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 641cb0ef41Sopenharmony_ci =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 651cb0ef41Sopenharmony_ci $avx = ($1>=2.19) + ($1>=2.22); 661cb0ef41Sopenharmony_ci } 671cb0ef41Sopenharmony_ci 681cb0ef41Sopenharmony_ci if (!$avx && $ARGV[0] eq "win32n" && 691cb0ef41Sopenharmony_ci `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 701cb0ef41Sopenharmony_ci $avx = ($1>=2.09) + ($1>=2.10); 711cb0ef41Sopenharmony_ci } 721cb0ef41Sopenharmony_ci 731cb0ef41Sopenharmony_ci if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { 741cb0ef41Sopenharmony_ci $avx = ($2>=3.0) + ($2>3.0); 751cb0ef41Sopenharmony_ci } 761cb0ef41Sopenharmony_ci} 771cb0ef41Sopenharmony_ci 781cb0ef41Sopenharmony_ci######################################################################## 791cb0ef41Sopenharmony_ci# Layout of opaque area is following. 801cb0ef41Sopenharmony_ci# 811cb0ef41Sopenharmony_ci# unsigned __int32 h[5]; # current hash value base 2^32 821cb0ef41Sopenharmony_ci# unsigned __int32 pad; # is_base2_26 in vector context 831cb0ef41Sopenharmony_ci# unsigned __int32 r[4]; # key value base 2^32 841cb0ef41Sopenharmony_ci 851cb0ef41Sopenharmony_ci&align(64); 861cb0ef41Sopenharmony_ci&function_begin("poly1305_init"); 871cb0ef41Sopenharmony_ci &mov ("edi",&wparam(0)); # context 881cb0ef41Sopenharmony_ci &mov ("esi",&wparam(1)); # key 891cb0ef41Sopenharmony_ci &mov ("ebp",&wparam(2)); # function table 901cb0ef41Sopenharmony_ci 911cb0ef41Sopenharmony_ci &xor ("eax","eax"); 921cb0ef41Sopenharmony_ci &mov (&DWP(4*0,"edi"),"eax"); # zero hash value 931cb0ef41Sopenharmony_ci &mov (&DWP(4*1,"edi"),"eax"); 941cb0ef41Sopenharmony_ci &mov (&DWP(4*2,"edi"),"eax"); 951cb0ef41Sopenharmony_ci &mov (&DWP(4*3,"edi"),"eax"); 961cb0ef41Sopenharmony_ci &mov (&DWP(4*4,"edi"),"eax"); 971cb0ef41Sopenharmony_ci &mov (&DWP(4*5,"edi"),"eax"); # is_base2_26 981cb0ef41Sopenharmony_ci 991cb0ef41Sopenharmony_ci &cmp ("esi",0); 1001cb0ef41Sopenharmony_ci &je (&label("nokey")); 1011cb0ef41Sopenharmony_ci 1021cb0ef41Sopenharmony_ci if ($sse2) { 1031cb0ef41Sopenharmony_ci &call (&label("pic_point")); 1041cb0ef41Sopenharmony_ci &set_label("pic_point"); 1051cb0ef41Sopenharmony_ci &blindpop("ebx"); 1061cb0ef41Sopenharmony_ci 1071cb0ef41Sopenharmony_ci &lea ("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx")); 1081cb0ef41Sopenharmony_ci &lea ("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx")); 1091cb0ef41Sopenharmony_ci 1101cb0ef41Sopenharmony_ci &picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point")); 1111cb0ef41Sopenharmony_ci &mov ("ecx",&DWP(0,"edi")); 1121cb0ef41Sopenharmony_ci &and ("ecx",1<<26|1<<24); 1131cb0ef41Sopenharmony_ci &cmp ("ecx",1<<26|1<<24); # SSE2 and XMM? 1141cb0ef41Sopenharmony_ci &jne (&label("no_sse2")); 1151cb0ef41Sopenharmony_ci 1161cb0ef41Sopenharmony_ci &lea ("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx")); 1171cb0ef41Sopenharmony_ci &lea ("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx")); 1181cb0ef41Sopenharmony_ci 1191cb0ef41Sopenharmony_ci if ($avx>1) { 1201cb0ef41Sopenharmony_ci &mov ("ecx",&DWP(8,"edi")); 1211cb0ef41Sopenharmony_ci &test ("ecx",1<<5); # AVX2? 1221cb0ef41Sopenharmony_ci &jz (&label("no_sse2")); 1231cb0ef41Sopenharmony_ci 1241cb0ef41Sopenharmony_ci &lea ("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx")); 1251cb0ef41Sopenharmony_ci } 1261cb0ef41Sopenharmony_ci &set_label("no_sse2"); 1271cb0ef41Sopenharmony_ci &mov ("edi",&wparam(0)); # reload context 1281cb0ef41Sopenharmony_ci &mov (&DWP(0,"ebp"),"eax"); # fill function table 1291cb0ef41Sopenharmony_ci &mov (&DWP(4,"ebp"),"edx"); 1301cb0ef41Sopenharmony_ci } 1311cb0ef41Sopenharmony_ci 1321cb0ef41Sopenharmony_ci &mov ("eax",&DWP(4*0,"esi")); # load input key 1331cb0ef41Sopenharmony_ci &mov ("ebx",&DWP(4*1,"esi")); 1341cb0ef41Sopenharmony_ci &mov ("ecx",&DWP(4*2,"esi")); 1351cb0ef41Sopenharmony_ci &mov ("edx",&DWP(4*3,"esi")); 1361cb0ef41Sopenharmony_ci &and ("eax",0x0fffffff); 1371cb0ef41Sopenharmony_ci &and ("ebx",0x0ffffffc); 1381cb0ef41Sopenharmony_ci &and ("ecx",0x0ffffffc); 1391cb0ef41Sopenharmony_ci &and ("edx",0x0ffffffc); 1401cb0ef41Sopenharmony_ci &mov (&DWP(4*6,"edi"),"eax"); 1411cb0ef41Sopenharmony_ci &mov (&DWP(4*7,"edi"),"ebx"); 1421cb0ef41Sopenharmony_ci &mov (&DWP(4*8,"edi"),"ecx"); 1431cb0ef41Sopenharmony_ci &mov (&DWP(4*9,"edi"),"edx"); 1441cb0ef41Sopenharmony_ci 1451cb0ef41Sopenharmony_ci &mov ("eax",$sse2); 1461cb0ef41Sopenharmony_ci&set_label("nokey"); 1471cb0ef41Sopenharmony_ci&function_end("poly1305_init"); 1481cb0ef41Sopenharmony_ci 1491cb0ef41Sopenharmony_ci($h0,$h1,$h2,$h3,$h4, 1501cb0ef41Sopenharmony_ci $d0,$d1,$d2,$d3, 1511cb0ef41Sopenharmony_ci $r0,$r1,$r2,$r3, 1521cb0ef41Sopenharmony_ci $s1,$s2,$s3)=map(4*$_,(0..15)); 1531cb0ef41Sopenharmony_ci 1541cb0ef41Sopenharmony_ci&function_begin("poly1305_blocks"); 1551cb0ef41Sopenharmony_ci &mov ("edi",&wparam(0)); # ctx 1561cb0ef41Sopenharmony_ci &mov ("esi",&wparam(1)); # inp 1571cb0ef41Sopenharmony_ci &mov ("ecx",&wparam(2)); # len 1581cb0ef41Sopenharmony_ci&set_label("enter_blocks"); 1591cb0ef41Sopenharmony_ci &and ("ecx",-15); 1601cb0ef41Sopenharmony_ci &jz (&label("nodata")); 1611cb0ef41Sopenharmony_ci 1621cb0ef41Sopenharmony_ci &stack_push(16); 1631cb0ef41Sopenharmony_ci &mov ("eax",&DWP(4*6,"edi")); # r0 1641cb0ef41Sopenharmony_ci &mov ("ebx",&DWP(4*7,"edi")); # r1 1651cb0ef41Sopenharmony_ci &lea ("ebp",&DWP(0,"esi","ecx")); # end of input 1661cb0ef41Sopenharmony_ci &mov ("ecx",&DWP(4*8,"edi")); # r2 1671cb0ef41Sopenharmony_ci &mov ("edx",&DWP(4*9,"edi")); # r3 1681cb0ef41Sopenharmony_ci 1691cb0ef41Sopenharmony_ci &mov (&wparam(2),"ebp"); 1701cb0ef41Sopenharmony_ci &mov ("ebp","esi"); 1711cb0ef41Sopenharmony_ci 1721cb0ef41Sopenharmony_ci &mov (&DWP($r0,"esp"),"eax"); # r0 1731cb0ef41Sopenharmony_ci &mov ("eax","ebx"); 1741cb0ef41Sopenharmony_ci &shr ("eax",2); 1751cb0ef41Sopenharmony_ci &mov (&DWP($r1,"esp"),"ebx"); # r1 1761cb0ef41Sopenharmony_ci &add ("eax","ebx"); # s1 1771cb0ef41Sopenharmony_ci &mov ("ebx","ecx"); 1781cb0ef41Sopenharmony_ci &shr ("ebx",2); 1791cb0ef41Sopenharmony_ci &mov (&DWP($r2,"esp"),"ecx"); # r2 1801cb0ef41Sopenharmony_ci &add ("ebx","ecx"); # s2 1811cb0ef41Sopenharmony_ci &mov ("ecx","edx"); 1821cb0ef41Sopenharmony_ci &shr ("ecx",2); 1831cb0ef41Sopenharmony_ci &mov (&DWP($r3,"esp"),"edx"); # r3 1841cb0ef41Sopenharmony_ci &add ("ecx","edx"); # s3 1851cb0ef41Sopenharmony_ci &mov (&DWP($s1,"esp"),"eax"); # s1 1861cb0ef41Sopenharmony_ci &mov (&DWP($s2,"esp"),"ebx"); # s2 1871cb0ef41Sopenharmony_ci &mov (&DWP($s3,"esp"),"ecx"); # s3 1881cb0ef41Sopenharmony_ci 1891cb0ef41Sopenharmony_ci &mov ("eax",&DWP(4*0,"edi")); # load hash value 1901cb0ef41Sopenharmony_ci &mov ("ebx",&DWP(4*1,"edi")); 1911cb0ef41Sopenharmony_ci &mov ("ecx",&DWP(4*2,"edi")); 1921cb0ef41Sopenharmony_ci &mov ("esi",&DWP(4*3,"edi")); 1931cb0ef41Sopenharmony_ci &mov ("edi",&DWP(4*4,"edi")); 1941cb0ef41Sopenharmony_ci &jmp (&label("loop")); 1951cb0ef41Sopenharmony_ci 1961cb0ef41Sopenharmony_ci&set_label("loop",32); 1971cb0ef41Sopenharmony_ci &add ("eax",&DWP(4*0,"ebp")); # accumulate input 1981cb0ef41Sopenharmony_ci &adc ("ebx",&DWP(4*1,"ebp")); 1991cb0ef41Sopenharmony_ci &adc ("ecx",&DWP(4*2,"ebp")); 2001cb0ef41Sopenharmony_ci &adc ("esi",&DWP(4*3,"ebp")); 2011cb0ef41Sopenharmony_ci &lea ("ebp",&DWP(4*4,"ebp")); 2021cb0ef41Sopenharmony_ci &adc ("edi",&wparam(3)); # padbit 2031cb0ef41Sopenharmony_ci 2041cb0ef41Sopenharmony_ci &mov (&DWP($h0,"esp"),"eax"); # put aside hash[+inp] 2051cb0ef41Sopenharmony_ci &mov (&DWP($h3,"esp"),"esi"); 2061cb0ef41Sopenharmony_ci 2071cb0ef41Sopenharmony_ci &mul (&DWP($r0,"esp")); # h0*r0 2081cb0ef41Sopenharmony_ci &mov (&DWP($h4,"esp"),"edi"); 2091cb0ef41Sopenharmony_ci &mov ("edi","eax"); 2101cb0ef41Sopenharmony_ci &mov ("eax","ebx"); # h1 2111cb0ef41Sopenharmony_ci &mov ("esi","edx"); 2121cb0ef41Sopenharmony_ci &mul (&DWP($s3,"esp")); # h1*s3 2131cb0ef41Sopenharmony_ci &add ("edi","eax"); 2141cb0ef41Sopenharmony_ci &mov ("eax","ecx"); # h2 2151cb0ef41Sopenharmony_ci &adc ("esi","edx"); 2161cb0ef41Sopenharmony_ci &mul (&DWP($s2,"esp")); # h2*s2 2171cb0ef41Sopenharmony_ci &add ("edi","eax"); 2181cb0ef41Sopenharmony_ci &mov ("eax",&DWP($h3,"esp")); 2191cb0ef41Sopenharmony_ci &adc ("esi","edx"); 2201cb0ef41Sopenharmony_ci &mul (&DWP($s1,"esp")); # h3*s1 2211cb0ef41Sopenharmony_ci &add ("edi","eax"); 2221cb0ef41Sopenharmony_ci &mov ("eax",&DWP($h0,"esp")); 2231cb0ef41Sopenharmony_ci &adc ("esi","edx"); 2241cb0ef41Sopenharmony_ci 2251cb0ef41Sopenharmony_ci &mul (&DWP($r1,"esp")); # h0*r1 2261cb0ef41Sopenharmony_ci &mov (&DWP($d0,"esp"),"edi"); 2271cb0ef41Sopenharmony_ci &xor ("edi","edi"); 2281cb0ef41Sopenharmony_ci &add ("esi","eax"); 2291cb0ef41Sopenharmony_ci &mov ("eax","ebx"); # h1 2301cb0ef41Sopenharmony_ci &adc ("edi","edx"); 2311cb0ef41Sopenharmony_ci &mul (&DWP($r0,"esp")); # h1*r0 2321cb0ef41Sopenharmony_ci &add ("esi","eax"); 2331cb0ef41Sopenharmony_ci &mov ("eax","ecx"); # h2 2341cb0ef41Sopenharmony_ci &adc ("edi","edx"); 2351cb0ef41Sopenharmony_ci &mul (&DWP($s3,"esp")); # h2*s3 2361cb0ef41Sopenharmony_ci &add ("esi","eax"); 2371cb0ef41Sopenharmony_ci &mov ("eax",&DWP($h3,"esp")); 2381cb0ef41Sopenharmony_ci &adc ("edi","edx"); 2391cb0ef41Sopenharmony_ci &mul (&DWP($s2,"esp")); # h3*s2 2401cb0ef41Sopenharmony_ci &add ("esi","eax"); 2411cb0ef41Sopenharmony_ci &mov ("eax",&DWP($h4,"esp")); 2421cb0ef41Sopenharmony_ci &adc ("edi","edx"); 2431cb0ef41Sopenharmony_ci &imul ("eax",&DWP($s1,"esp")); # h4*s1 2441cb0ef41Sopenharmony_ci &add ("esi","eax"); 2451cb0ef41Sopenharmony_ci &mov ("eax",&DWP($h0,"esp")); 2461cb0ef41Sopenharmony_ci &adc ("edi",0); 2471cb0ef41Sopenharmony_ci 2481cb0ef41Sopenharmony_ci &mul (&DWP($r2,"esp")); # h0*r2 2491cb0ef41Sopenharmony_ci &mov (&DWP($d1,"esp"),"esi"); 2501cb0ef41Sopenharmony_ci &xor ("esi","esi"); 2511cb0ef41Sopenharmony_ci &add ("edi","eax"); 2521cb0ef41Sopenharmony_ci &mov ("eax","ebx"); # h1 2531cb0ef41Sopenharmony_ci &adc ("esi","edx"); 2541cb0ef41Sopenharmony_ci &mul (&DWP($r1,"esp")); # h1*r1 2551cb0ef41Sopenharmony_ci &add ("edi","eax"); 2561cb0ef41Sopenharmony_ci &mov ("eax","ecx"); # h2 2571cb0ef41Sopenharmony_ci &adc ("esi","edx"); 2581cb0ef41Sopenharmony_ci &mul (&DWP($r0,"esp")); # h2*r0 2591cb0ef41Sopenharmony_ci &add ("edi","eax"); 2601cb0ef41Sopenharmony_ci &mov ("eax",&DWP($h3,"esp")); 2611cb0ef41Sopenharmony_ci &adc ("esi","edx"); 2621cb0ef41Sopenharmony_ci &mul (&DWP($s3,"esp")); # h3*s3 2631cb0ef41Sopenharmony_ci &add ("edi","eax"); 2641cb0ef41Sopenharmony_ci &mov ("eax",&DWP($h4,"esp")); 2651cb0ef41Sopenharmony_ci &adc ("esi","edx"); 2661cb0ef41Sopenharmony_ci &imul ("eax",&DWP($s2,"esp")); # h4*s2 2671cb0ef41Sopenharmony_ci &add ("edi","eax"); 2681cb0ef41Sopenharmony_ci &mov ("eax",&DWP($h0,"esp")); 2691cb0ef41Sopenharmony_ci &adc ("esi",0); 2701cb0ef41Sopenharmony_ci 2711cb0ef41Sopenharmony_ci &mul (&DWP($r3,"esp")); # h0*r3 2721cb0ef41Sopenharmony_ci &mov (&DWP($d2,"esp"),"edi"); 2731cb0ef41Sopenharmony_ci &xor ("edi","edi"); 2741cb0ef41Sopenharmony_ci &add ("esi","eax"); 2751cb0ef41Sopenharmony_ci &mov ("eax","ebx"); # h1 2761cb0ef41Sopenharmony_ci &adc ("edi","edx"); 2771cb0ef41Sopenharmony_ci &mul (&DWP($r2,"esp")); # h1*r2 2781cb0ef41Sopenharmony_ci &add ("esi","eax"); 2791cb0ef41Sopenharmony_ci &mov ("eax","ecx"); # h2 2801cb0ef41Sopenharmony_ci &adc ("edi","edx"); 2811cb0ef41Sopenharmony_ci &mul (&DWP($r1,"esp")); # h2*r1 2821cb0ef41Sopenharmony_ci &add ("esi","eax"); 2831cb0ef41Sopenharmony_ci &mov ("eax",&DWP($h3,"esp")); 2841cb0ef41Sopenharmony_ci &adc ("edi","edx"); 2851cb0ef41Sopenharmony_ci &mul (&DWP($r0,"esp")); # h3*r0 2861cb0ef41Sopenharmony_ci &add ("esi","eax"); 2871cb0ef41Sopenharmony_ci &mov ("ecx",&DWP($h4,"esp")); 2881cb0ef41Sopenharmony_ci &adc ("edi","edx"); 2891cb0ef41Sopenharmony_ci 2901cb0ef41Sopenharmony_ci &mov ("edx","ecx"); 2911cb0ef41Sopenharmony_ci &imul ("ecx",&DWP($s3,"esp")); # h4*s3 2921cb0ef41Sopenharmony_ci &add ("esi","ecx"); 2931cb0ef41Sopenharmony_ci &mov ("eax",&DWP($d0,"esp")); 2941cb0ef41Sopenharmony_ci &adc ("edi",0); 2951cb0ef41Sopenharmony_ci 2961cb0ef41Sopenharmony_ci &imul ("edx",&DWP($r0,"esp")); # h4*r0 2971cb0ef41Sopenharmony_ci &add ("edx","edi"); 2981cb0ef41Sopenharmony_ci 2991cb0ef41Sopenharmony_ci &mov ("ebx",&DWP($d1,"esp")); 3001cb0ef41Sopenharmony_ci &mov ("ecx",&DWP($d2,"esp")); 3011cb0ef41Sopenharmony_ci 3021cb0ef41Sopenharmony_ci &mov ("edi","edx"); # last reduction step 3031cb0ef41Sopenharmony_ci &shr ("edx",2); 3041cb0ef41Sopenharmony_ci &and ("edi",3); 3051cb0ef41Sopenharmony_ci &lea ("edx",&DWP(0,"edx","edx",4)); # *5 3061cb0ef41Sopenharmony_ci &add ("eax","edx"); 3071cb0ef41Sopenharmony_ci &adc ("ebx",0); 3081cb0ef41Sopenharmony_ci &adc ("ecx",0); 3091cb0ef41Sopenharmony_ci &adc ("esi",0); 3101cb0ef41Sopenharmony_ci &adc ("edi",0); 3111cb0ef41Sopenharmony_ci 3121cb0ef41Sopenharmony_ci &cmp ("ebp",&wparam(2)); # done yet? 3131cb0ef41Sopenharmony_ci &jne (&label("loop")); 3141cb0ef41Sopenharmony_ci 3151cb0ef41Sopenharmony_ci &mov ("edx",&wparam(0)); # ctx 3161cb0ef41Sopenharmony_ci &stack_pop(16); 3171cb0ef41Sopenharmony_ci &mov (&DWP(4*0,"edx"),"eax"); # store hash value 3181cb0ef41Sopenharmony_ci &mov (&DWP(4*1,"edx"),"ebx"); 3191cb0ef41Sopenharmony_ci &mov (&DWP(4*2,"edx"),"ecx"); 3201cb0ef41Sopenharmony_ci &mov (&DWP(4*3,"edx"),"esi"); 3211cb0ef41Sopenharmony_ci &mov (&DWP(4*4,"edx"),"edi"); 3221cb0ef41Sopenharmony_ci&set_label("nodata"); 3231cb0ef41Sopenharmony_ci&function_end("poly1305_blocks"); 3241cb0ef41Sopenharmony_ci 3251cb0ef41Sopenharmony_ci&function_begin("poly1305_emit"); 3261cb0ef41Sopenharmony_ci &mov ("ebp",&wparam(0)); # context 3271cb0ef41Sopenharmony_ci&set_label("enter_emit"); 3281cb0ef41Sopenharmony_ci &mov ("edi",&wparam(1)); # output 3291cb0ef41Sopenharmony_ci &mov ("eax",&DWP(4*0,"ebp")); # load hash value 3301cb0ef41Sopenharmony_ci &mov ("ebx",&DWP(4*1,"ebp")); 3311cb0ef41Sopenharmony_ci &mov ("ecx",&DWP(4*2,"ebp")); 3321cb0ef41Sopenharmony_ci &mov ("edx",&DWP(4*3,"ebp")); 3331cb0ef41Sopenharmony_ci &mov ("esi",&DWP(4*4,"ebp")); 3341cb0ef41Sopenharmony_ci 3351cb0ef41Sopenharmony_ci &add ("eax",5); # compare to modulus 3361cb0ef41Sopenharmony_ci &adc ("ebx",0); 3371cb0ef41Sopenharmony_ci &adc ("ecx",0); 3381cb0ef41Sopenharmony_ci &adc ("edx",0); 3391cb0ef41Sopenharmony_ci &adc ("esi",0); 3401cb0ef41Sopenharmony_ci &shr ("esi",2); # did it carry/borrow? 3411cb0ef41Sopenharmony_ci &neg ("esi"); # do we choose hash-modulus? 3421cb0ef41Sopenharmony_ci 3431cb0ef41Sopenharmony_ci &and ("eax","esi"); 3441cb0ef41Sopenharmony_ci &and ("ebx","esi"); 3451cb0ef41Sopenharmony_ci &and ("ecx","esi"); 3461cb0ef41Sopenharmony_ci &and ("edx","esi"); 3471cb0ef41Sopenharmony_ci &mov (&DWP(4*0,"edi"),"eax"); 3481cb0ef41Sopenharmony_ci &mov (&DWP(4*1,"edi"),"ebx"); 3491cb0ef41Sopenharmony_ci &mov (&DWP(4*2,"edi"),"ecx"); 3501cb0ef41Sopenharmony_ci &mov (&DWP(4*3,"edi"),"edx"); 3511cb0ef41Sopenharmony_ci 3521cb0ef41Sopenharmony_ci ¬ ("esi"); # or original hash value? 3531cb0ef41Sopenharmony_ci &mov ("eax",&DWP(4*0,"ebp")); 3541cb0ef41Sopenharmony_ci &mov ("ebx",&DWP(4*1,"ebp")); 3551cb0ef41Sopenharmony_ci &mov ("ecx",&DWP(4*2,"ebp")); 3561cb0ef41Sopenharmony_ci &mov ("edx",&DWP(4*3,"ebp")); 3571cb0ef41Sopenharmony_ci &mov ("ebp",&wparam(2)); 3581cb0ef41Sopenharmony_ci &and ("eax","esi"); 3591cb0ef41Sopenharmony_ci &and ("ebx","esi"); 3601cb0ef41Sopenharmony_ci &and ("ecx","esi"); 3611cb0ef41Sopenharmony_ci &and ("edx","esi"); 3621cb0ef41Sopenharmony_ci &or ("eax",&DWP(4*0,"edi")); 3631cb0ef41Sopenharmony_ci &or ("ebx",&DWP(4*1,"edi")); 3641cb0ef41Sopenharmony_ci &or ("ecx",&DWP(4*2,"edi")); 3651cb0ef41Sopenharmony_ci &or ("edx",&DWP(4*3,"edi")); 3661cb0ef41Sopenharmony_ci 3671cb0ef41Sopenharmony_ci &add ("eax",&DWP(4*0,"ebp")); # accumulate key 3681cb0ef41Sopenharmony_ci &adc ("ebx",&DWP(4*1,"ebp")); 3691cb0ef41Sopenharmony_ci &adc ("ecx",&DWP(4*2,"ebp")); 3701cb0ef41Sopenharmony_ci &adc ("edx",&DWP(4*3,"ebp")); 3711cb0ef41Sopenharmony_ci 3721cb0ef41Sopenharmony_ci &mov (&DWP(4*0,"edi"),"eax"); 3731cb0ef41Sopenharmony_ci &mov (&DWP(4*1,"edi"),"ebx"); 3741cb0ef41Sopenharmony_ci &mov (&DWP(4*2,"edi"),"ecx"); 3751cb0ef41Sopenharmony_ci &mov (&DWP(4*3,"edi"),"edx"); 3761cb0ef41Sopenharmony_ci&function_end("poly1305_emit"); 3771cb0ef41Sopenharmony_ci 3781cb0ef41Sopenharmony_ciif ($sse2) { 3791cb0ef41Sopenharmony_ci######################################################################## 3801cb0ef41Sopenharmony_ci# Layout of opaque area is following. 3811cb0ef41Sopenharmony_ci# 3821cb0ef41Sopenharmony_ci# unsigned __int32 h[5]; # current hash value base 2^26 3831cb0ef41Sopenharmony_ci# unsigned __int32 is_base2_26; 3841cb0ef41Sopenharmony_ci# unsigned __int32 r[4]; # key value base 2^32 3851cb0ef41Sopenharmony_ci# unsigned __int32 pad[2]; 3861cb0ef41Sopenharmony_ci# struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9]; 3871cb0ef41Sopenharmony_ci# 3881cb0ef41Sopenharmony_ci# where r^n are base 2^26 digits of degrees of multiplier key. There are 3891cb0ef41Sopenharmony_ci# 5 digits, but last four are interleaved with multiples of 5, totalling 3901cb0ef41Sopenharmony_ci# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 3911cb0ef41Sopenharmony_ci 3921cb0ef41Sopenharmony_cimy ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7)); 3931cb0ef41Sopenharmony_cimy $MASK=$T2; # borrow and keep in mind 3941cb0ef41Sopenharmony_ci 3951cb0ef41Sopenharmony_ci&align (32); 3961cb0ef41Sopenharmony_ci&function_begin_B("_poly1305_init_sse2"); 3971cb0ef41Sopenharmony_ci &movdqu ($D4,&QWP(4*6,"edi")); # key base 2^32 3981cb0ef41Sopenharmony_ci &lea ("edi",&DWP(16*3,"edi")); # size optimization 3991cb0ef41Sopenharmony_ci &mov ("ebp","esp"); 4001cb0ef41Sopenharmony_ci &sub ("esp",16*(9+5)); 4011cb0ef41Sopenharmony_ci &and ("esp",-16); 4021cb0ef41Sopenharmony_ci 4031cb0ef41Sopenharmony_ci #&pand ($D4,&QWP(96,"ebx")); # magic mask 4041cb0ef41Sopenharmony_ci &movq ($MASK,&QWP(64,"ebx")); 4051cb0ef41Sopenharmony_ci 4061cb0ef41Sopenharmony_ci &movdqa ($D0,$D4); 4071cb0ef41Sopenharmony_ci &movdqa ($D1,$D4); 4081cb0ef41Sopenharmony_ci &movdqa ($D2,$D4); 4091cb0ef41Sopenharmony_ci 4101cb0ef41Sopenharmony_ci &pand ($D0,$MASK); # -> base 2^26 4111cb0ef41Sopenharmony_ci &psrlq ($D1,26); 4121cb0ef41Sopenharmony_ci &psrldq ($D2,6); 4131cb0ef41Sopenharmony_ci &pand ($D1,$MASK); 4141cb0ef41Sopenharmony_ci &movdqa ($D3,$D2); 4151cb0ef41Sopenharmony_ci &psrlq ($D2,4) 4161cb0ef41Sopenharmony_ci &psrlq ($D3,30); 4171cb0ef41Sopenharmony_ci &pand ($D2,$MASK); 4181cb0ef41Sopenharmony_ci &pand ($D3,$MASK); 4191cb0ef41Sopenharmony_ci &psrldq ($D4,13); 4201cb0ef41Sopenharmony_ci 4211cb0ef41Sopenharmony_ci &lea ("edx",&DWP(16*9,"esp")); # size optimization 4221cb0ef41Sopenharmony_ci &mov ("ecx",2); 4231cb0ef41Sopenharmony_ci&set_label("square"); 4241cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$D0); 4251cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$D1); 4261cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$D2); 4271cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$D3); 4281cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$D4); 4291cb0ef41Sopenharmony_ci 4301cb0ef41Sopenharmony_ci &movdqa ($T1,$D1); 4311cb0ef41Sopenharmony_ci &movdqa ($T0,$D2); 4321cb0ef41Sopenharmony_ci &pslld ($T1,2); 4331cb0ef41Sopenharmony_ci &pslld ($T0,2); 4341cb0ef41Sopenharmony_ci &paddd ($T1,$D1); # *5 4351cb0ef41Sopenharmony_ci &paddd ($T0,$D2); # *5 4361cb0ef41Sopenharmony_ci &movdqa (&QWP(16*5,"esp"),$T1); 4371cb0ef41Sopenharmony_ci &movdqa (&QWP(16*6,"esp"),$T0); 4381cb0ef41Sopenharmony_ci &movdqa ($T1,$D3); 4391cb0ef41Sopenharmony_ci &movdqa ($T0,$D4); 4401cb0ef41Sopenharmony_ci &pslld ($T1,2); 4411cb0ef41Sopenharmony_ci &pslld ($T0,2); 4421cb0ef41Sopenharmony_ci &paddd ($T1,$D3); # *5 4431cb0ef41Sopenharmony_ci &paddd ($T0,$D4); # *5 4441cb0ef41Sopenharmony_ci &movdqa (&QWP(16*7,"esp"),$T1); 4451cb0ef41Sopenharmony_ci &movdqa (&QWP(16*8,"esp"),$T0); 4461cb0ef41Sopenharmony_ci 4471cb0ef41Sopenharmony_ci &pshufd ($T1,$D0,0b01000100); 4481cb0ef41Sopenharmony_ci &movdqa ($T0,$D1); 4491cb0ef41Sopenharmony_ci &pshufd ($D1,$D1,0b01000100); 4501cb0ef41Sopenharmony_ci &pshufd ($D2,$D2,0b01000100); 4511cb0ef41Sopenharmony_ci &pshufd ($D3,$D3,0b01000100); 4521cb0ef41Sopenharmony_ci &pshufd ($D4,$D4,0b01000100); 4531cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"edx"),$T1); 4541cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"edx"),$D1); 4551cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"edx"),$D2); 4561cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"edx"),$D3); 4571cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"edx"),$D4); 4581cb0ef41Sopenharmony_ci 4591cb0ef41Sopenharmony_ci ################################################################ 4601cb0ef41Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 4611cb0ef41Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 4621cb0ef41Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 4631cb0ef41Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 4641cb0ef41Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 4651cb0ef41Sopenharmony_ci 4661cb0ef41Sopenharmony_ci &pmuludq ($D4,$D0); # h4*r0 4671cb0ef41Sopenharmony_ci &pmuludq ($D3,$D0); # h3*r0 4681cb0ef41Sopenharmony_ci &pmuludq ($D2,$D0); # h2*r0 4691cb0ef41Sopenharmony_ci &pmuludq ($D1,$D0); # h1*r0 4701cb0ef41Sopenharmony_ci &pmuludq ($D0,$T1); # h0*r0 4711cb0ef41Sopenharmony_ci 4721cb0ef41Sopenharmony_cisub pmuladd { 4731cb0ef41Sopenharmony_cimy $load = shift; 4741cb0ef41Sopenharmony_cimy $base = shift; $base = "esp" if (!defined($base)); 4751cb0ef41Sopenharmony_ci 4761cb0ef41Sopenharmony_ci ################################################################ 4771cb0ef41Sopenharmony_ci # As for choice to "rotate" $T0-$T2 in order to move paddq 4781cb0ef41Sopenharmony_ci # past next multiplication. While it makes code harder to read 4791cb0ef41Sopenharmony_ci # and doesn't have significant effect on most processors, it 4801cb0ef41Sopenharmony_ci # makes a lot of difference on Atom, up to 30% improvement. 4811cb0ef41Sopenharmony_ci 4821cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 4831cb0ef41Sopenharmony_ci &pmuludq ($T0,&QWP(16*3,$base)); # r1*h3 4841cb0ef41Sopenharmony_ci &movdqa ($T2,$T1); 4851cb0ef41Sopenharmony_ci &pmuludq ($T1,&QWP(16*2,$base)); # r1*h2 4861cb0ef41Sopenharmony_ci &paddq ($D4,$T0); 4871cb0ef41Sopenharmony_ci &movdqa ($T0,$T2); 4881cb0ef41Sopenharmony_ci &pmuludq ($T2,&QWP(16*1,$base)); # r1*h1 4891cb0ef41Sopenharmony_ci &paddq ($D3,$T1); 4901cb0ef41Sopenharmony_ci &$load ($T1,5); # s1 4911cb0ef41Sopenharmony_ci &pmuludq ($T0,&QWP(16*0,$base)); # r1*h0 4921cb0ef41Sopenharmony_ci &paddq ($D2,$T2); 4931cb0ef41Sopenharmony_ci &pmuludq ($T1,&QWP(16*4,$base)); # s1*h4 4941cb0ef41Sopenharmony_ci &$load ($T2,2); # r2^n 4951cb0ef41Sopenharmony_ci &paddq ($D1,$T0); 4961cb0ef41Sopenharmony_ci 4971cb0ef41Sopenharmony_ci &movdqa ($T0,$T2); 4981cb0ef41Sopenharmony_ci &pmuludq ($T2,&QWP(16*2,$base)); # r2*h2 4991cb0ef41Sopenharmony_ci &paddq ($D0,$T1); 5001cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 5011cb0ef41Sopenharmony_ci &pmuludq ($T0,&QWP(16*1,$base)); # r2*h1 5021cb0ef41Sopenharmony_ci &paddq ($D4,$T2); 5031cb0ef41Sopenharmony_ci &$load ($T2,6); # s2^n 5041cb0ef41Sopenharmony_ci &pmuludq ($T1,&QWP(16*0,$base)); # r2*h0 5051cb0ef41Sopenharmony_ci &paddq ($D3,$T0); 5061cb0ef41Sopenharmony_ci &movdqa ($T0,$T2); 5071cb0ef41Sopenharmony_ci &pmuludq ($T2,&QWP(16*4,$base)); # s2*h4 5081cb0ef41Sopenharmony_ci &paddq ($D2,$T1); 5091cb0ef41Sopenharmony_ci &pmuludq ($T0,&QWP(16*3,$base)); # s2*h3 5101cb0ef41Sopenharmony_ci &$load ($T1,3); # r3^n 5111cb0ef41Sopenharmony_ci &paddq ($D1,$T2); 5121cb0ef41Sopenharmony_ci 5131cb0ef41Sopenharmony_ci &movdqa ($T2,$T1); 5141cb0ef41Sopenharmony_ci &pmuludq ($T1,&QWP(16*1,$base)); # r3*h1 5151cb0ef41Sopenharmony_ci &paddq ($D0,$T0); 5161cb0ef41Sopenharmony_ci &$load ($T0,7); # s3^n 5171cb0ef41Sopenharmony_ci &pmuludq ($T2,&QWP(16*0,$base)); # r3*h0 5181cb0ef41Sopenharmony_ci &paddq ($D4,$T1); 5191cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 5201cb0ef41Sopenharmony_ci &pmuludq ($T0,&QWP(16*4,$base)); # s3*h4 5211cb0ef41Sopenharmony_ci &paddq ($D3,$T2); 5221cb0ef41Sopenharmony_ci &movdqa ($T2,$T1); 5231cb0ef41Sopenharmony_ci &pmuludq ($T1,&QWP(16*3,$base)); # s3*h3 5241cb0ef41Sopenharmony_ci &paddq ($D2,$T0); 5251cb0ef41Sopenharmony_ci &pmuludq ($T2,&QWP(16*2,$base)); # s3*h2 5261cb0ef41Sopenharmony_ci &$load ($T0,4); # r4^n 5271cb0ef41Sopenharmony_ci &paddq ($D1,$T1); 5281cb0ef41Sopenharmony_ci 5291cb0ef41Sopenharmony_ci &$load ($T1,8); # s4^n 5301cb0ef41Sopenharmony_ci &pmuludq ($T0,&QWP(16*0,$base)); # r4*h0 5311cb0ef41Sopenharmony_ci &paddq ($D0,$T2); 5321cb0ef41Sopenharmony_ci &movdqa ($T2,$T1); 5331cb0ef41Sopenharmony_ci &pmuludq ($T1,&QWP(16*4,$base)); # s4*h4 5341cb0ef41Sopenharmony_ci &paddq ($D4,$T0); 5351cb0ef41Sopenharmony_ci &movdqa ($T0,$T2); 5361cb0ef41Sopenharmony_ci &pmuludq ($T2,&QWP(16*1,$base)); # s4*h1 5371cb0ef41Sopenharmony_ci &paddq ($D3,$T1); 5381cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 5391cb0ef41Sopenharmony_ci &pmuludq ($T0,&QWP(16*2,$base)); # s4*h2 5401cb0ef41Sopenharmony_ci &paddq ($D0,$T2); 5411cb0ef41Sopenharmony_ci &pmuludq ($T1,&QWP(16*3,$base)); # s4*h3 5421cb0ef41Sopenharmony_ci &movdqa ($MASK,&QWP(64,"ebx")); 5431cb0ef41Sopenharmony_ci &paddq ($D1,$T0); 5441cb0ef41Sopenharmony_ci &paddq ($D2,$T1); 5451cb0ef41Sopenharmony_ci} 5461cb0ef41Sopenharmony_ci &pmuladd (sub { my ($reg,$i)=@_; 5471cb0ef41Sopenharmony_ci &movdqa ($reg,&QWP(16*$i,"esp")); 5481cb0ef41Sopenharmony_ci },"edx"); 5491cb0ef41Sopenharmony_ci 5501cb0ef41Sopenharmony_cisub lazy_reduction { 5511cb0ef41Sopenharmony_cimy $extra = shift; 5521cb0ef41Sopenharmony_ci 5531cb0ef41Sopenharmony_ci ################################################################ 5541cb0ef41Sopenharmony_ci # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 5551cb0ef41Sopenharmony_ci # and P. Schwabe 5561cb0ef41Sopenharmony_ci # 5571cb0ef41Sopenharmony_ci # [(*) see discussion in poly1305-armv4 module] 5581cb0ef41Sopenharmony_ci 5591cb0ef41Sopenharmony_ci &movdqa ($T0,$D3); 5601cb0ef41Sopenharmony_ci &pand ($D3,$MASK); 5611cb0ef41Sopenharmony_ci &psrlq ($T0,26); 5621cb0ef41Sopenharmony_ci &$extra () if (defined($extra)); 5631cb0ef41Sopenharmony_ci &paddq ($T0,$D4); # h3 -> h4 5641cb0ef41Sopenharmony_ci &movdqa ($T1,$D0); 5651cb0ef41Sopenharmony_ci &pand ($D0,$MASK); 5661cb0ef41Sopenharmony_ci &psrlq ($T1,26); 5671cb0ef41Sopenharmony_ci &movdqa ($D4,$T0); 5681cb0ef41Sopenharmony_ci &paddq ($T1,$D1); # h0 -> h1 5691cb0ef41Sopenharmony_ci &psrlq ($T0,26); 5701cb0ef41Sopenharmony_ci &pand ($D4,$MASK); 5711cb0ef41Sopenharmony_ci &movdqa ($D1,$T1); 5721cb0ef41Sopenharmony_ci &psrlq ($T1,26); 5731cb0ef41Sopenharmony_ci &paddd ($D0,$T0); # favour paddd when 5741cb0ef41Sopenharmony_ci # possible, because 5751cb0ef41Sopenharmony_ci # paddq is "broken" 5761cb0ef41Sopenharmony_ci # on Atom 5771cb0ef41Sopenharmony_ci &psllq ($T0,2); 5781cb0ef41Sopenharmony_ci &paddq ($T1,$D2); # h1 -> h2 5791cb0ef41Sopenharmony_ci &paddq ($T0,$D0); # h4 -> h0 (*) 5801cb0ef41Sopenharmony_ci &pand ($D1,$MASK); 5811cb0ef41Sopenharmony_ci &movdqa ($D2,$T1); 5821cb0ef41Sopenharmony_ci &psrlq ($T1,26); 5831cb0ef41Sopenharmony_ci &pand ($D2,$MASK); 5841cb0ef41Sopenharmony_ci &paddd ($T1,$D3); # h2 -> h3 5851cb0ef41Sopenharmony_ci &movdqa ($D0,$T0); 5861cb0ef41Sopenharmony_ci &psrlq ($T0,26); 5871cb0ef41Sopenharmony_ci &movdqa ($D3,$T1); 5881cb0ef41Sopenharmony_ci &psrlq ($T1,26); 5891cb0ef41Sopenharmony_ci &pand ($D0,$MASK); 5901cb0ef41Sopenharmony_ci &paddd ($D1,$T0); # h0 -> h1 5911cb0ef41Sopenharmony_ci &pand ($D3,$MASK); 5921cb0ef41Sopenharmony_ci &paddd ($D4,$T1); # h3 -> h4 5931cb0ef41Sopenharmony_ci} 5941cb0ef41Sopenharmony_ci &lazy_reduction (); 5951cb0ef41Sopenharmony_ci 5961cb0ef41Sopenharmony_ci &dec ("ecx"); 5971cb0ef41Sopenharmony_ci &jz (&label("square_break")); 5981cb0ef41Sopenharmony_ci 5991cb0ef41Sopenharmony_ci &punpcklqdq ($D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2 6001cb0ef41Sopenharmony_ci &punpcklqdq ($D1,&QWP(16*1,"esp")); 6011cb0ef41Sopenharmony_ci &punpcklqdq ($D2,&QWP(16*2,"esp")); 6021cb0ef41Sopenharmony_ci &punpcklqdq ($D3,&QWP(16*3,"esp")); 6031cb0ef41Sopenharmony_ci &punpcklqdq ($D4,&QWP(16*4,"esp")); 6041cb0ef41Sopenharmony_ci &jmp (&label("square")); 6051cb0ef41Sopenharmony_ci 6061cb0ef41Sopenharmony_ci&set_label("square_break"); 6071cb0ef41Sopenharmony_ci &psllq ($D0,32); # -> r^3:0:r^4:0 6081cb0ef41Sopenharmony_ci &psllq ($D1,32); 6091cb0ef41Sopenharmony_ci &psllq ($D2,32); 6101cb0ef41Sopenharmony_ci &psllq ($D3,32); 6111cb0ef41Sopenharmony_ci &psllq ($D4,32); 6121cb0ef41Sopenharmony_ci &por ($D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2 6131cb0ef41Sopenharmony_ci &por ($D1,&QWP(16*1,"esp")); 6141cb0ef41Sopenharmony_ci &por ($D2,&QWP(16*2,"esp")); 6151cb0ef41Sopenharmony_ci &por ($D3,&QWP(16*3,"esp")); 6161cb0ef41Sopenharmony_ci &por ($D4,&QWP(16*4,"esp")); 6171cb0ef41Sopenharmony_ci 6181cb0ef41Sopenharmony_ci &pshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4 6191cb0ef41Sopenharmony_ci &pshufd ($D1,$D1,0b10001101); 6201cb0ef41Sopenharmony_ci &pshufd ($D2,$D2,0b10001101); 6211cb0ef41Sopenharmony_ci &pshufd ($D3,$D3,0b10001101); 6221cb0ef41Sopenharmony_ci &pshufd ($D4,$D4,0b10001101); 6231cb0ef41Sopenharmony_ci 6241cb0ef41Sopenharmony_ci &movdqu (&QWP(16*0,"edi"),$D0); # save the table 6251cb0ef41Sopenharmony_ci &movdqu (&QWP(16*1,"edi"),$D1); 6261cb0ef41Sopenharmony_ci &movdqu (&QWP(16*2,"edi"),$D2); 6271cb0ef41Sopenharmony_ci &movdqu (&QWP(16*3,"edi"),$D3); 6281cb0ef41Sopenharmony_ci &movdqu (&QWP(16*4,"edi"),$D4); 6291cb0ef41Sopenharmony_ci 6301cb0ef41Sopenharmony_ci &movdqa ($T1,$D1); 6311cb0ef41Sopenharmony_ci &movdqa ($T0,$D2); 6321cb0ef41Sopenharmony_ci &pslld ($T1,2); 6331cb0ef41Sopenharmony_ci &pslld ($T0,2); 6341cb0ef41Sopenharmony_ci &paddd ($T1,$D1); # *5 6351cb0ef41Sopenharmony_ci &paddd ($T0,$D2); # *5 6361cb0ef41Sopenharmony_ci &movdqu (&QWP(16*5,"edi"),$T1); 6371cb0ef41Sopenharmony_ci &movdqu (&QWP(16*6,"edi"),$T0); 6381cb0ef41Sopenharmony_ci &movdqa ($T1,$D3); 6391cb0ef41Sopenharmony_ci &movdqa ($T0,$D4); 6401cb0ef41Sopenharmony_ci &pslld ($T1,2); 6411cb0ef41Sopenharmony_ci &pslld ($T0,2); 6421cb0ef41Sopenharmony_ci &paddd ($T1,$D3); # *5 6431cb0ef41Sopenharmony_ci &paddd ($T0,$D4); # *5 6441cb0ef41Sopenharmony_ci &movdqu (&QWP(16*7,"edi"),$T1); 6451cb0ef41Sopenharmony_ci &movdqu (&QWP(16*8,"edi"),$T0); 6461cb0ef41Sopenharmony_ci 6471cb0ef41Sopenharmony_ci &mov ("esp","ebp"); 6481cb0ef41Sopenharmony_ci &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization 6491cb0ef41Sopenharmony_ci &ret (); 6501cb0ef41Sopenharmony_ci&function_end_B("_poly1305_init_sse2"); 6511cb0ef41Sopenharmony_ci 6521cb0ef41Sopenharmony_ci&align (32); 6531cb0ef41Sopenharmony_ci&function_begin("_poly1305_blocks_sse2"); 6541cb0ef41Sopenharmony_ci &mov ("edi",&wparam(0)); # ctx 6551cb0ef41Sopenharmony_ci &mov ("esi",&wparam(1)); # inp 6561cb0ef41Sopenharmony_ci &mov ("ecx",&wparam(2)); # len 6571cb0ef41Sopenharmony_ci 6581cb0ef41Sopenharmony_ci &mov ("eax",&DWP(4*5,"edi")); # is_base2_26 6591cb0ef41Sopenharmony_ci &and ("ecx",-16); 6601cb0ef41Sopenharmony_ci &jz (&label("nodata")); 6611cb0ef41Sopenharmony_ci &cmp ("ecx",64); 6621cb0ef41Sopenharmony_ci &jae (&label("enter_sse2")); 6631cb0ef41Sopenharmony_ci &test ("eax","eax"); # is_base2_26? 6641cb0ef41Sopenharmony_ci &jz (&label("enter_blocks")); 6651cb0ef41Sopenharmony_ci 6661cb0ef41Sopenharmony_ci&set_label("enter_sse2",16); 6671cb0ef41Sopenharmony_ci &call (&label("pic_point")); 6681cb0ef41Sopenharmony_ci&set_label("pic_point"); 6691cb0ef41Sopenharmony_ci &blindpop("ebx"); 6701cb0ef41Sopenharmony_ci &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx")); 6711cb0ef41Sopenharmony_ci 6721cb0ef41Sopenharmony_ci &test ("eax","eax"); # is_base2_26? 6731cb0ef41Sopenharmony_ci &jnz (&label("base2_26")); 6741cb0ef41Sopenharmony_ci 6751cb0ef41Sopenharmony_ci &call ("_poly1305_init_sse2"); 6761cb0ef41Sopenharmony_ci 6771cb0ef41Sopenharmony_ci ################################################# base 2^32 -> base 2^26 6781cb0ef41Sopenharmony_ci &mov ("eax",&DWP(0,"edi")); 6791cb0ef41Sopenharmony_ci &mov ("ecx",&DWP(3,"edi")); 6801cb0ef41Sopenharmony_ci &mov ("edx",&DWP(6,"edi")); 6811cb0ef41Sopenharmony_ci &mov ("esi",&DWP(9,"edi")); 6821cb0ef41Sopenharmony_ci &mov ("ebp",&DWP(13,"edi")); 6831cb0ef41Sopenharmony_ci &mov (&DWP(4*5,"edi"),1); # is_base2_26 6841cb0ef41Sopenharmony_ci 6851cb0ef41Sopenharmony_ci &shr ("ecx",2); 6861cb0ef41Sopenharmony_ci &and ("eax",0x3ffffff); 6871cb0ef41Sopenharmony_ci &shr ("edx",4); 6881cb0ef41Sopenharmony_ci &and ("ecx",0x3ffffff); 6891cb0ef41Sopenharmony_ci &shr ("esi",6); 6901cb0ef41Sopenharmony_ci &and ("edx",0x3ffffff); 6911cb0ef41Sopenharmony_ci 6921cb0ef41Sopenharmony_ci &movd ($D0,"eax"); 6931cb0ef41Sopenharmony_ci &movd ($D1,"ecx"); 6941cb0ef41Sopenharmony_ci &movd ($D2,"edx"); 6951cb0ef41Sopenharmony_ci &movd ($D3,"esi"); 6961cb0ef41Sopenharmony_ci &movd ($D4,"ebp"); 6971cb0ef41Sopenharmony_ci 6981cb0ef41Sopenharmony_ci &mov ("esi",&wparam(1)); # [reload] inp 6991cb0ef41Sopenharmony_ci &mov ("ecx",&wparam(2)); # [reload] len 7001cb0ef41Sopenharmony_ci &jmp (&label("base2_32")); 7011cb0ef41Sopenharmony_ci 7021cb0ef41Sopenharmony_ci&set_label("base2_26",16); 7031cb0ef41Sopenharmony_ci &movd ($D0,&DWP(4*0,"edi")); # load hash value 7041cb0ef41Sopenharmony_ci &movd ($D1,&DWP(4*1,"edi")); 7051cb0ef41Sopenharmony_ci &movd ($D2,&DWP(4*2,"edi")); 7061cb0ef41Sopenharmony_ci &movd ($D3,&DWP(4*3,"edi")); 7071cb0ef41Sopenharmony_ci &movd ($D4,&DWP(4*4,"edi")); 7081cb0ef41Sopenharmony_ci &movdqa ($MASK,&QWP(64,"ebx")); 7091cb0ef41Sopenharmony_ci 7101cb0ef41Sopenharmony_ci&set_label("base2_32"); 7111cb0ef41Sopenharmony_ci &mov ("eax",&wparam(3)); # padbit 7121cb0ef41Sopenharmony_ci &mov ("ebp","esp"); 7131cb0ef41Sopenharmony_ci 7141cb0ef41Sopenharmony_ci &sub ("esp",16*(5+5+5+9+9)); 7151cb0ef41Sopenharmony_ci &and ("esp",-16); 7161cb0ef41Sopenharmony_ci 7171cb0ef41Sopenharmony_ci &lea ("edi",&DWP(16*3,"edi")); # size optimization 7181cb0ef41Sopenharmony_ci &shl ("eax",24); # padbit 7191cb0ef41Sopenharmony_ci 7201cb0ef41Sopenharmony_ci &test ("ecx",31); 7211cb0ef41Sopenharmony_ci &jz (&label("even")); 7221cb0ef41Sopenharmony_ci 7231cb0ef41Sopenharmony_ci ################################################################ 7241cb0ef41Sopenharmony_ci # process single block, with SSE2, because it's still faster 7251cb0ef41Sopenharmony_ci # even though half of result is discarded 7261cb0ef41Sopenharmony_ci 7271cb0ef41Sopenharmony_ci &movdqu ($T1,&QWP(0,"esi")); # input 7281cb0ef41Sopenharmony_ci &lea ("esi",&DWP(16,"esi")); 7291cb0ef41Sopenharmony_ci 7301cb0ef41Sopenharmony_ci &movdqa ($T0,$T1); # -> base 2^26 ... 7311cb0ef41Sopenharmony_ci &pand ($T1,$MASK); 7321cb0ef41Sopenharmony_ci &paddd ($D0,$T1); # ... and accumulate 7331cb0ef41Sopenharmony_ci 7341cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 7351cb0ef41Sopenharmony_ci &psrlq ($T0,26); 7361cb0ef41Sopenharmony_ci &psrldq ($T1,6); 7371cb0ef41Sopenharmony_ci &pand ($T0,$MASK); 7381cb0ef41Sopenharmony_ci &paddd ($D1,$T0); 7391cb0ef41Sopenharmony_ci 7401cb0ef41Sopenharmony_ci &movdqa ($T0,$T1); 7411cb0ef41Sopenharmony_ci &psrlq ($T1,4); 7421cb0ef41Sopenharmony_ci &pand ($T1,$MASK); 7431cb0ef41Sopenharmony_ci &paddd ($D2,$T1); 7441cb0ef41Sopenharmony_ci 7451cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 7461cb0ef41Sopenharmony_ci &psrlq ($T0,30); 7471cb0ef41Sopenharmony_ci &pand ($T0,$MASK); 7481cb0ef41Sopenharmony_ci &psrldq ($T1,7); 7491cb0ef41Sopenharmony_ci &paddd ($D3,$T0); 7501cb0ef41Sopenharmony_ci 7511cb0ef41Sopenharmony_ci &movd ($T0,"eax"); # padbit 7521cb0ef41Sopenharmony_ci &paddd ($D4,$T1); 7531cb0ef41Sopenharmony_ci &movd ($T1,&DWP(16*0+12,"edi")); # r0 7541cb0ef41Sopenharmony_ci &paddd ($D4,$T0); 7551cb0ef41Sopenharmony_ci 7561cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$D0); 7571cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$D1); 7581cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$D2); 7591cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$D3); 7601cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$D4); 7611cb0ef41Sopenharmony_ci 7621cb0ef41Sopenharmony_ci ################################################################ 7631cb0ef41Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 7641cb0ef41Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 7651cb0ef41Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 7661cb0ef41Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 7671cb0ef41Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 7681cb0ef41Sopenharmony_ci 7691cb0ef41Sopenharmony_ci &pmuludq ($D0,$T1); # h4*r0 7701cb0ef41Sopenharmony_ci &pmuludq ($D1,$T1); # h3*r0 7711cb0ef41Sopenharmony_ci &pmuludq ($D2,$T1); # h2*r0 7721cb0ef41Sopenharmony_ci &movd ($T0,&DWP(16*1+12,"edi")); # r1 7731cb0ef41Sopenharmony_ci &pmuludq ($D3,$T1); # h1*r0 7741cb0ef41Sopenharmony_ci &pmuludq ($D4,$T1); # h0*r0 7751cb0ef41Sopenharmony_ci 7761cb0ef41Sopenharmony_ci &pmuladd (sub { my ($reg,$i)=@_; 7771cb0ef41Sopenharmony_ci &movd ($reg,&DWP(16*$i+12,"edi")); 7781cb0ef41Sopenharmony_ci }); 7791cb0ef41Sopenharmony_ci 7801cb0ef41Sopenharmony_ci &lazy_reduction (); 7811cb0ef41Sopenharmony_ci 7821cb0ef41Sopenharmony_ci &sub ("ecx",16); 7831cb0ef41Sopenharmony_ci &jz (&label("done")); 7841cb0ef41Sopenharmony_ci 7851cb0ef41Sopenharmony_ci&set_label("even"); 7861cb0ef41Sopenharmony_ci &lea ("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization 7871cb0ef41Sopenharmony_ci &lea ("eax",&DWP(-16*2,"esi")); 7881cb0ef41Sopenharmony_ci &sub ("ecx",64); 7891cb0ef41Sopenharmony_ci 7901cb0ef41Sopenharmony_ci ################################################################ 7911cb0ef41Sopenharmony_ci # expand and copy pre-calculated table to stack 7921cb0ef41Sopenharmony_ci 7931cb0ef41Sopenharmony_ci &movdqu ($T0,&QWP(16*0,"edi")); # r^1:r^2:r^3:r^4 7941cb0ef41Sopenharmony_ci &pshufd ($T1,$T0,0b01000100); # duplicate r^3:r^4 7951cb0ef41Sopenharmony_ci &cmovb ("esi","eax"); 7961cb0ef41Sopenharmony_ci &pshufd ($T0,$T0,0b11101110); # duplicate r^1:r^2 7971cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"edx"),$T1); 7981cb0ef41Sopenharmony_ci &lea ("eax",&DWP(16*10,"esp")); 7991cb0ef41Sopenharmony_ci &movdqu ($T1,&QWP(16*1,"edi")); 8001cb0ef41Sopenharmony_ci &movdqa (&QWP(16*(0-9),"edx"),$T0); 8011cb0ef41Sopenharmony_ci &pshufd ($T0,$T1,0b01000100); 8021cb0ef41Sopenharmony_ci &pshufd ($T1,$T1,0b11101110); 8031cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"edx"),$T0); 8041cb0ef41Sopenharmony_ci &movdqu ($T0,&QWP(16*2,"edi")); 8051cb0ef41Sopenharmony_ci &movdqa (&QWP(16*(1-9),"edx"),$T1); 8061cb0ef41Sopenharmony_ci &pshufd ($T1,$T0,0b01000100); 8071cb0ef41Sopenharmony_ci &pshufd ($T0,$T0,0b11101110); 8081cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"edx"),$T1); 8091cb0ef41Sopenharmony_ci &movdqu ($T1,&QWP(16*3,"edi")); 8101cb0ef41Sopenharmony_ci &movdqa (&QWP(16*(2-9),"edx"),$T0); 8111cb0ef41Sopenharmony_ci &pshufd ($T0,$T1,0b01000100); 8121cb0ef41Sopenharmony_ci &pshufd ($T1,$T1,0b11101110); 8131cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"edx"),$T0); 8141cb0ef41Sopenharmony_ci &movdqu ($T0,&QWP(16*4,"edi")); 8151cb0ef41Sopenharmony_ci &movdqa (&QWP(16*(3-9),"edx"),$T1); 8161cb0ef41Sopenharmony_ci &pshufd ($T1,$T0,0b01000100); 8171cb0ef41Sopenharmony_ci &pshufd ($T0,$T0,0b11101110); 8181cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"edx"),$T1); 8191cb0ef41Sopenharmony_ci &movdqu ($T1,&QWP(16*5,"edi")); 8201cb0ef41Sopenharmony_ci &movdqa (&QWP(16*(4-9),"edx"),$T0); 8211cb0ef41Sopenharmony_ci &pshufd ($T0,$T1,0b01000100); 8221cb0ef41Sopenharmony_ci &pshufd ($T1,$T1,0b11101110); 8231cb0ef41Sopenharmony_ci &movdqa (&QWP(16*5,"edx"),$T0); 8241cb0ef41Sopenharmony_ci &movdqu ($T0,&QWP(16*6,"edi")); 8251cb0ef41Sopenharmony_ci &movdqa (&QWP(16*(5-9),"edx"),$T1); 8261cb0ef41Sopenharmony_ci &pshufd ($T1,$T0,0b01000100); 8271cb0ef41Sopenharmony_ci &pshufd ($T0,$T0,0b11101110); 8281cb0ef41Sopenharmony_ci &movdqa (&QWP(16*6,"edx"),$T1); 8291cb0ef41Sopenharmony_ci &movdqu ($T1,&QWP(16*7,"edi")); 8301cb0ef41Sopenharmony_ci &movdqa (&QWP(16*(6-9),"edx"),$T0); 8311cb0ef41Sopenharmony_ci &pshufd ($T0,$T1,0b01000100); 8321cb0ef41Sopenharmony_ci &pshufd ($T1,$T1,0b11101110); 8331cb0ef41Sopenharmony_ci &movdqa (&QWP(16*7,"edx"),$T0); 8341cb0ef41Sopenharmony_ci &movdqu ($T0,&QWP(16*8,"edi")); 8351cb0ef41Sopenharmony_ci &movdqa (&QWP(16*(7-9),"edx"),$T1); 8361cb0ef41Sopenharmony_ci &pshufd ($T1,$T0,0b01000100); 8371cb0ef41Sopenharmony_ci &pshufd ($T0,$T0,0b11101110); 8381cb0ef41Sopenharmony_ci &movdqa (&QWP(16*8,"edx"),$T1); 8391cb0ef41Sopenharmony_ci &movdqa (&QWP(16*(8-9),"edx"),$T0); 8401cb0ef41Sopenharmony_ci 8411cb0ef41Sopenharmony_cisub load_input { 8421cb0ef41Sopenharmony_cimy ($inpbase,$offbase)=@_; 8431cb0ef41Sopenharmony_ci 8441cb0ef41Sopenharmony_ci &movdqu ($T0,&QWP($inpbase+0,"esi")); # load input 8451cb0ef41Sopenharmony_ci &movdqu ($T1,&QWP($inpbase+16,"esi")); 8461cb0ef41Sopenharmony_ci &lea ("esi",&DWP(16*2,"esi")); 8471cb0ef41Sopenharmony_ci 8481cb0ef41Sopenharmony_ci &movdqa (&QWP($offbase+16*2,"esp"),$D2); 8491cb0ef41Sopenharmony_ci &movdqa (&QWP($offbase+16*3,"esp"),$D3); 8501cb0ef41Sopenharmony_ci &movdqa (&QWP($offbase+16*4,"esp"),$D4); 8511cb0ef41Sopenharmony_ci 8521cb0ef41Sopenharmony_ci &movdqa ($D2,$T0); # splat input 8531cb0ef41Sopenharmony_ci &movdqa ($D3,$T1); 8541cb0ef41Sopenharmony_ci &psrldq ($D2,6); 8551cb0ef41Sopenharmony_ci &psrldq ($D3,6); 8561cb0ef41Sopenharmony_ci &movdqa ($D4,$T0); 8571cb0ef41Sopenharmony_ci &punpcklqdq ($D2,$D3); # 2:3 8581cb0ef41Sopenharmony_ci &punpckhqdq ($D4,$T1); # 4 8591cb0ef41Sopenharmony_ci &punpcklqdq ($T0,$T1); # 0:1 8601cb0ef41Sopenharmony_ci 8611cb0ef41Sopenharmony_ci &movdqa ($D3,$D2); 8621cb0ef41Sopenharmony_ci &psrlq ($D2,4); 8631cb0ef41Sopenharmony_ci &psrlq ($D3,30); 8641cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 8651cb0ef41Sopenharmony_ci &psrlq ($D4,40); # 4 8661cb0ef41Sopenharmony_ci &psrlq ($T1,26); 8671cb0ef41Sopenharmony_ci &pand ($T0,$MASK); # 0 8681cb0ef41Sopenharmony_ci &pand ($T1,$MASK); # 1 8691cb0ef41Sopenharmony_ci &pand ($D2,$MASK); # 2 8701cb0ef41Sopenharmony_ci &pand ($D3,$MASK); # 3 8711cb0ef41Sopenharmony_ci &por ($D4,&QWP(0,"ebx")); # padbit, yes, always 8721cb0ef41Sopenharmony_ci 8731cb0ef41Sopenharmony_ci &movdqa (&QWP($offbase+16*0,"esp"),$D0) if ($offbase); 8741cb0ef41Sopenharmony_ci &movdqa (&QWP($offbase+16*1,"esp"),$D1) if ($offbase); 8751cb0ef41Sopenharmony_ci} 8761cb0ef41Sopenharmony_ci &load_input (16*2,16*5); 8771cb0ef41Sopenharmony_ci 8781cb0ef41Sopenharmony_ci &jbe (&label("skip_loop")); 8791cb0ef41Sopenharmony_ci &jmp (&label("loop")); 8801cb0ef41Sopenharmony_ci 8811cb0ef41Sopenharmony_ci&set_label("loop",32); 8821cb0ef41Sopenharmony_ci ################################################################ 8831cb0ef41Sopenharmony_ci # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 8841cb0ef41Sopenharmony_ci # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 8851cb0ef41Sopenharmony_ci # \___________________/ 8861cb0ef41Sopenharmony_ci # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 8871cb0ef41Sopenharmony_ci # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 8881cb0ef41Sopenharmony_ci # \___________________/ \____________________/ 8891cb0ef41Sopenharmony_ci ################################################################ 8901cb0ef41Sopenharmony_ci 8911cb0ef41Sopenharmony_ci &movdqa ($T2,&QWP(16*(0-9),"edx")); # r0^2 8921cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"eax"),$T1); 8931cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"eax"),$D2); 8941cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"eax"),$D3); 8951cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"eax"),$D4); 8961cb0ef41Sopenharmony_ci 8971cb0ef41Sopenharmony_ci ################################################################ 8981cb0ef41Sopenharmony_ci # d4 = h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1 8991cb0ef41Sopenharmony_ci # d3 = h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4 9001cb0ef41Sopenharmony_ci # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 9011cb0ef41Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2 9021cb0ef41Sopenharmony_ci # d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 9031cb0ef41Sopenharmony_ci 9041cb0ef41Sopenharmony_ci &movdqa ($D1,$T0); 9051cb0ef41Sopenharmony_ci &pmuludq ($T0,$T2); # h0*r0 9061cb0ef41Sopenharmony_ci &movdqa ($D0,$T1); 9071cb0ef41Sopenharmony_ci &pmuludq ($T1,$T2); # h1*r0 9081cb0ef41Sopenharmony_ci &pmuludq ($D2,$T2); # h2*r0 9091cb0ef41Sopenharmony_ci &pmuludq ($D3,$T2); # h3*r0 9101cb0ef41Sopenharmony_ci &pmuludq ($D4,$T2); # h4*r0 9111cb0ef41Sopenharmony_ci 9121cb0ef41Sopenharmony_cisub pmuladd_alt { 9131cb0ef41Sopenharmony_cimy $addr = shift; 9141cb0ef41Sopenharmony_ci 9151cb0ef41Sopenharmony_ci &pmuludq ($D0,&$addr(8)); # h1*s4 9161cb0ef41Sopenharmony_ci &movdqa ($T2,$D1); 9171cb0ef41Sopenharmony_ci &pmuludq ($D1,&$addr(1)); # h0*r1 9181cb0ef41Sopenharmony_ci &paddq ($D0,$T0); 9191cb0ef41Sopenharmony_ci &movdqa ($T0,$T2); 9201cb0ef41Sopenharmony_ci &pmuludq ($T2,&$addr(2)); # h0*r2 9211cb0ef41Sopenharmony_ci &paddq ($D1,$T1); 9221cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 9231cb0ef41Sopenharmony_ci &pmuludq ($T0,&$addr(3)); # h0*r3 9241cb0ef41Sopenharmony_ci &paddq ($D2,$T2); 9251cb0ef41Sopenharmony_ci &movdqa ($T2,&QWP(16*1,"eax")); # pull h1 9261cb0ef41Sopenharmony_ci &pmuludq ($T1,&$addr(4)); # h0*r4 9271cb0ef41Sopenharmony_ci &paddq ($D3,$T0); 9281cb0ef41Sopenharmony_ci 9291cb0ef41Sopenharmony_ci &movdqa ($T0,$T2); 9301cb0ef41Sopenharmony_ci &pmuludq ($T2,&$addr(1)); # h1*r1 9311cb0ef41Sopenharmony_ci &paddq ($D4,$T1); 9321cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 9331cb0ef41Sopenharmony_ci &pmuludq ($T0,&$addr(2)); # h1*r2 9341cb0ef41Sopenharmony_ci &paddq ($D2,$T2); 9351cb0ef41Sopenharmony_ci &movdqa ($T2,&QWP(16*2,"eax")); # pull h2 9361cb0ef41Sopenharmony_ci &pmuludq ($T1,&$addr(3)); # h1*r3 9371cb0ef41Sopenharmony_ci &paddq ($D3,$T0); 9381cb0ef41Sopenharmony_ci &movdqa ($T0,$T2); 9391cb0ef41Sopenharmony_ci &pmuludq ($T2,&$addr(7)); # h2*s3 9401cb0ef41Sopenharmony_ci &paddq ($D4,$T1); 9411cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 9421cb0ef41Sopenharmony_ci &pmuludq ($T0,&$addr(8)); # h2*s4 9431cb0ef41Sopenharmony_ci &paddq ($D0,$T2); 9441cb0ef41Sopenharmony_ci 9451cb0ef41Sopenharmony_ci &movdqa ($T2,$T1); 9461cb0ef41Sopenharmony_ci &pmuludq ($T1,&$addr(1)); # h2*r1 9471cb0ef41Sopenharmony_ci &paddq ($D1,$T0); 9481cb0ef41Sopenharmony_ci &movdqa ($T0,&QWP(16*3,"eax")); # pull h3 9491cb0ef41Sopenharmony_ci &pmuludq ($T2,&$addr(2)); # h2*r2 9501cb0ef41Sopenharmony_ci &paddq ($D3,$T1); 9511cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 9521cb0ef41Sopenharmony_ci &pmuludq ($T0,&$addr(6)); # h3*s2 9531cb0ef41Sopenharmony_ci &paddq ($D4,$T2); 9541cb0ef41Sopenharmony_ci &movdqa ($T2,$T1); 9551cb0ef41Sopenharmony_ci &pmuludq ($T1,&$addr(7)); # h3*s3 9561cb0ef41Sopenharmony_ci &paddq ($D0,$T0); 9571cb0ef41Sopenharmony_ci &movdqa ($T0,$T2); 9581cb0ef41Sopenharmony_ci &pmuludq ($T2,&$addr(8)); # h3*s4 9591cb0ef41Sopenharmony_ci &paddq ($D1,$T1); 9601cb0ef41Sopenharmony_ci 9611cb0ef41Sopenharmony_ci &movdqa ($T1,&QWP(16*4,"eax")); # pull h4 9621cb0ef41Sopenharmony_ci &pmuludq ($T0,&$addr(1)); # h3*r1 9631cb0ef41Sopenharmony_ci &paddq ($D2,$T2); 9641cb0ef41Sopenharmony_ci &movdqa ($T2,$T1); 9651cb0ef41Sopenharmony_ci &pmuludq ($T1,&$addr(8)); # h4*s4 9661cb0ef41Sopenharmony_ci &paddq ($D4,$T0); 9671cb0ef41Sopenharmony_ci &movdqa ($T0,$T2); 9681cb0ef41Sopenharmony_ci &pmuludq ($T2,&$addr(5)); # h4*s1 9691cb0ef41Sopenharmony_ci &paddq ($D3,$T1); 9701cb0ef41Sopenharmony_ci &movdqa ($T1,$T0); 9711cb0ef41Sopenharmony_ci &pmuludq ($T0,&$addr(6)); # h4*s2 9721cb0ef41Sopenharmony_ci &paddq ($D0,$T2); 9731cb0ef41Sopenharmony_ci &movdqa ($MASK,&QWP(64,"ebx")); 9741cb0ef41Sopenharmony_ci &pmuludq ($T1,&$addr(7)); # h4*s3 9751cb0ef41Sopenharmony_ci &paddq ($D1,$T0); 9761cb0ef41Sopenharmony_ci &paddq ($D2,$T1); 9771cb0ef41Sopenharmony_ci} 9781cb0ef41Sopenharmony_ci &pmuladd_alt (sub { my $i=shift; &QWP(16*($i-9),"edx"); }); 9791cb0ef41Sopenharmony_ci 9801cb0ef41Sopenharmony_ci &load_input (-16*2,0); 9811cb0ef41Sopenharmony_ci &lea ("eax",&DWP(-16*2,"esi")); 9821cb0ef41Sopenharmony_ci &sub ("ecx",64); 9831cb0ef41Sopenharmony_ci 9841cb0ef41Sopenharmony_ci &paddd ($T0,&QWP(16*(5+0),"esp")); # add hash value 9851cb0ef41Sopenharmony_ci &paddd ($T1,&QWP(16*(5+1),"esp")); 9861cb0ef41Sopenharmony_ci &paddd ($D2,&QWP(16*(5+2),"esp")); 9871cb0ef41Sopenharmony_ci &paddd ($D3,&QWP(16*(5+3),"esp")); 9881cb0ef41Sopenharmony_ci &paddd ($D4,&QWP(16*(5+4),"esp")); 9891cb0ef41Sopenharmony_ci 9901cb0ef41Sopenharmony_ci &cmovb ("esi","eax"); 9911cb0ef41Sopenharmony_ci &lea ("eax",&DWP(16*10,"esp")); 9921cb0ef41Sopenharmony_ci 9931cb0ef41Sopenharmony_ci &movdqa ($T2,&QWP(16*0,"edx")); # r0^4 9941cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$D1); 9951cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"eax"),$T1); 9961cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"eax"),$D2); 9971cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"eax"),$D3); 9981cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"eax"),$D4); 9991cb0ef41Sopenharmony_ci 10001cb0ef41Sopenharmony_ci ################################################################ 10011cb0ef41Sopenharmony_ci # d4 += h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1 10021cb0ef41Sopenharmony_ci # d3 += h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4 10031cb0ef41Sopenharmony_ci # d2 += h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 10041cb0ef41Sopenharmony_ci # d1 += h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2 10051cb0ef41Sopenharmony_ci # d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 10061cb0ef41Sopenharmony_ci 10071cb0ef41Sopenharmony_ci &movdqa ($D1,$T0); 10081cb0ef41Sopenharmony_ci &pmuludq ($T0,$T2); # h0*r0 10091cb0ef41Sopenharmony_ci &paddq ($T0,$D0); 10101cb0ef41Sopenharmony_ci &movdqa ($D0,$T1); 10111cb0ef41Sopenharmony_ci &pmuludq ($T1,$T2); # h1*r0 10121cb0ef41Sopenharmony_ci &pmuludq ($D2,$T2); # h2*r0 10131cb0ef41Sopenharmony_ci &pmuludq ($D3,$T2); # h3*r0 10141cb0ef41Sopenharmony_ci &pmuludq ($D4,$T2); # h4*r0 10151cb0ef41Sopenharmony_ci 10161cb0ef41Sopenharmony_ci &paddq ($T1,&QWP(16*1,"esp")); 10171cb0ef41Sopenharmony_ci &paddq ($D2,&QWP(16*2,"esp")); 10181cb0ef41Sopenharmony_ci &paddq ($D3,&QWP(16*3,"esp")); 10191cb0ef41Sopenharmony_ci &paddq ($D4,&QWP(16*4,"esp")); 10201cb0ef41Sopenharmony_ci 10211cb0ef41Sopenharmony_ci &pmuladd_alt (sub { my $i=shift; &QWP(16*$i,"edx"); }); 10221cb0ef41Sopenharmony_ci 10231cb0ef41Sopenharmony_ci &lazy_reduction (); 10241cb0ef41Sopenharmony_ci 10251cb0ef41Sopenharmony_ci &load_input (16*2,16*5); 10261cb0ef41Sopenharmony_ci 10271cb0ef41Sopenharmony_ci &ja (&label("loop")); 10281cb0ef41Sopenharmony_ci 10291cb0ef41Sopenharmony_ci&set_label("skip_loop"); 10301cb0ef41Sopenharmony_ci ################################################################ 10311cb0ef41Sopenharmony_ci # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 10321cb0ef41Sopenharmony_ci 10331cb0ef41Sopenharmony_ci &pshufd ($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n 10341cb0ef41Sopenharmony_ci &add ("ecx",32); 10351cb0ef41Sopenharmony_ci &jnz (&label("long_tail")); 10361cb0ef41Sopenharmony_ci 10371cb0ef41Sopenharmony_ci &paddd ($T0,$D0); # add hash value 10381cb0ef41Sopenharmony_ci &paddd ($T1,$D1); 10391cb0ef41Sopenharmony_ci &paddd ($D2,&QWP(16*7,"esp")); 10401cb0ef41Sopenharmony_ci &paddd ($D3,&QWP(16*8,"esp")); 10411cb0ef41Sopenharmony_ci &paddd ($D4,&QWP(16*9,"esp")); 10421cb0ef41Sopenharmony_ci 10431cb0ef41Sopenharmony_ci&set_label("long_tail"); 10441cb0ef41Sopenharmony_ci 10451cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"eax"),$T0); 10461cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"eax"),$T1); 10471cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"eax"),$D2); 10481cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"eax"),$D3); 10491cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"eax"),$D4); 10501cb0ef41Sopenharmony_ci 10511cb0ef41Sopenharmony_ci ################################################################ 10521cb0ef41Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 10531cb0ef41Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 10541cb0ef41Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 10551cb0ef41Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 10561cb0ef41Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 10571cb0ef41Sopenharmony_ci 10581cb0ef41Sopenharmony_ci &pmuludq ($T0,$T2); # h0*r0 10591cb0ef41Sopenharmony_ci &pmuludq ($T1,$T2); # h1*r0 10601cb0ef41Sopenharmony_ci &pmuludq ($D2,$T2); # h2*r0 10611cb0ef41Sopenharmony_ci &movdqa ($D0,$T0); 10621cb0ef41Sopenharmony_ci &pshufd ($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n 10631cb0ef41Sopenharmony_ci &pmuludq ($D3,$T2); # h3*r0 10641cb0ef41Sopenharmony_ci &movdqa ($D1,$T1); 10651cb0ef41Sopenharmony_ci &pmuludq ($D4,$T2); # h4*r0 10661cb0ef41Sopenharmony_ci 10671cb0ef41Sopenharmony_ci &pmuladd (sub { my ($reg,$i)=@_; 10681cb0ef41Sopenharmony_ci &pshufd ($reg,&QWP(16*($i-9),"edx"),0x10); 10691cb0ef41Sopenharmony_ci },"eax"); 10701cb0ef41Sopenharmony_ci 10711cb0ef41Sopenharmony_ci &jz (&label("short_tail")); 10721cb0ef41Sopenharmony_ci 10731cb0ef41Sopenharmony_ci &load_input (-16*2,0); 10741cb0ef41Sopenharmony_ci 10751cb0ef41Sopenharmony_ci &pshufd ($T2,&QWP(16*0,"edx"),0x10); # r0^n 10761cb0ef41Sopenharmony_ci &paddd ($T0,&QWP(16*5,"esp")); # add hash value 10771cb0ef41Sopenharmony_ci &paddd ($T1,&QWP(16*6,"esp")); 10781cb0ef41Sopenharmony_ci &paddd ($D2,&QWP(16*7,"esp")); 10791cb0ef41Sopenharmony_ci &paddd ($D3,&QWP(16*8,"esp")); 10801cb0ef41Sopenharmony_ci &paddd ($D4,&QWP(16*9,"esp")); 10811cb0ef41Sopenharmony_ci 10821cb0ef41Sopenharmony_ci ################################################################ 10831cb0ef41Sopenharmony_ci # multiply inp[0:1] by r^4:r^3 and accumulate 10841cb0ef41Sopenharmony_ci 10851cb0ef41Sopenharmony_ci &movdqa (&QWP(16*0,"esp"),$T0); 10861cb0ef41Sopenharmony_ci &pmuludq ($T0,$T2); # h0*r0 10871cb0ef41Sopenharmony_ci &movdqa (&QWP(16*1,"esp"),$T1); 10881cb0ef41Sopenharmony_ci &pmuludq ($T1,$T2); # h1*r0 10891cb0ef41Sopenharmony_ci &paddq ($D0,$T0); 10901cb0ef41Sopenharmony_ci &movdqa ($T0,$D2); 10911cb0ef41Sopenharmony_ci &pmuludq ($D2,$T2); # h2*r0 10921cb0ef41Sopenharmony_ci &paddq ($D1,$T1); 10931cb0ef41Sopenharmony_ci &movdqa ($T1,$D3); 10941cb0ef41Sopenharmony_ci &pmuludq ($D3,$T2); # h3*r0 10951cb0ef41Sopenharmony_ci &paddq ($D2,&QWP(16*2,"esp")); 10961cb0ef41Sopenharmony_ci &movdqa (&QWP(16*2,"esp"),$T0); 10971cb0ef41Sopenharmony_ci &pshufd ($T0,&QWP(16*1,"edx"),0x10); # r1^n 10981cb0ef41Sopenharmony_ci &paddq ($D3,&QWP(16*3,"esp")); 10991cb0ef41Sopenharmony_ci &movdqa (&QWP(16*3,"esp"),$T1); 11001cb0ef41Sopenharmony_ci &movdqa ($T1,$D4); 11011cb0ef41Sopenharmony_ci &pmuludq ($D4,$T2); # h4*r0 11021cb0ef41Sopenharmony_ci &paddq ($D4,&QWP(16*4,"esp")); 11031cb0ef41Sopenharmony_ci &movdqa (&QWP(16*4,"esp"),$T1); 11041cb0ef41Sopenharmony_ci 11051cb0ef41Sopenharmony_ci &pmuladd (sub { my ($reg,$i)=@_; 11061cb0ef41Sopenharmony_ci &pshufd ($reg,&QWP(16*$i,"edx"),0x10); 11071cb0ef41Sopenharmony_ci }); 11081cb0ef41Sopenharmony_ci 11091cb0ef41Sopenharmony_ci&set_label("short_tail"); 11101cb0ef41Sopenharmony_ci 11111cb0ef41Sopenharmony_ci ################################################################ 11121cb0ef41Sopenharmony_ci # horizontal addition 11131cb0ef41Sopenharmony_ci 11141cb0ef41Sopenharmony_ci &pshufd ($T1,$D4,0b01001110); 11151cb0ef41Sopenharmony_ci &pshufd ($T0,$D3,0b01001110); 11161cb0ef41Sopenharmony_ci &paddq ($D4,$T1); 11171cb0ef41Sopenharmony_ci &paddq ($D3,$T0); 11181cb0ef41Sopenharmony_ci &pshufd ($T1,$D0,0b01001110); 11191cb0ef41Sopenharmony_ci &pshufd ($T0,$D1,0b01001110); 11201cb0ef41Sopenharmony_ci &paddq ($D0,$T1); 11211cb0ef41Sopenharmony_ci &paddq ($D1,$T0); 11221cb0ef41Sopenharmony_ci &pshufd ($T1,$D2,0b01001110); 11231cb0ef41Sopenharmony_ci #&paddq ($D2,$T1); 11241cb0ef41Sopenharmony_ci 11251cb0ef41Sopenharmony_ci &lazy_reduction (sub { &paddq ($D2,$T1) }); 11261cb0ef41Sopenharmony_ci 11271cb0ef41Sopenharmony_ci&set_label("done"); 11281cb0ef41Sopenharmony_ci &movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value 11291cb0ef41Sopenharmony_ci &movd (&DWP(-16*3+4*1,"edi"),$D1); 11301cb0ef41Sopenharmony_ci &movd (&DWP(-16*3+4*2,"edi"),$D2); 11311cb0ef41Sopenharmony_ci &movd (&DWP(-16*3+4*3,"edi"),$D3); 11321cb0ef41Sopenharmony_ci &movd (&DWP(-16*3+4*4,"edi"),$D4); 11331cb0ef41Sopenharmony_ci &mov ("esp","ebp"); 11341cb0ef41Sopenharmony_ci&set_label("nodata"); 11351cb0ef41Sopenharmony_ci&function_end("_poly1305_blocks_sse2"); 11361cb0ef41Sopenharmony_ci 11371cb0ef41Sopenharmony_ci&align (32); 11381cb0ef41Sopenharmony_ci&function_begin("_poly1305_emit_sse2"); 11391cb0ef41Sopenharmony_ci &mov ("ebp",&wparam(0)); # context 11401cb0ef41Sopenharmony_ci 11411cb0ef41Sopenharmony_ci &cmp (&DWP(4*5,"ebp"),0); # is_base2_26? 11421cb0ef41Sopenharmony_ci &je (&label("enter_emit")); 11431cb0ef41Sopenharmony_ci 11441cb0ef41Sopenharmony_ci &mov ("eax",&DWP(4*0,"ebp")); # load hash value 11451cb0ef41Sopenharmony_ci &mov ("edi",&DWP(4*1,"ebp")); 11461cb0ef41Sopenharmony_ci &mov ("ecx",&DWP(4*2,"ebp")); 11471cb0ef41Sopenharmony_ci &mov ("edx",&DWP(4*3,"ebp")); 11481cb0ef41Sopenharmony_ci &mov ("esi",&DWP(4*4,"ebp")); 11491cb0ef41Sopenharmony_ci 11501cb0ef41Sopenharmony_ci &mov ("ebx","edi"); # base 2^26 -> base 2^32 11511cb0ef41Sopenharmony_ci &shl ("edi",26); 11521cb0ef41Sopenharmony_ci &shr ("ebx",6); 11531cb0ef41Sopenharmony_ci &add ("eax","edi"); 11541cb0ef41Sopenharmony_ci &mov ("edi","ecx"); 11551cb0ef41Sopenharmony_ci &adc ("ebx",0); 11561cb0ef41Sopenharmony_ci 11571cb0ef41Sopenharmony_ci &shl ("edi",20); 11581cb0ef41Sopenharmony_ci &shr ("ecx",12); 11591cb0ef41Sopenharmony_ci &add ("ebx","edi"); 11601cb0ef41Sopenharmony_ci &mov ("edi","edx"); 11611cb0ef41Sopenharmony_ci &adc ("ecx",0); 11621cb0ef41Sopenharmony_ci 11631cb0ef41Sopenharmony_ci &shl ("edi",14); 11641cb0ef41Sopenharmony_ci &shr ("edx",18); 11651cb0ef41Sopenharmony_ci &add ("ecx","edi"); 11661cb0ef41Sopenharmony_ci &mov ("edi","esi"); 11671cb0ef41Sopenharmony_ci &adc ("edx",0); 11681cb0ef41Sopenharmony_ci 11691cb0ef41Sopenharmony_ci &shl ("edi",8); 11701cb0ef41Sopenharmony_ci &shr ("esi",24); 11711cb0ef41Sopenharmony_ci &add ("edx","edi"); 11721cb0ef41Sopenharmony_ci &adc ("esi",0); # can be partially reduced 11731cb0ef41Sopenharmony_ci 11741cb0ef41Sopenharmony_ci &mov ("edi","esi"); # final reduction 11751cb0ef41Sopenharmony_ci &and ("esi",3); 11761cb0ef41Sopenharmony_ci &shr ("edi",2); 11771cb0ef41Sopenharmony_ci &lea ("ebp",&DWP(0,"edi","edi",4)); # *5 11781cb0ef41Sopenharmony_ci &mov ("edi",&wparam(1)); # output 11791cb0ef41Sopenharmony_ci &add ("eax","ebp"); 11801cb0ef41Sopenharmony_ci &mov ("ebp",&wparam(2)); # key 11811cb0ef41Sopenharmony_ci &adc ("ebx",0); 11821cb0ef41Sopenharmony_ci &adc ("ecx",0); 11831cb0ef41Sopenharmony_ci &adc ("edx",0); 11841cb0ef41Sopenharmony_ci &adc ("esi",0); 11851cb0ef41Sopenharmony_ci 11861cb0ef41Sopenharmony_ci &movd ($D0,"eax"); # offload original hash value 11871cb0ef41Sopenharmony_ci &add ("eax",5); # compare to modulus 11881cb0ef41Sopenharmony_ci &movd ($D1,"ebx"); 11891cb0ef41Sopenharmony_ci &adc ("ebx",0); 11901cb0ef41Sopenharmony_ci &movd ($D2,"ecx"); 11911cb0ef41Sopenharmony_ci &adc ("ecx",0); 11921cb0ef41Sopenharmony_ci &movd ($D3,"edx"); 11931cb0ef41Sopenharmony_ci &adc ("edx",0); 11941cb0ef41Sopenharmony_ci &adc ("esi",0); 11951cb0ef41Sopenharmony_ci &shr ("esi",2); # did it carry/borrow? 11961cb0ef41Sopenharmony_ci 11971cb0ef41Sopenharmony_ci &neg ("esi"); # do we choose (hash-modulus) ... 11981cb0ef41Sopenharmony_ci &and ("eax","esi"); 11991cb0ef41Sopenharmony_ci &and ("ebx","esi"); 12001cb0ef41Sopenharmony_ci &and ("ecx","esi"); 12011cb0ef41Sopenharmony_ci &and ("edx","esi"); 12021cb0ef41Sopenharmony_ci &mov (&DWP(4*0,"edi"),"eax"); 12031cb0ef41Sopenharmony_ci &movd ("eax",$D0); 12041cb0ef41Sopenharmony_ci &mov (&DWP(4*1,"edi"),"ebx"); 12051cb0ef41Sopenharmony_ci &movd ("ebx",$D1); 12061cb0ef41Sopenharmony_ci &mov (&DWP(4*2,"edi"),"ecx"); 12071cb0ef41Sopenharmony_ci &movd ("ecx",$D2); 12081cb0ef41Sopenharmony_ci &mov (&DWP(4*3,"edi"),"edx"); 12091cb0ef41Sopenharmony_ci &movd ("edx",$D3); 12101cb0ef41Sopenharmony_ci 12111cb0ef41Sopenharmony_ci ¬ ("esi"); # ... or original hash value? 12121cb0ef41Sopenharmony_ci &and ("eax","esi"); 12131cb0ef41Sopenharmony_ci &and ("ebx","esi"); 12141cb0ef41Sopenharmony_ci &or ("eax",&DWP(4*0,"edi")); 12151cb0ef41Sopenharmony_ci &and ("ecx","esi"); 12161cb0ef41Sopenharmony_ci &or ("ebx",&DWP(4*1,"edi")); 12171cb0ef41Sopenharmony_ci &and ("edx","esi"); 12181cb0ef41Sopenharmony_ci &or ("ecx",&DWP(4*2,"edi")); 12191cb0ef41Sopenharmony_ci &or ("edx",&DWP(4*3,"edi")); 12201cb0ef41Sopenharmony_ci 12211cb0ef41Sopenharmony_ci &add ("eax",&DWP(4*0,"ebp")); # accumulate key 12221cb0ef41Sopenharmony_ci &adc ("ebx",&DWP(4*1,"ebp")); 12231cb0ef41Sopenharmony_ci &mov (&DWP(4*0,"edi"),"eax"); 12241cb0ef41Sopenharmony_ci &adc ("ecx",&DWP(4*2,"ebp")); 12251cb0ef41Sopenharmony_ci &mov (&DWP(4*1,"edi"),"ebx"); 12261cb0ef41Sopenharmony_ci &adc ("edx",&DWP(4*3,"ebp")); 12271cb0ef41Sopenharmony_ci &mov (&DWP(4*2,"edi"),"ecx"); 12281cb0ef41Sopenharmony_ci &mov (&DWP(4*3,"edi"),"edx"); 12291cb0ef41Sopenharmony_ci&function_end("_poly1305_emit_sse2"); 12301cb0ef41Sopenharmony_ci 12311cb0ef41Sopenharmony_ciif ($avx>1) { 12321cb0ef41Sopenharmony_ci######################################################################## 12331cb0ef41Sopenharmony_ci# Note that poly1305_init_avx2 operates on %xmm, I could have used 12341cb0ef41Sopenharmony_ci# poly1305_init_sse2... 12351cb0ef41Sopenharmony_ci 12361cb0ef41Sopenharmony_ci&align (32); 12371cb0ef41Sopenharmony_ci&function_begin_B("_poly1305_init_avx2"); 12381cb0ef41Sopenharmony_ci &vmovdqu ($D4,&QWP(4*6,"edi")); # key base 2^32 12391cb0ef41Sopenharmony_ci &lea ("edi",&DWP(16*3,"edi")); # size optimization 12401cb0ef41Sopenharmony_ci &mov ("ebp","esp"); 12411cb0ef41Sopenharmony_ci &sub ("esp",16*(9+5)); 12421cb0ef41Sopenharmony_ci &and ("esp",-16); 12431cb0ef41Sopenharmony_ci 12441cb0ef41Sopenharmony_ci #&vpand ($D4,$D4,&QWP(96,"ebx")); # magic mask 12451cb0ef41Sopenharmony_ci &vmovdqa ($MASK,&QWP(64,"ebx")); 12461cb0ef41Sopenharmony_ci 12471cb0ef41Sopenharmony_ci &vpand ($D0,$D4,$MASK); # -> base 2^26 12481cb0ef41Sopenharmony_ci &vpsrlq ($D1,$D4,26); 12491cb0ef41Sopenharmony_ci &vpsrldq ($D3,$D4,6); 12501cb0ef41Sopenharmony_ci &vpand ($D1,$D1,$MASK); 12511cb0ef41Sopenharmony_ci &vpsrlq ($D2,$D3,4) 12521cb0ef41Sopenharmony_ci &vpsrlq ($D3,$D3,30); 12531cb0ef41Sopenharmony_ci &vpand ($D2,$D2,$MASK); 12541cb0ef41Sopenharmony_ci &vpand ($D3,$D3,$MASK); 12551cb0ef41Sopenharmony_ci &vpsrldq ($D4,$D4,13); 12561cb0ef41Sopenharmony_ci 12571cb0ef41Sopenharmony_ci &lea ("edx",&DWP(16*9,"esp")); # size optimization 12581cb0ef41Sopenharmony_ci &mov ("ecx",2); 12591cb0ef41Sopenharmony_ci&set_label("square"); 12601cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*0,"esp"),$D0); 12611cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*1,"esp"),$D1); 12621cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*2,"esp"),$D2); 12631cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*3,"esp"),$D3); 12641cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*4,"esp"),$D4); 12651cb0ef41Sopenharmony_ci 12661cb0ef41Sopenharmony_ci &vpslld ($T1,$D1,2); 12671cb0ef41Sopenharmony_ci &vpslld ($T0,$D2,2); 12681cb0ef41Sopenharmony_ci &vpaddd ($T1,$T1,$D1); # *5 12691cb0ef41Sopenharmony_ci &vpaddd ($T0,$T0,$D2); # *5 12701cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*5,"esp"),$T1); 12711cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*6,"esp"),$T0); 12721cb0ef41Sopenharmony_ci &vpslld ($T1,$D3,2); 12731cb0ef41Sopenharmony_ci &vpslld ($T0,$D4,2); 12741cb0ef41Sopenharmony_ci &vpaddd ($T1,$T1,$D3); # *5 12751cb0ef41Sopenharmony_ci &vpaddd ($T0,$T0,$D4); # *5 12761cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*7,"esp"),$T1); 12771cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*8,"esp"),$T0); 12781cb0ef41Sopenharmony_ci 12791cb0ef41Sopenharmony_ci &vpshufd ($T0,$D0,0b01000100); 12801cb0ef41Sopenharmony_ci &vmovdqa ($T1,$D1); 12811cb0ef41Sopenharmony_ci &vpshufd ($D1,$D1,0b01000100); 12821cb0ef41Sopenharmony_ci &vpshufd ($D2,$D2,0b01000100); 12831cb0ef41Sopenharmony_ci &vpshufd ($D3,$D3,0b01000100); 12841cb0ef41Sopenharmony_ci &vpshufd ($D4,$D4,0b01000100); 12851cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*0,"edx"),$T0); 12861cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*1,"edx"),$D1); 12871cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*2,"edx"),$D2); 12881cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*3,"edx"),$D3); 12891cb0ef41Sopenharmony_ci &vmovdqa (&QWP(16*4,"edx"),$D4); 12901cb0ef41Sopenharmony_ci 12911cb0ef41Sopenharmony_ci ################################################################ 12921cb0ef41Sopenharmony_ci # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 12931cb0ef41Sopenharmony_ci # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 12941cb0ef41Sopenharmony_ci # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 12951cb0ef41Sopenharmony_ci # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 12961cb0ef41Sopenharmony_ci # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 12971cb0ef41Sopenharmony_ci 12981cb0ef41Sopenharmony_ci &vpmuludq ($D4,$D4,$D0); # h4*r0 12991cb0ef41Sopenharmony_ci &vpmuludq ($D3,$D3,$D0); # h3*r0 13001cb0ef41Sopenharmony_ci &vpmuludq ($D2,$D2,$D0); # h2*r0 13011cb0ef41Sopenharmony_ci &vpmuludq ($D1,$D1,$D0); # h1*r0 13021cb0ef41Sopenharmony_ci &vpmuludq ($D0,$T0,$D0); # h0*r0 13031cb0ef41Sopenharmony_ci 13041cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # r1*h3 13051cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T0); 13061cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T1,&QWP(16*2,"edx")); # r1*h2 13071cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T2); 13081cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T1,&QWP(16*1,"edx")); # r1*h1 13091cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T0); 13101cb0ef41Sopenharmony_ci &vmovdqa ($T2,&QWP(16*5,"esp")); # s1 13111cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T1,&QWP(16*0,"edx")); # r1*h0 13121cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T1); 13131cb0ef41Sopenharmony_ci &vmovdqa ($T0,&QWP(16*2,"esp")); # r2 13141cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T2,&QWP(16*4,"edx")); # s1*h4 13151cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T2); 13161cb0ef41Sopenharmony_ci 13171cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # r2*h2 13181cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T1); 13191cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r2*h1 13201cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T2); 13211cb0ef41Sopenharmony_ci &vmovdqa ($T1,&QWP(16*6,"esp")); # s2 13221cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r2*h0 13231cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T0); 13241cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s2*h4 13251cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T2); 13261cb0ef41Sopenharmony_ci &vmovdqa ($T0,&QWP(16*3,"esp")); # r3 13271cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T1,&QWP(16*3,"edx")); # s2*h3 13281cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T1); 13291cb0ef41Sopenharmony_ci 13301cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r3*h1 13311cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T2); 13321cb0ef41Sopenharmony_ci &vmovdqa ($T1,&QWP(16*7,"esp")); # s3 13331cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r3*h0 13341cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T0); 13351cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s3*h4 13361cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T2); 13371cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # s3*h3 13381cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T0); 13391cb0ef41Sopenharmony_ci &vmovdqa ($T2,&QWP(16*4,"esp")); # r4 13401cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T1,&QWP(16*2,"edx")); # s3*h2 13411cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T1); 13421cb0ef41Sopenharmony_ci 13431cb0ef41Sopenharmony_ci &vmovdqa ($T0,&QWP(16*8,"esp")); # s4 13441cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T2,&QWP(16*0,"edx")); # r4*h0 13451cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T2); 13461cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T0,&QWP(16*4,"edx")); # s4*h4 13471cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T1); 13481cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # s4*h1 13491cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T2); 13501cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # s4*h2 13511cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T1); 13521cb0ef41Sopenharmony_ci &vmovdqa ($MASK,&QWP(64,"ebx")); 13531cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T0,&QWP(16*3,"edx")); # s4*h3 13541cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T0); 13551cb0ef41Sopenharmony_ci 13561cb0ef41Sopenharmony_ci ################################################################ 13571cb0ef41Sopenharmony_ci # lazy reduction 13581cb0ef41Sopenharmony_ci &vpsrlq ($T0,$D3,26); 13591cb0ef41Sopenharmony_ci &vpand ($D3,$D3,$MASK); 13601cb0ef41Sopenharmony_ci &vpsrlq ($T1,$D0,26); 13611cb0ef41Sopenharmony_ci &vpand ($D0,$D0,$MASK); 13621cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T0); # h3 -> h4 13631cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T1); # h0 -> h1 13641cb0ef41Sopenharmony_ci &vpsrlq ($T0,$D4,26); 13651cb0ef41Sopenharmony_ci &vpand ($D4,$D4,$MASK); 13661cb0ef41Sopenharmony_ci &vpsrlq ($T1,$D1,26); 13671cb0ef41Sopenharmony_ci &vpand ($D1,$D1,$MASK); 13681cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T1); # h1 -> h2 13691cb0ef41Sopenharmony_ci &vpaddd ($D0,$D0,$T0); 13701cb0ef41Sopenharmony_ci &vpsllq ($T0,$T0,2); 13711cb0ef41Sopenharmony_ci &vpsrlq ($T1,$D2,26); 13721cb0ef41Sopenharmony_ci &vpand ($D2,$D2,$MASK); 13731cb0ef41Sopenharmony_ci &vpaddd ($D0,$D0,$T0); # h4 -> h0 13741cb0ef41Sopenharmony_ci &vpaddd ($D3,$D3,$T1); # h2 -> h3 13751cb0ef41Sopenharmony_ci &vpsrlq ($T1,$D3,26); 13761cb0ef41Sopenharmony_ci &vpsrlq ($T0,$D0,26); 13771cb0ef41Sopenharmony_ci &vpand ($D0,$D0,$MASK); 13781cb0ef41Sopenharmony_ci &vpand ($D3,$D3,$MASK); 13791cb0ef41Sopenharmony_ci &vpaddd ($D1,$D1,$T0); # h0 -> h1 13801cb0ef41Sopenharmony_ci &vpaddd ($D4,$D4,$T1); # h3 -> h4 13811cb0ef41Sopenharmony_ci 13821cb0ef41Sopenharmony_ci &dec ("ecx"); 13831cb0ef41Sopenharmony_ci &jz (&label("square_break")); 13841cb0ef41Sopenharmony_ci 13851cb0ef41Sopenharmony_ci &vpunpcklqdq ($D0,$D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2 13861cb0ef41Sopenharmony_ci &vpunpcklqdq ($D1,$D1,&QWP(16*1,"esp")); 13871cb0ef41Sopenharmony_ci &vpunpcklqdq ($D2,$D2,&QWP(16*2,"esp")); 13881cb0ef41Sopenharmony_ci &vpunpcklqdq ($D3,$D3,&QWP(16*3,"esp")); 13891cb0ef41Sopenharmony_ci &vpunpcklqdq ($D4,$D4,&QWP(16*4,"esp")); 13901cb0ef41Sopenharmony_ci &jmp (&label("square")); 13911cb0ef41Sopenharmony_ci 13921cb0ef41Sopenharmony_ci&set_label("square_break"); 13931cb0ef41Sopenharmony_ci &vpsllq ($D0,$D0,32); # -> r^3:0:r^4:0 13941cb0ef41Sopenharmony_ci &vpsllq ($D1,$D1,32); 13951cb0ef41Sopenharmony_ci &vpsllq ($D2,$D2,32); 13961cb0ef41Sopenharmony_ci &vpsllq ($D3,$D3,32); 13971cb0ef41Sopenharmony_ci &vpsllq ($D4,$D4,32); 13981cb0ef41Sopenharmony_ci &vpor ($D0,$D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2 13991cb0ef41Sopenharmony_ci &vpor ($D1,$D1,&QWP(16*1,"esp")); 14001cb0ef41Sopenharmony_ci &vpor ($D2,$D2,&QWP(16*2,"esp")); 14011cb0ef41Sopenharmony_ci &vpor ($D3,$D3,&QWP(16*3,"esp")); 14021cb0ef41Sopenharmony_ci &vpor ($D4,$D4,&QWP(16*4,"esp")); 14031cb0ef41Sopenharmony_ci 14041cb0ef41Sopenharmony_ci &vpshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4 14051cb0ef41Sopenharmony_ci &vpshufd ($D1,$D1,0b10001101); 14061cb0ef41Sopenharmony_ci &vpshufd ($D2,$D2,0b10001101); 14071cb0ef41Sopenharmony_ci &vpshufd ($D3,$D3,0b10001101); 14081cb0ef41Sopenharmony_ci &vpshufd ($D4,$D4,0b10001101); 14091cb0ef41Sopenharmony_ci 14101cb0ef41Sopenharmony_ci &vmovdqu (&QWP(16*0,"edi"),$D0); # save the table 14111cb0ef41Sopenharmony_ci &vmovdqu (&QWP(16*1,"edi"),$D1); 14121cb0ef41Sopenharmony_ci &vmovdqu (&QWP(16*2,"edi"),$D2); 14131cb0ef41Sopenharmony_ci &vmovdqu (&QWP(16*3,"edi"),$D3); 14141cb0ef41Sopenharmony_ci &vmovdqu (&QWP(16*4,"edi"),$D4); 14151cb0ef41Sopenharmony_ci 14161cb0ef41Sopenharmony_ci &vpslld ($T1,$D1,2); 14171cb0ef41Sopenharmony_ci &vpslld ($T0,$D2,2); 14181cb0ef41Sopenharmony_ci &vpaddd ($T1,$T1,$D1); # *5 14191cb0ef41Sopenharmony_ci &vpaddd ($T0,$T0,$D2); # *5 14201cb0ef41Sopenharmony_ci &vmovdqu (&QWP(16*5,"edi"),$T1); 14211cb0ef41Sopenharmony_ci &vmovdqu (&QWP(16*6,"edi"),$T0); 14221cb0ef41Sopenharmony_ci &vpslld ($T1,$D3,2); 14231cb0ef41Sopenharmony_ci &vpslld ($T0,$D4,2); 14241cb0ef41Sopenharmony_ci &vpaddd ($T1,$T1,$D3); # *5 14251cb0ef41Sopenharmony_ci &vpaddd ($T0,$T0,$D4); # *5 14261cb0ef41Sopenharmony_ci &vmovdqu (&QWP(16*7,"edi"),$T1); 14271cb0ef41Sopenharmony_ci &vmovdqu (&QWP(16*8,"edi"),$T0); 14281cb0ef41Sopenharmony_ci 14291cb0ef41Sopenharmony_ci &mov ("esp","ebp"); 14301cb0ef41Sopenharmony_ci &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization 14311cb0ef41Sopenharmony_ci &ret (); 14321cb0ef41Sopenharmony_ci&function_end_B("_poly1305_init_avx2"); 14331cb0ef41Sopenharmony_ci 14341cb0ef41Sopenharmony_ci######################################################################## 14351cb0ef41Sopenharmony_ci# now it's time to switch to %ymm 14361cb0ef41Sopenharmony_ci 14371cb0ef41Sopenharmony_cimy ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7)); 14381cb0ef41Sopenharmony_cimy $MASK=$T2; 14391cb0ef41Sopenharmony_ci 14401cb0ef41Sopenharmony_cisub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; } 14411cb0ef41Sopenharmony_ci 14421cb0ef41Sopenharmony_ci&align (32); 14431cb0ef41Sopenharmony_ci&function_begin("_poly1305_blocks_avx2"); 14441cb0ef41Sopenharmony_ci &mov ("edi",&wparam(0)); # ctx 14451cb0ef41Sopenharmony_ci &mov ("esi",&wparam(1)); # inp 14461cb0ef41Sopenharmony_ci &mov ("ecx",&wparam(2)); # len 14471cb0ef41Sopenharmony_ci 14481cb0ef41Sopenharmony_ci &mov ("eax",&DWP(4*5,"edi")); # is_base2_26 14491cb0ef41Sopenharmony_ci &and ("ecx",-16); 14501cb0ef41Sopenharmony_ci &jz (&label("nodata")); 14511cb0ef41Sopenharmony_ci &cmp ("ecx",64); 14521cb0ef41Sopenharmony_ci &jae (&label("enter_avx2")); 14531cb0ef41Sopenharmony_ci &test ("eax","eax"); # is_base2_26? 14541cb0ef41Sopenharmony_ci &jz (&label("enter_blocks")); 14551cb0ef41Sopenharmony_ci 14561cb0ef41Sopenharmony_ci&set_label("enter_avx2"); 14571cb0ef41Sopenharmony_ci &vzeroupper (); 14581cb0ef41Sopenharmony_ci 14591cb0ef41Sopenharmony_ci &call (&label("pic_point")); 14601cb0ef41Sopenharmony_ci&set_label("pic_point"); 14611cb0ef41Sopenharmony_ci &blindpop("ebx"); 14621cb0ef41Sopenharmony_ci &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx")); 14631cb0ef41Sopenharmony_ci 14641cb0ef41Sopenharmony_ci &test ("eax","eax"); # is_base2_26? 14651cb0ef41Sopenharmony_ci &jnz (&label("base2_26")); 14661cb0ef41Sopenharmony_ci 14671cb0ef41Sopenharmony_ci &call ("_poly1305_init_avx2"); 14681cb0ef41Sopenharmony_ci 14691cb0ef41Sopenharmony_ci ################################################# base 2^32 -> base 2^26 14701cb0ef41Sopenharmony_ci &mov ("eax",&DWP(0,"edi")); 14711cb0ef41Sopenharmony_ci &mov ("ecx",&DWP(3,"edi")); 14721cb0ef41Sopenharmony_ci &mov ("edx",&DWP(6,"edi")); 14731cb0ef41Sopenharmony_ci &mov ("esi",&DWP(9,"edi")); 14741cb0ef41Sopenharmony_ci &mov ("ebp",&DWP(13,"edi")); 14751cb0ef41Sopenharmony_ci 14761cb0ef41Sopenharmony_ci &shr ("ecx",2); 14771cb0ef41Sopenharmony_ci &and ("eax",0x3ffffff); 14781cb0ef41Sopenharmony_ci &shr ("edx",4); 14791cb0ef41Sopenharmony_ci &and ("ecx",0x3ffffff); 14801cb0ef41Sopenharmony_ci &shr ("esi",6); 14811cb0ef41Sopenharmony_ci &and ("edx",0x3ffffff); 14821cb0ef41Sopenharmony_ci 14831cb0ef41Sopenharmony_ci &mov (&DWP(4*0,"edi"),"eax"); 14841cb0ef41Sopenharmony_ci &mov (&DWP(4*1,"edi"),"ecx"); 14851cb0ef41Sopenharmony_ci &mov (&DWP(4*2,"edi"),"edx"); 14861cb0ef41Sopenharmony_ci &mov (&DWP(4*3,"edi"),"esi"); 14871cb0ef41Sopenharmony_ci &mov (&DWP(4*4,"edi"),"ebp"); 14881cb0ef41Sopenharmony_ci &mov (&DWP(4*5,"edi"),1); # is_base2_26 14891cb0ef41Sopenharmony_ci 14901cb0ef41Sopenharmony_ci &mov ("esi",&wparam(1)); # [reload] inp 14911cb0ef41Sopenharmony_ci &mov ("ecx",&wparam(2)); # [reload] len 14921cb0ef41Sopenharmony_ci 14931cb0ef41Sopenharmony_ci&set_label("base2_26"); 14941cb0ef41Sopenharmony_ci &mov ("eax",&wparam(3)); # padbit 14951cb0ef41Sopenharmony_ci &mov ("ebp","esp"); 14961cb0ef41Sopenharmony_ci 14971cb0ef41Sopenharmony_ci &sub ("esp",32*(5+9)); 14981cb0ef41Sopenharmony_ci &and ("esp",-512); # ensure that frame 14991cb0ef41Sopenharmony_ci # doesn't cross page 15001cb0ef41Sopenharmony_ci # boundary, which is 15011cb0ef41Sopenharmony_ci # essential for 15021cb0ef41Sopenharmony_ci # misaligned 32-byte 15031cb0ef41Sopenharmony_ci # loads 15041cb0ef41Sopenharmony_ci 15051cb0ef41Sopenharmony_ci ################################################################ 15061cb0ef41Sopenharmony_ci # expand and copy pre-calculated table to stack 15071cb0ef41Sopenharmony_ci 15081cb0ef41Sopenharmony_ci &vmovdqu (&X($D0),&QWP(16*(3+0),"edi")); 15091cb0ef41Sopenharmony_ci &lea ("edx",&DWP(32*5+128,"esp")); # +128 size optimization 15101cb0ef41Sopenharmony_ci &vmovdqu (&X($D1),&QWP(16*(3+1),"edi")); 15111cb0ef41Sopenharmony_ci &vmovdqu (&X($D2),&QWP(16*(3+2),"edi")); 15121cb0ef41Sopenharmony_ci &vmovdqu (&X($D3),&QWP(16*(3+3),"edi")); 15131cb0ef41Sopenharmony_ci &vmovdqu (&X($D4),&QWP(16*(3+4),"edi")); 15141cb0ef41Sopenharmony_ci &lea ("edi",&DWP(16*3,"edi")); # size optimization 15151cb0ef41Sopenharmony_ci &vpermq ($D0,$D0,0b01000000); # 00001234 -> 12343434 15161cb0ef41Sopenharmony_ci &vpermq ($D1,$D1,0b01000000); 15171cb0ef41Sopenharmony_ci &vpermq ($D2,$D2,0b01000000); 15181cb0ef41Sopenharmony_ci &vpermq ($D3,$D3,0b01000000); 15191cb0ef41Sopenharmony_ci &vpermq ($D4,$D4,0b01000000); 15201cb0ef41Sopenharmony_ci &vpshufd ($D0,$D0,0b11001000); # 12343434 -> 14243444 15211cb0ef41Sopenharmony_ci &vpshufd ($D1,$D1,0b11001000); 15221cb0ef41Sopenharmony_ci &vpshufd ($D2,$D2,0b11001000); 15231cb0ef41Sopenharmony_ci &vpshufd ($D3,$D3,0b11001000); 15241cb0ef41Sopenharmony_ci &vpshufd ($D4,$D4,0b11001000); 15251cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*0-128,"edx"),$D0); 15261cb0ef41Sopenharmony_ci &vmovdqu (&X($D0),&QWP(16*5,"edi")); 15271cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*1-128,"edx"),$D1); 15281cb0ef41Sopenharmony_ci &vmovdqu (&X($D1),&QWP(16*6,"edi")); 15291cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*2-128,"edx"),$D2); 15301cb0ef41Sopenharmony_ci &vmovdqu (&X($D2),&QWP(16*7,"edi")); 15311cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*3-128,"edx"),$D3); 15321cb0ef41Sopenharmony_ci &vmovdqu (&X($D3),&QWP(16*8,"edi")); 15331cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*4-128,"edx"),$D4); 15341cb0ef41Sopenharmony_ci &vpermq ($D0,$D0,0b01000000); 15351cb0ef41Sopenharmony_ci &vpermq ($D1,$D1,0b01000000); 15361cb0ef41Sopenharmony_ci &vpermq ($D2,$D2,0b01000000); 15371cb0ef41Sopenharmony_ci &vpermq ($D3,$D3,0b01000000); 15381cb0ef41Sopenharmony_ci &vpshufd ($D0,$D0,0b11001000); 15391cb0ef41Sopenharmony_ci &vpshufd ($D1,$D1,0b11001000); 15401cb0ef41Sopenharmony_ci &vpshufd ($D2,$D2,0b11001000); 15411cb0ef41Sopenharmony_ci &vpshufd ($D3,$D3,0b11001000); 15421cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*5-128,"edx"),$D0); 15431cb0ef41Sopenharmony_ci &vmovd (&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value 15441cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*6-128,"edx"),$D1); 15451cb0ef41Sopenharmony_ci &vmovd (&X($D1),&DWP(-16*3+4*1,"edi")); 15461cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*7-128,"edx"),$D2); 15471cb0ef41Sopenharmony_ci &vmovd (&X($D2),&DWP(-16*3+4*2,"edi")); 15481cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*8-128,"edx"),$D3); 15491cb0ef41Sopenharmony_ci &vmovd (&X($D3),&DWP(-16*3+4*3,"edi")); 15501cb0ef41Sopenharmony_ci &vmovd (&X($D4),&DWP(-16*3+4*4,"edi")); 15511cb0ef41Sopenharmony_ci &vmovdqa ($MASK,&QWP(64,"ebx")); 15521cb0ef41Sopenharmony_ci &neg ("eax"); # padbit 15531cb0ef41Sopenharmony_ci 15541cb0ef41Sopenharmony_ci &test ("ecx",63); 15551cb0ef41Sopenharmony_ci &jz (&label("even")); 15561cb0ef41Sopenharmony_ci 15571cb0ef41Sopenharmony_ci &mov ("edx","ecx"); 15581cb0ef41Sopenharmony_ci &and ("ecx",-64); 15591cb0ef41Sopenharmony_ci &and ("edx",63); 15601cb0ef41Sopenharmony_ci 15611cb0ef41Sopenharmony_ci &vmovdqu (&X($T0),&QWP(16*0,"esi")); 15621cb0ef41Sopenharmony_ci &cmp ("edx",32); 15631cb0ef41Sopenharmony_ci &jb (&label("one")); 15641cb0ef41Sopenharmony_ci 15651cb0ef41Sopenharmony_ci &vmovdqu (&X($T1),&QWP(16*1,"esi")); 15661cb0ef41Sopenharmony_ci &je (&label("two")); 15671cb0ef41Sopenharmony_ci 15681cb0ef41Sopenharmony_ci &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); 15691cb0ef41Sopenharmony_ci &lea ("esi",&DWP(16*3,"esi")); 15701cb0ef41Sopenharmony_ci &lea ("ebx",&DWP(8,"ebx")); # three padbits 15711cb0ef41Sopenharmony_ci &lea ("edx",&DWP(32*5+128+8,"esp")); # --:r^1:r^2:r^3 (*) 15721cb0ef41Sopenharmony_ci &jmp (&label("tail")); 15731cb0ef41Sopenharmony_ci 15741cb0ef41Sopenharmony_ci&set_label("two"); 15751cb0ef41Sopenharmony_ci &lea ("esi",&DWP(16*2,"esi")); 15761cb0ef41Sopenharmony_ci &lea ("ebx",&DWP(16,"ebx")); # two padbits 15771cb0ef41Sopenharmony_ci &lea ("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*) 15781cb0ef41Sopenharmony_ci &jmp (&label("tail")); 15791cb0ef41Sopenharmony_ci 15801cb0ef41Sopenharmony_ci&set_label("one"); 15811cb0ef41Sopenharmony_ci &lea ("esi",&DWP(16*1,"esi")); 15821cb0ef41Sopenharmony_ci &vpxor ($T1,$T1,$T1); 15831cb0ef41Sopenharmony_ci &lea ("ebx",&DWP(32,"ebx","eax",8)); # one or no padbits 15841cb0ef41Sopenharmony_ci &lea ("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*) 15851cb0ef41Sopenharmony_ci &jmp (&label("tail")); 15861cb0ef41Sopenharmony_ci 15871cb0ef41Sopenharmony_ci# (*) spots marked with '--' are data from next table entry, but they 15881cb0ef41Sopenharmony_ci# are multiplied by 0 and therefore rendered insignificant 15891cb0ef41Sopenharmony_ci 15901cb0ef41Sopenharmony_ci&set_label("even",32); 15911cb0ef41Sopenharmony_ci &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input 15921cb0ef41Sopenharmony_ci &vmovdqu (&X($T1),&QWP(16*1,"esi")); 15931cb0ef41Sopenharmony_ci &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); 15941cb0ef41Sopenharmony_ci &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1); 15951cb0ef41Sopenharmony_ci &lea ("esi",&DWP(16*4,"esi")); 15961cb0ef41Sopenharmony_ci &sub ("ecx",64); 15971cb0ef41Sopenharmony_ci &jz (&label("tail")); 15981cb0ef41Sopenharmony_ci 15991cb0ef41Sopenharmony_ci&set_label("loop"); 16001cb0ef41Sopenharmony_ci ################################################################ 16011cb0ef41Sopenharmony_ci # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4 16021cb0ef41Sopenharmony_ci # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3 16031cb0ef41Sopenharmony_ci # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2 16041cb0ef41Sopenharmony_ci # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1 16051cb0ef41Sopenharmony_ci # \________/ \_______/ 16061cb0ef41Sopenharmony_ci ################################################################ 16071cb0ef41Sopenharmony_ci 16081cb0ef41Sopenharmony_cisub vsplat_input { 16091cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*2,"esp"),$D2); 16101cb0ef41Sopenharmony_ci &vpsrldq ($D2,$T0,6); # splat input 16111cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*0,"esp"),$D0); 16121cb0ef41Sopenharmony_ci &vpsrldq ($D0,$T1,6); 16131cb0ef41Sopenharmony_ci &vmovdqa (&QWP(32*1,"esp"),$D1); 16141cb0ef41Sopenharmony_ci &vpunpckhqdq ($D1,$T0,$T1); # 4 16151cb0ef41Sopenharmony_ci &vpunpcklqdq ($T0,$T0,$T1); # 0:1 16161cb0ef41Sopenharmony_ci &vpunpcklqdq ($D2,$D2,$D0); # 2:3 16171cb0ef41Sopenharmony_ci 16181cb0ef41Sopenharmony_ci &vpsrlq ($D0,$D2,30); 16191cb0ef41Sopenharmony_ci &vpsrlq ($D2,$D2,4); 16201cb0ef41Sopenharmony_ci &vpsrlq ($T1,$T0,26); 16211cb0ef41Sopenharmony_ci &vpsrlq ($D1,$D1,40); # 4 16221cb0ef41Sopenharmony_ci &vpand ($D2,$D2,$MASK); # 2 16231cb0ef41Sopenharmony_ci &vpand ($T0,$T0,$MASK); # 0 16241cb0ef41Sopenharmony_ci &vpand ($T1,$T1,$MASK); # 1 16251cb0ef41Sopenharmony_ci &vpand ($D0,$D0,$MASK); # 3 (*) 16261cb0ef41Sopenharmony_ci &vpor ($D1,$D1,&QWP(0,"ebx")); # padbit, yes, always 16271cb0ef41Sopenharmony_ci 16281cb0ef41Sopenharmony_ci # (*) note that output is counterintuitive, inp[3:4] is 16291cb0ef41Sopenharmony_ci # returned in $D1-2, while $D3-4 are preserved; 16301cb0ef41Sopenharmony_ci} 16311cb0ef41Sopenharmony_ci &vsplat_input (); 16321cb0ef41Sopenharmony_ci 16331cb0ef41Sopenharmony_cisub vpmuladd { 16341cb0ef41Sopenharmony_cimy $addr = shift; 16351cb0ef41Sopenharmony_ci 16361cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,&QWP(32*2,"esp")); # add hash value 16371cb0ef41Sopenharmony_ci &vpaddq ($T0,$T0,&QWP(32*0,"esp")); 16381cb0ef41Sopenharmony_ci &vpaddq ($T1,$T1,&QWP(32*1,"esp")); 16391cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$D3); 16401cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$D4); 16411cb0ef41Sopenharmony_ci 16421cb0ef41Sopenharmony_ci ################################################################ 16431cb0ef41Sopenharmony_ci # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 16441cb0ef41Sopenharmony_ci # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 16451cb0ef41Sopenharmony_ci # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 16461cb0ef41Sopenharmony_ci # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 16471cb0ef41Sopenharmony_ci # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 16481cb0ef41Sopenharmony_ci 16491cb0ef41Sopenharmony_ci &vpmuludq ($D3,$D2,&$addr(1)); # d3 = h2*r1 16501cb0ef41Sopenharmony_ci &vmovdqa (QWP(32*1,"esp"),$T1); 16511cb0ef41Sopenharmony_ci &vpmuludq ($D4,$D2,&$addr(2)); # d4 = h2*r2 16521cb0ef41Sopenharmony_ci &vmovdqa (QWP(32*3,"esp"),$D0); 16531cb0ef41Sopenharmony_ci &vpmuludq ($D0,$D2,&$addr(7)); # d0 = h2*s3 16541cb0ef41Sopenharmony_ci &vmovdqa (QWP(32*4,"esp"),$D1); 16551cb0ef41Sopenharmony_ci &vpmuludq ($D1,$D2,&$addr(8)); # d1 = h2*s4 16561cb0ef41Sopenharmony_ci &vpmuludq ($D2,$D2,&$addr(0)); # d2 = h2*r0 16571cb0ef41Sopenharmony_ci 16581cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T0,&$addr(3)); # h0*r3 16591cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T2); # d3 += h0*r3 16601cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T0,&$addr(4)); # h0*r4 16611cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T1); # d4 + h0*r4 16621cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T0,&$addr(0)); # h0*r0 16631cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T2); # d0 + h0*r0 16641cb0ef41Sopenharmony_ci &vmovdqa ($T2,&QWP(32*1,"esp")); # h1 16651cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T0,&$addr(1)); # h0*r1 16661cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T1); # d1 += h0*r1 16671cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T0,&$addr(2)); # h0*r2 16681cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T0); # d2 += h0*r2 16691cb0ef41Sopenharmony_ci 16701cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T2,&$addr(2)); # h1*r2 16711cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T1); # d3 += h1*r2 16721cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T2,&$addr(3)); # h1*r3 16731cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T0); # d4 += h1*r3 16741cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T2,&$addr(8)); # h1*s4 16751cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T1); # d0 += h1*s4 16761cb0ef41Sopenharmony_ci &vmovdqa ($T1,&QWP(32*3,"esp")); # h3 16771cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T2,&$addr(0)); # h1*r0 16781cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T0); # d1 += h1*r0 16791cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T2,&$addr(1)); # h1*r1 16801cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T2); # d2 += h1*r1 16811cb0ef41Sopenharmony_ci 16821cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T1,&$addr(0)); # h3*r0 16831cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T0); # d3 += h3*r0 16841cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T1,&$addr(1)); # h3*r1 16851cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T2); # d4 += h3*r1 16861cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T1,&$addr(6)); # h3*s2 16871cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T0); # d0 += h3*s2 16881cb0ef41Sopenharmony_ci &vmovdqa ($T0,&QWP(32*4,"esp")); # h4 16891cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T1,&$addr(7)); # h3*s3 16901cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T2); # d1+= h3*s3 16911cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T1,&$addr(8)); # h3*s4 16921cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T1); # d2 += h3*s4 16931cb0ef41Sopenharmony_ci 16941cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T0,&$addr(8)); # h4*s4 16951cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T2); # d3 += h4*s4 16961cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T0,&$addr(5)); # h4*s1 16971cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T1); # d0 += h4*s1 16981cb0ef41Sopenharmony_ci &vpmuludq ($T2,$T0,&$addr(0)); # h4*r0 16991cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T2); # d4 += h4*r0 17001cb0ef41Sopenharmony_ci &vmovdqa ($MASK,&QWP(64,"ebx")); 17011cb0ef41Sopenharmony_ci &vpmuludq ($T1,$T0,&$addr(6)); # h4*s2 17021cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T1); # d1 += h4*s2 17031cb0ef41Sopenharmony_ci &vpmuludq ($T0,$T0,&$addr(7)); # h4*s3 17041cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T0); # d2 += h4*s3 17051cb0ef41Sopenharmony_ci} 17061cb0ef41Sopenharmony_ci &vpmuladd (sub { my $i=shift; &QWP(32*$i-128,"edx"); }); 17071cb0ef41Sopenharmony_ci 17081cb0ef41Sopenharmony_cisub vlazy_reduction { 17091cb0ef41Sopenharmony_ci ################################################################ 17101cb0ef41Sopenharmony_ci # lazy reduction 17111cb0ef41Sopenharmony_ci 17121cb0ef41Sopenharmony_ci &vpsrlq ($T0,$D3,26); 17131cb0ef41Sopenharmony_ci &vpand ($D3,$D3,$MASK); 17141cb0ef41Sopenharmony_ci &vpsrlq ($T1,$D0,26); 17151cb0ef41Sopenharmony_ci &vpand ($D0,$D0,$MASK); 17161cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T0); # h3 -> h4 17171cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T1); # h0 -> h1 17181cb0ef41Sopenharmony_ci &vpsrlq ($T0,$D4,26); 17191cb0ef41Sopenharmony_ci &vpand ($D4,$D4,$MASK); 17201cb0ef41Sopenharmony_ci &vpsrlq ($T1,$D1,26); 17211cb0ef41Sopenharmony_ci &vpand ($D1,$D1,$MASK); 17221cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T1); # h1 -> h2 17231cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T0); 17241cb0ef41Sopenharmony_ci &vpsllq ($T0,$T0,2); 17251cb0ef41Sopenharmony_ci &vpsrlq ($T1,$D2,26); 17261cb0ef41Sopenharmony_ci &vpand ($D2,$D2,$MASK); 17271cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T0); # h4 -> h0 17281cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T1); # h2 -> h3 17291cb0ef41Sopenharmony_ci &vpsrlq ($T1,$D3,26); 17301cb0ef41Sopenharmony_ci &vpsrlq ($T0,$D0,26); 17311cb0ef41Sopenharmony_ci &vpand ($D0,$D0,$MASK); 17321cb0ef41Sopenharmony_ci &vpand ($D3,$D3,$MASK); 17331cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T0); # h0 -> h1 17341cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T1); # h3 -> h4 17351cb0ef41Sopenharmony_ci} 17361cb0ef41Sopenharmony_ci &vlazy_reduction(); 17371cb0ef41Sopenharmony_ci 17381cb0ef41Sopenharmony_ci &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input 17391cb0ef41Sopenharmony_ci &vmovdqu (&X($T1),&QWP(16*1,"esi")); 17401cb0ef41Sopenharmony_ci &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); 17411cb0ef41Sopenharmony_ci &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1); 17421cb0ef41Sopenharmony_ci &lea ("esi",&DWP(16*4,"esi")); 17431cb0ef41Sopenharmony_ci &sub ("ecx",64); 17441cb0ef41Sopenharmony_ci &jnz (&label("loop")); 17451cb0ef41Sopenharmony_ci 17461cb0ef41Sopenharmony_ci&set_label("tail"); 17471cb0ef41Sopenharmony_ci &vsplat_input (); 17481cb0ef41Sopenharmony_ci &and ("ebx",-64); # restore pointer 17491cb0ef41Sopenharmony_ci 17501cb0ef41Sopenharmony_ci &vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); }); 17511cb0ef41Sopenharmony_ci 17521cb0ef41Sopenharmony_ci ################################################################ 17531cb0ef41Sopenharmony_ci # horizontal addition 17541cb0ef41Sopenharmony_ci 17551cb0ef41Sopenharmony_ci &vpsrldq ($T0,$D4,8); 17561cb0ef41Sopenharmony_ci &vpsrldq ($T1,$D3,8); 17571cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T0); 17581cb0ef41Sopenharmony_ci &vpsrldq ($T0,$D0,8); 17591cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T1); 17601cb0ef41Sopenharmony_ci &vpsrldq ($T1,$D1,8); 17611cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T0); 17621cb0ef41Sopenharmony_ci &vpsrldq ($T0,$D2,8); 17631cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T1); 17641cb0ef41Sopenharmony_ci &vpermq ($T1,$D4,2); # keep folding 17651cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T0); 17661cb0ef41Sopenharmony_ci &vpermq ($T0,$D3,2); 17671cb0ef41Sopenharmony_ci &vpaddq ($D4,$D4,$T1); 17681cb0ef41Sopenharmony_ci &vpermq ($T1,$D0,2); 17691cb0ef41Sopenharmony_ci &vpaddq ($D3,$D3,$T0); 17701cb0ef41Sopenharmony_ci &vpermq ($T0,$D1,2); 17711cb0ef41Sopenharmony_ci &vpaddq ($D0,$D0,$T1); 17721cb0ef41Sopenharmony_ci &vpermq ($T1,$D2,2); 17731cb0ef41Sopenharmony_ci &vpaddq ($D1,$D1,$T0); 17741cb0ef41Sopenharmony_ci &vpaddq ($D2,$D2,$T1); 17751cb0ef41Sopenharmony_ci 17761cb0ef41Sopenharmony_ci &vlazy_reduction(); 17771cb0ef41Sopenharmony_ci 17781cb0ef41Sopenharmony_ci &cmp ("ecx",0); 17791cb0ef41Sopenharmony_ci &je (&label("done")); 17801cb0ef41Sopenharmony_ci 17811cb0ef41Sopenharmony_ci ################################################################ 17821cb0ef41Sopenharmony_ci # clear all but single word 17831cb0ef41Sopenharmony_ci 17841cb0ef41Sopenharmony_ci &vpshufd (&X($D0),&X($D0),0b11111100); 17851cb0ef41Sopenharmony_ci &lea ("edx",&DWP(32*5+128,"esp")); # restore pointer 17861cb0ef41Sopenharmony_ci &vpshufd (&X($D1),&X($D1),0b11111100); 17871cb0ef41Sopenharmony_ci &vpshufd (&X($D2),&X($D2),0b11111100); 17881cb0ef41Sopenharmony_ci &vpshufd (&X($D3),&X($D3),0b11111100); 17891cb0ef41Sopenharmony_ci &vpshufd (&X($D4),&X($D4),0b11111100); 17901cb0ef41Sopenharmony_ci &jmp (&label("even")); 17911cb0ef41Sopenharmony_ci 17921cb0ef41Sopenharmony_ci&set_label("done",16); 17931cb0ef41Sopenharmony_ci &vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value 17941cb0ef41Sopenharmony_ci &vmovd (&DWP(-16*3+4*1,"edi"),&X($D1)); 17951cb0ef41Sopenharmony_ci &vmovd (&DWP(-16*3+4*2,"edi"),&X($D2)); 17961cb0ef41Sopenharmony_ci &vmovd (&DWP(-16*3+4*3,"edi"),&X($D3)); 17971cb0ef41Sopenharmony_ci &vmovd (&DWP(-16*3+4*4,"edi"),&X($D4)); 17981cb0ef41Sopenharmony_ci &vzeroupper (); 17991cb0ef41Sopenharmony_ci &mov ("esp","ebp"); 18001cb0ef41Sopenharmony_ci&set_label("nodata"); 18011cb0ef41Sopenharmony_ci&function_end("_poly1305_blocks_avx2"); 18021cb0ef41Sopenharmony_ci} 18031cb0ef41Sopenharmony_ci&set_label("const_sse2",64); 18041cb0ef41Sopenharmony_ci &data_word(1<<24,0, 1<<24,0, 1<<24,0, 1<<24,0); 18051cb0ef41Sopenharmony_ci &data_word(0,0, 0,0, 0,0, 0,0); 18061cb0ef41Sopenharmony_ci &data_word(0x03ffffff,0,0x03ffffff,0, 0x03ffffff,0, 0x03ffffff,0); 18071cb0ef41Sopenharmony_ci &data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc); 18081cb0ef41Sopenharmony_ci} 18091cb0ef41Sopenharmony_ci&asciz ("Poly1305 for x86, CRYPTOGAMS by <appro\@openssl.org>"); 18101cb0ef41Sopenharmony_ci&align (4); 18111cb0ef41Sopenharmony_ci 18121cb0ef41Sopenharmony_ci&asm_finish(); 18131cb0ef41Sopenharmony_ci 18141cb0ef41Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 1815