18c2ecf20Sopenharmony_ci#!/usr/bin/env perl 28c2ecf20Sopenharmony_ci# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 38c2ecf20Sopenharmony_ci# 48c2ecf20Sopenharmony_ci# ==================================================================== 58c2ecf20Sopenharmony_ci# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL 68c2ecf20Sopenharmony_ci# project. 78c2ecf20Sopenharmony_ci# ==================================================================== 88c2ecf20Sopenharmony_ci# 98c2ecf20Sopenharmony_ci# This module implements Poly1305 hash for ARMv8. 108c2ecf20Sopenharmony_ci# 118c2ecf20Sopenharmony_ci# June 2015 128c2ecf20Sopenharmony_ci# 138c2ecf20Sopenharmony_ci# Numbers are cycles per processed byte with poly1305_blocks alone. 148c2ecf20Sopenharmony_ci# 158c2ecf20Sopenharmony_ci# IALU/gcc-4.9 NEON 168c2ecf20Sopenharmony_ci# 178c2ecf20Sopenharmony_ci# Apple A7 1.86/+5% 0.72 188c2ecf20Sopenharmony_ci# Cortex-A53 2.69/+58% 1.47 198c2ecf20Sopenharmony_ci# Cortex-A57 2.70/+7% 1.14 208c2ecf20Sopenharmony_ci# Denver 1.64/+50% 1.18(*) 218c2ecf20Sopenharmony_ci# X-Gene 2.13/+68% 2.27 228c2ecf20Sopenharmony_ci# Mongoose 1.77/+75% 1.12 238c2ecf20Sopenharmony_ci# Kryo 2.70/+55% 1.13 248c2ecf20Sopenharmony_ci# ThunderX2 1.17/+95% 1.36 258c2ecf20Sopenharmony_ci# 268c2ecf20Sopenharmony_ci# (*) estimate based on resources availability is less than 1.0, 278c2ecf20Sopenharmony_ci# i.e. measured result is worse than expected, presumably binary 288c2ecf20Sopenharmony_ci# translator is not almighty; 298c2ecf20Sopenharmony_ci 308c2ecf20Sopenharmony_ci$flavour=shift; 318c2ecf20Sopenharmony_ci$output=shift; 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ciif ($flavour && $flavour ne "void") { 348c2ecf20Sopenharmony_ci $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 358c2ecf20Sopenharmony_ci ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 368c2ecf20Sopenharmony_ci ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 378c2ecf20Sopenharmony_ci die "can't locate arm-xlate.pl"; 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci open STDOUT,"| \"$^X\" $xlate $flavour $output"; 408c2ecf20Sopenharmony_ci} else { 418c2ecf20Sopenharmony_ci open STDOUT,">$output"; 428c2ecf20Sopenharmony_ci} 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_cimy ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); 458c2ecf20Sopenharmony_cimy ($mac,$nonce)=($inp,$len); 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_cimy ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci$code.=<<___; 508c2ecf20Sopenharmony_ci#ifndef __KERNEL__ 518c2ecf20Sopenharmony_ci# include "arm_arch.h" 528c2ecf20Sopenharmony_ci.extern OPENSSL_armcap_P 538c2ecf20Sopenharmony_ci#endif 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci.text 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_ci// forward "declarations" are required for Apple 588c2ecf20Sopenharmony_ci.globl poly1305_blocks 598c2ecf20Sopenharmony_ci.globl poly1305_emit 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci.globl poly1305_init 628c2ecf20Sopenharmony_ci.type poly1305_init,%function 638c2ecf20Sopenharmony_ci.align 5 648c2ecf20Sopenharmony_cipoly1305_init: 658c2ecf20Sopenharmony_ci cmp $inp,xzr 668c2ecf20Sopenharmony_ci stp xzr,xzr,[$ctx] // zero hash value 678c2ecf20Sopenharmony_ci stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci csel x0,xzr,x0,eq 708c2ecf20Sopenharmony_ci b.eq .Lno_key 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci#ifndef __KERNEL__ 738c2ecf20Sopenharmony_ci adrp x17,OPENSSL_armcap_P 748c2ecf20Sopenharmony_ci ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 758c2ecf20Sopenharmony_ci#endif 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci ldp $r0,$r1,[$inp] // load key 788c2ecf20Sopenharmony_ci mov $s1,#0xfffffffc0fffffff 798c2ecf20Sopenharmony_ci movk $s1,#0x0fff,lsl#48 808c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 818c2ecf20Sopenharmony_ci rev $r0,$r0 // flip bytes 828c2ecf20Sopenharmony_ci rev $r1,$r1 838c2ecf20Sopenharmony_ci#endif 848c2ecf20Sopenharmony_ci and $r0,$r0,$s1 // &=0ffffffc0fffffff 858c2ecf20Sopenharmony_ci and $s1,$s1,#-4 868c2ecf20Sopenharmony_ci and $r1,$r1,$s1 // &=0ffffffc0ffffffc 878c2ecf20Sopenharmony_ci mov w#$s1,#-1 888c2ecf20Sopenharmony_ci stp $r0,$r1,[$ctx,#32] // save key value 898c2ecf20Sopenharmony_ci str w#$s1,[$ctx,#48] // impossible key power value 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci#ifndef __KERNEL__ 928c2ecf20Sopenharmony_ci tst w17,#ARMV7_NEON 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci adr $d0,.Lpoly1305_blocks 958c2ecf20Sopenharmony_ci adr $r0,.Lpoly1305_blocks_neon 968c2ecf20Sopenharmony_ci adr $d1,.Lpoly1305_emit 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci csel $d0,$d0,$r0,eq 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ci# ifdef __ILP32__ 1018c2ecf20Sopenharmony_ci stp w#$d0,w#$d1,[$len] 1028c2ecf20Sopenharmony_ci# else 1038c2ecf20Sopenharmony_ci stp $d0,$d1,[$len] 1048c2ecf20Sopenharmony_ci# endif 1058c2ecf20Sopenharmony_ci#endif 1068c2ecf20Sopenharmony_ci mov x0,#1 1078c2ecf20Sopenharmony_ci.Lno_key: 1088c2ecf20Sopenharmony_ci ret 1098c2ecf20Sopenharmony_ci.size poly1305_init,.-poly1305_init 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci.type poly1305_blocks,%function 1128c2ecf20Sopenharmony_ci.align 5 1138c2ecf20Sopenharmony_cipoly1305_blocks: 1148c2ecf20Sopenharmony_ci.Lpoly1305_blocks: 1158c2ecf20Sopenharmony_ci ands $len,$len,#-16 1168c2ecf20Sopenharmony_ci b.eq .Lno_data 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci ldp $h0,$h1,[$ctx] // load hash value 1198c2ecf20Sopenharmony_ci ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] 1208c2ecf20Sopenharmony_ci ldp $r0,$r1,[$ctx,#32] // load key value 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 1238c2ecf20Sopenharmony_ci lsr $d0,$h0,#32 1248c2ecf20Sopenharmony_ci mov w#$d1,w#$h0 1258c2ecf20Sopenharmony_ci lsr $d2,$h1,#32 1268c2ecf20Sopenharmony_ci mov w15,w#$h1 1278c2ecf20Sopenharmony_ci lsr x16,$h2,#32 1288c2ecf20Sopenharmony_ci#else 1298c2ecf20Sopenharmony_ci mov w#$d0,w#$h0 1308c2ecf20Sopenharmony_ci lsr $d1,$h0,#32 1318c2ecf20Sopenharmony_ci mov w#$d2,w#$h1 1328c2ecf20Sopenharmony_ci lsr x15,$h1,#32 1338c2ecf20Sopenharmony_ci mov w16,w#$h2 1348c2ecf20Sopenharmony_ci#endif 1358c2ecf20Sopenharmony_ci 1368c2ecf20Sopenharmony_ci add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 1378c2ecf20Sopenharmony_ci lsr $d1,$d2,#12 1388c2ecf20Sopenharmony_ci adds $d0,$d0,$d2,lsl#52 1398c2ecf20Sopenharmony_ci add $d1,$d1,x15,lsl#14 1408c2ecf20Sopenharmony_ci adc $d1,$d1,xzr 1418c2ecf20Sopenharmony_ci lsr $d2,x16,#24 1428c2ecf20Sopenharmony_ci adds $d1,$d1,x16,lsl#40 1438c2ecf20Sopenharmony_ci adc $d2,$d2,xzr 1448c2ecf20Sopenharmony_ci 1458c2ecf20Sopenharmony_ci cmp x17,#0 // is_base2_26? 1468c2ecf20Sopenharmony_ci add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 1478c2ecf20Sopenharmony_ci csel $h0,$h0,$d0,eq // choose between radixes 1488c2ecf20Sopenharmony_ci csel $h1,$h1,$d1,eq 1498c2ecf20Sopenharmony_ci csel $h2,$h2,$d2,eq 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci.Loop: 1528c2ecf20Sopenharmony_ci ldp $t0,$t1,[$inp],#16 // load input 1538c2ecf20Sopenharmony_ci sub $len,$len,#16 1548c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 1558c2ecf20Sopenharmony_ci rev $t0,$t0 1568c2ecf20Sopenharmony_ci rev $t1,$t1 1578c2ecf20Sopenharmony_ci#endif 1588c2ecf20Sopenharmony_ci adds $h0,$h0,$t0 // accumulate input 1598c2ecf20Sopenharmony_ci adcs $h1,$h1,$t1 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_ci mul $d0,$h0,$r0 // h0*r0 1628c2ecf20Sopenharmony_ci adc $h2,$h2,$padbit 1638c2ecf20Sopenharmony_ci umulh $d1,$h0,$r0 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci mul $t0,$h1,$s1 // h1*5*r1 1668c2ecf20Sopenharmony_ci umulh $t1,$h1,$s1 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci adds $d0,$d0,$t0 1698c2ecf20Sopenharmony_ci mul $t0,$h0,$r1 // h0*r1 1708c2ecf20Sopenharmony_ci adc $d1,$d1,$t1 1718c2ecf20Sopenharmony_ci umulh $d2,$h0,$r1 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_ci adds $d1,$d1,$t0 1748c2ecf20Sopenharmony_ci mul $t0,$h1,$r0 // h1*r0 1758c2ecf20Sopenharmony_ci adc $d2,$d2,xzr 1768c2ecf20Sopenharmony_ci umulh $t1,$h1,$r0 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci adds $d1,$d1,$t0 1798c2ecf20Sopenharmony_ci mul $t0,$h2,$s1 // h2*5*r1 1808c2ecf20Sopenharmony_ci adc $d2,$d2,$t1 1818c2ecf20Sopenharmony_ci mul $t1,$h2,$r0 // h2*r0 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci adds $d1,$d1,$t0 1848c2ecf20Sopenharmony_ci adc $d2,$d2,$t1 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_ci and $t0,$d2,#-4 // final reduction 1878c2ecf20Sopenharmony_ci and $h2,$d2,#3 1888c2ecf20Sopenharmony_ci add $t0,$t0,$d2,lsr#2 1898c2ecf20Sopenharmony_ci adds $h0,$d0,$t0 1908c2ecf20Sopenharmony_ci adcs $h1,$d1,xzr 1918c2ecf20Sopenharmony_ci adc $h2,$h2,xzr 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci cbnz $len,.Loop 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci stp $h0,$h1,[$ctx] // store hash value 1968c2ecf20Sopenharmony_ci stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci.Lno_data: 1998c2ecf20Sopenharmony_ci ret 2008c2ecf20Sopenharmony_ci.size poly1305_blocks,.-poly1305_blocks 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci.type poly1305_emit,%function 2038c2ecf20Sopenharmony_ci.align 5 2048c2ecf20Sopenharmony_cipoly1305_emit: 2058c2ecf20Sopenharmony_ci.Lpoly1305_emit: 2068c2ecf20Sopenharmony_ci ldp $h0,$h1,[$ctx] // load hash base 2^64 2078c2ecf20Sopenharmony_ci ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] 2088c2ecf20Sopenharmony_ci ldp $t0,$t1,[$nonce] // load nonce 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 2118c2ecf20Sopenharmony_ci lsr $d0,$h0,#32 2128c2ecf20Sopenharmony_ci mov w#$d1,w#$h0 2138c2ecf20Sopenharmony_ci lsr $d2,$h1,#32 2148c2ecf20Sopenharmony_ci mov w15,w#$h1 2158c2ecf20Sopenharmony_ci lsr x16,$h2,#32 2168c2ecf20Sopenharmony_ci#else 2178c2ecf20Sopenharmony_ci mov w#$d0,w#$h0 2188c2ecf20Sopenharmony_ci lsr $d1,$h0,#32 2198c2ecf20Sopenharmony_ci mov w#$d2,w#$h1 2208c2ecf20Sopenharmony_ci lsr x15,$h1,#32 2218c2ecf20Sopenharmony_ci mov w16,w#$h2 2228c2ecf20Sopenharmony_ci#endif 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_ci add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 2258c2ecf20Sopenharmony_ci lsr $d1,$d2,#12 2268c2ecf20Sopenharmony_ci adds $d0,$d0,$d2,lsl#52 2278c2ecf20Sopenharmony_ci add $d1,$d1,x15,lsl#14 2288c2ecf20Sopenharmony_ci adc $d1,$d1,xzr 2298c2ecf20Sopenharmony_ci lsr $d2,x16,#24 2308c2ecf20Sopenharmony_ci adds $d1,$d1,x16,lsl#40 2318c2ecf20Sopenharmony_ci adc $d2,$d2,xzr 2328c2ecf20Sopenharmony_ci 2338c2ecf20Sopenharmony_ci cmp $r0,#0 // is_base2_26? 2348c2ecf20Sopenharmony_ci csel $h0,$h0,$d0,eq // choose between radixes 2358c2ecf20Sopenharmony_ci csel $h1,$h1,$d1,eq 2368c2ecf20Sopenharmony_ci csel $h2,$h2,$d2,eq 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci adds $d0,$h0,#5 // compare to modulus 2398c2ecf20Sopenharmony_ci adcs $d1,$h1,xzr 2408c2ecf20Sopenharmony_ci adc $d2,$h2,xzr 2418c2ecf20Sopenharmony_ci 2428c2ecf20Sopenharmony_ci tst $d2,#-4 // see if it's carried/borrowed 2438c2ecf20Sopenharmony_ci 2448c2ecf20Sopenharmony_ci csel $h0,$h0,$d0,eq 2458c2ecf20Sopenharmony_ci csel $h1,$h1,$d1,eq 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 2488c2ecf20Sopenharmony_ci ror $t0,$t0,#32 // flip nonce words 2498c2ecf20Sopenharmony_ci ror $t1,$t1,#32 2508c2ecf20Sopenharmony_ci#endif 2518c2ecf20Sopenharmony_ci adds $h0,$h0,$t0 // accumulate nonce 2528c2ecf20Sopenharmony_ci adc $h1,$h1,$t1 2538c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 2548c2ecf20Sopenharmony_ci rev $h0,$h0 // flip output bytes 2558c2ecf20Sopenharmony_ci rev $h1,$h1 2568c2ecf20Sopenharmony_ci#endif 2578c2ecf20Sopenharmony_ci stp $h0,$h1,[$mac] // write result 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci ret 2608c2ecf20Sopenharmony_ci.size poly1305_emit,.-poly1305_emit 2618c2ecf20Sopenharmony_ci___ 2628c2ecf20Sopenharmony_cimy ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); 2638c2ecf20Sopenharmony_cimy ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); 2648c2ecf20Sopenharmony_cimy ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); 2658c2ecf20Sopenharmony_cimy ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); 2668c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); 2678c2ecf20Sopenharmony_cimy ($T0,$T1,$MASK) = map("v$_",(29..31)); 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_cimy ($in2,$zeros)=("x16","x17"); 2708c2ecf20Sopenharmony_cimy $is_base2_26 = $zeros; # borrow 2718c2ecf20Sopenharmony_ci 2728c2ecf20Sopenharmony_ci$code.=<<___; 2738c2ecf20Sopenharmony_ci.type poly1305_mult,%function 2748c2ecf20Sopenharmony_ci.align 5 2758c2ecf20Sopenharmony_cipoly1305_mult: 2768c2ecf20Sopenharmony_ci mul $d0,$h0,$r0 // h0*r0 2778c2ecf20Sopenharmony_ci umulh $d1,$h0,$r0 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_ci mul $t0,$h1,$s1 // h1*5*r1 2808c2ecf20Sopenharmony_ci umulh $t1,$h1,$s1 2818c2ecf20Sopenharmony_ci 2828c2ecf20Sopenharmony_ci adds $d0,$d0,$t0 2838c2ecf20Sopenharmony_ci mul $t0,$h0,$r1 // h0*r1 2848c2ecf20Sopenharmony_ci adc $d1,$d1,$t1 2858c2ecf20Sopenharmony_ci umulh $d2,$h0,$r1 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_ci adds $d1,$d1,$t0 2888c2ecf20Sopenharmony_ci mul $t0,$h1,$r0 // h1*r0 2898c2ecf20Sopenharmony_ci adc $d2,$d2,xzr 2908c2ecf20Sopenharmony_ci umulh $t1,$h1,$r0 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci adds $d1,$d1,$t0 2938c2ecf20Sopenharmony_ci mul $t0,$h2,$s1 // h2*5*r1 2948c2ecf20Sopenharmony_ci adc $d2,$d2,$t1 2958c2ecf20Sopenharmony_ci mul $t1,$h2,$r0 // h2*r0 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci adds $d1,$d1,$t0 2988c2ecf20Sopenharmony_ci adc $d2,$d2,$t1 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci and $t0,$d2,#-4 // final reduction 3018c2ecf20Sopenharmony_ci and $h2,$d2,#3 3028c2ecf20Sopenharmony_ci add $t0,$t0,$d2,lsr#2 3038c2ecf20Sopenharmony_ci adds $h0,$d0,$t0 3048c2ecf20Sopenharmony_ci adcs $h1,$d1,xzr 3058c2ecf20Sopenharmony_ci adc $h2,$h2,xzr 3068c2ecf20Sopenharmony_ci 3078c2ecf20Sopenharmony_ci ret 3088c2ecf20Sopenharmony_ci.size poly1305_mult,.-poly1305_mult 3098c2ecf20Sopenharmony_ci 3108c2ecf20Sopenharmony_ci.type poly1305_splat,%function 3118c2ecf20Sopenharmony_ci.align 4 3128c2ecf20Sopenharmony_cipoly1305_splat: 3138c2ecf20Sopenharmony_ci and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 3148c2ecf20Sopenharmony_ci ubfx x13,$h0,#26,#26 3158c2ecf20Sopenharmony_ci extr x14,$h1,$h0,#52 3168c2ecf20Sopenharmony_ci and x14,x14,#0x03ffffff 3178c2ecf20Sopenharmony_ci ubfx x15,$h1,#14,#26 3188c2ecf20Sopenharmony_ci extr x16,$h2,$h1,#40 3198c2ecf20Sopenharmony_ci 3208c2ecf20Sopenharmony_ci str w12,[$ctx,#16*0] // r0 3218c2ecf20Sopenharmony_ci add w12,w13,w13,lsl#2 // r1*5 3228c2ecf20Sopenharmony_ci str w13,[$ctx,#16*1] // r1 3238c2ecf20Sopenharmony_ci add w13,w14,w14,lsl#2 // r2*5 3248c2ecf20Sopenharmony_ci str w12,[$ctx,#16*2] // s1 3258c2ecf20Sopenharmony_ci str w14,[$ctx,#16*3] // r2 3268c2ecf20Sopenharmony_ci add w14,w15,w15,lsl#2 // r3*5 3278c2ecf20Sopenharmony_ci str w13,[$ctx,#16*4] // s2 3288c2ecf20Sopenharmony_ci str w15,[$ctx,#16*5] // r3 3298c2ecf20Sopenharmony_ci add w15,w16,w16,lsl#2 // r4*5 3308c2ecf20Sopenharmony_ci str w14,[$ctx,#16*6] // s3 3318c2ecf20Sopenharmony_ci str w16,[$ctx,#16*7] // r4 3328c2ecf20Sopenharmony_ci str w15,[$ctx,#16*8] // s4 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_ci ret 3358c2ecf20Sopenharmony_ci.size poly1305_splat,.-poly1305_splat 3368c2ecf20Sopenharmony_ci 3378c2ecf20Sopenharmony_ci#ifdef __KERNEL__ 3388c2ecf20Sopenharmony_ci.globl poly1305_blocks_neon 3398c2ecf20Sopenharmony_ci#endif 3408c2ecf20Sopenharmony_ci.type poly1305_blocks_neon,%function 3418c2ecf20Sopenharmony_ci.align 5 3428c2ecf20Sopenharmony_cipoly1305_blocks_neon: 3438c2ecf20Sopenharmony_ci.Lpoly1305_blocks_neon: 3448c2ecf20Sopenharmony_ci ldr $is_base2_26,[$ctx,#24] 3458c2ecf20Sopenharmony_ci cmp $len,#128 3468c2ecf20Sopenharmony_ci b.lo .Lpoly1305_blocks 3478c2ecf20Sopenharmony_ci 3488c2ecf20Sopenharmony_ci .inst 0xd503233f // paciasp 3498c2ecf20Sopenharmony_ci stp x29,x30,[sp,#-80]! 3508c2ecf20Sopenharmony_ci add x29,sp,#0 3518c2ecf20Sopenharmony_ci 3528c2ecf20Sopenharmony_ci stp d8,d9,[sp,#16] // meet ABI requirements 3538c2ecf20Sopenharmony_ci stp d10,d11,[sp,#32] 3548c2ecf20Sopenharmony_ci stp d12,d13,[sp,#48] 3558c2ecf20Sopenharmony_ci stp d14,d15,[sp,#64] 3568c2ecf20Sopenharmony_ci 3578c2ecf20Sopenharmony_ci cbz $is_base2_26,.Lbase2_64_neon 3588c2ecf20Sopenharmony_ci 3598c2ecf20Sopenharmony_ci ldp w10,w11,[$ctx] // load hash value base 2^26 3608c2ecf20Sopenharmony_ci ldp w12,w13,[$ctx,#8] 3618c2ecf20Sopenharmony_ci ldr w14,[$ctx,#16] 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_ci tst $len,#31 3648c2ecf20Sopenharmony_ci b.eq .Leven_neon 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci ldp $r0,$r1,[$ctx,#32] // load key value 3678c2ecf20Sopenharmony_ci 3688c2ecf20Sopenharmony_ci add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 3698c2ecf20Sopenharmony_ci lsr $h1,x12,#12 3708c2ecf20Sopenharmony_ci adds $h0,$h0,x12,lsl#52 3718c2ecf20Sopenharmony_ci add $h1,$h1,x13,lsl#14 3728c2ecf20Sopenharmony_ci adc $h1,$h1,xzr 3738c2ecf20Sopenharmony_ci lsr $h2,x14,#24 3748c2ecf20Sopenharmony_ci adds $h1,$h1,x14,lsl#40 3758c2ecf20Sopenharmony_ci adc $d2,$h2,xzr // can be partially reduced... 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci ldp $d0,$d1,[$inp],#16 // load input 3788c2ecf20Sopenharmony_ci sub $len,$len,#16 3798c2ecf20Sopenharmony_ci add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 3828c2ecf20Sopenharmony_ci rev $d0,$d0 3838c2ecf20Sopenharmony_ci rev $d1,$d1 3848c2ecf20Sopenharmony_ci#endif 3858c2ecf20Sopenharmony_ci adds $h0,$h0,$d0 // accumulate input 3868c2ecf20Sopenharmony_ci adcs $h1,$h1,$d1 3878c2ecf20Sopenharmony_ci adc $h2,$h2,$padbit 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci bl poly1305_mult 3908c2ecf20Sopenharmony_ci 3918c2ecf20Sopenharmony_ci and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 3928c2ecf20Sopenharmony_ci ubfx x11,$h0,#26,#26 3938c2ecf20Sopenharmony_ci extr x12,$h1,$h0,#52 3948c2ecf20Sopenharmony_ci and x12,x12,#0x03ffffff 3958c2ecf20Sopenharmony_ci ubfx x13,$h1,#14,#26 3968c2ecf20Sopenharmony_ci extr x14,$h2,$h1,#40 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci b .Leven_neon 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ci.align 4 4018c2ecf20Sopenharmony_ci.Lbase2_64_neon: 4028c2ecf20Sopenharmony_ci ldp $r0,$r1,[$ctx,#32] // load key value 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci ldp $h0,$h1,[$ctx] // load hash value base 2^64 4058c2ecf20Sopenharmony_ci ldr $h2,[$ctx,#16] 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_ci tst $len,#31 4088c2ecf20Sopenharmony_ci b.eq .Linit_neon 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_ci ldp $d0,$d1,[$inp],#16 // load input 4118c2ecf20Sopenharmony_ci sub $len,$len,#16 4128c2ecf20Sopenharmony_ci add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 4138c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 4148c2ecf20Sopenharmony_ci rev $d0,$d0 4158c2ecf20Sopenharmony_ci rev $d1,$d1 4168c2ecf20Sopenharmony_ci#endif 4178c2ecf20Sopenharmony_ci adds $h0,$h0,$d0 // accumulate input 4188c2ecf20Sopenharmony_ci adcs $h1,$h1,$d1 4198c2ecf20Sopenharmony_ci adc $h2,$h2,$padbit 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_ci bl poly1305_mult 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_ci.Linit_neon: 4248c2ecf20Sopenharmony_ci ldr w17,[$ctx,#48] // first table element 4258c2ecf20Sopenharmony_ci and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 4268c2ecf20Sopenharmony_ci ubfx x11,$h0,#26,#26 4278c2ecf20Sopenharmony_ci extr x12,$h1,$h0,#52 4288c2ecf20Sopenharmony_ci and x12,x12,#0x03ffffff 4298c2ecf20Sopenharmony_ci ubfx x13,$h1,#14,#26 4308c2ecf20Sopenharmony_ci extr x14,$h2,$h1,#40 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci cmp w17,#-1 // is value impossible? 4338c2ecf20Sopenharmony_ci b.ne .Leven_neon 4348c2ecf20Sopenharmony_ci 4358c2ecf20Sopenharmony_ci fmov ${H0},x10 4368c2ecf20Sopenharmony_ci fmov ${H1},x11 4378c2ecf20Sopenharmony_ci fmov ${H2},x12 4388c2ecf20Sopenharmony_ci fmov ${H3},x13 4398c2ecf20Sopenharmony_ci fmov ${H4},x14 4408c2ecf20Sopenharmony_ci 4418c2ecf20Sopenharmony_ci ////////////////////////////////// initialize r^n table 4428c2ecf20Sopenharmony_ci mov $h0,$r0 // r^1 4438c2ecf20Sopenharmony_ci add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 4448c2ecf20Sopenharmony_ci mov $h1,$r1 4458c2ecf20Sopenharmony_ci mov $h2,xzr 4468c2ecf20Sopenharmony_ci add $ctx,$ctx,#48+12 4478c2ecf20Sopenharmony_ci bl poly1305_splat 4488c2ecf20Sopenharmony_ci 4498c2ecf20Sopenharmony_ci bl poly1305_mult // r^2 4508c2ecf20Sopenharmony_ci sub $ctx,$ctx,#4 4518c2ecf20Sopenharmony_ci bl poly1305_splat 4528c2ecf20Sopenharmony_ci 4538c2ecf20Sopenharmony_ci bl poly1305_mult // r^3 4548c2ecf20Sopenharmony_ci sub $ctx,$ctx,#4 4558c2ecf20Sopenharmony_ci bl poly1305_splat 4568c2ecf20Sopenharmony_ci 4578c2ecf20Sopenharmony_ci bl poly1305_mult // r^4 4588c2ecf20Sopenharmony_ci sub $ctx,$ctx,#4 4598c2ecf20Sopenharmony_ci bl poly1305_splat 4608c2ecf20Sopenharmony_ci sub $ctx,$ctx,#48 // restore original $ctx 4618c2ecf20Sopenharmony_ci b .Ldo_neon 4628c2ecf20Sopenharmony_ci 4638c2ecf20Sopenharmony_ci.align 4 4648c2ecf20Sopenharmony_ci.Leven_neon: 4658c2ecf20Sopenharmony_ci fmov ${H0},x10 4668c2ecf20Sopenharmony_ci fmov ${H1},x11 4678c2ecf20Sopenharmony_ci fmov ${H2},x12 4688c2ecf20Sopenharmony_ci fmov ${H3},x13 4698c2ecf20Sopenharmony_ci fmov ${H4},x14 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci.Ldo_neon: 4728c2ecf20Sopenharmony_ci ldp x8,x12,[$inp,#32] // inp[2:3] 4738c2ecf20Sopenharmony_ci subs $len,$len,#64 4748c2ecf20Sopenharmony_ci ldp x9,x13,[$inp,#48] 4758c2ecf20Sopenharmony_ci add $in2,$inp,#96 4768c2ecf20Sopenharmony_ci adr $zeros,.Lzeros 4778c2ecf20Sopenharmony_ci 4788c2ecf20Sopenharmony_ci lsl $padbit,$padbit,#24 4798c2ecf20Sopenharmony_ci add x15,$ctx,#48 4808c2ecf20Sopenharmony_ci 4818c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 4828c2ecf20Sopenharmony_ci rev x8,x8 4838c2ecf20Sopenharmony_ci rev x12,x12 4848c2ecf20Sopenharmony_ci rev x9,x9 4858c2ecf20Sopenharmony_ci rev x13,x13 4868c2ecf20Sopenharmony_ci#endif 4878c2ecf20Sopenharmony_ci and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 4888c2ecf20Sopenharmony_ci and x5,x9,#0x03ffffff 4898c2ecf20Sopenharmony_ci ubfx x6,x8,#26,#26 4908c2ecf20Sopenharmony_ci ubfx x7,x9,#26,#26 4918c2ecf20Sopenharmony_ci add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 4928c2ecf20Sopenharmony_ci extr x8,x12,x8,#52 4938c2ecf20Sopenharmony_ci extr x9,x13,x9,#52 4948c2ecf20Sopenharmony_ci add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 4958c2ecf20Sopenharmony_ci fmov $IN23_0,x4 4968c2ecf20Sopenharmony_ci and x8,x8,#0x03ffffff 4978c2ecf20Sopenharmony_ci and x9,x9,#0x03ffffff 4988c2ecf20Sopenharmony_ci ubfx x10,x12,#14,#26 4998c2ecf20Sopenharmony_ci ubfx x11,x13,#14,#26 5008c2ecf20Sopenharmony_ci add x12,$padbit,x12,lsr#40 5018c2ecf20Sopenharmony_ci add x13,$padbit,x13,lsr#40 5028c2ecf20Sopenharmony_ci add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 5038c2ecf20Sopenharmony_ci fmov $IN23_1,x6 5048c2ecf20Sopenharmony_ci add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 5058c2ecf20Sopenharmony_ci add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 5068c2ecf20Sopenharmony_ci fmov $IN23_2,x8 5078c2ecf20Sopenharmony_ci fmov $IN23_3,x10 5088c2ecf20Sopenharmony_ci fmov $IN23_4,x12 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci ldp x8,x12,[$inp],#16 // inp[0:1] 5118c2ecf20Sopenharmony_ci ldp x9,x13,[$inp],#48 5128c2ecf20Sopenharmony_ci 5138c2ecf20Sopenharmony_ci ld1 {$R0,$R1,$S1,$R2},[x15],#64 5148c2ecf20Sopenharmony_ci ld1 {$S2,$R3,$S3,$R4},[x15],#64 5158c2ecf20Sopenharmony_ci ld1 {$S4},[x15] 5168c2ecf20Sopenharmony_ci 5178c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 5188c2ecf20Sopenharmony_ci rev x8,x8 5198c2ecf20Sopenharmony_ci rev x12,x12 5208c2ecf20Sopenharmony_ci rev x9,x9 5218c2ecf20Sopenharmony_ci rev x13,x13 5228c2ecf20Sopenharmony_ci#endif 5238c2ecf20Sopenharmony_ci and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 5248c2ecf20Sopenharmony_ci and x5,x9,#0x03ffffff 5258c2ecf20Sopenharmony_ci ubfx x6,x8,#26,#26 5268c2ecf20Sopenharmony_ci ubfx x7,x9,#26,#26 5278c2ecf20Sopenharmony_ci add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 5288c2ecf20Sopenharmony_ci extr x8,x12,x8,#52 5298c2ecf20Sopenharmony_ci extr x9,x13,x9,#52 5308c2ecf20Sopenharmony_ci add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 5318c2ecf20Sopenharmony_ci fmov $IN01_0,x4 5328c2ecf20Sopenharmony_ci and x8,x8,#0x03ffffff 5338c2ecf20Sopenharmony_ci and x9,x9,#0x03ffffff 5348c2ecf20Sopenharmony_ci ubfx x10,x12,#14,#26 5358c2ecf20Sopenharmony_ci ubfx x11,x13,#14,#26 5368c2ecf20Sopenharmony_ci add x12,$padbit,x12,lsr#40 5378c2ecf20Sopenharmony_ci add x13,$padbit,x13,lsr#40 5388c2ecf20Sopenharmony_ci add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 5398c2ecf20Sopenharmony_ci fmov $IN01_1,x6 5408c2ecf20Sopenharmony_ci add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 5418c2ecf20Sopenharmony_ci add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 5428c2ecf20Sopenharmony_ci movi $MASK.2d,#-1 5438c2ecf20Sopenharmony_ci fmov $IN01_2,x8 5448c2ecf20Sopenharmony_ci fmov $IN01_3,x10 5458c2ecf20Sopenharmony_ci fmov $IN01_4,x12 5468c2ecf20Sopenharmony_ci ushr $MASK.2d,$MASK.2d,#38 5478c2ecf20Sopenharmony_ci 5488c2ecf20Sopenharmony_ci b.ls .Lskip_loop 5498c2ecf20Sopenharmony_ci 5508c2ecf20Sopenharmony_ci.align 4 5518c2ecf20Sopenharmony_ci.Loop_neon: 5528c2ecf20Sopenharmony_ci //////////////////////////////////////////////////////////////// 5538c2ecf20Sopenharmony_ci // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 5548c2ecf20Sopenharmony_ci // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 5558c2ecf20Sopenharmony_ci // \___________________/ 5568c2ecf20Sopenharmony_ci // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 5578c2ecf20Sopenharmony_ci // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 5588c2ecf20Sopenharmony_ci // \___________________/ \____________________/ 5598c2ecf20Sopenharmony_ci // 5608c2ecf20Sopenharmony_ci // Note that we start with inp[2:3]*r^2. This is because it 5618c2ecf20Sopenharmony_ci // doesn't depend on reduction in previous iteration. 5628c2ecf20Sopenharmony_ci //////////////////////////////////////////////////////////////// 5638c2ecf20Sopenharmony_ci // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 5648c2ecf20Sopenharmony_ci // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 5658c2ecf20Sopenharmony_ci // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 5668c2ecf20Sopenharmony_ci // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 5678c2ecf20Sopenharmony_ci // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 5688c2ecf20Sopenharmony_ci 5698c2ecf20Sopenharmony_ci subs $len,$len,#64 5708c2ecf20Sopenharmony_ci umull $ACC4,$IN23_0,${R4}[2] 5718c2ecf20Sopenharmony_ci csel $in2,$zeros,$in2,lo 5728c2ecf20Sopenharmony_ci umull $ACC3,$IN23_0,${R3}[2] 5738c2ecf20Sopenharmony_ci umull $ACC2,$IN23_0,${R2}[2] 5748c2ecf20Sopenharmony_ci ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 5758c2ecf20Sopenharmony_ci umull $ACC1,$IN23_0,${R1}[2] 5768c2ecf20Sopenharmony_ci ldp x9,x13,[$in2],#48 5778c2ecf20Sopenharmony_ci umull $ACC0,$IN23_0,${R0}[2] 5788c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 5798c2ecf20Sopenharmony_ci rev x8,x8 5808c2ecf20Sopenharmony_ci rev x12,x12 5818c2ecf20Sopenharmony_ci rev x9,x9 5828c2ecf20Sopenharmony_ci rev x13,x13 5838c2ecf20Sopenharmony_ci#endif 5848c2ecf20Sopenharmony_ci 5858c2ecf20Sopenharmony_ci umlal $ACC4,$IN23_1,${R3}[2] 5868c2ecf20Sopenharmony_ci and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 5878c2ecf20Sopenharmony_ci umlal $ACC3,$IN23_1,${R2}[2] 5888c2ecf20Sopenharmony_ci and x5,x9,#0x03ffffff 5898c2ecf20Sopenharmony_ci umlal $ACC2,$IN23_1,${R1}[2] 5908c2ecf20Sopenharmony_ci ubfx x6,x8,#26,#26 5918c2ecf20Sopenharmony_ci umlal $ACC1,$IN23_1,${R0}[2] 5928c2ecf20Sopenharmony_ci ubfx x7,x9,#26,#26 5938c2ecf20Sopenharmony_ci umlal $ACC0,$IN23_1,${S4}[2] 5948c2ecf20Sopenharmony_ci add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 5958c2ecf20Sopenharmony_ci 5968c2ecf20Sopenharmony_ci umlal $ACC4,$IN23_2,${R2}[2] 5978c2ecf20Sopenharmony_ci extr x8,x12,x8,#52 5988c2ecf20Sopenharmony_ci umlal $ACC3,$IN23_2,${R1}[2] 5998c2ecf20Sopenharmony_ci extr x9,x13,x9,#52 6008c2ecf20Sopenharmony_ci umlal $ACC2,$IN23_2,${R0}[2] 6018c2ecf20Sopenharmony_ci add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 6028c2ecf20Sopenharmony_ci umlal $ACC1,$IN23_2,${S4}[2] 6038c2ecf20Sopenharmony_ci fmov $IN23_0,x4 6048c2ecf20Sopenharmony_ci umlal $ACC0,$IN23_2,${S3}[2] 6058c2ecf20Sopenharmony_ci and x8,x8,#0x03ffffff 6068c2ecf20Sopenharmony_ci 6078c2ecf20Sopenharmony_ci umlal $ACC4,$IN23_3,${R1}[2] 6088c2ecf20Sopenharmony_ci and x9,x9,#0x03ffffff 6098c2ecf20Sopenharmony_ci umlal $ACC3,$IN23_3,${R0}[2] 6108c2ecf20Sopenharmony_ci ubfx x10,x12,#14,#26 6118c2ecf20Sopenharmony_ci umlal $ACC2,$IN23_3,${S4}[2] 6128c2ecf20Sopenharmony_ci ubfx x11,x13,#14,#26 6138c2ecf20Sopenharmony_ci umlal $ACC1,$IN23_3,${S3}[2] 6148c2ecf20Sopenharmony_ci add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 6158c2ecf20Sopenharmony_ci umlal $ACC0,$IN23_3,${S2}[2] 6168c2ecf20Sopenharmony_ci fmov $IN23_1,x6 6178c2ecf20Sopenharmony_ci 6188c2ecf20Sopenharmony_ci add $IN01_2,$IN01_2,$H2 6198c2ecf20Sopenharmony_ci add x12,$padbit,x12,lsr#40 6208c2ecf20Sopenharmony_ci umlal $ACC4,$IN23_4,${R0}[2] 6218c2ecf20Sopenharmony_ci add x13,$padbit,x13,lsr#40 6228c2ecf20Sopenharmony_ci umlal $ACC3,$IN23_4,${S4}[2] 6238c2ecf20Sopenharmony_ci add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 6248c2ecf20Sopenharmony_ci umlal $ACC2,$IN23_4,${S3}[2] 6258c2ecf20Sopenharmony_ci add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 6268c2ecf20Sopenharmony_ci umlal $ACC1,$IN23_4,${S2}[2] 6278c2ecf20Sopenharmony_ci fmov $IN23_2,x8 6288c2ecf20Sopenharmony_ci umlal $ACC0,$IN23_4,${S1}[2] 6298c2ecf20Sopenharmony_ci fmov $IN23_3,x10 6308c2ecf20Sopenharmony_ci 6318c2ecf20Sopenharmony_ci //////////////////////////////////////////////////////////////// 6328c2ecf20Sopenharmony_ci // (hash+inp[0:1])*r^4 and accumulate 6338c2ecf20Sopenharmony_ci 6348c2ecf20Sopenharmony_ci add $IN01_0,$IN01_0,$H0 6358c2ecf20Sopenharmony_ci fmov $IN23_4,x12 6368c2ecf20Sopenharmony_ci umlal $ACC3,$IN01_2,${R1}[0] 6378c2ecf20Sopenharmony_ci ldp x8,x12,[$inp],#16 // inp[0:1] 6388c2ecf20Sopenharmony_ci umlal $ACC0,$IN01_2,${S3}[0] 6398c2ecf20Sopenharmony_ci ldp x9,x13,[$inp],#48 6408c2ecf20Sopenharmony_ci umlal $ACC4,$IN01_2,${R2}[0] 6418c2ecf20Sopenharmony_ci umlal $ACC1,$IN01_2,${S4}[0] 6428c2ecf20Sopenharmony_ci umlal $ACC2,$IN01_2,${R0}[0] 6438c2ecf20Sopenharmony_ci#ifdef __AARCH64EB__ 6448c2ecf20Sopenharmony_ci rev x8,x8 6458c2ecf20Sopenharmony_ci rev x12,x12 6468c2ecf20Sopenharmony_ci rev x9,x9 6478c2ecf20Sopenharmony_ci rev x13,x13 6488c2ecf20Sopenharmony_ci#endif 6498c2ecf20Sopenharmony_ci 6508c2ecf20Sopenharmony_ci add $IN01_1,$IN01_1,$H1 6518c2ecf20Sopenharmony_ci umlal $ACC3,$IN01_0,${R3}[0] 6528c2ecf20Sopenharmony_ci umlal $ACC4,$IN01_0,${R4}[0] 6538c2ecf20Sopenharmony_ci and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 6548c2ecf20Sopenharmony_ci umlal $ACC2,$IN01_0,${R2}[0] 6558c2ecf20Sopenharmony_ci and x5,x9,#0x03ffffff 6568c2ecf20Sopenharmony_ci umlal $ACC0,$IN01_0,${R0}[0] 6578c2ecf20Sopenharmony_ci ubfx x6,x8,#26,#26 6588c2ecf20Sopenharmony_ci umlal $ACC1,$IN01_0,${R1}[0] 6598c2ecf20Sopenharmony_ci ubfx x7,x9,#26,#26 6608c2ecf20Sopenharmony_ci 6618c2ecf20Sopenharmony_ci add $IN01_3,$IN01_3,$H3 6628c2ecf20Sopenharmony_ci add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 6638c2ecf20Sopenharmony_ci umlal $ACC3,$IN01_1,${R2}[0] 6648c2ecf20Sopenharmony_ci extr x8,x12,x8,#52 6658c2ecf20Sopenharmony_ci umlal $ACC4,$IN01_1,${R3}[0] 6668c2ecf20Sopenharmony_ci extr x9,x13,x9,#52 6678c2ecf20Sopenharmony_ci umlal $ACC0,$IN01_1,${S4}[0] 6688c2ecf20Sopenharmony_ci add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 6698c2ecf20Sopenharmony_ci umlal $ACC2,$IN01_1,${R1}[0] 6708c2ecf20Sopenharmony_ci fmov $IN01_0,x4 6718c2ecf20Sopenharmony_ci umlal $ACC1,$IN01_1,${R0}[0] 6728c2ecf20Sopenharmony_ci and x8,x8,#0x03ffffff 6738c2ecf20Sopenharmony_ci 6748c2ecf20Sopenharmony_ci add $IN01_4,$IN01_4,$H4 6758c2ecf20Sopenharmony_ci and x9,x9,#0x03ffffff 6768c2ecf20Sopenharmony_ci umlal $ACC3,$IN01_3,${R0}[0] 6778c2ecf20Sopenharmony_ci ubfx x10,x12,#14,#26 6788c2ecf20Sopenharmony_ci umlal $ACC0,$IN01_3,${S2}[0] 6798c2ecf20Sopenharmony_ci ubfx x11,x13,#14,#26 6808c2ecf20Sopenharmony_ci umlal $ACC4,$IN01_3,${R1}[0] 6818c2ecf20Sopenharmony_ci add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 6828c2ecf20Sopenharmony_ci umlal $ACC1,$IN01_3,${S3}[0] 6838c2ecf20Sopenharmony_ci fmov $IN01_1,x6 6848c2ecf20Sopenharmony_ci umlal $ACC2,$IN01_3,${S4}[0] 6858c2ecf20Sopenharmony_ci add x12,$padbit,x12,lsr#40 6868c2ecf20Sopenharmony_ci 6878c2ecf20Sopenharmony_ci umlal $ACC3,$IN01_4,${S4}[0] 6888c2ecf20Sopenharmony_ci add x13,$padbit,x13,lsr#40 6898c2ecf20Sopenharmony_ci umlal $ACC0,$IN01_4,${S1}[0] 6908c2ecf20Sopenharmony_ci add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 6918c2ecf20Sopenharmony_ci umlal $ACC4,$IN01_4,${R0}[0] 6928c2ecf20Sopenharmony_ci add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 6938c2ecf20Sopenharmony_ci umlal $ACC1,$IN01_4,${S2}[0] 6948c2ecf20Sopenharmony_ci fmov $IN01_2,x8 6958c2ecf20Sopenharmony_ci umlal $ACC2,$IN01_4,${S3}[0] 6968c2ecf20Sopenharmony_ci fmov $IN01_3,x10 6978c2ecf20Sopenharmony_ci fmov $IN01_4,x12 6988c2ecf20Sopenharmony_ci 6998c2ecf20Sopenharmony_ci ///////////////////////////////////////////////////////////////// 7008c2ecf20Sopenharmony_ci // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 7018c2ecf20Sopenharmony_ci // and P. Schwabe 7028c2ecf20Sopenharmony_ci // 7038c2ecf20Sopenharmony_ci // [see discussion in poly1305-armv4 module] 7048c2ecf20Sopenharmony_ci 7058c2ecf20Sopenharmony_ci ushr $T0.2d,$ACC3,#26 7068c2ecf20Sopenharmony_ci xtn $H3,$ACC3 7078c2ecf20Sopenharmony_ci ushr $T1.2d,$ACC0,#26 7088c2ecf20Sopenharmony_ci and $ACC0,$ACC0,$MASK.2d 7098c2ecf20Sopenharmony_ci add $ACC4,$ACC4,$T0.2d // h3 -> h4 7108c2ecf20Sopenharmony_ci bic $H3,#0xfc,lsl#24 // &=0x03ffffff 7118c2ecf20Sopenharmony_ci add $ACC1,$ACC1,$T1.2d // h0 -> h1 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_ci ushr $T0.2d,$ACC4,#26 7148c2ecf20Sopenharmony_ci xtn $H4,$ACC4 7158c2ecf20Sopenharmony_ci ushr $T1.2d,$ACC1,#26 7168c2ecf20Sopenharmony_ci xtn $H1,$ACC1 7178c2ecf20Sopenharmony_ci bic $H4,#0xfc,lsl#24 7188c2ecf20Sopenharmony_ci add $ACC2,$ACC2,$T1.2d // h1 -> h2 7198c2ecf20Sopenharmony_ci 7208c2ecf20Sopenharmony_ci add $ACC0,$ACC0,$T0.2d 7218c2ecf20Sopenharmony_ci shl $T0.2d,$T0.2d,#2 7228c2ecf20Sopenharmony_ci shrn $T1.2s,$ACC2,#26 7238c2ecf20Sopenharmony_ci xtn $H2,$ACC2 7248c2ecf20Sopenharmony_ci add $ACC0,$ACC0,$T0.2d // h4 -> h0 7258c2ecf20Sopenharmony_ci bic $H1,#0xfc,lsl#24 7268c2ecf20Sopenharmony_ci add $H3,$H3,$T1.2s // h2 -> h3 7278c2ecf20Sopenharmony_ci bic $H2,#0xfc,lsl#24 7288c2ecf20Sopenharmony_ci 7298c2ecf20Sopenharmony_ci shrn $T0.2s,$ACC0,#26 7308c2ecf20Sopenharmony_ci xtn $H0,$ACC0 7318c2ecf20Sopenharmony_ci ushr $T1.2s,$H3,#26 7328c2ecf20Sopenharmony_ci bic $H3,#0xfc,lsl#24 7338c2ecf20Sopenharmony_ci bic $H0,#0xfc,lsl#24 7348c2ecf20Sopenharmony_ci add $H1,$H1,$T0.2s // h0 -> h1 7358c2ecf20Sopenharmony_ci add $H4,$H4,$T1.2s // h3 -> h4 7368c2ecf20Sopenharmony_ci 7378c2ecf20Sopenharmony_ci b.hi .Loop_neon 7388c2ecf20Sopenharmony_ci 7398c2ecf20Sopenharmony_ci.Lskip_loop: 7408c2ecf20Sopenharmony_ci dup $IN23_2,${IN23_2}[0] 7418c2ecf20Sopenharmony_ci add $IN01_2,$IN01_2,$H2 7428c2ecf20Sopenharmony_ci 7438c2ecf20Sopenharmony_ci //////////////////////////////////////////////////////////////// 7448c2ecf20Sopenharmony_ci // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 7458c2ecf20Sopenharmony_ci 7468c2ecf20Sopenharmony_ci adds $len,$len,#32 7478c2ecf20Sopenharmony_ci b.ne .Long_tail 7488c2ecf20Sopenharmony_ci 7498c2ecf20Sopenharmony_ci dup $IN23_2,${IN01_2}[0] 7508c2ecf20Sopenharmony_ci add $IN23_0,$IN01_0,$H0 7518c2ecf20Sopenharmony_ci add $IN23_3,$IN01_3,$H3 7528c2ecf20Sopenharmony_ci add $IN23_1,$IN01_1,$H1 7538c2ecf20Sopenharmony_ci add $IN23_4,$IN01_4,$H4 7548c2ecf20Sopenharmony_ci 7558c2ecf20Sopenharmony_ci.Long_tail: 7568c2ecf20Sopenharmony_ci dup $IN23_0,${IN23_0}[0] 7578c2ecf20Sopenharmony_ci umull2 $ACC0,$IN23_2,${S3} 7588c2ecf20Sopenharmony_ci umull2 $ACC3,$IN23_2,${R1} 7598c2ecf20Sopenharmony_ci umull2 $ACC4,$IN23_2,${R2} 7608c2ecf20Sopenharmony_ci umull2 $ACC2,$IN23_2,${R0} 7618c2ecf20Sopenharmony_ci umull2 $ACC1,$IN23_2,${S4} 7628c2ecf20Sopenharmony_ci 7638c2ecf20Sopenharmony_ci dup $IN23_1,${IN23_1}[0] 7648c2ecf20Sopenharmony_ci umlal2 $ACC0,$IN23_0,${R0} 7658c2ecf20Sopenharmony_ci umlal2 $ACC2,$IN23_0,${R2} 7668c2ecf20Sopenharmony_ci umlal2 $ACC3,$IN23_0,${R3} 7678c2ecf20Sopenharmony_ci umlal2 $ACC4,$IN23_0,${R4} 7688c2ecf20Sopenharmony_ci umlal2 $ACC1,$IN23_0,${R1} 7698c2ecf20Sopenharmony_ci 7708c2ecf20Sopenharmony_ci dup $IN23_3,${IN23_3}[0] 7718c2ecf20Sopenharmony_ci umlal2 $ACC0,$IN23_1,${S4} 7728c2ecf20Sopenharmony_ci umlal2 $ACC3,$IN23_1,${R2} 7738c2ecf20Sopenharmony_ci umlal2 $ACC2,$IN23_1,${R1} 7748c2ecf20Sopenharmony_ci umlal2 $ACC4,$IN23_1,${R3} 7758c2ecf20Sopenharmony_ci umlal2 $ACC1,$IN23_1,${R0} 7768c2ecf20Sopenharmony_ci 7778c2ecf20Sopenharmony_ci dup $IN23_4,${IN23_4}[0] 7788c2ecf20Sopenharmony_ci umlal2 $ACC3,$IN23_3,${R0} 7798c2ecf20Sopenharmony_ci umlal2 $ACC4,$IN23_3,${R1} 7808c2ecf20Sopenharmony_ci umlal2 $ACC0,$IN23_3,${S2} 7818c2ecf20Sopenharmony_ci umlal2 $ACC1,$IN23_3,${S3} 7828c2ecf20Sopenharmony_ci umlal2 $ACC2,$IN23_3,${S4} 7838c2ecf20Sopenharmony_ci 7848c2ecf20Sopenharmony_ci umlal2 $ACC3,$IN23_4,${S4} 7858c2ecf20Sopenharmony_ci umlal2 $ACC0,$IN23_4,${S1} 7868c2ecf20Sopenharmony_ci umlal2 $ACC4,$IN23_4,${R0} 7878c2ecf20Sopenharmony_ci umlal2 $ACC1,$IN23_4,${S2} 7888c2ecf20Sopenharmony_ci umlal2 $ACC2,$IN23_4,${S3} 7898c2ecf20Sopenharmony_ci 7908c2ecf20Sopenharmony_ci b.eq .Lshort_tail 7918c2ecf20Sopenharmony_ci 7928c2ecf20Sopenharmony_ci //////////////////////////////////////////////////////////////// 7938c2ecf20Sopenharmony_ci // (hash+inp[0:1])*r^4:r^3 and accumulate 7948c2ecf20Sopenharmony_ci 7958c2ecf20Sopenharmony_ci add $IN01_0,$IN01_0,$H0 7968c2ecf20Sopenharmony_ci umlal $ACC3,$IN01_2,${R1} 7978c2ecf20Sopenharmony_ci umlal $ACC0,$IN01_2,${S3} 7988c2ecf20Sopenharmony_ci umlal $ACC4,$IN01_2,${R2} 7998c2ecf20Sopenharmony_ci umlal $ACC1,$IN01_2,${S4} 8008c2ecf20Sopenharmony_ci umlal $ACC2,$IN01_2,${R0} 8018c2ecf20Sopenharmony_ci 8028c2ecf20Sopenharmony_ci add $IN01_1,$IN01_1,$H1 8038c2ecf20Sopenharmony_ci umlal $ACC3,$IN01_0,${R3} 8048c2ecf20Sopenharmony_ci umlal $ACC0,$IN01_0,${R0} 8058c2ecf20Sopenharmony_ci umlal $ACC4,$IN01_0,${R4} 8068c2ecf20Sopenharmony_ci umlal $ACC1,$IN01_0,${R1} 8078c2ecf20Sopenharmony_ci umlal $ACC2,$IN01_0,${R2} 8088c2ecf20Sopenharmony_ci 8098c2ecf20Sopenharmony_ci add $IN01_3,$IN01_3,$H3 8108c2ecf20Sopenharmony_ci umlal $ACC3,$IN01_1,${R2} 8118c2ecf20Sopenharmony_ci umlal $ACC0,$IN01_1,${S4} 8128c2ecf20Sopenharmony_ci umlal $ACC4,$IN01_1,${R3} 8138c2ecf20Sopenharmony_ci umlal $ACC1,$IN01_1,${R0} 8148c2ecf20Sopenharmony_ci umlal $ACC2,$IN01_1,${R1} 8158c2ecf20Sopenharmony_ci 8168c2ecf20Sopenharmony_ci add $IN01_4,$IN01_4,$H4 8178c2ecf20Sopenharmony_ci umlal $ACC3,$IN01_3,${R0} 8188c2ecf20Sopenharmony_ci umlal $ACC0,$IN01_3,${S2} 8198c2ecf20Sopenharmony_ci umlal $ACC4,$IN01_3,${R1} 8208c2ecf20Sopenharmony_ci umlal $ACC1,$IN01_3,${S3} 8218c2ecf20Sopenharmony_ci umlal $ACC2,$IN01_3,${S4} 8228c2ecf20Sopenharmony_ci 8238c2ecf20Sopenharmony_ci umlal $ACC3,$IN01_4,${S4} 8248c2ecf20Sopenharmony_ci umlal $ACC0,$IN01_4,${S1} 8258c2ecf20Sopenharmony_ci umlal $ACC4,$IN01_4,${R0} 8268c2ecf20Sopenharmony_ci umlal $ACC1,$IN01_4,${S2} 8278c2ecf20Sopenharmony_ci umlal $ACC2,$IN01_4,${S3} 8288c2ecf20Sopenharmony_ci 8298c2ecf20Sopenharmony_ci.Lshort_tail: 8308c2ecf20Sopenharmony_ci //////////////////////////////////////////////////////////////// 8318c2ecf20Sopenharmony_ci // horizontal add 8328c2ecf20Sopenharmony_ci 8338c2ecf20Sopenharmony_ci addp $ACC3,$ACC3,$ACC3 8348c2ecf20Sopenharmony_ci ldp d8,d9,[sp,#16] // meet ABI requirements 8358c2ecf20Sopenharmony_ci addp $ACC0,$ACC0,$ACC0 8368c2ecf20Sopenharmony_ci ldp d10,d11,[sp,#32] 8378c2ecf20Sopenharmony_ci addp $ACC4,$ACC4,$ACC4 8388c2ecf20Sopenharmony_ci ldp d12,d13,[sp,#48] 8398c2ecf20Sopenharmony_ci addp $ACC1,$ACC1,$ACC1 8408c2ecf20Sopenharmony_ci ldp d14,d15,[sp,#64] 8418c2ecf20Sopenharmony_ci addp $ACC2,$ACC2,$ACC2 8428c2ecf20Sopenharmony_ci ldr x30,[sp,#8] 8438c2ecf20Sopenharmony_ci 8448c2ecf20Sopenharmony_ci //////////////////////////////////////////////////////////////// 8458c2ecf20Sopenharmony_ci // lazy reduction, but without narrowing 8468c2ecf20Sopenharmony_ci 8478c2ecf20Sopenharmony_ci ushr $T0.2d,$ACC3,#26 8488c2ecf20Sopenharmony_ci and $ACC3,$ACC3,$MASK.2d 8498c2ecf20Sopenharmony_ci ushr $T1.2d,$ACC0,#26 8508c2ecf20Sopenharmony_ci and $ACC0,$ACC0,$MASK.2d 8518c2ecf20Sopenharmony_ci 8528c2ecf20Sopenharmony_ci add $ACC4,$ACC4,$T0.2d // h3 -> h4 8538c2ecf20Sopenharmony_ci add $ACC1,$ACC1,$T1.2d // h0 -> h1 8548c2ecf20Sopenharmony_ci 8558c2ecf20Sopenharmony_ci ushr $T0.2d,$ACC4,#26 8568c2ecf20Sopenharmony_ci and $ACC4,$ACC4,$MASK.2d 8578c2ecf20Sopenharmony_ci ushr $T1.2d,$ACC1,#26 8588c2ecf20Sopenharmony_ci and $ACC1,$ACC1,$MASK.2d 8598c2ecf20Sopenharmony_ci add $ACC2,$ACC2,$T1.2d // h1 -> h2 8608c2ecf20Sopenharmony_ci 8618c2ecf20Sopenharmony_ci add $ACC0,$ACC0,$T0.2d 8628c2ecf20Sopenharmony_ci shl $T0.2d,$T0.2d,#2 8638c2ecf20Sopenharmony_ci ushr $T1.2d,$ACC2,#26 8648c2ecf20Sopenharmony_ci and $ACC2,$ACC2,$MASK.2d 8658c2ecf20Sopenharmony_ci add $ACC0,$ACC0,$T0.2d // h4 -> h0 8668c2ecf20Sopenharmony_ci add $ACC3,$ACC3,$T1.2d // h2 -> h3 8678c2ecf20Sopenharmony_ci 8688c2ecf20Sopenharmony_ci ushr $T0.2d,$ACC0,#26 8698c2ecf20Sopenharmony_ci and $ACC0,$ACC0,$MASK.2d 8708c2ecf20Sopenharmony_ci ushr $T1.2d,$ACC3,#26 8718c2ecf20Sopenharmony_ci and $ACC3,$ACC3,$MASK.2d 8728c2ecf20Sopenharmony_ci add $ACC1,$ACC1,$T0.2d // h0 -> h1 8738c2ecf20Sopenharmony_ci add $ACC4,$ACC4,$T1.2d // h3 -> h4 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_ci //////////////////////////////////////////////////////////////// 8768c2ecf20Sopenharmony_ci // write the result, can be partially reduced 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 8798c2ecf20Sopenharmony_ci mov x4,#1 8808c2ecf20Sopenharmony_ci st1 {$ACC4}[0],[$ctx] 8818c2ecf20Sopenharmony_ci str x4,[$ctx,#8] // set is_base2_26 8828c2ecf20Sopenharmony_ci 8838c2ecf20Sopenharmony_ci ldr x29,[sp],#80 8848c2ecf20Sopenharmony_ci .inst 0xd50323bf // autiasp 8858c2ecf20Sopenharmony_ci ret 8868c2ecf20Sopenharmony_ci.size poly1305_blocks_neon,.-poly1305_blocks_neon 8878c2ecf20Sopenharmony_ci 8888c2ecf20Sopenharmony_ci.align 5 8898c2ecf20Sopenharmony_ci.Lzeros: 8908c2ecf20Sopenharmony_ci.long 0,0,0,0,0,0,0,0 8918c2ecf20Sopenharmony_ci.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" 8928c2ecf20Sopenharmony_ci.align 2 8938c2ecf20Sopenharmony_ci#if !defined(__KERNEL__) && !defined(_WIN64) 8948c2ecf20Sopenharmony_ci.comm OPENSSL_armcap_P,4,4 8958c2ecf20Sopenharmony_ci.hidden OPENSSL_armcap_P 8968c2ecf20Sopenharmony_ci#endif 8978c2ecf20Sopenharmony_ci___ 8988c2ecf20Sopenharmony_ci 8998c2ecf20Sopenharmony_ciforeach (split("\n",$code)) { 9008c2ecf20Sopenharmony_ci s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or 9018c2ecf20Sopenharmony_ci s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or 9028c2ecf20Sopenharmony_ci (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or 9038c2ecf20Sopenharmony_ci (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or 9048c2ecf20Sopenharmony_ci (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or 9058c2ecf20Sopenharmony_ci (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or 9068c2ecf20Sopenharmony_ci (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); 9078c2ecf20Sopenharmony_ci 9088c2ecf20Sopenharmony_ci s/\.[124]([sd])\[/.$1\[/; 9098c2ecf20Sopenharmony_ci s/w#x([0-9]+)/w$1/g; 9108c2ecf20Sopenharmony_ci 9118c2ecf20Sopenharmony_ci print $_,"\n"; 9128c2ecf20Sopenharmony_ci} 9138c2ecf20Sopenharmony_ciclose STDOUT; 914