162306a36Sopenharmony_ci#!/usr/bin/env perl 262306a36Sopenharmony_ci# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 362306a36Sopenharmony_ci# 462306a36Sopenharmony_ci# ==================================================================== 562306a36Sopenharmony_ci# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL 662306a36Sopenharmony_ci# project. 762306a36Sopenharmony_ci# ==================================================================== 862306a36Sopenharmony_ci# 962306a36Sopenharmony_ci# IALU(*)/gcc-4.4 NEON 1062306a36Sopenharmony_ci# 1162306a36Sopenharmony_ci# ARM11xx(ARMv6) 7.78/+100% - 1262306a36Sopenharmony_ci# Cortex-A5 6.35/+130% 3.00 1362306a36Sopenharmony_ci# Cortex-A8 6.25/+115% 2.36 1462306a36Sopenharmony_ci# Cortex-A9 5.10/+95% 2.55 1562306a36Sopenharmony_ci# Cortex-A15 3.85/+85% 1.25(**) 1662306a36Sopenharmony_ci# Snapdragon S4 5.70/+100% 1.48(**) 1762306a36Sopenharmony_ci# 1862306a36Sopenharmony_ci# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; 1962306a36Sopenharmony_ci# (**) these are trade-off results, they can be improved by ~8% but at 2062306a36Sopenharmony_ci# the cost of 15/12% regression on Cortex-A5/A7, it's even possible 2162306a36Sopenharmony_ci# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_ci$flavour = shift; 2462306a36Sopenharmony_ciif ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 2562306a36Sopenharmony_cielse { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ciif ($flavour && $flavour ne "void") { 2862306a36Sopenharmony_ci $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 2962306a36Sopenharmony_ci ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 3062306a36Sopenharmony_ci ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 3162306a36Sopenharmony_ci die "can't locate arm-xlate.pl"; 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci open STDOUT,"| \"$^X\" $xlate $flavour $output"; 3462306a36Sopenharmony_ci} else { 3562306a36Sopenharmony_ci open STDOUT,">$output"; 3662306a36Sopenharmony_ci} 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci$code.=<<___; 4162306a36Sopenharmony_ci#ifndef __KERNEL__ 4262306a36Sopenharmony_ci# include "arm_arch.h" 4362306a36Sopenharmony_ci#else 4462306a36Sopenharmony_ci# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 4562306a36Sopenharmony_ci# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ 4662306a36Sopenharmony_ci# define poly1305_init poly1305_init_arm 4762306a36Sopenharmony_ci# define poly1305_blocks poly1305_blocks_arm 4862306a36Sopenharmony_ci# define poly1305_emit poly1305_emit_arm 4962306a36Sopenharmony_ci.globl poly1305_blocks_neon 5062306a36Sopenharmony_ci#endif 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci#if defined(__thumb2__) 5362306a36Sopenharmony_ci.syntax unified 5462306a36Sopenharmony_ci.thumb 5562306a36Sopenharmony_ci#else 5662306a36Sopenharmony_ci.code 32 5762306a36Sopenharmony_ci#endif 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci.text 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci.globl poly1305_emit 6262306a36Sopenharmony_ci.globl poly1305_blocks 6362306a36Sopenharmony_ci.globl poly1305_init 6462306a36Sopenharmony_ci.type poly1305_init,%function 6562306a36Sopenharmony_ci.align 5 6662306a36Sopenharmony_cipoly1305_init: 6762306a36Sopenharmony_ci.Lpoly1305_init: 6862306a36Sopenharmony_ci stmdb sp!,{r4-r11} 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_ci eor r3,r3,r3 7162306a36Sopenharmony_ci cmp $inp,#0 7262306a36Sopenharmony_ci str r3,[$ctx,#0] @ zero hash value 7362306a36Sopenharmony_ci str r3,[$ctx,#4] 7462306a36Sopenharmony_ci str r3,[$ctx,#8] 7562306a36Sopenharmony_ci str r3,[$ctx,#12] 7662306a36Sopenharmony_ci str r3,[$ctx,#16] 7762306a36Sopenharmony_ci str r3,[$ctx,#36] @ clear is_base2_26 7862306a36Sopenharmony_ci add $ctx,$ctx,#20 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci#ifdef __thumb2__ 8162306a36Sopenharmony_ci it eq 8262306a36Sopenharmony_ci#endif 8362306a36Sopenharmony_ci moveq r0,#0 8462306a36Sopenharmony_ci beq .Lno_key 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 8762306a36Sopenharmony_ci mov r3,#-1 8862306a36Sopenharmony_ci str r3,[$ctx,#28] @ impossible key power value 8962306a36Sopenharmony_ci# ifndef __KERNEL__ 9062306a36Sopenharmony_ci adr r11,.Lpoly1305_init 9162306a36Sopenharmony_ci ldr r12,.LOPENSSL_armcap 9262306a36Sopenharmony_ci# endif 9362306a36Sopenharmony_ci#endif 9462306a36Sopenharmony_ci ldrb r4,[$inp,#0] 9562306a36Sopenharmony_ci mov r10,#0x0fffffff 9662306a36Sopenharmony_ci ldrb r5,[$inp,#1] 9762306a36Sopenharmony_ci and r3,r10,#-4 @ 0x0ffffffc 9862306a36Sopenharmony_ci ldrb r6,[$inp,#2] 9962306a36Sopenharmony_ci ldrb r7,[$inp,#3] 10062306a36Sopenharmony_ci orr r4,r4,r5,lsl#8 10162306a36Sopenharmony_ci ldrb r5,[$inp,#4] 10262306a36Sopenharmony_ci orr r4,r4,r6,lsl#16 10362306a36Sopenharmony_ci ldrb r6,[$inp,#5] 10462306a36Sopenharmony_ci orr r4,r4,r7,lsl#24 10562306a36Sopenharmony_ci ldrb r7,[$inp,#6] 10662306a36Sopenharmony_ci and r4,r4,r10 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 10962306a36Sopenharmony_ci# if !defined(_WIN32) 11062306a36Sopenharmony_ci ldr r12,[r11,r12] @ OPENSSL_armcap_P 11162306a36Sopenharmony_ci# endif 11262306a36Sopenharmony_ci# if defined(__APPLE__) || defined(_WIN32) 11362306a36Sopenharmony_ci ldr r12,[r12] 11462306a36Sopenharmony_ci# endif 11562306a36Sopenharmony_ci#endif 11662306a36Sopenharmony_ci ldrb r8,[$inp,#7] 11762306a36Sopenharmony_ci orr r5,r5,r6,lsl#8 11862306a36Sopenharmony_ci ldrb r6,[$inp,#8] 11962306a36Sopenharmony_ci orr r5,r5,r7,lsl#16 12062306a36Sopenharmony_ci ldrb r7,[$inp,#9] 12162306a36Sopenharmony_ci orr r5,r5,r8,lsl#24 12262306a36Sopenharmony_ci ldrb r8,[$inp,#10] 12362306a36Sopenharmony_ci and r5,r5,r3 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 12662306a36Sopenharmony_ci tst r12,#ARMV7_NEON @ check for NEON 12762306a36Sopenharmony_ci# ifdef __thumb2__ 12862306a36Sopenharmony_ci adr r9,.Lpoly1305_blocks_neon 12962306a36Sopenharmony_ci adr r11,.Lpoly1305_blocks 13062306a36Sopenharmony_ci it ne 13162306a36Sopenharmony_ci movne r11,r9 13262306a36Sopenharmony_ci adr r12,.Lpoly1305_emit 13362306a36Sopenharmony_ci orr r11,r11,#1 @ thumb-ify addresses 13462306a36Sopenharmony_ci orr r12,r12,#1 13562306a36Sopenharmony_ci# else 13662306a36Sopenharmony_ci add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) 13762306a36Sopenharmony_ci ite eq 13862306a36Sopenharmony_ci addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) 13962306a36Sopenharmony_ci addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) 14062306a36Sopenharmony_ci# endif 14162306a36Sopenharmony_ci#endif 14262306a36Sopenharmony_ci ldrb r9,[$inp,#11] 14362306a36Sopenharmony_ci orr r6,r6,r7,lsl#8 14462306a36Sopenharmony_ci ldrb r7,[$inp,#12] 14562306a36Sopenharmony_ci orr r6,r6,r8,lsl#16 14662306a36Sopenharmony_ci ldrb r8,[$inp,#13] 14762306a36Sopenharmony_ci orr r6,r6,r9,lsl#24 14862306a36Sopenharmony_ci ldrb r9,[$inp,#14] 14962306a36Sopenharmony_ci and r6,r6,r3 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci ldrb r10,[$inp,#15] 15262306a36Sopenharmony_ci orr r7,r7,r8,lsl#8 15362306a36Sopenharmony_ci str r4,[$ctx,#0] 15462306a36Sopenharmony_ci orr r7,r7,r9,lsl#16 15562306a36Sopenharmony_ci str r5,[$ctx,#4] 15662306a36Sopenharmony_ci orr r7,r7,r10,lsl#24 15762306a36Sopenharmony_ci str r6,[$ctx,#8] 15862306a36Sopenharmony_ci and r7,r7,r3 15962306a36Sopenharmony_ci str r7,[$ctx,#12] 16062306a36Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 16162306a36Sopenharmony_ci stmia r2,{r11,r12} @ fill functions table 16262306a36Sopenharmony_ci mov r0,#1 16362306a36Sopenharmony_ci#else 16462306a36Sopenharmony_ci mov r0,#0 16562306a36Sopenharmony_ci#endif 16662306a36Sopenharmony_ci.Lno_key: 16762306a36Sopenharmony_ci ldmia sp!,{r4-r11} 16862306a36Sopenharmony_ci#if __ARM_ARCH__>=5 16962306a36Sopenharmony_ci ret @ bx lr 17062306a36Sopenharmony_ci#else 17162306a36Sopenharmony_ci tst lr,#1 17262306a36Sopenharmony_ci moveq pc,lr @ be binary compatible with V4, yet 17362306a36Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 17462306a36Sopenharmony_ci#endif 17562306a36Sopenharmony_ci.size poly1305_init,.-poly1305_init 17662306a36Sopenharmony_ci___ 17762306a36Sopenharmony_ci{ 17862306a36Sopenharmony_cimy ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); 17962306a36Sopenharmony_cimy ($s1,$s2,$s3)=($r1,$r2,$r3); 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci$code.=<<___; 18262306a36Sopenharmony_ci.type poly1305_blocks,%function 18362306a36Sopenharmony_ci.align 5 18462306a36Sopenharmony_cipoly1305_blocks: 18562306a36Sopenharmony_ci.Lpoly1305_blocks: 18662306a36Sopenharmony_ci stmdb sp!,{r3-r11,lr} 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci ands $len,$len,#-16 18962306a36Sopenharmony_ci beq .Lno_data 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci add $len,$len,$inp @ end pointer 19262306a36Sopenharmony_ci sub sp,sp,#32 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci#if __ARM_ARCH__<7 19562306a36Sopenharmony_ci ldmia $ctx,{$h0-$r3} @ load context 19662306a36Sopenharmony_ci add $ctx,$ctx,#20 19762306a36Sopenharmony_ci str $len,[sp,#16] @ offload stuff 19862306a36Sopenharmony_ci str $ctx,[sp,#12] 19962306a36Sopenharmony_ci#else 20062306a36Sopenharmony_ci ldr lr,[$ctx,#36] @ is_base2_26 20162306a36Sopenharmony_ci ldmia $ctx!,{$h0-$h4} @ load hash value 20262306a36Sopenharmony_ci str $len,[sp,#16] @ offload stuff 20362306a36Sopenharmony_ci str $ctx,[sp,#12] 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 20662306a36Sopenharmony_ci mov $r1,$h1,lsr#6 20762306a36Sopenharmony_ci adcs $r1,$r1,$h2,lsl#20 20862306a36Sopenharmony_ci mov $r2,$h2,lsr#12 20962306a36Sopenharmony_ci adcs $r2,$r2,$h3,lsl#14 21062306a36Sopenharmony_ci mov $r3,$h3,lsr#18 21162306a36Sopenharmony_ci adcs $r3,$r3,$h4,lsl#8 21262306a36Sopenharmony_ci mov $len,#0 21362306a36Sopenharmony_ci teq lr,#0 21462306a36Sopenharmony_ci str $len,[$ctx,#16] @ clear is_base2_26 21562306a36Sopenharmony_ci adc $len,$len,$h4,lsr#24 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci itttt ne 21862306a36Sopenharmony_ci movne $h0,$r0 @ choose between radixes 21962306a36Sopenharmony_ci movne $h1,$r1 22062306a36Sopenharmony_ci movne $h2,$r2 22162306a36Sopenharmony_ci movne $h3,$r3 22262306a36Sopenharmony_ci ldmia $ctx,{$r0-$r3} @ load key 22362306a36Sopenharmony_ci it ne 22462306a36Sopenharmony_ci movne $h4,$len 22562306a36Sopenharmony_ci#endif 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci mov lr,$inp 22862306a36Sopenharmony_ci cmp $padbit,#0 22962306a36Sopenharmony_ci str $r1,[sp,#20] 23062306a36Sopenharmony_ci str $r2,[sp,#24] 23162306a36Sopenharmony_ci str $r3,[sp,#28] 23262306a36Sopenharmony_ci b .Loop 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci.align 4 23562306a36Sopenharmony_ci.Loop: 23662306a36Sopenharmony_ci#if __ARM_ARCH__<7 23762306a36Sopenharmony_ci ldrb r0,[lr],#16 @ load input 23862306a36Sopenharmony_ci# ifdef __thumb2__ 23962306a36Sopenharmony_ci it hi 24062306a36Sopenharmony_ci# endif 24162306a36Sopenharmony_ci addhi $h4,$h4,#1 @ 1<<128 24262306a36Sopenharmony_ci ldrb r1,[lr,#-15] 24362306a36Sopenharmony_ci ldrb r2,[lr,#-14] 24462306a36Sopenharmony_ci ldrb r3,[lr,#-13] 24562306a36Sopenharmony_ci orr r1,r0,r1,lsl#8 24662306a36Sopenharmony_ci ldrb r0,[lr,#-12] 24762306a36Sopenharmony_ci orr r2,r1,r2,lsl#16 24862306a36Sopenharmony_ci ldrb r1,[lr,#-11] 24962306a36Sopenharmony_ci orr r3,r2,r3,lsl#24 25062306a36Sopenharmony_ci ldrb r2,[lr,#-10] 25162306a36Sopenharmony_ci adds $h0,$h0,r3 @ accumulate input 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci ldrb r3,[lr,#-9] 25462306a36Sopenharmony_ci orr r1,r0,r1,lsl#8 25562306a36Sopenharmony_ci ldrb r0,[lr,#-8] 25662306a36Sopenharmony_ci orr r2,r1,r2,lsl#16 25762306a36Sopenharmony_ci ldrb r1,[lr,#-7] 25862306a36Sopenharmony_ci orr r3,r2,r3,lsl#24 25962306a36Sopenharmony_ci ldrb r2,[lr,#-6] 26062306a36Sopenharmony_ci adcs $h1,$h1,r3 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci ldrb r3,[lr,#-5] 26362306a36Sopenharmony_ci orr r1,r0,r1,lsl#8 26462306a36Sopenharmony_ci ldrb r0,[lr,#-4] 26562306a36Sopenharmony_ci orr r2,r1,r2,lsl#16 26662306a36Sopenharmony_ci ldrb r1,[lr,#-3] 26762306a36Sopenharmony_ci orr r3,r2,r3,lsl#24 26862306a36Sopenharmony_ci ldrb r2,[lr,#-2] 26962306a36Sopenharmony_ci adcs $h2,$h2,r3 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci ldrb r3,[lr,#-1] 27262306a36Sopenharmony_ci orr r1,r0,r1,lsl#8 27362306a36Sopenharmony_ci str lr,[sp,#8] @ offload input pointer 27462306a36Sopenharmony_ci orr r2,r1,r2,lsl#16 27562306a36Sopenharmony_ci add $s1,$r1,$r1,lsr#2 27662306a36Sopenharmony_ci orr r3,r2,r3,lsl#24 27762306a36Sopenharmony_ci#else 27862306a36Sopenharmony_ci ldr r0,[lr],#16 @ load input 27962306a36Sopenharmony_ci it hi 28062306a36Sopenharmony_ci addhi $h4,$h4,#1 @ padbit 28162306a36Sopenharmony_ci ldr r1,[lr,#-12] 28262306a36Sopenharmony_ci ldr r2,[lr,#-8] 28362306a36Sopenharmony_ci ldr r3,[lr,#-4] 28462306a36Sopenharmony_ci# ifdef __ARMEB__ 28562306a36Sopenharmony_ci rev r0,r0 28662306a36Sopenharmony_ci rev r1,r1 28762306a36Sopenharmony_ci rev r2,r2 28862306a36Sopenharmony_ci rev r3,r3 28962306a36Sopenharmony_ci# endif 29062306a36Sopenharmony_ci adds $h0,$h0,r0 @ accumulate input 29162306a36Sopenharmony_ci str lr,[sp,#8] @ offload input pointer 29262306a36Sopenharmony_ci adcs $h1,$h1,r1 29362306a36Sopenharmony_ci add $s1,$r1,$r1,lsr#2 29462306a36Sopenharmony_ci adcs $h2,$h2,r2 29562306a36Sopenharmony_ci#endif 29662306a36Sopenharmony_ci add $s2,$r2,$r2,lsr#2 29762306a36Sopenharmony_ci adcs $h3,$h3,r3 29862306a36Sopenharmony_ci add $s3,$r3,$r3,lsr#2 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci umull r2,r3,$h1,$r0 30162306a36Sopenharmony_ci adc $h4,$h4,#0 30262306a36Sopenharmony_ci umull r0,r1,$h0,$r0 30362306a36Sopenharmony_ci umlal r2,r3,$h4,$s1 30462306a36Sopenharmony_ci umlal r0,r1,$h3,$s1 30562306a36Sopenharmony_ci ldr $r1,[sp,#20] @ reload $r1 30662306a36Sopenharmony_ci umlal r2,r3,$h2,$s3 30762306a36Sopenharmony_ci umlal r0,r1,$h1,$s3 30862306a36Sopenharmony_ci umlal r2,r3,$h3,$s2 30962306a36Sopenharmony_ci umlal r0,r1,$h2,$s2 31062306a36Sopenharmony_ci umlal r2,r3,$h0,$r1 31162306a36Sopenharmony_ci str r0,[sp,#0] @ future $h0 31262306a36Sopenharmony_ci mul r0,$s2,$h4 31362306a36Sopenharmony_ci ldr $r2,[sp,#24] @ reload $r2 31462306a36Sopenharmony_ci adds r2,r2,r1 @ d1+=d0>>32 31562306a36Sopenharmony_ci eor r1,r1,r1 31662306a36Sopenharmony_ci adc lr,r3,#0 @ future $h2 31762306a36Sopenharmony_ci str r2,[sp,#4] @ future $h1 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_ci mul r2,$s3,$h4 32062306a36Sopenharmony_ci eor r3,r3,r3 32162306a36Sopenharmony_ci umlal r0,r1,$h3,$s3 32262306a36Sopenharmony_ci ldr $r3,[sp,#28] @ reload $r3 32362306a36Sopenharmony_ci umlal r2,r3,$h3,$r0 32462306a36Sopenharmony_ci umlal r0,r1,$h2,$r0 32562306a36Sopenharmony_ci umlal r2,r3,$h2,$r1 32662306a36Sopenharmony_ci umlal r0,r1,$h1,$r1 32762306a36Sopenharmony_ci umlal r2,r3,$h1,$r2 32862306a36Sopenharmony_ci umlal r0,r1,$h0,$r2 32962306a36Sopenharmony_ci umlal r2,r3,$h0,$r3 33062306a36Sopenharmony_ci ldr $h0,[sp,#0] 33162306a36Sopenharmony_ci mul $h4,$r0,$h4 33262306a36Sopenharmony_ci ldr $h1,[sp,#4] 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ci adds $h2,lr,r0 @ d2+=d1>>32 33562306a36Sopenharmony_ci ldr lr,[sp,#8] @ reload input pointer 33662306a36Sopenharmony_ci adc r1,r1,#0 33762306a36Sopenharmony_ci adds $h3,r2,r1 @ d3+=d2>>32 33862306a36Sopenharmony_ci ldr r0,[sp,#16] @ reload end pointer 33962306a36Sopenharmony_ci adc r3,r3,#0 34062306a36Sopenharmony_ci add $h4,$h4,r3 @ h4+=d3>>32 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci and r1,$h4,#-4 34362306a36Sopenharmony_ci and $h4,$h4,#3 34462306a36Sopenharmony_ci add r1,r1,r1,lsr#2 @ *=5 34562306a36Sopenharmony_ci adds $h0,$h0,r1 34662306a36Sopenharmony_ci adcs $h1,$h1,#0 34762306a36Sopenharmony_ci adcs $h2,$h2,#0 34862306a36Sopenharmony_ci adcs $h3,$h3,#0 34962306a36Sopenharmony_ci adc $h4,$h4,#0 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_ci cmp r0,lr @ done yet? 35262306a36Sopenharmony_ci bhi .Loop 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci ldr $ctx,[sp,#12] 35562306a36Sopenharmony_ci add sp,sp,#32 35662306a36Sopenharmony_ci stmdb $ctx,{$h0-$h4} @ store the result 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci.Lno_data: 35962306a36Sopenharmony_ci#if __ARM_ARCH__>=5 36062306a36Sopenharmony_ci ldmia sp!,{r3-r11,pc} 36162306a36Sopenharmony_ci#else 36262306a36Sopenharmony_ci ldmia sp!,{r3-r11,lr} 36362306a36Sopenharmony_ci tst lr,#1 36462306a36Sopenharmony_ci moveq pc,lr @ be binary compatible with V4, yet 36562306a36Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 36662306a36Sopenharmony_ci#endif 36762306a36Sopenharmony_ci.size poly1305_blocks,.-poly1305_blocks 36862306a36Sopenharmony_ci___ 36962306a36Sopenharmony_ci} 37062306a36Sopenharmony_ci{ 37162306a36Sopenharmony_cimy ($ctx,$mac,$nonce)=map("r$_",(0..2)); 37262306a36Sopenharmony_cimy ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); 37362306a36Sopenharmony_cimy $g4=$ctx; 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_ci$code.=<<___; 37662306a36Sopenharmony_ci.type poly1305_emit,%function 37762306a36Sopenharmony_ci.align 5 37862306a36Sopenharmony_cipoly1305_emit: 37962306a36Sopenharmony_ci.Lpoly1305_emit: 38062306a36Sopenharmony_ci stmdb sp!,{r4-r11} 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ci ldmia $ctx,{$h0-$h4} 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci#if __ARM_ARCH__>=7 38562306a36Sopenharmony_ci ldr ip,[$ctx,#36] @ is_base2_26 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 38862306a36Sopenharmony_ci mov $g1,$h1,lsr#6 38962306a36Sopenharmony_ci adcs $g1,$g1,$h2,lsl#20 39062306a36Sopenharmony_ci mov $g2,$h2,lsr#12 39162306a36Sopenharmony_ci adcs $g2,$g2,$h3,lsl#14 39262306a36Sopenharmony_ci mov $g3,$h3,lsr#18 39362306a36Sopenharmony_ci adcs $g3,$g3,$h4,lsl#8 39462306a36Sopenharmony_ci mov $g4,#0 39562306a36Sopenharmony_ci adc $g4,$g4,$h4,lsr#24 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_ci tst ip,ip 39862306a36Sopenharmony_ci itttt ne 39962306a36Sopenharmony_ci movne $h0,$g0 40062306a36Sopenharmony_ci movne $h1,$g1 40162306a36Sopenharmony_ci movne $h2,$g2 40262306a36Sopenharmony_ci movne $h3,$g3 40362306a36Sopenharmony_ci it ne 40462306a36Sopenharmony_ci movne $h4,$g4 40562306a36Sopenharmony_ci#endif 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_ci adds $g0,$h0,#5 @ compare to modulus 40862306a36Sopenharmony_ci adcs $g1,$h1,#0 40962306a36Sopenharmony_ci adcs $g2,$h2,#0 41062306a36Sopenharmony_ci adcs $g3,$h3,#0 41162306a36Sopenharmony_ci adc $g4,$h4,#0 41262306a36Sopenharmony_ci tst $g4,#4 @ did it carry/borrow? 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci#ifdef __thumb2__ 41562306a36Sopenharmony_ci it ne 41662306a36Sopenharmony_ci#endif 41762306a36Sopenharmony_ci movne $h0,$g0 41862306a36Sopenharmony_ci ldr $g0,[$nonce,#0] 41962306a36Sopenharmony_ci#ifdef __thumb2__ 42062306a36Sopenharmony_ci it ne 42162306a36Sopenharmony_ci#endif 42262306a36Sopenharmony_ci movne $h1,$g1 42362306a36Sopenharmony_ci ldr $g1,[$nonce,#4] 42462306a36Sopenharmony_ci#ifdef __thumb2__ 42562306a36Sopenharmony_ci it ne 42662306a36Sopenharmony_ci#endif 42762306a36Sopenharmony_ci movne $h2,$g2 42862306a36Sopenharmony_ci ldr $g2,[$nonce,#8] 42962306a36Sopenharmony_ci#ifdef __thumb2__ 43062306a36Sopenharmony_ci it ne 43162306a36Sopenharmony_ci#endif 43262306a36Sopenharmony_ci movne $h3,$g3 43362306a36Sopenharmony_ci ldr $g3,[$nonce,#12] 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci adds $h0,$h0,$g0 43662306a36Sopenharmony_ci adcs $h1,$h1,$g1 43762306a36Sopenharmony_ci adcs $h2,$h2,$g2 43862306a36Sopenharmony_ci adc $h3,$h3,$g3 43962306a36Sopenharmony_ci 44062306a36Sopenharmony_ci#if __ARM_ARCH__>=7 44162306a36Sopenharmony_ci# ifdef __ARMEB__ 44262306a36Sopenharmony_ci rev $h0,$h0 44362306a36Sopenharmony_ci rev $h1,$h1 44462306a36Sopenharmony_ci rev $h2,$h2 44562306a36Sopenharmony_ci rev $h3,$h3 44662306a36Sopenharmony_ci# endif 44762306a36Sopenharmony_ci str $h0,[$mac,#0] 44862306a36Sopenharmony_ci str $h1,[$mac,#4] 44962306a36Sopenharmony_ci str $h2,[$mac,#8] 45062306a36Sopenharmony_ci str $h3,[$mac,#12] 45162306a36Sopenharmony_ci#else 45262306a36Sopenharmony_ci strb $h0,[$mac,#0] 45362306a36Sopenharmony_ci mov $h0,$h0,lsr#8 45462306a36Sopenharmony_ci strb $h1,[$mac,#4] 45562306a36Sopenharmony_ci mov $h1,$h1,lsr#8 45662306a36Sopenharmony_ci strb $h2,[$mac,#8] 45762306a36Sopenharmony_ci mov $h2,$h2,lsr#8 45862306a36Sopenharmony_ci strb $h3,[$mac,#12] 45962306a36Sopenharmony_ci mov $h3,$h3,lsr#8 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_ci strb $h0,[$mac,#1] 46262306a36Sopenharmony_ci mov $h0,$h0,lsr#8 46362306a36Sopenharmony_ci strb $h1,[$mac,#5] 46462306a36Sopenharmony_ci mov $h1,$h1,lsr#8 46562306a36Sopenharmony_ci strb $h2,[$mac,#9] 46662306a36Sopenharmony_ci mov $h2,$h2,lsr#8 46762306a36Sopenharmony_ci strb $h3,[$mac,#13] 46862306a36Sopenharmony_ci mov $h3,$h3,lsr#8 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_ci strb $h0,[$mac,#2] 47162306a36Sopenharmony_ci mov $h0,$h0,lsr#8 47262306a36Sopenharmony_ci strb $h1,[$mac,#6] 47362306a36Sopenharmony_ci mov $h1,$h1,lsr#8 47462306a36Sopenharmony_ci strb $h2,[$mac,#10] 47562306a36Sopenharmony_ci mov $h2,$h2,lsr#8 47662306a36Sopenharmony_ci strb $h3,[$mac,#14] 47762306a36Sopenharmony_ci mov $h3,$h3,lsr#8 47862306a36Sopenharmony_ci 47962306a36Sopenharmony_ci strb $h0,[$mac,#3] 48062306a36Sopenharmony_ci strb $h1,[$mac,#7] 48162306a36Sopenharmony_ci strb $h2,[$mac,#11] 48262306a36Sopenharmony_ci strb $h3,[$mac,#15] 48362306a36Sopenharmony_ci#endif 48462306a36Sopenharmony_ci ldmia sp!,{r4-r11} 48562306a36Sopenharmony_ci#if __ARM_ARCH__>=5 48662306a36Sopenharmony_ci ret @ bx lr 48762306a36Sopenharmony_ci#else 48862306a36Sopenharmony_ci tst lr,#1 48962306a36Sopenharmony_ci moveq pc,lr @ be binary compatible with V4, yet 49062306a36Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 49162306a36Sopenharmony_ci#endif 49262306a36Sopenharmony_ci.size poly1305_emit,.-poly1305_emit 49362306a36Sopenharmony_ci___ 49462306a36Sopenharmony_ci{ 49562306a36Sopenharmony_cimy ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); 49662306a36Sopenharmony_cimy ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); 49762306a36Sopenharmony_cimy ($T0,$T1,$MASK) = map("q$_",(15,4,0)); 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_cimy ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ci$code.=<<___; 50262306a36Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 50362306a36Sopenharmony_ci.fpu neon 50462306a36Sopenharmony_ci 50562306a36Sopenharmony_ci.type poly1305_init_neon,%function 50662306a36Sopenharmony_ci.align 5 50762306a36Sopenharmony_cipoly1305_init_neon: 50862306a36Sopenharmony_ci.Lpoly1305_init_neon: 50962306a36Sopenharmony_ci ldr r3,[$ctx,#48] @ first table element 51062306a36Sopenharmony_ci cmp r3,#-1 @ is value impossible? 51162306a36Sopenharmony_ci bne .Lno_init_neon 51262306a36Sopenharmony_ci 51362306a36Sopenharmony_ci ldr r4,[$ctx,#20] @ load key base 2^32 51462306a36Sopenharmony_ci ldr r5,[$ctx,#24] 51562306a36Sopenharmony_ci ldr r6,[$ctx,#28] 51662306a36Sopenharmony_ci ldr r7,[$ctx,#32] 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 51962306a36Sopenharmony_ci mov r3,r4,lsr#26 52062306a36Sopenharmony_ci mov r4,r5,lsr#20 52162306a36Sopenharmony_ci orr r3,r3,r5,lsl#6 52262306a36Sopenharmony_ci mov r5,r6,lsr#14 52362306a36Sopenharmony_ci orr r4,r4,r6,lsl#12 52462306a36Sopenharmony_ci mov r6,r7,lsr#8 52562306a36Sopenharmony_ci orr r5,r5,r7,lsl#18 52662306a36Sopenharmony_ci and r3,r3,#0x03ffffff 52762306a36Sopenharmony_ci and r4,r4,#0x03ffffff 52862306a36Sopenharmony_ci and r5,r5,#0x03ffffff 52962306a36Sopenharmony_ci 53062306a36Sopenharmony_ci vdup.32 $R0,r2 @ r^1 in both lanes 53162306a36Sopenharmony_ci add r2,r3,r3,lsl#2 @ *5 53262306a36Sopenharmony_ci vdup.32 $R1,r3 53362306a36Sopenharmony_ci add r3,r4,r4,lsl#2 53462306a36Sopenharmony_ci vdup.32 $S1,r2 53562306a36Sopenharmony_ci vdup.32 $R2,r4 53662306a36Sopenharmony_ci add r4,r5,r5,lsl#2 53762306a36Sopenharmony_ci vdup.32 $S2,r3 53862306a36Sopenharmony_ci vdup.32 $R3,r5 53962306a36Sopenharmony_ci add r5,r6,r6,lsl#2 54062306a36Sopenharmony_ci vdup.32 $S3,r4 54162306a36Sopenharmony_ci vdup.32 $R4,r6 54262306a36Sopenharmony_ci vdup.32 $S4,r5 54362306a36Sopenharmony_ci 54462306a36Sopenharmony_ci mov $zeros,#2 @ counter 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_ci.Lsquare_neon: 54762306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 54862306a36Sopenharmony_ci @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 54962306a36Sopenharmony_ci @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 55062306a36Sopenharmony_ci @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 55162306a36Sopenharmony_ci @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 55262306a36Sopenharmony_ci @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci vmull.u32 $D0,$R0,${R0}[1] 55562306a36Sopenharmony_ci vmull.u32 $D1,$R1,${R0}[1] 55662306a36Sopenharmony_ci vmull.u32 $D2,$R2,${R0}[1] 55762306a36Sopenharmony_ci vmull.u32 $D3,$R3,${R0}[1] 55862306a36Sopenharmony_ci vmull.u32 $D4,$R4,${R0}[1] 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_ci vmlal.u32 $D0,$R4,${S1}[1] 56162306a36Sopenharmony_ci vmlal.u32 $D1,$R0,${R1}[1] 56262306a36Sopenharmony_ci vmlal.u32 $D2,$R1,${R1}[1] 56362306a36Sopenharmony_ci vmlal.u32 $D3,$R2,${R1}[1] 56462306a36Sopenharmony_ci vmlal.u32 $D4,$R3,${R1}[1] 56562306a36Sopenharmony_ci 56662306a36Sopenharmony_ci vmlal.u32 $D0,$R3,${S2}[1] 56762306a36Sopenharmony_ci vmlal.u32 $D1,$R4,${S2}[1] 56862306a36Sopenharmony_ci vmlal.u32 $D3,$R1,${R2}[1] 56962306a36Sopenharmony_ci vmlal.u32 $D2,$R0,${R2}[1] 57062306a36Sopenharmony_ci vmlal.u32 $D4,$R2,${R2}[1] 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci vmlal.u32 $D0,$R2,${S3}[1] 57362306a36Sopenharmony_ci vmlal.u32 $D3,$R0,${R3}[1] 57462306a36Sopenharmony_ci vmlal.u32 $D1,$R3,${S3}[1] 57562306a36Sopenharmony_ci vmlal.u32 $D2,$R4,${S3}[1] 57662306a36Sopenharmony_ci vmlal.u32 $D4,$R1,${R3}[1] 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci vmlal.u32 $D3,$R4,${S4}[1] 57962306a36Sopenharmony_ci vmlal.u32 $D0,$R1,${S4}[1] 58062306a36Sopenharmony_ci vmlal.u32 $D1,$R2,${S4}[1] 58162306a36Sopenharmony_ci vmlal.u32 $D2,$R3,${S4}[1] 58262306a36Sopenharmony_ci vmlal.u32 $D4,$R0,${R4}[1] 58362306a36Sopenharmony_ci 58462306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 58562306a36Sopenharmony_ci @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 58662306a36Sopenharmony_ci @ and P. Schwabe 58762306a36Sopenharmony_ci @ 58862306a36Sopenharmony_ci @ H0>>+H1>>+H2>>+H3>>+H4 58962306a36Sopenharmony_ci @ H3>>+H4>>*5+H0>>+H1 59062306a36Sopenharmony_ci @ 59162306a36Sopenharmony_ci @ Trivia. 59262306a36Sopenharmony_ci @ 59362306a36Sopenharmony_ci @ Result of multiplication of n-bit number by m-bit number is 59462306a36Sopenharmony_ci @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 59562306a36Sopenharmony_ci @ m-bit number multiplied by 2^n is still n+m bits wide. 59662306a36Sopenharmony_ci @ 59762306a36Sopenharmony_ci @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 59862306a36Sopenharmony_ci @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 59962306a36Sopenharmony_ci @ one is n+1 bits wide. 60062306a36Sopenharmony_ci @ 60162306a36Sopenharmony_ci @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 60262306a36Sopenharmony_ci @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 60362306a36Sopenharmony_ci @ can be 27. However! In cases when their width exceeds 26 bits 60462306a36Sopenharmony_ci @ they are limited by 2^26+2^6. This in turn means that *sum* 60562306a36Sopenharmony_ci @ of the products with these values can still be viewed as sum 60662306a36Sopenharmony_ci @ of 52-bit numbers as long as the amount of addends is not a 60762306a36Sopenharmony_ci @ power of 2. For example, 60862306a36Sopenharmony_ci @ 60962306a36Sopenharmony_ci @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 61062306a36Sopenharmony_ci @ 61162306a36Sopenharmony_ci @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 61262306a36Sopenharmony_ci @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 61362306a36Sopenharmony_ci @ 8 * (2^52) or 2^55. However, the value is then multiplied by 61462306a36Sopenharmony_ci @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 61562306a36Sopenharmony_ci @ which is less than 32 * (2^52) or 2^57. And when processing 61662306a36Sopenharmony_ci @ data we are looking at triple as many addends... 61762306a36Sopenharmony_ci @ 61862306a36Sopenharmony_ci @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 61962306a36Sopenharmony_ci @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 62062306a36Sopenharmony_ci @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 62162306a36Sopenharmony_ci @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 62262306a36Sopenharmony_ci @ instruction accepts 2x32-bit input and writes 2x64-bit result. 62362306a36Sopenharmony_ci @ This means that result of reduction have to be compressed upon 62462306a36Sopenharmony_ci @ loop wrap-around. This can be done in the process of reduction 62562306a36Sopenharmony_ci @ to minimize amount of instructions [as well as amount of 62662306a36Sopenharmony_ci @ 128-bit instructions, which benefits low-end processors], but 62762306a36Sopenharmony_ci @ one has to watch for H2 (which is narrower than H0) and 5*H4 62862306a36Sopenharmony_ci @ not being wider than 58 bits, so that result of right shift 62962306a36Sopenharmony_ci @ by 26 bits fits in 32 bits. This is also useful on x86, 63062306a36Sopenharmony_ci @ because it allows to use paddd in place for paddq, which 63162306a36Sopenharmony_ci @ benefits Atom, where paddq is ridiculously slow. 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci vshr.u64 $T0,$D3,#26 63462306a36Sopenharmony_ci vmovn.i64 $D3#lo,$D3 63562306a36Sopenharmony_ci vshr.u64 $T1,$D0,#26 63662306a36Sopenharmony_ci vmovn.i64 $D0#lo,$D0 63762306a36Sopenharmony_ci vadd.i64 $D4,$D4,$T0 @ h3 -> h4 63862306a36Sopenharmony_ci vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff 63962306a36Sopenharmony_ci vadd.i64 $D1,$D1,$T1 @ h0 -> h1 64062306a36Sopenharmony_ci vbic.i32 $D0#lo,#0xfc000000 64162306a36Sopenharmony_ci 64262306a36Sopenharmony_ci vshrn.u64 $T0#lo,$D4,#26 64362306a36Sopenharmony_ci vmovn.i64 $D4#lo,$D4 64462306a36Sopenharmony_ci vshr.u64 $T1,$D1,#26 64562306a36Sopenharmony_ci vmovn.i64 $D1#lo,$D1 64662306a36Sopenharmony_ci vadd.i64 $D2,$D2,$T1 @ h1 -> h2 64762306a36Sopenharmony_ci vbic.i32 $D4#lo,#0xfc000000 64862306a36Sopenharmony_ci vbic.i32 $D1#lo,#0xfc000000 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci vadd.i32 $D0#lo,$D0#lo,$T0#lo 65162306a36Sopenharmony_ci vshl.u32 $T0#lo,$T0#lo,#2 65262306a36Sopenharmony_ci vshrn.u64 $T1#lo,$D2,#26 65362306a36Sopenharmony_ci vmovn.i64 $D2#lo,$D2 65462306a36Sopenharmony_ci vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 65562306a36Sopenharmony_ci vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 65662306a36Sopenharmony_ci vbic.i32 $D2#lo,#0xfc000000 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci vshr.u32 $T0#lo,$D0#lo,#26 65962306a36Sopenharmony_ci vbic.i32 $D0#lo,#0xfc000000 66062306a36Sopenharmony_ci vshr.u32 $T1#lo,$D3#lo,#26 66162306a36Sopenharmony_ci vbic.i32 $D3#lo,#0xfc000000 66262306a36Sopenharmony_ci vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 66362306a36Sopenharmony_ci vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_ci subs $zeros,$zeros,#1 66662306a36Sopenharmony_ci beq .Lsquare_break_neon 66762306a36Sopenharmony_ci 66862306a36Sopenharmony_ci add $tbl0,$ctx,#(48+0*9*4) 66962306a36Sopenharmony_ci add $tbl1,$ctx,#(48+1*9*4) 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci vtrn.32 $R0,$D0#lo @ r^2:r^1 67262306a36Sopenharmony_ci vtrn.32 $R2,$D2#lo 67362306a36Sopenharmony_ci vtrn.32 $R3,$D3#lo 67462306a36Sopenharmony_ci vtrn.32 $R1,$D1#lo 67562306a36Sopenharmony_ci vtrn.32 $R4,$D4#lo 67662306a36Sopenharmony_ci 67762306a36Sopenharmony_ci vshl.u32 $S2,$R2,#2 @ *5 67862306a36Sopenharmony_ci vshl.u32 $S3,$R3,#2 67962306a36Sopenharmony_ci vshl.u32 $S1,$R1,#2 68062306a36Sopenharmony_ci vshl.u32 $S4,$R4,#2 68162306a36Sopenharmony_ci vadd.i32 $S2,$S2,$R2 68262306a36Sopenharmony_ci vadd.i32 $S1,$S1,$R1 68362306a36Sopenharmony_ci vadd.i32 $S3,$S3,$R3 68462306a36Sopenharmony_ci vadd.i32 $S4,$S4,$R4 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! 68762306a36Sopenharmony_ci vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! 68862306a36Sopenharmony_ci vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 68962306a36Sopenharmony_ci vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 69062306a36Sopenharmony_ci vst1.32 {${S4}[0]},[$tbl0,:32] 69162306a36Sopenharmony_ci vst1.32 {${S4}[1]},[$tbl1,:32] 69262306a36Sopenharmony_ci 69362306a36Sopenharmony_ci b .Lsquare_neon 69462306a36Sopenharmony_ci 69562306a36Sopenharmony_ci.align 4 69662306a36Sopenharmony_ci.Lsquare_break_neon: 69762306a36Sopenharmony_ci add $tbl0,$ctx,#(48+2*4*9) 69862306a36Sopenharmony_ci add $tbl1,$ctx,#(48+3*4*9) 69962306a36Sopenharmony_ci 70062306a36Sopenharmony_ci vmov $R0,$D0#lo @ r^4:r^3 70162306a36Sopenharmony_ci vshl.u32 $S1,$D1#lo,#2 @ *5 70262306a36Sopenharmony_ci vmov $R1,$D1#lo 70362306a36Sopenharmony_ci vshl.u32 $S2,$D2#lo,#2 70462306a36Sopenharmony_ci vmov $R2,$D2#lo 70562306a36Sopenharmony_ci vshl.u32 $S3,$D3#lo,#2 70662306a36Sopenharmony_ci vmov $R3,$D3#lo 70762306a36Sopenharmony_ci vshl.u32 $S4,$D4#lo,#2 70862306a36Sopenharmony_ci vmov $R4,$D4#lo 70962306a36Sopenharmony_ci vadd.i32 $S1,$S1,$D1#lo 71062306a36Sopenharmony_ci vadd.i32 $S2,$S2,$D2#lo 71162306a36Sopenharmony_ci vadd.i32 $S3,$S3,$D3#lo 71262306a36Sopenharmony_ci vadd.i32 $S4,$S4,$D4#lo 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_ci vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! 71562306a36Sopenharmony_ci vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! 71662306a36Sopenharmony_ci vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 71762306a36Sopenharmony_ci vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 71862306a36Sopenharmony_ci vst1.32 {${S4}[0]},[$tbl0] 71962306a36Sopenharmony_ci vst1.32 {${S4}[1]},[$tbl1] 72062306a36Sopenharmony_ci 72162306a36Sopenharmony_ci.Lno_init_neon: 72262306a36Sopenharmony_ci ret @ bx lr 72362306a36Sopenharmony_ci.size poly1305_init_neon,.-poly1305_init_neon 72462306a36Sopenharmony_ci 72562306a36Sopenharmony_ci.type poly1305_blocks_neon,%function 72662306a36Sopenharmony_ci.align 5 72762306a36Sopenharmony_cipoly1305_blocks_neon: 72862306a36Sopenharmony_ci.Lpoly1305_blocks_neon: 72962306a36Sopenharmony_ci ldr ip,[$ctx,#36] @ is_base2_26 73062306a36Sopenharmony_ci 73162306a36Sopenharmony_ci cmp $len,#64 73262306a36Sopenharmony_ci blo .Lpoly1305_blocks 73362306a36Sopenharmony_ci 73462306a36Sopenharmony_ci stmdb sp!,{r4-r7} 73562306a36Sopenharmony_ci vstmdb sp!,{d8-d15} @ ABI specification says so 73662306a36Sopenharmony_ci 73762306a36Sopenharmony_ci tst ip,ip @ is_base2_26? 73862306a36Sopenharmony_ci bne .Lbase2_26_neon 73962306a36Sopenharmony_ci 74062306a36Sopenharmony_ci stmdb sp!,{r1-r3,lr} 74162306a36Sopenharmony_ci bl .Lpoly1305_init_neon 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci ldr r4,[$ctx,#0] @ load hash value base 2^32 74462306a36Sopenharmony_ci ldr r5,[$ctx,#4] 74562306a36Sopenharmony_ci ldr r6,[$ctx,#8] 74662306a36Sopenharmony_ci ldr r7,[$ctx,#12] 74762306a36Sopenharmony_ci ldr ip,[$ctx,#16] 74862306a36Sopenharmony_ci 74962306a36Sopenharmony_ci and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 75062306a36Sopenharmony_ci mov r3,r4,lsr#26 75162306a36Sopenharmony_ci veor $D0#lo,$D0#lo,$D0#lo 75262306a36Sopenharmony_ci mov r4,r5,lsr#20 75362306a36Sopenharmony_ci orr r3,r3,r5,lsl#6 75462306a36Sopenharmony_ci veor $D1#lo,$D1#lo,$D1#lo 75562306a36Sopenharmony_ci mov r5,r6,lsr#14 75662306a36Sopenharmony_ci orr r4,r4,r6,lsl#12 75762306a36Sopenharmony_ci veor $D2#lo,$D2#lo,$D2#lo 75862306a36Sopenharmony_ci mov r6,r7,lsr#8 75962306a36Sopenharmony_ci orr r5,r5,r7,lsl#18 76062306a36Sopenharmony_ci veor $D3#lo,$D3#lo,$D3#lo 76162306a36Sopenharmony_ci and r3,r3,#0x03ffffff 76262306a36Sopenharmony_ci orr r6,r6,ip,lsl#24 76362306a36Sopenharmony_ci veor $D4#lo,$D4#lo,$D4#lo 76462306a36Sopenharmony_ci and r4,r4,#0x03ffffff 76562306a36Sopenharmony_ci mov r1,#1 76662306a36Sopenharmony_ci and r5,r5,#0x03ffffff 76762306a36Sopenharmony_ci str r1,[$ctx,#36] @ set is_base2_26 76862306a36Sopenharmony_ci 76962306a36Sopenharmony_ci vmov.32 $D0#lo[0],r2 77062306a36Sopenharmony_ci vmov.32 $D1#lo[0],r3 77162306a36Sopenharmony_ci vmov.32 $D2#lo[0],r4 77262306a36Sopenharmony_ci vmov.32 $D3#lo[0],r5 77362306a36Sopenharmony_ci vmov.32 $D4#lo[0],r6 77462306a36Sopenharmony_ci adr $zeros,.Lzeros 77562306a36Sopenharmony_ci 77662306a36Sopenharmony_ci ldmia sp!,{r1-r3,lr} 77762306a36Sopenharmony_ci b .Lhash_loaded 77862306a36Sopenharmony_ci 77962306a36Sopenharmony_ci.align 4 78062306a36Sopenharmony_ci.Lbase2_26_neon: 78162306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 78262306a36Sopenharmony_ci @ load hash value 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_ci veor $D0#lo,$D0#lo,$D0#lo 78562306a36Sopenharmony_ci veor $D1#lo,$D1#lo,$D1#lo 78662306a36Sopenharmony_ci veor $D2#lo,$D2#lo,$D2#lo 78762306a36Sopenharmony_ci veor $D3#lo,$D3#lo,$D3#lo 78862306a36Sopenharmony_ci veor $D4#lo,$D4#lo,$D4#lo 78962306a36Sopenharmony_ci vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! 79062306a36Sopenharmony_ci adr $zeros,.Lzeros 79162306a36Sopenharmony_ci vld1.32 {$D4#lo[0]},[$ctx] 79262306a36Sopenharmony_ci sub $ctx,$ctx,#16 @ rewind 79362306a36Sopenharmony_ci 79462306a36Sopenharmony_ci.Lhash_loaded: 79562306a36Sopenharmony_ci add $in2,$inp,#32 79662306a36Sopenharmony_ci mov $padbit,$padbit,lsl#24 79762306a36Sopenharmony_ci tst $len,#31 79862306a36Sopenharmony_ci beq .Leven 79962306a36Sopenharmony_ci 80062306a36Sopenharmony_ci vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! 80162306a36Sopenharmony_ci vmov.32 $H4#lo[0],$padbit 80262306a36Sopenharmony_ci sub $len,$len,#16 80362306a36Sopenharmony_ci add $in2,$inp,#32 80462306a36Sopenharmony_ci 80562306a36Sopenharmony_ci# ifdef __ARMEB__ 80662306a36Sopenharmony_ci vrev32.8 $H0,$H0 80762306a36Sopenharmony_ci vrev32.8 $H3,$H3 80862306a36Sopenharmony_ci vrev32.8 $H1,$H1 80962306a36Sopenharmony_ci vrev32.8 $H2,$H2 81062306a36Sopenharmony_ci# endif 81162306a36Sopenharmony_ci vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 81262306a36Sopenharmony_ci vshl.u32 $H3#lo,$H3#lo,#18 81362306a36Sopenharmony_ci 81462306a36Sopenharmony_ci vsri.u32 $H3#lo,$H2#lo,#14 81562306a36Sopenharmony_ci vshl.u32 $H2#lo,$H2#lo,#12 81662306a36Sopenharmony_ci vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi 81762306a36Sopenharmony_ci 81862306a36Sopenharmony_ci vbic.i32 $H3#lo,#0xfc000000 81962306a36Sopenharmony_ci vsri.u32 $H2#lo,$H1#lo,#20 82062306a36Sopenharmony_ci vshl.u32 $H1#lo,$H1#lo,#6 82162306a36Sopenharmony_ci 82262306a36Sopenharmony_ci vbic.i32 $H2#lo,#0xfc000000 82362306a36Sopenharmony_ci vsri.u32 $H1#lo,$H0#lo,#26 82462306a36Sopenharmony_ci vadd.i32 $H3#hi,$H3#lo,$D3#lo 82562306a36Sopenharmony_ci 82662306a36Sopenharmony_ci vbic.i32 $H0#lo,#0xfc000000 82762306a36Sopenharmony_ci vbic.i32 $H1#lo,#0xfc000000 82862306a36Sopenharmony_ci vadd.i32 $H2#hi,$H2#lo,$D2#lo 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci vadd.i32 $H0#hi,$H0#lo,$D0#lo 83162306a36Sopenharmony_ci vadd.i32 $H1#hi,$H1#lo,$D1#lo 83262306a36Sopenharmony_ci 83362306a36Sopenharmony_ci mov $tbl1,$zeros 83462306a36Sopenharmony_ci add $tbl0,$ctx,#48 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci cmp $len,$len 83762306a36Sopenharmony_ci b .Long_tail 83862306a36Sopenharmony_ci 83962306a36Sopenharmony_ci.align 4 84062306a36Sopenharmony_ci.Leven: 84162306a36Sopenharmony_ci subs $len,$len,#64 84262306a36Sopenharmony_ci it lo 84362306a36Sopenharmony_ci movlo $in2,$zeros 84462306a36Sopenharmony_ci 84562306a36Sopenharmony_ci vmov.i32 $H4,#1<<24 @ padbit, yes, always 84662306a36Sopenharmony_ci vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] 84762306a36Sopenharmony_ci add $inp,$inp,#64 84862306a36Sopenharmony_ci vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) 84962306a36Sopenharmony_ci add $in2,$in2,#64 85062306a36Sopenharmony_ci itt hi 85162306a36Sopenharmony_ci addhi $tbl1,$ctx,#(48+1*9*4) 85262306a36Sopenharmony_ci addhi $tbl0,$ctx,#(48+3*9*4) 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci# ifdef __ARMEB__ 85562306a36Sopenharmony_ci vrev32.8 $H0,$H0 85662306a36Sopenharmony_ci vrev32.8 $H3,$H3 85762306a36Sopenharmony_ci vrev32.8 $H1,$H1 85862306a36Sopenharmony_ci vrev32.8 $H2,$H2 85962306a36Sopenharmony_ci# endif 86062306a36Sopenharmony_ci vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 86162306a36Sopenharmony_ci vshl.u32 $H3,$H3,#18 86262306a36Sopenharmony_ci 86362306a36Sopenharmony_ci vsri.u32 $H3,$H2,#14 86462306a36Sopenharmony_ci vshl.u32 $H2,$H2,#12 86562306a36Sopenharmony_ci 86662306a36Sopenharmony_ci vbic.i32 $H3,#0xfc000000 86762306a36Sopenharmony_ci vsri.u32 $H2,$H1,#20 86862306a36Sopenharmony_ci vshl.u32 $H1,$H1,#6 86962306a36Sopenharmony_ci 87062306a36Sopenharmony_ci vbic.i32 $H2,#0xfc000000 87162306a36Sopenharmony_ci vsri.u32 $H1,$H0,#26 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci vbic.i32 $H0,#0xfc000000 87462306a36Sopenharmony_ci vbic.i32 $H1,#0xfc000000 87562306a36Sopenharmony_ci 87662306a36Sopenharmony_ci bls .Lskip_loop 87762306a36Sopenharmony_ci 87862306a36Sopenharmony_ci vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 87962306a36Sopenharmony_ci vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 88062306a36Sopenharmony_ci vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 88162306a36Sopenharmony_ci vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 88262306a36Sopenharmony_ci b .Loop_neon 88362306a36Sopenharmony_ci 88462306a36Sopenharmony_ci.align 5 88562306a36Sopenharmony_ci.Loop_neon: 88662306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 88762306a36Sopenharmony_ci @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 88862306a36Sopenharmony_ci @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 88962306a36Sopenharmony_ci @ \___________________/ 89062306a36Sopenharmony_ci @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 89162306a36Sopenharmony_ci @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 89262306a36Sopenharmony_ci @ \___________________/ \____________________/ 89362306a36Sopenharmony_ci @ 89462306a36Sopenharmony_ci @ Note that we start with inp[2:3]*r^2. This is because it 89562306a36Sopenharmony_ci @ doesn't depend on reduction in previous iteration. 89662306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 89762306a36Sopenharmony_ci @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 89862306a36Sopenharmony_ci @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 89962306a36Sopenharmony_ci @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 90062306a36Sopenharmony_ci @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 90162306a36Sopenharmony_ci @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 90262306a36Sopenharmony_ci 90362306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 90462306a36Sopenharmony_ci @ inp[2:3]*r^2 90562306a36Sopenharmony_ci 90662306a36Sopenharmony_ci vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] 90762306a36Sopenharmony_ci vmull.u32 $D2,$H2#hi,${R0}[1] 90862306a36Sopenharmony_ci vadd.i32 $H0#lo,$H0#lo,$D0#lo 90962306a36Sopenharmony_ci vmull.u32 $D0,$H0#hi,${R0}[1] 91062306a36Sopenharmony_ci vadd.i32 $H3#lo,$H3#lo,$D3#lo 91162306a36Sopenharmony_ci vmull.u32 $D3,$H3#hi,${R0}[1] 91262306a36Sopenharmony_ci vmlal.u32 $D2,$H1#hi,${R1}[1] 91362306a36Sopenharmony_ci vadd.i32 $H1#lo,$H1#lo,$D1#lo 91462306a36Sopenharmony_ci vmull.u32 $D1,$H1#hi,${R0}[1] 91562306a36Sopenharmony_ci 91662306a36Sopenharmony_ci vadd.i32 $H4#lo,$H4#lo,$D4#lo 91762306a36Sopenharmony_ci vmull.u32 $D4,$H4#hi,${R0}[1] 91862306a36Sopenharmony_ci subs $len,$len,#64 91962306a36Sopenharmony_ci vmlal.u32 $D0,$H4#hi,${S1}[1] 92062306a36Sopenharmony_ci it lo 92162306a36Sopenharmony_ci movlo $in2,$zeros 92262306a36Sopenharmony_ci vmlal.u32 $D3,$H2#hi,${R1}[1] 92362306a36Sopenharmony_ci vld1.32 ${S4}[1],[$tbl1,:32] 92462306a36Sopenharmony_ci vmlal.u32 $D1,$H0#hi,${R1}[1] 92562306a36Sopenharmony_ci vmlal.u32 $D4,$H3#hi,${R1}[1] 92662306a36Sopenharmony_ci 92762306a36Sopenharmony_ci vmlal.u32 $D0,$H3#hi,${S2}[1] 92862306a36Sopenharmony_ci vmlal.u32 $D3,$H1#hi,${R2}[1] 92962306a36Sopenharmony_ci vmlal.u32 $D4,$H2#hi,${R2}[1] 93062306a36Sopenharmony_ci vmlal.u32 $D1,$H4#hi,${S2}[1] 93162306a36Sopenharmony_ci vmlal.u32 $D2,$H0#hi,${R2}[1] 93262306a36Sopenharmony_ci 93362306a36Sopenharmony_ci vmlal.u32 $D3,$H0#hi,${R3}[1] 93462306a36Sopenharmony_ci vmlal.u32 $D0,$H2#hi,${S3}[1] 93562306a36Sopenharmony_ci vmlal.u32 $D4,$H1#hi,${R3}[1] 93662306a36Sopenharmony_ci vmlal.u32 $D1,$H3#hi,${S3}[1] 93762306a36Sopenharmony_ci vmlal.u32 $D2,$H4#hi,${S3}[1] 93862306a36Sopenharmony_ci 93962306a36Sopenharmony_ci vmlal.u32 $D3,$H4#hi,${S4}[1] 94062306a36Sopenharmony_ci vmlal.u32 $D0,$H1#hi,${S4}[1] 94162306a36Sopenharmony_ci vmlal.u32 $D4,$H0#hi,${R4}[1] 94262306a36Sopenharmony_ci vmlal.u32 $D1,$H2#hi,${S4}[1] 94362306a36Sopenharmony_ci vmlal.u32 $D2,$H3#hi,${S4}[1] 94462306a36Sopenharmony_ci 94562306a36Sopenharmony_ci vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) 94662306a36Sopenharmony_ci add $in2,$in2,#64 94762306a36Sopenharmony_ci 94862306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 94962306a36Sopenharmony_ci @ (hash+inp[0:1])*r^4 and accumulate 95062306a36Sopenharmony_ci 95162306a36Sopenharmony_ci vmlal.u32 $D3,$H3#lo,${R0}[0] 95262306a36Sopenharmony_ci vmlal.u32 $D0,$H0#lo,${R0}[0] 95362306a36Sopenharmony_ci vmlal.u32 $D4,$H4#lo,${R0}[0] 95462306a36Sopenharmony_ci vmlal.u32 $D1,$H1#lo,${R0}[0] 95562306a36Sopenharmony_ci vmlal.u32 $D2,$H2#lo,${R0}[0] 95662306a36Sopenharmony_ci vld1.32 ${S4}[0],[$tbl0,:32] 95762306a36Sopenharmony_ci 95862306a36Sopenharmony_ci vmlal.u32 $D3,$H2#lo,${R1}[0] 95962306a36Sopenharmony_ci vmlal.u32 $D0,$H4#lo,${S1}[0] 96062306a36Sopenharmony_ci vmlal.u32 $D4,$H3#lo,${R1}[0] 96162306a36Sopenharmony_ci vmlal.u32 $D1,$H0#lo,${R1}[0] 96262306a36Sopenharmony_ci vmlal.u32 $D2,$H1#lo,${R1}[0] 96362306a36Sopenharmony_ci 96462306a36Sopenharmony_ci vmlal.u32 $D3,$H1#lo,${R2}[0] 96562306a36Sopenharmony_ci vmlal.u32 $D0,$H3#lo,${S2}[0] 96662306a36Sopenharmony_ci vmlal.u32 $D4,$H2#lo,${R2}[0] 96762306a36Sopenharmony_ci vmlal.u32 $D1,$H4#lo,${S2}[0] 96862306a36Sopenharmony_ci vmlal.u32 $D2,$H0#lo,${R2}[0] 96962306a36Sopenharmony_ci 97062306a36Sopenharmony_ci vmlal.u32 $D3,$H0#lo,${R3}[0] 97162306a36Sopenharmony_ci vmlal.u32 $D0,$H2#lo,${S3}[0] 97262306a36Sopenharmony_ci vmlal.u32 $D4,$H1#lo,${R3}[0] 97362306a36Sopenharmony_ci vmlal.u32 $D1,$H3#lo,${S3}[0] 97462306a36Sopenharmony_ci vmlal.u32 $D3,$H4#lo,${S4}[0] 97562306a36Sopenharmony_ci 97662306a36Sopenharmony_ci vmlal.u32 $D2,$H4#lo,${S3}[0] 97762306a36Sopenharmony_ci vmlal.u32 $D0,$H1#lo,${S4}[0] 97862306a36Sopenharmony_ci vmlal.u32 $D4,$H0#lo,${R4}[0] 97962306a36Sopenharmony_ci vmov.i32 $H4,#1<<24 @ padbit, yes, always 98062306a36Sopenharmony_ci vmlal.u32 $D1,$H2#lo,${S4}[0] 98162306a36Sopenharmony_ci vmlal.u32 $D2,$H3#lo,${S4}[0] 98262306a36Sopenharmony_ci 98362306a36Sopenharmony_ci vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] 98462306a36Sopenharmony_ci add $inp,$inp,#64 98562306a36Sopenharmony_ci# ifdef __ARMEB__ 98662306a36Sopenharmony_ci vrev32.8 $H0,$H0 98762306a36Sopenharmony_ci vrev32.8 $H1,$H1 98862306a36Sopenharmony_ci vrev32.8 $H2,$H2 98962306a36Sopenharmony_ci vrev32.8 $H3,$H3 99062306a36Sopenharmony_ci# endif 99162306a36Sopenharmony_ci 99262306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 99362306a36Sopenharmony_ci @ lazy reduction interleaved with base 2^32 -> base 2^26 of 99462306a36Sopenharmony_ci @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. 99562306a36Sopenharmony_ci 99662306a36Sopenharmony_ci vshr.u64 $T0,$D3,#26 99762306a36Sopenharmony_ci vmovn.i64 $D3#lo,$D3 99862306a36Sopenharmony_ci vshr.u64 $T1,$D0,#26 99962306a36Sopenharmony_ci vmovn.i64 $D0#lo,$D0 100062306a36Sopenharmony_ci vadd.i64 $D4,$D4,$T0 @ h3 -> h4 100162306a36Sopenharmony_ci vbic.i32 $D3#lo,#0xfc000000 100262306a36Sopenharmony_ci vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 100362306a36Sopenharmony_ci vadd.i64 $D1,$D1,$T1 @ h0 -> h1 100462306a36Sopenharmony_ci vshl.u32 $H3,$H3,#18 100562306a36Sopenharmony_ci vbic.i32 $D0#lo,#0xfc000000 100662306a36Sopenharmony_ci 100762306a36Sopenharmony_ci vshrn.u64 $T0#lo,$D4,#26 100862306a36Sopenharmony_ci vmovn.i64 $D4#lo,$D4 100962306a36Sopenharmony_ci vshr.u64 $T1,$D1,#26 101062306a36Sopenharmony_ci vmovn.i64 $D1#lo,$D1 101162306a36Sopenharmony_ci vadd.i64 $D2,$D2,$T1 @ h1 -> h2 101262306a36Sopenharmony_ci vsri.u32 $H3,$H2,#14 101362306a36Sopenharmony_ci vbic.i32 $D4#lo,#0xfc000000 101462306a36Sopenharmony_ci vshl.u32 $H2,$H2,#12 101562306a36Sopenharmony_ci vbic.i32 $D1#lo,#0xfc000000 101662306a36Sopenharmony_ci 101762306a36Sopenharmony_ci vadd.i32 $D0#lo,$D0#lo,$T0#lo 101862306a36Sopenharmony_ci vshl.u32 $T0#lo,$T0#lo,#2 101962306a36Sopenharmony_ci vbic.i32 $H3,#0xfc000000 102062306a36Sopenharmony_ci vshrn.u64 $T1#lo,$D2,#26 102162306a36Sopenharmony_ci vmovn.i64 $D2#lo,$D2 102262306a36Sopenharmony_ci vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] 102362306a36Sopenharmony_ci vsri.u32 $H2,$H1,#20 102462306a36Sopenharmony_ci vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 102562306a36Sopenharmony_ci vshl.u32 $H1,$H1,#6 102662306a36Sopenharmony_ci vbic.i32 $D2#lo,#0xfc000000 102762306a36Sopenharmony_ci vbic.i32 $H2,#0xfc000000 102862306a36Sopenharmony_ci 102962306a36Sopenharmony_ci vshrn.u64 $T0#lo,$D0,#26 @ re-narrow 103062306a36Sopenharmony_ci vmovn.i64 $D0#lo,$D0 103162306a36Sopenharmony_ci vsri.u32 $H1,$H0,#26 103262306a36Sopenharmony_ci vbic.i32 $H0,#0xfc000000 103362306a36Sopenharmony_ci vshr.u32 $T1#lo,$D3#lo,#26 103462306a36Sopenharmony_ci vbic.i32 $D3#lo,#0xfc000000 103562306a36Sopenharmony_ci vbic.i32 $D0#lo,#0xfc000000 103662306a36Sopenharmony_ci vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 103762306a36Sopenharmony_ci vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 103862306a36Sopenharmony_ci vbic.i32 $H1,#0xfc000000 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_ci bhi .Loop_neon 104162306a36Sopenharmony_ci 104262306a36Sopenharmony_ci.Lskip_loop: 104362306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 104462306a36Sopenharmony_ci @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 104562306a36Sopenharmony_ci 104662306a36Sopenharmony_ci add $tbl1,$ctx,#(48+0*9*4) 104762306a36Sopenharmony_ci add $tbl0,$ctx,#(48+1*9*4) 104862306a36Sopenharmony_ci adds $len,$len,#32 104962306a36Sopenharmony_ci it ne 105062306a36Sopenharmony_ci movne $len,#0 105162306a36Sopenharmony_ci bne .Long_tail 105262306a36Sopenharmony_ci 105362306a36Sopenharmony_ci vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi 105462306a36Sopenharmony_ci vadd.i32 $H0#hi,$H0#lo,$D0#lo 105562306a36Sopenharmony_ci vadd.i32 $H3#hi,$H3#lo,$D3#lo 105662306a36Sopenharmony_ci vadd.i32 $H1#hi,$H1#lo,$D1#lo 105762306a36Sopenharmony_ci vadd.i32 $H4#hi,$H4#lo,$D4#lo 105862306a36Sopenharmony_ci 105962306a36Sopenharmony_ci.Long_tail: 106062306a36Sopenharmony_ci vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 106162306a36Sopenharmony_ci vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 106262306a36Sopenharmony_ci 106362306a36Sopenharmony_ci vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant 106462306a36Sopenharmony_ci vmull.u32 $D2,$H2#hi,$R0 106562306a36Sopenharmony_ci vadd.i32 $H0#lo,$H0#lo,$D0#lo 106662306a36Sopenharmony_ci vmull.u32 $D0,$H0#hi,$R0 106762306a36Sopenharmony_ci vadd.i32 $H3#lo,$H3#lo,$D3#lo 106862306a36Sopenharmony_ci vmull.u32 $D3,$H3#hi,$R0 106962306a36Sopenharmony_ci vadd.i32 $H1#lo,$H1#lo,$D1#lo 107062306a36Sopenharmony_ci vmull.u32 $D1,$H1#hi,$R0 107162306a36Sopenharmony_ci vadd.i32 $H4#lo,$H4#lo,$D4#lo 107262306a36Sopenharmony_ci vmull.u32 $D4,$H4#hi,$R0 107362306a36Sopenharmony_ci 107462306a36Sopenharmony_ci vmlal.u32 $D0,$H4#hi,$S1 107562306a36Sopenharmony_ci vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 107662306a36Sopenharmony_ci vmlal.u32 $D3,$H2#hi,$R1 107762306a36Sopenharmony_ci vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 107862306a36Sopenharmony_ci vmlal.u32 $D1,$H0#hi,$R1 107962306a36Sopenharmony_ci vmlal.u32 $D4,$H3#hi,$R1 108062306a36Sopenharmony_ci vmlal.u32 $D2,$H1#hi,$R1 108162306a36Sopenharmony_ci 108262306a36Sopenharmony_ci vmlal.u32 $D3,$H1#hi,$R2 108362306a36Sopenharmony_ci vld1.32 ${S4}[1],[$tbl1,:32] 108462306a36Sopenharmony_ci vmlal.u32 $D0,$H3#hi,$S2 108562306a36Sopenharmony_ci vld1.32 ${S4}[0],[$tbl0,:32] 108662306a36Sopenharmony_ci vmlal.u32 $D4,$H2#hi,$R2 108762306a36Sopenharmony_ci vmlal.u32 $D1,$H4#hi,$S2 108862306a36Sopenharmony_ci vmlal.u32 $D2,$H0#hi,$R2 108962306a36Sopenharmony_ci 109062306a36Sopenharmony_ci vmlal.u32 $D3,$H0#hi,$R3 109162306a36Sopenharmony_ci it ne 109262306a36Sopenharmony_ci addne $tbl1,$ctx,#(48+2*9*4) 109362306a36Sopenharmony_ci vmlal.u32 $D0,$H2#hi,$S3 109462306a36Sopenharmony_ci it ne 109562306a36Sopenharmony_ci addne $tbl0,$ctx,#(48+3*9*4) 109662306a36Sopenharmony_ci vmlal.u32 $D4,$H1#hi,$R3 109762306a36Sopenharmony_ci vmlal.u32 $D1,$H3#hi,$S3 109862306a36Sopenharmony_ci vmlal.u32 $D2,$H4#hi,$S3 109962306a36Sopenharmony_ci 110062306a36Sopenharmony_ci vmlal.u32 $D3,$H4#hi,$S4 110162306a36Sopenharmony_ci vorn $MASK,$MASK,$MASK @ all-ones, can be redundant 110262306a36Sopenharmony_ci vmlal.u32 $D0,$H1#hi,$S4 110362306a36Sopenharmony_ci vshr.u64 $MASK,$MASK,#38 110462306a36Sopenharmony_ci vmlal.u32 $D4,$H0#hi,$R4 110562306a36Sopenharmony_ci vmlal.u32 $D1,$H2#hi,$S4 110662306a36Sopenharmony_ci vmlal.u32 $D2,$H3#hi,$S4 110762306a36Sopenharmony_ci 110862306a36Sopenharmony_ci beq .Lshort_tail 110962306a36Sopenharmony_ci 111062306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 111162306a36Sopenharmony_ci @ (hash+inp[0:1])*r^4:r^3 and accumulate 111262306a36Sopenharmony_ci 111362306a36Sopenharmony_ci vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 111462306a36Sopenharmony_ci vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 111562306a36Sopenharmony_ci 111662306a36Sopenharmony_ci vmlal.u32 $D2,$H2#lo,$R0 111762306a36Sopenharmony_ci vmlal.u32 $D0,$H0#lo,$R0 111862306a36Sopenharmony_ci vmlal.u32 $D3,$H3#lo,$R0 111962306a36Sopenharmony_ci vmlal.u32 $D1,$H1#lo,$R0 112062306a36Sopenharmony_ci vmlal.u32 $D4,$H4#lo,$R0 112162306a36Sopenharmony_ci 112262306a36Sopenharmony_ci vmlal.u32 $D0,$H4#lo,$S1 112362306a36Sopenharmony_ci vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 112462306a36Sopenharmony_ci vmlal.u32 $D3,$H2#lo,$R1 112562306a36Sopenharmony_ci vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 112662306a36Sopenharmony_ci vmlal.u32 $D1,$H0#lo,$R1 112762306a36Sopenharmony_ci vmlal.u32 $D4,$H3#lo,$R1 112862306a36Sopenharmony_ci vmlal.u32 $D2,$H1#lo,$R1 112962306a36Sopenharmony_ci 113062306a36Sopenharmony_ci vmlal.u32 $D3,$H1#lo,$R2 113162306a36Sopenharmony_ci vld1.32 ${S4}[1],[$tbl1,:32] 113262306a36Sopenharmony_ci vmlal.u32 $D0,$H3#lo,$S2 113362306a36Sopenharmony_ci vld1.32 ${S4}[0],[$tbl0,:32] 113462306a36Sopenharmony_ci vmlal.u32 $D4,$H2#lo,$R2 113562306a36Sopenharmony_ci vmlal.u32 $D1,$H4#lo,$S2 113662306a36Sopenharmony_ci vmlal.u32 $D2,$H0#lo,$R2 113762306a36Sopenharmony_ci 113862306a36Sopenharmony_ci vmlal.u32 $D3,$H0#lo,$R3 113962306a36Sopenharmony_ci vmlal.u32 $D0,$H2#lo,$S3 114062306a36Sopenharmony_ci vmlal.u32 $D4,$H1#lo,$R3 114162306a36Sopenharmony_ci vmlal.u32 $D1,$H3#lo,$S3 114262306a36Sopenharmony_ci vmlal.u32 $D2,$H4#lo,$S3 114362306a36Sopenharmony_ci 114462306a36Sopenharmony_ci vmlal.u32 $D3,$H4#lo,$S4 114562306a36Sopenharmony_ci vorn $MASK,$MASK,$MASK @ all-ones 114662306a36Sopenharmony_ci vmlal.u32 $D0,$H1#lo,$S4 114762306a36Sopenharmony_ci vshr.u64 $MASK,$MASK,#38 114862306a36Sopenharmony_ci vmlal.u32 $D4,$H0#lo,$R4 114962306a36Sopenharmony_ci vmlal.u32 $D1,$H2#lo,$S4 115062306a36Sopenharmony_ci vmlal.u32 $D2,$H3#lo,$S4 115162306a36Sopenharmony_ci 115262306a36Sopenharmony_ci.Lshort_tail: 115362306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 115462306a36Sopenharmony_ci @ horizontal addition 115562306a36Sopenharmony_ci 115662306a36Sopenharmony_ci vadd.i64 $D3#lo,$D3#lo,$D3#hi 115762306a36Sopenharmony_ci vadd.i64 $D0#lo,$D0#lo,$D0#hi 115862306a36Sopenharmony_ci vadd.i64 $D4#lo,$D4#lo,$D4#hi 115962306a36Sopenharmony_ci vadd.i64 $D1#lo,$D1#lo,$D1#hi 116062306a36Sopenharmony_ci vadd.i64 $D2#lo,$D2#lo,$D2#hi 116162306a36Sopenharmony_ci 116262306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 116362306a36Sopenharmony_ci @ lazy reduction, but without narrowing 116462306a36Sopenharmony_ci 116562306a36Sopenharmony_ci vshr.u64 $T0,$D3,#26 116662306a36Sopenharmony_ci vand.i64 $D3,$D3,$MASK 116762306a36Sopenharmony_ci vshr.u64 $T1,$D0,#26 116862306a36Sopenharmony_ci vand.i64 $D0,$D0,$MASK 116962306a36Sopenharmony_ci vadd.i64 $D4,$D4,$T0 @ h3 -> h4 117062306a36Sopenharmony_ci vadd.i64 $D1,$D1,$T1 @ h0 -> h1 117162306a36Sopenharmony_ci 117262306a36Sopenharmony_ci vshr.u64 $T0,$D4,#26 117362306a36Sopenharmony_ci vand.i64 $D4,$D4,$MASK 117462306a36Sopenharmony_ci vshr.u64 $T1,$D1,#26 117562306a36Sopenharmony_ci vand.i64 $D1,$D1,$MASK 117662306a36Sopenharmony_ci vadd.i64 $D2,$D2,$T1 @ h1 -> h2 117762306a36Sopenharmony_ci 117862306a36Sopenharmony_ci vadd.i64 $D0,$D0,$T0 117962306a36Sopenharmony_ci vshl.u64 $T0,$T0,#2 118062306a36Sopenharmony_ci vshr.u64 $T1,$D2,#26 118162306a36Sopenharmony_ci vand.i64 $D2,$D2,$MASK 118262306a36Sopenharmony_ci vadd.i64 $D0,$D0,$T0 @ h4 -> h0 118362306a36Sopenharmony_ci vadd.i64 $D3,$D3,$T1 @ h2 -> h3 118462306a36Sopenharmony_ci 118562306a36Sopenharmony_ci vshr.u64 $T0,$D0,#26 118662306a36Sopenharmony_ci vand.i64 $D0,$D0,$MASK 118762306a36Sopenharmony_ci vshr.u64 $T1,$D3,#26 118862306a36Sopenharmony_ci vand.i64 $D3,$D3,$MASK 118962306a36Sopenharmony_ci vadd.i64 $D1,$D1,$T0 @ h0 -> h1 119062306a36Sopenharmony_ci vadd.i64 $D4,$D4,$T1 @ h3 -> h4 119162306a36Sopenharmony_ci 119262306a36Sopenharmony_ci cmp $len,#0 119362306a36Sopenharmony_ci bne .Leven 119462306a36Sopenharmony_ci 119562306a36Sopenharmony_ci @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 119662306a36Sopenharmony_ci @ store hash value 119762306a36Sopenharmony_ci 119862306a36Sopenharmony_ci vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! 119962306a36Sopenharmony_ci vst1.32 {$D4#lo[0]},[$ctx] 120062306a36Sopenharmony_ci 120162306a36Sopenharmony_ci vldmia sp!,{d8-d15} @ epilogue 120262306a36Sopenharmony_ci ldmia sp!,{r4-r7} 120362306a36Sopenharmony_ci ret @ bx lr 120462306a36Sopenharmony_ci.size poly1305_blocks_neon,.-poly1305_blocks_neon 120562306a36Sopenharmony_ci 120662306a36Sopenharmony_ci.align 5 120762306a36Sopenharmony_ci.Lzeros: 120862306a36Sopenharmony_ci.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 120962306a36Sopenharmony_ci#ifndef __KERNEL__ 121062306a36Sopenharmony_ci.LOPENSSL_armcap: 121162306a36Sopenharmony_ci# ifdef _WIN32 121262306a36Sopenharmony_ci.word OPENSSL_armcap_P 121362306a36Sopenharmony_ci# else 121462306a36Sopenharmony_ci.word OPENSSL_armcap_P-.Lpoly1305_init 121562306a36Sopenharmony_ci# endif 121662306a36Sopenharmony_ci.comm OPENSSL_armcap_P,4,4 121762306a36Sopenharmony_ci.hidden OPENSSL_armcap_P 121862306a36Sopenharmony_ci#endif 121962306a36Sopenharmony_ci#endif 122062306a36Sopenharmony_ci___ 122162306a36Sopenharmony_ci} } 122262306a36Sopenharmony_ci$code.=<<___; 122362306a36Sopenharmony_ci.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" 122462306a36Sopenharmony_ci.align 2 122562306a36Sopenharmony_ci___ 122662306a36Sopenharmony_ci 122762306a36Sopenharmony_ciforeach (split("\n",$code)) { 122862306a36Sopenharmony_ci s/\`([^\`]*)\`/eval $1/geo; 122962306a36Sopenharmony_ci 123062306a36Sopenharmony_ci s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 123162306a36Sopenharmony_ci s/\bret\b/bx lr/go or 123262306a36Sopenharmony_ci s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 123362306a36Sopenharmony_ci 123462306a36Sopenharmony_ci print $_,"\n"; 123562306a36Sopenharmony_ci} 123662306a36Sopenharmony_ciclose STDOUT; # enforce flush 1237