18c2ecf20Sopenharmony_ci#!/usr/bin/env perl
28c2ecf20Sopenharmony_ci# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
38c2ecf20Sopenharmony_ci#
48c2ecf20Sopenharmony_ci# ====================================================================
58c2ecf20Sopenharmony_ci# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
68c2ecf20Sopenharmony_ci# project.
78c2ecf20Sopenharmony_ci# ====================================================================
88c2ecf20Sopenharmony_ci#
98c2ecf20Sopenharmony_ci# This module implements Poly1305 hash for ARMv8.
108c2ecf20Sopenharmony_ci#
118c2ecf20Sopenharmony_ci# June 2015
128c2ecf20Sopenharmony_ci#
138c2ecf20Sopenharmony_ci# Numbers are cycles per processed byte with poly1305_blocks alone.
148c2ecf20Sopenharmony_ci#
158c2ecf20Sopenharmony_ci#		IALU/gcc-4.9	NEON
168c2ecf20Sopenharmony_ci#
178c2ecf20Sopenharmony_ci# Apple A7	1.86/+5%	0.72
188c2ecf20Sopenharmony_ci# Cortex-A53	2.69/+58%	1.47
198c2ecf20Sopenharmony_ci# Cortex-A57	2.70/+7%	1.14
208c2ecf20Sopenharmony_ci# Denver	1.64/+50%	1.18(*)
218c2ecf20Sopenharmony_ci# X-Gene	2.13/+68%	2.27
228c2ecf20Sopenharmony_ci# Mongoose	1.77/+75%	1.12
238c2ecf20Sopenharmony_ci# Kryo		2.70/+55%	1.13
248c2ecf20Sopenharmony_ci# ThunderX2	1.17/+95%	1.36
258c2ecf20Sopenharmony_ci#
268c2ecf20Sopenharmony_ci# (*)	estimate based on resources availability is less than 1.0,
278c2ecf20Sopenharmony_ci#	i.e. measured result is worse than expected, presumably binary
288c2ecf20Sopenharmony_ci#	translator is not almighty;
298c2ecf20Sopenharmony_ci
308c2ecf20Sopenharmony_ci$flavour=shift;
318c2ecf20Sopenharmony_ci$output=shift;
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ciif ($flavour && $flavour ne "void") {
348c2ecf20Sopenharmony_ci    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
358c2ecf20Sopenharmony_ci    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
368c2ecf20Sopenharmony_ci    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
378c2ecf20Sopenharmony_ci    die "can't locate arm-xlate.pl";
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci    open STDOUT,"| \"$^X\" $xlate $flavour $output";
408c2ecf20Sopenharmony_ci} else {
418c2ecf20Sopenharmony_ci    open STDOUT,">$output";
428c2ecf20Sopenharmony_ci}
438c2ecf20Sopenharmony_ci
448c2ecf20Sopenharmony_cimy ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
458c2ecf20Sopenharmony_cimy ($mac,$nonce)=($inp,$len);
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_cimy ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci$code.=<<___;
508c2ecf20Sopenharmony_ci#ifndef __KERNEL__
518c2ecf20Sopenharmony_ci# include "arm_arch.h"
528c2ecf20Sopenharmony_ci.extern	OPENSSL_armcap_P
538c2ecf20Sopenharmony_ci#endif
548c2ecf20Sopenharmony_ci
558c2ecf20Sopenharmony_ci.text
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci// forward "declarations" are required for Apple
588c2ecf20Sopenharmony_ci.globl	poly1305_blocks
598c2ecf20Sopenharmony_ci.globl	poly1305_emit
608c2ecf20Sopenharmony_ci
618c2ecf20Sopenharmony_ci.globl	poly1305_init
628c2ecf20Sopenharmony_ci.type	poly1305_init,%function
638c2ecf20Sopenharmony_ci.align	5
648c2ecf20Sopenharmony_cipoly1305_init:
658c2ecf20Sopenharmony_ci	cmp	$inp,xzr
668c2ecf20Sopenharmony_ci	stp	xzr,xzr,[$ctx]		// zero hash value
678c2ecf20Sopenharmony_ci	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26]
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci	csel	x0,xzr,x0,eq
708c2ecf20Sopenharmony_ci	b.eq	.Lno_key
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci#ifndef	__KERNEL__
738c2ecf20Sopenharmony_ci	adrp	x17,OPENSSL_armcap_P
748c2ecf20Sopenharmony_ci	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
758c2ecf20Sopenharmony_ci#endif
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci	ldp	$r0,$r1,[$inp]		// load key
788c2ecf20Sopenharmony_ci	mov	$s1,#0xfffffffc0fffffff
798c2ecf20Sopenharmony_ci	movk	$s1,#0x0fff,lsl#48
808c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
818c2ecf20Sopenharmony_ci	rev	$r0,$r0			// flip bytes
828c2ecf20Sopenharmony_ci	rev	$r1,$r1
838c2ecf20Sopenharmony_ci#endif
848c2ecf20Sopenharmony_ci	and	$r0,$r0,$s1		// &=0ffffffc0fffffff
858c2ecf20Sopenharmony_ci	and	$s1,$s1,#-4
868c2ecf20Sopenharmony_ci	and	$r1,$r1,$s1		// &=0ffffffc0ffffffc
878c2ecf20Sopenharmony_ci	mov	w#$s1,#-1
888c2ecf20Sopenharmony_ci	stp	$r0,$r1,[$ctx,#32]	// save key value
898c2ecf20Sopenharmony_ci	str	w#$s1,[$ctx,#48]	// impossible key power value
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci#ifndef	__KERNEL__
928c2ecf20Sopenharmony_ci	tst	w17,#ARMV7_NEON
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_ci	adr	$d0,.Lpoly1305_blocks
958c2ecf20Sopenharmony_ci	adr	$r0,.Lpoly1305_blocks_neon
968c2ecf20Sopenharmony_ci	adr	$d1,.Lpoly1305_emit
978c2ecf20Sopenharmony_ci
988c2ecf20Sopenharmony_ci	csel	$d0,$d0,$r0,eq
998c2ecf20Sopenharmony_ci
1008c2ecf20Sopenharmony_ci# ifdef	__ILP32__
1018c2ecf20Sopenharmony_ci	stp	w#$d0,w#$d1,[$len]
1028c2ecf20Sopenharmony_ci# else
1038c2ecf20Sopenharmony_ci	stp	$d0,$d1,[$len]
1048c2ecf20Sopenharmony_ci# endif
1058c2ecf20Sopenharmony_ci#endif
1068c2ecf20Sopenharmony_ci	mov	x0,#1
1078c2ecf20Sopenharmony_ci.Lno_key:
1088c2ecf20Sopenharmony_ci	ret
1098c2ecf20Sopenharmony_ci.size	poly1305_init,.-poly1305_init
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_ci.type	poly1305_blocks,%function
1128c2ecf20Sopenharmony_ci.align	5
1138c2ecf20Sopenharmony_cipoly1305_blocks:
1148c2ecf20Sopenharmony_ci.Lpoly1305_blocks:
1158c2ecf20Sopenharmony_ci	ands	$len,$len,#-16
1168c2ecf20Sopenharmony_ci	b.eq	.Lno_data
1178c2ecf20Sopenharmony_ci
1188c2ecf20Sopenharmony_ci	ldp	$h0,$h1,[$ctx]		// load hash value
1198c2ecf20Sopenharmony_ci	ldp	$h2,x17,[$ctx,#16]	// [along with is_base2_26]
1208c2ecf20Sopenharmony_ci	ldp	$r0,$r1,[$ctx,#32]	// load key value
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
1238c2ecf20Sopenharmony_ci	lsr	$d0,$h0,#32
1248c2ecf20Sopenharmony_ci	mov	w#$d1,w#$h0
1258c2ecf20Sopenharmony_ci	lsr	$d2,$h1,#32
1268c2ecf20Sopenharmony_ci	mov	w15,w#$h1
1278c2ecf20Sopenharmony_ci	lsr	x16,$h2,#32
1288c2ecf20Sopenharmony_ci#else
1298c2ecf20Sopenharmony_ci	mov	w#$d0,w#$h0
1308c2ecf20Sopenharmony_ci	lsr	$d1,$h0,#32
1318c2ecf20Sopenharmony_ci	mov	w#$d2,w#$h1
1328c2ecf20Sopenharmony_ci	lsr	x15,$h1,#32
1338c2ecf20Sopenharmony_ci	mov	w16,w#$h2
1348c2ecf20Sopenharmony_ci#endif
1358c2ecf20Sopenharmony_ci
1368c2ecf20Sopenharmony_ci	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
1378c2ecf20Sopenharmony_ci	lsr	$d1,$d2,#12
1388c2ecf20Sopenharmony_ci	adds	$d0,$d0,$d2,lsl#52
1398c2ecf20Sopenharmony_ci	add	$d1,$d1,x15,lsl#14
1408c2ecf20Sopenharmony_ci	adc	$d1,$d1,xzr
1418c2ecf20Sopenharmony_ci	lsr	$d2,x16,#24
1428c2ecf20Sopenharmony_ci	adds	$d1,$d1,x16,lsl#40
1438c2ecf20Sopenharmony_ci	adc	$d2,$d2,xzr
1448c2ecf20Sopenharmony_ci
1458c2ecf20Sopenharmony_ci	cmp	x17,#0			// is_base2_26?
1468c2ecf20Sopenharmony_ci	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
1478c2ecf20Sopenharmony_ci	csel	$h0,$h0,$d0,eq		// choose between radixes
1488c2ecf20Sopenharmony_ci	csel	$h1,$h1,$d1,eq
1498c2ecf20Sopenharmony_ci	csel	$h2,$h2,$d2,eq
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci.Loop:
1528c2ecf20Sopenharmony_ci	ldp	$t0,$t1,[$inp],#16	// load input
1538c2ecf20Sopenharmony_ci	sub	$len,$len,#16
1548c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
1558c2ecf20Sopenharmony_ci	rev	$t0,$t0
1568c2ecf20Sopenharmony_ci	rev	$t1,$t1
1578c2ecf20Sopenharmony_ci#endif
1588c2ecf20Sopenharmony_ci	adds	$h0,$h0,$t0		// accumulate input
1598c2ecf20Sopenharmony_ci	adcs	$h1,$h1,$t1
1608c2ecf20Sopenharmony_ci
1618c2ecf20Sopenharmony_ci	mul	$d0,$h0,$r0		// h0*r0
1628c2ecf20Sopenharmony_ci	adc	$h2,$h2,$padbit
1638c2ecf20Sopenharmony_ci	umulh	$d1,$h0,$r0
1648c2ecf20Sopenharmony_ci
1658c2ecf20Sopenharmony_ci	mul	$t0,$h1,$s1		// h1*5*r1
1668c2ecf20Sopenharmony_ci	umulh	$t1,$h1,$s1
1678c2ecf20Sopenharmony_ci
1688c2ecf20Sopenharmony_ci	adds	$d0,$d0,$t0
1698c2ecf20Sopenharmony_ci	mul	$t0,$h0,$r1		// h0*r1
1708c2ecf20Sopenharmony_ci	adc	$d1,$d1,$t1
1718c2ecf20Sopenharmony_ci	umulh	$d2,$h0,$r1
1728c2ecf20Sopenharmony_ci
1738c2ecf20Sopenharmony_ci	adds	$d1,$d1,$t0
1748c2ecf20Sopenharmony_ci	mul	$t0,$h1,$r0		// h1*r0
1758c2ecf20Sopenharmony_ci	adc	$d2,$d2,xzr
1768c2ecf20Sopenharmony_ci	umulh	$t1,$h1,$r0
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ci	adds	$d1,$d1,$t0
1798c2ecf20Sopenharmony_ci	mul	$t0,$h2,$s1		// h2*5*r1
1808c2ecf20Sopenharmony_ci	adc	$d2,$d2,$t1
1818c2ecf20Sopenharmony_ci	mul	$t1,$h2,$r0		// h2*r0
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci	adds	$d1,$d1,$t0
1848c2ecf20Sopenharmony_ci	adc	$d2,$d2,$t1
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_ci	and	$t0,$d2,#-4		// final reduction
1878c2ecf20Sopenharmony_ci	and	$h2,$d2,#3
1888c2ecf20Sopenharmony_ci	add	$t0,$t0,$d2,lsr#2
1898c2ecf20Sopenharmony_ci	adds	$h0,$d0,$t0
1908c2ecf20Sopenharmony_ci	adcs	$h1,$d1,xzr
1918c2ecf20Sopenharmony_ci	adc	$h2,$h2,xzr
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci	cbnz	$len,.Loop
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_ci	stp	$h0,$h1,[$ctx]		// store hash value
1968c2ecf20Sopenharmony_ci	stp	$h2,xzr,[$ctx,#16]	// [and clear is_base2_26]
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ci.Lno_data:
1998c2ecf20Sopenharmony_ci	ret
2008c2ecf20Sopenharmony_ci.size	poly1305_blocks,.-poly1305_blocks
2018c2ecf20Sopenharmony_ci
2028c2ecf20Sopenharmony_ci.type	poly1305_emit,%function
2038c2ecf20Sopenharmony_ci.align	5
2048c2ecf20Sopenharmony_cipoly1305_emit:
2058c2ecf20Sopenharmony_ci.Lpoly1305_emit:
2068c2ecf20Sopenharmony_ci	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
2078c2ecf20Sopenharmony_ci	ldp	$h2,$r0,[$ctx,#16]	// [along with is_base2_26]
2088c2ecf20Sopenharmony_ci	ldp	$t0,$t1,[$nonce]	// load nonce
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
2118c2ecf20Sopenharmony_ci	lsr	$d0,$h0,#32
2128c2ecf20Sopenharmony_ci	mov	w#$d1,w#$h0
2138c2ecf20Sopenharmony_ci	lsr	$d2,$h1,#32
2148c2ecf20Sopenharmony_ci	mov	w15,w#$h1
2158c2ecf20Sopenharmony_ci	lsr	x16,$h2,#32
2168c2ecf20Sopenharmony_ci#else
2178c2ecf20Sopenharmony_ci	mov	w#$d0,w#$h0
2188c2ecf20Sopenharmony_ci	lsr	$d1,$h0,#32
2198c2ecf20Sopenharmony_ci	mov	w#$d2,w#$h1
2208c2ecf20Sopenharmony_ci	lsr	x15,$h1,#32
2218c2ecf20Sopenharmony_ci	mov	w16,w#$h2
2228c2ecf20Sopenharmony_ci#endif
2238c2ecf20Sopenharmony_ci
2248c2ecf20Sopenharmony_ci	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
2258c2ecf20Sopenharmony_ci	lsr	$d1,$d2,#12
2268c2ecf20Sopenharmony_ci	adds	$d0,$d0,$d2,lsl#52
2278c2ecf20Sopenharmony_ci	add	$d1,$d1,x15,lsl#14
2288c2ecf20Sopenharmony_ci	adc	$d1,$d1,xzr
2298c2ecf20Sopenharmony_ci	lsr	$d2,x16,#24
2308c2ecf20Sopenharmony_ci	adds	$d1,$d1,x16,lsl#40
2318c2ecf20Sopenharmony_ci	adc	$d2,$d2,xzr
2328c2ecf20Sopenharmony_ci
2338c2ecf20Sopenharmony_ci	cmp	$r0,#0			// is_base2_26?
2348c2ecf20Sopenharmony_ci	csel	$h0,$h0,$d0,eq		// choose between radixes
2358c2ecf20Sopenharmony_ci	csel	$h1,$h1,$d1,eq
2368c2ecf20Sopenharmony_ci	csel	$h2,$h2,$d2,eq
2378c2ecf20Sopenharmony_ci
2388c2ecf20Sopenharmony_ci	adds	$d0,$h0,#5		// compare to modulus
2398c2ecf20Sopenharmony_ci	adcs	$d1,$h1,xzr
2408c2ecf20Sopenharmony_ci	adc	$d2,$h2,xzr
2418c2ecf20Sopenharmony_ci
2428c2ecf20Sopenharmony_ci	tst	$d2,#-4			// see if it's carried/borrowed
2438c2ecf20Sopenharmony_ci
2448c2ecf20Sopenharmony_ci	csel	$h0,$h0,$d0,eq
2458c2ecf20Sopenharmony_ci	csel	$h1,$h1,$d1,eq
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
2488c2ecf20Sopenharmony_ci	ror	$t0,$t0,#32		// flip nonce words
2498c2ecf20Sopenharmony_ci	ror	$t1,$t1,#32
2508c2ecf20Sopenharmony_ci#endif
2518c2ecf20Sopenharmony_ci	adds	$h0,$h0,$t0		// accumulate nonce
2528c2ecf20Sopenharmony_ci	adc	$h1,$h1,$t1
2538c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
2548c2ecf20Sopenharmony_ci	rev	$h0,$h0			// flip output bytes
2558c2ecf20Sopenharmony_ci	rev	$h1,$h1
2568c2ecf20Sopenharmony_ci#endif
2578c2ecf20Sopenharmony_ci	stp	$h0,$h1,[$mac]		// write result
2588c2ecf20Sopenharmony_ci
2598c2ecf20Sopenharmony_ci	ret
2608c2ecf20Sopenharmony_ci.size	poly1305_emit,.-poly1305_emit
2618c2ecf20Sopenharmony_ci___
2628c2ecf20Sopenharmony_cimy ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
2638c2ecf20Sopenharmony_cimy ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
2648c2ecf20Sopenharmony_cimy ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
2658c2ecf20Sopenharmony_cimy ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
2668c2ecf20Sopenharmony_cimy ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
2678c2ecf20Sopenharmony_cimy ($T0,$T1,$MASK) = map("v$_",(29..31));
2688c2ecf20Sopenharmony_ci
2698c2ecf20Sopenharmony_cimy ($in2,$zeros)=("x16","x17");
2708c2ecf20Sopenharmony_cimy $is_base2_26 = $zeros;		# borrow
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci$code.=<<___;
2738c2ecf20Sopenharmony_ci.type	poly1305_mult,%function
2748c2ecf20Sopenharmony_ci.align	5
2758c2ecf20Sopenharmony_cipoly1305_mult:
2768c2ecf20Sopenharmony_ci	mul	$d0,$h0,$r0		// h0*r0
2778c2ecf20Sopenharmony_ci	umulh	$d1,$h0,$r0
2788c2ecf20Sopenharmony_ci
2798c2ecf20Sopenharmony_ci	mul	$t0,$h1,$s1		// h1*5*r1
2808c2ecf20Sopenharmony_ci	umulh	$t1,$h1,$s1
2818c2ecf20Sopenharmony_ci
2828c2ecf20Sopenharmony_ci	adds	$d0,$d0,$t0
2838c2ecf20Sopenharmony_ci	mul	$t0,$h0,$r1		// h0*r1
2848c2ecf20Sopenharmony_ci	adc	$d1,$d1,$t1
2858c2ecf20Sopenharmony_ci	umulh	$d2,$h0,$r1
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_ci	adds	$d1,$d1,$t0
2888c2ecf20Sopenharmony_ci	mul	$t0,$h1,$r0		// h1*r0
2898c2ecf20Sopenharmony_ci	adc	$d2,$d2,xzr
2908c2ecf20Sopenharmony_ci	umulh	$t1,$h1,$r0
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_ci	adds	$d1,$d1,$t0
2938c2ecf20Sopenharmony_ci	mul	$t0,$h2,$s1		// h2*5*r1
2948c2ecf20Sopenharmony_ci	adc	$d2,$d2,$t1
2958c2ecf20Sopenharmony_ci	mul	$t1,$h2,$r0		// h2*r0
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_ci	adds	$d1,$d1,$t0
2988c2ecf20Sopenharmony_ci	adc	$d2,$d2,$t1
2998c2ecf20Sopenharmony_ci
3008c2ecf20Sopenharmony_ci	and	$t0,$d2,#-4		// final reduction
3018c2ecf20Sopenharmony_ci	and	$h2,$d2,#3
3028c2ecf20Sopenharmony_ci	add	$t0,$t0,$d2,lsr#2
3038c2ecf20Sopenharmony_ci	adds	$h0,$d0,$t0
3048c2ecf20Sopenharmony_ci	adcs	$h1,$d1,xzr
3058c2ecf20Sopenharmony_ci	adc	$h2,$h2,xzr
3068c2ecf20Sopenharmony_ci
3078c2ecf20Sopenharmony_ci	ret
3088c2ecf20Sopenharmony_ci.size	poly1305_mult,.-poly1305_mult
3098c2ecf20Sopenharmony_ci
3108c2ecf20Sopenharmony_ci.type	poly1305_splat,%function
3118c2ecf20Sopenharmony_ci.align	4
3128c2ecf20Sopenharmony_cipoly1305_splat:
3138c2ecf20Sopenharmony_ci	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26
3148c2ecf20Sopenharmony_ci	ubfx	x13,$h0,#26,#26
3158c2ecf20Sopenharmony_ci	extr	x14,$h1,$h0,#52
3168c2ecf20Sopenharmony_ci	and	x14,x14,#0x03ffffff
3178c2ecf20Sopenharmony_ci	ubfx	x15,$h1,#14,#26
3188c2ecf20Sopenharmony_ci	extr	x16,$h2,$h1,#40
3198c2ecf20Sopenharmony_ci
3208c2ecf20Sopenharmony_ci	str	w12,[$ctx,#16*0]	// r0
3218c2ecf20Sopenharmony_ci	add	w12,w13,w13,lsl#2	// r1*5
3228c2ecf20Sopenharmony_ci	str	w13,[$ctx,#16*1]	// r1
3238c2ecf20Sopenharmony_ci	add	w13,w14,w14,lsl#2	// r2*5
3248c2ecf20Sopenharmony_ci	str	w12,[$ctx,#16*2]	// s1
3258c2ecf20Sopenharmony_ci	str	w14,[$ctx,#16*3]	// r2
3268c2ecf20Sopenharmony_ci	add	w14,w15,w15,lsl#2	// r3*5
3278c2ecf20Sopenharmony_ci	str	w13,[$ctx,#16*4]	// s2
3288c2ecf20Sopenharmony_ci	str	w15,[$ctx,#16*5]	// r3
3298c2ecf20Sopenharmony_ci	add	w15,w16,w16,lsl#2	// r4*5
3308c2ecf20Sopenharmony_ci	str	w14,[$ctx,#16*6]	// s3
3318c2ecf20Sopenharmony_ci	str	w16,[$ctx,#16*7]	// r4
3328c2ecf20Sopenharmony_ci	str	w15,[$ctx,#16*8]	// s4
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ci	ret
3358c2ecf20Sopenharmony_ci.size	poly1305_splat,.-poly1305_splat
3368c2ecf20Sopenharmony_ci
3378c2ecf20Sopenharmony_ci#ifdef	__KERNEL__
3388c2ecf20Sopenharmony_ci.globl	poly1305_blocks_neon
3398c2ecf20Sopenharmony_ci#endif
3408c2ecf20Sopenharmony_ci.type	poly1305_blocks_neon,%function
3418c2ecf20Sopenharmony_ci.align	5
3428c2ecf20Sopenharmony_cipoly1305_blocks_neon:
3438c2ecf20Sopenharmony_ci.Lpoly1305_blocks_neon:
3448c2ecf20Sopenharmony_ci	ldr	$is_base2_26,[$ctx,#24]
3458c2ecf20Sopenharmony_ci	cmp	$len,#128
3468c2ecf20Sopenharmony_ci	b.lo	.Lpoly1305_blocks
3478c2ecf20Sopenharmony_ci
3488c2ecf20Sopenharmony_ci	.inst	0xd503233f		// paciasp
3498c2ecf20Sopenharmony_ci	stp	x29,x30,[sp,#-80]!
3508c2ecf20Sopenharmony_ci	add	x29,sp,#0
3518c2ecf20Sopenharmony_ci
3528c2ecf20Sopenharmony_ci	stp	d8,d9,[sp,#16]		// meet ABI requirements
3538c2ecf20Sopenharmony_ci	stp	d10,d11,[sp,#32]
3548c2ecf20Sopenharmony_ci	stp	d12,d13,[sp,#48]
3558c2ecf20Sopenharmony_ci	stp	d14,d15,[sp,#64]
3568c2ecf20Sopenharmony_ci
3578c2ecf20Sopenharmony_ci	cbz	$is_base2_26,.Lbase2_64_neon
3588c2ecf20Sopenharmony_ci
3598c2ecf20Sopenharmony_ci	ldp	w10,w11,[$ctx]		// load hash value base 2^26
3608c2ecf20Sopenharmony_ci	ldp	w12,w13,[$ctx,#8]
3618c2ecf20Sopenharmony_ci	ldr	w14,[$ctx,#16]
3628c2ecf20Sopenharmony_ci
3638c2ecf20Sopenharmony_ci	tst	$len,#31
3648c2ecf20Sopenharmony_ci	b.eq	.Leven_neon
3658c2ecf20Sopenharmony_ci
3668c2ecf20Sopenharmony_ci	ldp	$r0,$r1,[$ctx,#32]	// load key value
3678c2ecf20Sopenharmony_ci
3688c2ecf20Sopenharmony_ci	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
3698c2ecf20Sopenharmony_ci	lsr	$h1,x12,#12
3708c2ecf20Sopenharmony_ci	adds	$h0,$h0,x12,lsl#52
3718c2ecf20Sopenharmony_ci	add	$h1,$h1,x13,lsl#14
3728c2ecf20Sopenharmony_ci	adc	$h1,$h1,xzr
3738c2ecf20Sopenharmony_ci	lsr	$h2,x14,#24
3748c2ecf20Sopenharmony_ci	adds	$h1,$h1,x14,lsl#40
3758c2ecf20Sopenharmony_ci	adc	$d2,$h2,xzr		// can be partially reduced...
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci	ldp	$d0,$d1,[$inp],#16	// load input
3788c2ecf20Sopenharmony_ci	sub	$len,$len,#16
3798c2ecf20Sopenharmony_ci	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
3828c2ecf20Sopenharmony_ci	rev	$d0,$d0
3838c2ecf20Sopenharmony_ci	rev	$d1,$d1
3848c2ecf20Sopenharmony_ci#endif
3858c2ecf20Sopenharmony_ci	adds	$h0,$h0,$d0		// accumulate input
3868c2ecf20Sopenharmony_ci	adcs	$h1,$h1,$d1
3878c2ecf20Sopenharmony_ci	adc	$h2,$h2,$padbit
3888c2ecf20Sopenharmony_ci
3898c2ecf20Sopenharmony_ci	bl	poly1305_mult
3908c2ecf20Sopenharmony_ci
3918c2ecf20Sopenharmony_ci	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
3928c2ecf20Sopenharmony_ci	ubfx	x11,$h0,#26,#26
3938c2ecf20Sopenharmony_ci	extr	x12,$h1,$h0,#52
3948c2ecf20Sopenharmony_ci	and	x12,x12,#0x03ffffff
3958c2ecf20Sopenharmony_ci	ubfx	x13,$h1,#14,#26
3968c2ecf20Sopenharmony_ci	extr	x14,$h2,$h1,#40
3978c2ecf20Sopenharmony_ci
3988c2ecf20Sopenharmony_ci	b	.Leven_neon
3998c2ecf20Sopenharmony_ci
4008c2ecf20Sopenharmony_ci.align	4
4018c2ecf20Sopenharmony_ci.Lbase2_64_neon:
4028c2ecf20Sopenharmony_ci	ldp	$r0,$r1,[$ctx,#32]	// load key value
4038c2ecf20Sopenharmony_ci
4048c2ecf20Sopenharmony_ci	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
4058c2ecf20Sopenharmony_ci	ldr	$h2,[$ctx,#16]
4068c2ecf20Sopenharmony_ci
4078c2ecf20Sopenharmony_ci	tst	$len,#31
4088c2ecf20Sopenharmony_ci	b.eq	.Linit_neon
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_ci	ldp	$d0,$d1,[$inp],#16	// load input
4118c2ecf20Sopenharmony_ci	sub	$len,$len,#16
4128c2ecf20Sopenharmony_ci	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
4138c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
4148c2ecf20Sopenharmony_ci	rev	$d0,$d0
4158c2ecf20Sopenharmony_ci	rev	$d1,$d1
4168c2ecf20Sopenharmony_ci#endif
4178c2ecf20Sopenharmony_ci	adds	$h0,$h0,$d0		// accumulate input
4188c2ecf20Sopenharmony_ci	adcs	$h1,$h1,$d1
4198c2ecf20Sopenharmony_ci	adc	$h2,$h2,$padbit
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_ci	bl	poly1305_mult
4228c2ecf20Sopenharmony_ci
4238c2ecf20Sopenharmony_ci.Linit_neon:
4248c2ecf20Sopenharmony_ci	ldr	w17,[$ctx,#48]		// first table element
4258c2ecf20Sopenharmony_ci	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
4268c2ecf20Sopenharmony_ci	ubfx	x11,$h0,#26,#26
4278c2ecf20Sopenharmony_ci	extr	x12,$h1,$h0,#52
4288c2ecf20Sopenharmony_ci	and	x12,x12,#0x03ffffff
4298c2ecf20Sopenharmony_ci	ubfx	x13,$h1,#14,#26
4308c2ecf20Sopenharmony_ci	extr	x14,$h2,$h1,#40
4318c2ecf20Sopenharmony_ci
4328c2ecf20Sopenharmony_ci	cmp	w17,#-1			// is value impossible?
4338c2ecf20Sopenharmony_ci	b.ne	.Leven_neon
4348c2ecf20Sopenharmony_ci
4358c2ecf20Sopenharmony_ci	fmov	${H0},x10
4368c2ecf20Sopenharmony_ci	fmov	${H1},x11
4378c2ecf20Sopenharmony_ci	fmov	${H2},x12
4388c2ecf20Sopenharmony_ci	fmov	${H3},x13
4398c2ecf20Sopenharmony_ci	fmov	${H4},x14
4408c2ecf20Sopenharmony_ci
4418c2ecf20Sopenharmony_ci	////////////////////////////////// initialize r^n table
4428c2ecf20Sopenharmony_ci	mov	$h0,$r0			// r^1
4438c2ecf20Sopenharmony_ci	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
4448c2ecf20Sopenharmony_ci	mov	$h1,$r1
4458c2ecf20Sopenharmony_ci	mov	$h2,xzr
4468c2ecf20Sopenharmony_ci	add	$ctx,$ctx,#48+12
4478c2ecf20Sopenharmony_ci	bl	poly1305_splat
4488c2ecf20Sopenharmony_ci
4498c2ecf20Sopenharmony_ci	bl	poly1305_mult		// r^2
4508c2ecf20Sopenharmony_ci	sub	$ctx,$ctx,#4
4518c2ecf20Sopenharmony_ci	bl	poly1305_splat
4528c2ecf20Sopenharmony_ci
4538c2ecf20Sopenharmony_ci	bl	poly1305_mult		// r^3
4548c2ecf20Sopenharmony_ci	sub	$ctx,$ctx,#4
4558c2ecf20Sopenharmony_ci	bl	poly1305_splat
4568c2ecf20Sopenharmony_ci
4578c2ecf20Sopenharmony_ci	bl	poly1305_mult		// r^4
4588c2ecf20Sopenharmony_ci	sub	$ctx,$ctx,#4
4598c2ecf20Sopenharmony_ci	bl	poly1305_splat
4608c2ecf20Sopenharmony_ci	sub	$ctx,$ctx,#48		// restore original $ctx
4618c2ecf20Sopenharmony_ci	b	.Ldo_neon
4628c2ecf20Sopenharmony_ci
4638c2ecf20Sopenharmony_ci.align	4
4648c2ecf20Sopenharmony_ci.Leven_neon:
4658c2ecf20Sopenharmony_ci	fmov	${H0},x10
4668c2ecf20Sopenharmony_ci	fmov	${H1},x11
4678c2ecf20Sopenharmony_ci	fmov	${H2},x12
4688c2ecf20Sopenharmony_ci	fmov	${H3},x13
4698c2ecf20Sopenharmony_ci	fmov	${H4},x14
4708c2ecf20Sopenharmony_ci
4718c2ecf20Sopenharmony_ci.Ldo_neon:
4728c2ecf20Sopenharmony_ci	ldp	x8,x12,[$inp,#32]	// inp[2:3]
4738c2ecf20Sopenharmony_ci	subs	$len,$len,#64
4748c2ecf20Sopenharmony_ci	ldp	x9,x13,[$inp,#48]
4758c2ecf20Sopenharmony_ci	add	$in2,$inp,#96
4768c2ecf20Sopenharmony_ci	adr	$zeros,.Lzeros
4778c2ecf20Sopenharmony_ci
4788c2ecf20Sopenharmony_ci	lsl	$padbit,$padbit,#24
4798c2ecf20Sopenharmony_ci	add	x15,$ctx,#48
4808c2ecf20Sopenharmony_ci
4818c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
4828c2ecf20Sopenharmony_ci	rev	x8,x8
4838c2ecf20Sopenharmony_ci	rev	x12,x12
4848c2ecf20Sopenharmony_ci	rev	x9,x9
4858c2ecf20Sopenharmony_ci	rev	x13,x13
4868c2ecf20Sopenharmony_ci#endif
4878c2ecf20Sopenharmony_ci	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
4888c2ecf20Sopenharmony_ci	and	x5,x9,#0x03ffffff
4898c2ecf20Sopenharmony_ci	ubfx	x6,x8,#26,#26
4908c2ecf20Sopenharmony_ci	ubfx	x7,x9,#26,#26
4918c2ecf20Sopenharmony_ci	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
4928c2ecf20Sopenharmony_ci	extr	x8,x12,x8,#52
4938c2ecf20Sopenharmony_ci	extr	x9,x13,x9,#52
4948c2ecf20Sopenharmony_ci	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
4958c2ecf20Sopenharmony_ci	fmov	$IN23_0,x4
4968c2ecf20Sopenharmony_ci	and	x8,x8,#0x03ffffff
4978c2ecf20Sopenharmony_ci	and	x9,x9,#0x03ffffff
4988c2ecf20Sopenharmony_ci	ubfx	x10,x12,#14,#26
4998c2ecf20Sopenharmony_ci	ubfx	x11,x13,#14,#26
5008c2ecf20Sopenharmony_ci	add	x12,$padbit,x12,lsr#40
5018c2ecf20Sopenharmony_ci	add	x13,$padbit,x13,lsr#40
5028c2ecf20Sopenharmony_ci	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
5038c2ecf20Sopenharmony_ci	fmov	$IN23_1,x6
5048c2ecf20Sopenharmony_ci	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
5058c2ecf20Sopenharmony_ci	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
5068c2ecf20Sopenharmony_ci	fmov	$IN23_2,x8
5078c2ecf20Sopenharmony_ci	fmov	$IN23_3,x10
5088c2ecf20Sopenharmony_ci	fmov	$IN23_4,x12
5098c2ecf20Sopenharmony_ci
5108c2ecf20Sopenharmony_ci	ldp	x8,x12,[$inp],#16	// inp[0:1]
5118c2ecf20Sopenharmony_ci	ldp	x9,x13,[$inp],#48
5128c2ecf20Sopenharmony_ci
5138c2ecf20Sopenharmony_ci	ld1	{$R0,$R1,$S1,$R2},[x15],#64
5148c2ecf20Sopenharmony_ci	ld1	{$S2,$R3,$S3,$R4},[x15],#64
5158c2ecf20Sopenharmony_ci	ld1	{$S4},[x15]
5168c2ecf20Sopenharmony_ci
5178c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
5188c2ecf20Sopenharmony_ci	rev	x8,x8
5198c2ecf20Sopenharmony_ci	rev	x12,x12
5208c2ecf20Sopenharmony_ci	rev	x9,x9
5218c2ecf20Sopenharmony_ci	rev	x13,x13
5228c2ecf20Sopenharmony_ci#endif
5238c2ecf20Sopenharmony_ci	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
5248c2ecf20Sopenharmony_ci	and	x5,x9,#0x03ffffff
5258c2ecf20Sopenharmony_ci	ubfx	x6,x8,#26,#26
5268c2ecf20Sopenharmony_ci	ubfx	x7,x9,#26,#26
5278c2ecf20Sopenharmony_ci	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
5288c2ecf20Sopenharmony_ci	extr	x8,x12,x8,#52
5298c2ecf20Sopenharmony_ci	extr	x9,x13,x9,#52
5308c2ecf20Sopenharmony_ci	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
5318c2ecf20Sopenharmony_ci	fmov	$IN01_0,x4
5328c2ecf20Sopenharmony_ci	and	x8,x8,#0x03ffffff
5338c2ecf20Sopenharmony_ci	and	x9,x9,#0x03ffffff
5348c2ecf20Sopenharmony_ci	ubfx	x10,x12,#14,#26
5358c2ecf20Sopenharmony_ci	ubfx	x11,x13,#14,#26
5368c2ecf20Sopenharmony_ci	add	x12,$padbit,x12,lsr#40
5378c2ecf20Sopenharmony_ci	add	x13,$padbit,x13,lsr#40
5388c2ecf20Sopenharmony_ci	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
5398c2ecf20Sopenharmony_ci	fmov	$IN01_1,x6
5408c2ecf20Sopenharmony_ci	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
5418c2ecf20Sopenharmony_ci	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
5428c2ecf20Sopenharmony_ci	movi	$MASK.2d,#-1
5438c2ecf20Sopenharmony_ci	fmov	$IN01_2,x8
5448c2ecf20Sopenharmony_ci	fmov	$IN01_3,x10
5458c2ecf20Sopenharmony_ci	fmov	$IN01_4,x12
5468c2ecf20Sopenharmony_ci	ushr	$MASK.2d,$MASK.2d,#38
5478c2ecf20Sopenharmony_ci
5488c2ecf20Sopenharmony_ci	b.ls	.Lskip_loop
5498c2ecf20Sopenharmony_ci
5508c2ecf20Sopenharmony_ci.align	4
5518c2ecf20Sopenharmony_ci.Loop_neon:
5528c2ecf20Sopenharmony_ci	////////////////////////////////////////////////////////////////
5538c2ecf20Sopenharmony_ci	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
5548c2ecf20Sopenharmony_ci	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
5558c2ecf20Sopenharmony_ci	//   \___________________/
5568c2ecf20Sopenharmony_ci	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
5578c2ecf20Sopenharmony_ci	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
5588c2ecf20Sopenharmony_ci	//   \___________________/ \____________________/
5598c2ecf20Sopenharmony_ci	//
5608c2ecf20Sopenharmony_ci	// Note that we start with inp[2:3]*r^2. This is because it
5618c2ecf20Sopenharmony_ci	// doesn't depend on reduction in previous iteration.
5628c2ecf20Sopenharmony_ci	////////////////////////////////////////////////////////////////
5638c2ecf20Sopenharmony_ci	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
5648c2ecf20Sopenharmony_ci	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
5658c2ecf20Sopenharmony_ci	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
5668c2ecf20Sopenharmony_ci	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
5678c2ecf20Sopenharmony_ci	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
5688c2ecf20Sopenharmony_ci
5698c2ecf20Sopenharmony_ci	subs	$len,$len,#64
5708c2ecf20Sopenharmony_ci	umull	$ACC4,$IN23_0,${R4}[2]
5718c2ecf20Sopenharmony_ci	csel	$in2,$zeros,$in2,lo
5728c2ecf20Sopenharmony_ci	umull	$ACC3,$IN23_0,${R3}[2]
5738c2ecf20Sopenharmony_ci	umull	$ACC2,$IN23_0,${R2}[2]
5748c2ecf20Sopenharmony_ci	 ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
5758c2ecf20Sopenharmony_ci	umull	$ACC1,$IN23_0,${R1}[2]
5768c2ecf20Sopenharmony_ci	 ldp	x9,x13,[$in2],#48
5778c2ecf20Sopenharmony_ci	umull	$ACC0,$IN23_0,${R0}[2]
5788c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
5798c2ecf20Sopenharmony_ci	 rev	x8,x8
5808c2ecf20Sopenharmony_ci	 rev	x12,x12
5818c2ecf20Sopenharmony_ci	 rev	x9,x9
5828c2ecf20Sopenharmony_ci	 rev	x13,x13
5838c2ecf20Sopenharmony_ci#endif
5848c2ecf20Sopenharmony_ci
5858c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN23_1,${R3}[2]
5868c2ecf20Sopenharmony_ci	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
5878c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN23_1,${R2}[2]
5888c2ecf20Sopenharmony_ci	 and	x5,x9,#0x03ffffff
5898c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN23_1,${R1}[2]
5908c2ecf20Sopenharmony_ci	 ubfx	x6,x8,#26,#26
5918c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN23_1,${R0}[2]
5928c2ecf20Sopenharmony_ci	 ubfx	x7,x9,#26,#26
5938c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN23_1,${S4}[2]
5948c2ecf20Sopenharmony_ci	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
5958c2ecf20Sopenharmony_ci
5968c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN23_2,${R2}[2]
5978c2ecf20Sopenharmony_ci	 extr	x8,x12,x8,#52
5988c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN23_2,${R1}[2]
5998c2ecf20Sopenharmony_ci	 extr	x9,x13,x9,#52
6008c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN23_2,${R0}[2]
6018c2ecf20Sopenharmony_ci	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
6028c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN23_2,${S4}[2]
6038c2ecf20Sopenharmony_ci	 fmov	$IN23_0,x4
6048c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN23_2,${S3}[2]
6058c2ecf20Sopenharmony_ci	 and	x8,x8,#0x03ffffff
6068c2ecf20Sopenharmony_ci
6078c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN23_3,${R1}[2]
6088c2ecf20Sopenharmony_ci	 and	x9,x9,#0x03ffffff
6098c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN23_3,${R0}[2]
6108c2ecf20Sopenharmony_ci	 ubfx	x10,x12,#14,#26
6118c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN23_3,${S4}[2]
6128c2ecf20Sopenharmony_ci	 ubfx	x11,x13,#14,#26
6138c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN23_3,${S3}[2]
6148c2ecf20Sopenharmony_ci	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
6158c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN23_3,${S2}[2]
6168c2ecf20Sopenharmony_ci	 fmov	$IN23_1,x6
6178c2ecf20Sopenharmony_ci
6188c2ecf20Sopenharmony_ci	add	$IN01_2,$IN01_2,$H2
6198c2ecf20Sopenharmony_ci	 add	x12,$padbit,x12,lsr#40
6208c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN23_4,${R0}[2]
6218c2ecf20Sopenharmony_ci	 add	x13,$padbit,x13,lsr#40
6228c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN23_4,${S4}[2]
6238c2ecf20Sopenharmony_ci	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
6248c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN23_4,${S3}[2]
6258c2ecf20Sopenharmony_ci	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
6268c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN23_4,${S2}[2]
6278c2ecf20Sopenharmony_ci	 fmov	$IN23_2,x8
6288c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN23_4,${S1}[2]
6298c2ecf20Sopenharmony_ci	 fmov	$IN23_3,x10
6308c2ecf20Sopenharmony_ci
6318c2ecf20Sopenharmony_ci	////////////////////////////////////////////////////////////////
6328c2ecf20Sopenharmony_ci	// (hash+inp[0:1])*r^4 and accumulate
6338c2ecf20Sopenharmony_ci
6348c2ecf20Sopenharmony_ci	add	$IN01_0,$IN01_0,$H0
6358c2ecf20Sopenharmony_ci	 fmov	$IN23_4,x12
6368c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN01_2,${R1}[0]
6378c2ecf20Sopenharmony_ci	 ldp	x8,x12,[$inp],#16	// inp[0:1]
6388c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN01_2,${S3}[0]
6398c2ecf20Sopenharmony_ci	 ldp	x9,x13,[$inp],#48
6408c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN01_2,${R2}[0]
6418c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN01_2,${S4}[0]
6428c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN01_2,${R0}[0]
6438c2ecf20Sopenharmony_ci#ifdef	__AARCH64EB__
6448c2ecf20Sopenharmony_ci	 rev	x8,x8
6458c2ecf20Sopenharmony_ci	 rev	x12,x12
6468c2ecf20Sopenharmony_ci	 rev	x9,x9
6478c2ecf20Sopenharmony_ci	 rev	x13,x13
6488c2ecf20Sopenharmony_ci#endif
6498c2ecf20Sopenharmony_ci
6508c2ecf20Sopenharmony_ci	add	$IN01_1,$IN01_1,$H1
6518c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN01_0,${R3}[0]
6528c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN01_0,${R4}[0]
6538c2ecf20Sopenharmony_ci	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
6548c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN01_0,${R2}[0]
6558c2ecf20Sopenharmony_ci	 and	x5,x9,#0x03ffffff
6568c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN01_0,${R0}[0]
6578c2ecf20Sopenharmony_ci	 ubfx	x6,x8,#26,#26
6588c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN01_0,${R1}[0]
6598c2ecf20Sopenharmony_ci	 ubfx	x7,x9,#26,#26
6608c2ecf20Sopenharmony_ci
6618c2ecf20Sopenharmony_ci	add	$IN01_3,$IN01_3,$H3
6628c2ecf20Sopenharmony_ci	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
6638c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN01_1,${R2}[0]
6648c2ecf20Sopenharmony_ci	 extr	x8,x12,x8,#52
6658c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN01_1,${R3}[0]
6668c2ecf20Sopenharmony_ci	 extr	x9,x13,x9,#52
6678c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN01_1,${S4}[0]
6688c2ecf20Sopenharmony_ci	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
6698c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN01_1,${R1}[0]
6708c2ecf20Sopenharmony_ci	 fmov	$IN01_0,x4
6718c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN01_1,${R0}[0]
6728c2ecf20Sopenharmony_ci	 and	x8,x8,#0x03ffffff
6738c2ecf20Sopenharmony_ci
6748c2ecf20Sopenharmony_ci	add	$IN01_4,$IN01_4,$H4
6758c2ecf20Sopenharmony_ci	 and	x9,x9,#0x03ffffff
6768c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN01_3,${R0}[0]
6778c2ecf20Sopenharmony_ci	 ubfx	x10,x12,#14,#26
6788c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN01_3,${S2}[0]
6798c2ecf20Sopenharmony_ci	 ubfx	x11,x13,#14,#26
6808c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN01_3,${R1}[0]
6818c2ecf20Sopenharmony_ci	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
6828c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN01_3,${S3}[0]
6838c2ecf20Sopenharmony_ci	 fmov	$IN01_1,x6
6848c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN01_3,${S4}[0]
6858c2ecf20Sopenharmony_ci	 add	x12,$padbit,x12,lsr#40
6868c2ecf20Sopenharmony_ci
6878c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN01_4,${S4}[0]
6888c2ecf20Sopenharmony_ci	 add	x13,$padbit,x13,lsr#40
6898c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN01_4,${S1}[0]
6908c2ecf20Sopenharmony_ci	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
6918c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN01_4,${R0}[0]
6928c2ecf20Sopenharmony_ci	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
6938c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN01_4,${S2}[0]
6948c2ecf20Sopenharmony_ci	 fmov	$IN01_2,x8
6958c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN01_4,${S3}[0]
6968c2ecf20Sopenharmony_ci	 fmov	$IN01_3,x10
6978c2ecf20Sopenharmony_ci	 fmov	$IN01_4,x12
6988c2ecf20Sopenharmony_ci
6998c2ecf20Sopenharmony_ci	/////////////////////////////////////////////////////////////////
7008c2ecf20Sopenharmony_ci	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
7018c2ecf20Sopenharmony_ci	// and P. Schwabe
7028c2ecf20Sopenharmony_ci	//
7038c2ecf20Sopenharmony_ci	// [see discussion in poly1305-armv4 module]
7048c2ecf20Sopenharmony_ci
7058c2ecf20Sopenharmony_ci	ushr	$T0.2d,$ACC3,#26
7068c2ecf20Sopenharmony_ci	xtn	$H3,$ACC3
7078c2ecf20Sopenharmony_ci	 ushr	$T1.2d,$ACC0,#26
7088c2ecf20Sopenharmony_ci	 and	$ACC0,$ACC0,$MASK.2d
7098c2ecf20Sopenharmony_ci	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
7108c2ecf20Sopenharmony_ci	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff
7118c2ecf20Sopenharmony_ci	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
7128c2ecf20Sopenharmony_ci
7138c2ecf20Sopenharmony_ci	ushr	$T0.2d,$ACC4,#26
7148c2ecf20Sopenharmony_ci	xtn	$H4,$ACC4
7158c2ecf20Sopenharmony_ci	 ushr	$T1.2d,$ACC1,#26
7168c2ecf20Sopenharmony_ci	 xtn	$H1,$ACC1
7178c2ecf20Sopenharmony_ci	bic	$H4,#0xfc,lsl#24
7188c2ecf20Sopenharmony_ci	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
7198c2ecf20Sopenharmony_ci
7208c2ecf20Sopenharmony_ci	add	$ACC0,$ACC0,$T0.2d
7218c2ecf20Sopenharmony_ci	shl	$T0.2d,$T0.2d,#2
7228c2ecf20Sopenharmony_ci	 shrn	$T1.2s,$ACC2,#26
7238c2ecf20Sopenharmony_ci	 xtn	$H2,$ACC2
7248c2ecf20Sopenharmony_ci	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
7258c2ecf20Sopenharmony_ci	 bic	$H1,#0xfc,lsl#24
7268c2ecf20Sopenharmony_ci	 add	$H3,$H3,$T1.2s		// h2 -> h3
7278c2ecf20Sopenharmony_ci	 bic	$H2,#0xfc,lsl#24
7288c2ecf20Sopenharmony_ci
7298c2ecf20Sopenharmony_ci	shrn	$T0.2s,$ACC0,#26
7308c2ecf20Sopenharmony_ci	xtn	$H0,$ACC0
7318c2ecf20Sopenharmony_ci	 ushr	$T1.2s,$H3,#26
7328c2ecf20Sopenharmony_ci	 bic	$H3,#0xfc,lsl#24
7338c2ecf20Sopenharmony_ci	 bic	$H0,#0xfc,lsl#24
7348c2ecf20Sopenharmony_ci	add	$H1,$H1,$T0.2s		// h0 -> h1
7358c2ecf20Sopenharmony_ci	 add	$H4,$H4,$T1.2s		// h3 -> h4
7368c2ecf20Sopenharmony_ci
7378c2ecf20Sopenharmony_ci	b.hi	.Loop_neon
7388c2ecf20Sopenharmony_ci
7398c2ecf20Sopenharmony_ci.Lskip_loop:
7408c2ecf20Sopenharmony_ci	dup	$IN23_2,${IN23_2}[0]
7418c2ecf20Sopenharmony_ci	add	$IN01_2,$IN01_2,$H2
7428c2ecf20Sopenharmony_ci
7438c2ecf20Sopenharmony_ci	////////////////////////////////////////////////////////////////
7448c2ecf20Sopenharmony_ci	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
7458c2ecf20Sopenharmony_ci
7468c2ecf20Sopenharmony_ci	adds	$len,$len,#32
7478c2ecf20Sopenharmony_ci	b.ne	.Long_tail
7488c2ecf20Sopenharmony_ci
7498c2ecf20Sopenharmony_ci	dup	$IN23_2,${IN01_2}[0]
7508c2ecf20Sopenharmony_ci	add	$IN23_0,$IN01_0,$H0
7518c2ecf20Sopenharmony_ci	add	$IN23_3,$IN01_3,$H3
7528c2ecf20Sopenharmony_ci	add	$IN23_1,$IN01_1,$H1
7538c2ecf20Sopenharmony_ci	add	$IN23_4,$IN01_4,$H4
7548c2ecf20Sopenharmony_ci
7558c2ecf20Sopenharmony_ci.Long_tail:
7568c2ecf20Sopenharmony_ci	dup	$IN23_0,${IN23_0}[0]
7578c2ecf20Sopenharmony_ci	umull2	$ACC0,$IN23_2,${S3}
7588c2ecf20Sopenharmony_ci	umull2	$ACC3,$IN23_2,${R1}
7598c2ecf20Sopenharmony_ci	umull2	$ACC4,$IN23_2,${R2}
7608c2ecf20Sopenharmony_ci	umull2	$ACC2,$IN23_2,${R0}
7618c2ecf20Sopenharmony_ci	umull2	$ACC1,$IN23_2,${S4}
7628c2ecf20Sopenharmony_ci
7638c2ecf20Sopenharmony_ci	dup	$IN23_1,${IN23_1}[0]
7648c2ecf20Sopenharmony_ci	umlal2	$ACC0,$IN23_0,${R0}
7658c2ecf20Sopenharmony_ci	umlal2	$ACC2,$IN23_0,${R2}
7668c2ecf20Sopenharmony_ci	umlal2	$ACC3,$IN23_0,${R3}
7678c2ecf20Sopenharmony_ci	umlal2	$ACC4,$IN23_0,${R4}
7688c2ecf20Sopenharmony_ci	umlal2	$ACC1,$IN23_0,${R1}
7698c2ecf20Sopenharmony_ci
7708c2ecf20Sopenharmony_ci	dup	$IN23_3,${IN23_3}[0]
7718c2ecf20Sopenharmony_ci	umlal2	$ACC0,$IN23_1,${S4}
7728c2ecf20Sopenharmony_ci	umlal2	$ACC3,$IN23_1,${R2}
7738c2ecf20Sopenharmony_ci	umlal2	$ACC2,$IN23_1,${R1}
7748c2ecf20Sopenharmony_ci	umlal2	$ACC4,$IN23_1,${R3}
7758c2ecf20Sopenharmony_ci	umlal2	$ACC1,$IN23_1,${R0}
7768c2ecf20Sopenharmony_ci
7778c2ecf20Sopenharmony_ci	dup	$IN23_4,${IN23_4}[0]
7788c2ecf20Sopenharmony_ci	umlal2	$ACC3,$IN23_3,${R0}
7798c2ecf20Sopenharmony_ci	umlal2	$ACC4,$IN23_3,${R1}
7808c2ecf20Sopenharmony_ci	umlal2	$ACC0,$IN23_3,${S2}
7818c2ecf20Sopenharmony_ci	umlal2	$ACC1,$IN23_3,${S3}
7828c2ecf20Sopenharmony_ci	umlal2	$ACC2,$IN23_3,${S4}
7838c2ecf20Sopenharmony_ci
7848c2ecf20Sopenharmony_ci	umlal2	$ACC3,$IN23_4,${S4}
7858c2ecf20Sopenharmony_ci	umlal2	$ACC0,$IN23_4,${S1}
7868c2ecf20Sopenharmony_ci	umlal2	$ACC4,$IN23_4,${R0}
7878c2ecf20Sopenharmony_ci	umlal2	$ACC1,$IN23_4,${S2}
7888c2ecf20Sopenharmony_ci	umlal2	$ACC2,$IN23_4,${S3}
7898c2ecf20Sopenharmony_ci
7908c2ecf20Sopenharmony_ci	b.eq	.Lshort_tail
7918c2ecf20Sopenharmony_ci
7928c2ecf20Sopenharmony_ci	////////////////////////////////////////////////////////////////
7938c2ecf20Sopenharmony_ci	// (hash+inp[0:1])*r^4:r^3 and accumulate
7948c2ecf20Sopenharmony_ci
7958c2ecf20Sopenharmony_ci	add	$IN01_0,$IN01_0,$H0
7968c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN01_2,${R1}
7978c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN01_2,${S3}
7988c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN01_2,${R2}
7998c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN01_2,${S4}
8008c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN01_2,${R0}
8018c2ecf20Sopenharmony_ci
8028c2ecf20Sopenharmony_ci	add	$IN01_1,$IN01_1,$H1
8038c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN01_0,${R3}
8048c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN01_0,${R0}
8058c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN01_0,${R4}
8068c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN01_0,${R1}
8078c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN01_0,${R2}
8088c2ecf20Sopenharmony_ci
8098c2ecf20Sopenharmony_ci	add	$IN01_3,$IN01_3,$H3
8108c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN01_1,${R2}
8118c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN01_1,${S4}
8128c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN01_1,${R3}
8138c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN01_1,${R0}
8148c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN01_1,${R1}
8158c2ecf20Sopenharmony_ci
8168c2ecf20Sopenharmony_ci	add	$IN01_4,$IN01_4,$H4
8178c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN01_3,${R0}
8188c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN01_3,${S2}
8198c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN01_3,${R1}
8208c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN01_3,${S3}
8218c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN01_3,${S4}
8228c2ecf20Sopenharmony_ci
8238c2ecf20Sopenharmony_ci	umlal	$ACC3,$IN01_4,${S4}
8248c2ecf20Sopenharmony_ci	umlal	$ACC0,$IN01_4,${S1}
8258c2ecf20Sopenharmony_ci	umlal	$ACC4,$IN01_4,${R0}
8268c2ecf20Sopenharmony_ci	umlal	$ACC1,$IN01_4,${S2}
8278c2ecf20Sopenharmony_ci	umlal	$ACC2,$IN01_4,${S3}
8288c2ecf20Sopenharmony_ci
8298c2ecf20Sopenharmony_ci.Lshort_tail:
8308c2ecf20Sopenharmony_ci	////////////////////////////////////////////////////////////////
8318c2ecf20Sopenharmony_ci	// horizontal add
8328c2ecf20Sopenharmony_ci
8338c2ecf20Sopenharmony_ci	addp	$ACC3,$ACC3,$ACC3
8348c2ecf20Sopenharmony_ci	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
8358c2ecf20Sopenharmony_ci	addp	$ACC0,$ACC0,$ACC0
8368c2ecf20Sopenharmony_ci	 ldp	d10,d11,[sp,#32]
8378c2ecf20Sopenharmony_ci	addp	$ACC4,$ACC4,$ACC4
8388c2ecf20Sopenharmony_ci	 ldp	d12,d13,[sp,#48]
8398c2ecf20Sopenharmony_ci	addp	$ACC1,$ACC1,$ACC1
8408c2ecf20Sopenharmony_ci	 ldp	d14,d15,[sp,#64]
8418c2ecf20Sopenharmony_ci	addp	$ACC2,$ACC2,$ACC2
8428c2ecf20Sopenharmony_ci	 ldr	x30,[sp,#8]
8438c2ecf20Sopenharmony_ci
8448c2ecf20Sopenharmony_ci	////////////////////////////////////////////////////////////////
8458c2ecf20Sopenharmony_ci	// lazy reduction, but without narrowing
8468c2ecf20Sopenharmony_ci
8478c2ecf20Sopenharmony_ci	ushr	$T0.2d,$ACC3,#26
8488c2ecf20Sopenharmony_ci	and	$ACC3,$ACC3,$MASK.2d
8498c2ecf20Sopenharmony_ci	 ushr	$T1.2d,$ACC0,#26
8508c2ecf20Sopenharmony_ci	 and	$ACC0,$ACC0,$MASK.2d
8518c2ecf20Sopenharmony_ci
8528c2ecf20Sopenharmony_ci	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
8538c2ecf20Sopenharmony_ci	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
8548c2ecf20Sopenharmony_ci
8558c2ecf20Sopenharmony_ci	ushr	$T0.2d,$ACC4,#26
8568c2ecf20Sopenharmony_ci	and	$ACC4,$ACC4,$MASK.2d
8578c2ecf20Sopenharmony_ci	 ushr	$T1.2d,$ACC1,#26
8588c2ecf20Sopenharmony_ci	 and	$ACC1,$ACC1,$MASK.2d
8598c2ecf20Sopenharmony_ci	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
8608c2ecf20Sopenharmony_ci
8618c2ecf20Sopenharmony_ci	add	$ACC0,$ACC0,$T0.2d
8628c2ecf20Sopenharmony_ci	shl	$T0.2d,$T0.2d,#2
8638c2ecf20Sopenharmony_ci	 ushr	$T1.2d,$ACC2,#26
8648c2ecf20Sopenharmony_ci	 and	$ACC2,$ACC2,$MASK.2d
8658c2ecf20Sopenharmony_ci	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
8668c2ecf20Sopenharmony_ci	 add	$ACC3,$ACC3,$T1.2d	// h2 -> h3
8678c2ecf20Sopenharmony_ci
8688c2ecf20Sopenharmony_ci	ushr	$T0.2d,$ACC0,#26
8698c2ecf20Sopenharmony_ci	and	$ACC0,$ACC0,$MASK.2d
8708c2ecf20Sopenharmony_ci	 ushr	$T1.2d,$ACC3,#26
8718c2ecf20Sopenharmony_ci	 and	$ACC3,$ACC3,$MASK.2d
8728c2ecf20Sopenharmony_ci	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1
8738c2ecf20Sopenharmony_ci	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4
8748c2ecf20Sopenharmony_ci
8758c2ecf20Sopenharmony_ci	////////////////////////////////////////////////////////////////
8768c2ecf20Sopenharmony_ci	// write the result, can be partially reduced
8778c2ecf20Sopenharmony_ci
8788c2ecf20Sopenharmony_ci	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
8798c2ecf20Sopenharmony_ci	mov	x4,#1
8808c2ecf20Sopenharmony_ci	st1	{$ACC4}[0],[$ctx]
8818c2ecf20Sopenharmony_ci	str	x4,[$ctx,#8]		// set is_base2_26
8828c2ecf20Sopenharmony_ci
8838c2ecf20Sopenharmony_ci	ldr	x29,[sp],#80
8848c2ecf20Sopenharmony_ci	 .inst	0xd50323bf		// autiasp
8858c2ecf20Sopenharmony_ci	ret
8868c2ecf20Sopenharmony_ci.size	poly1305_blocks_neon,.-poly1305_blocks_neon
8878c2ecf20Sopenharmony_ci
8888c2ecf20Sopenharmony_ci.align	5
8898c2ecf20Sopenharmony_ci.Lzeros:
8908c2ecf20Sopenharmony_ci.long	0,0,0,0,0,0,0,0
8918c2ecf20Sopenharmony_ci.asciz	"Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
8928c2ecf20Sopenharmony_ci.align	2
8938c2ecf20Sopenharmony_ci#if !defined(__KERNEL__) && !defined(_WIN64)
8948c2ecf20Sopenharmony_ci.comm	OPENSSL_armcap_P,4,4
8958c2ecf20Sopenharmony_ci.hidden	OPENSSL_armcap_P
8968c2ecf20Sopenharmony_ci#endif
8978c2ecf20Sopenharmony_ci___
8988c2ecf20Sopenharmony_ci
8998c2ecf20Sopenharmony_ciforeach (split("\n",$code)) {
9008c2ecf20Sopenharmony_ci	s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/			or
9018c2ecf20Sopenharmony_ci	s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/	or
9028c2ecf20Sopenharmony_ci	(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))			or
9038c2ecf20Sopenharmony_ci	(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))	or
9048c2ecf20Sopenharmony_ci	(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))		or
9058c2ecf20Sopenharmony_ci	(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))		or
9068c2ecf20Sopenharmony_ci	(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
9078c2ecf20Sopenharmony_ci
9088c2ecf20Sopenharmony_ci	s/\.[124]([sd])\[/.$1\[/;
9098c2ecf20Sopenharmony_ci	s/w#x([0-9]+)/w$1/g;
9108c2ecf20Sopenharmony_ci
9118c2ecf20Sopenharmony_ci	print $_,"\n";
9128c2ecf20Sopenharmony_ci}
9138c2ecf20Sopenharmony_ciclose STDOUT;
914