162306a36Sopenharmony_ci#!/usr/bin/env perl
262306a36Sopenharmony_ci# SPDX-License-Identifier: GPL-2.0
362306a36Sopenharmony_ci
462306a36Sopenharmony_ci# This code is taken from the OpenSSL project but the author (Andy Polyakov)
562306a36Sopenharmony_ci# has relicensed it under the GPLv2. Therefore this program is free software;
662306a36Sopenharmony_ci# you can redistribute it and/or modify it under the terms of the GNU General
762306a36Sopenharmony_ci# Public License version 2 as published by the Free Software Foundation.
862306a36Sopenharmony_ci#
962306a36Sopenharmony_ci# The original headers, including the original license headers, are
1062306a36Sopenharmony_ci# included below for completeness.
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci# ====================================================================
1362306a36Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1462306a36Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
1562306a36Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
1662306a36Sopenharmony_ci# details see https://www.openssl.org/~appro/cryptogams/.
1762306a36Sopenharmony_ci# ====================================================================
1862306a36Sopenharmony_ci
1962306a36Sopenharmony_ci# SHA256 block procedure for ARMv4. May 2007.
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci# Performance is ~2x better than gcc 3.4 generated code and in "abso-
2262306a36Sopenharmony_ci# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
2362306a36Sopenharmony_ci# byte [on single-issue Xscale PXA250 core].
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_ci# July 2010.
2662306a36Sopenharmony_ci#
2762306a36Sopenharmony_ci# Rescheduling for dual-issue pipeline resulted in 22% improvement on
2862306a36Sopenharmony_ci# Cortex A8 core and ~20 cycles per processed byte.
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci# February 2011.
3162306a36Sopenharmony_ci#
3262306a36Sopenharmony_ci# Profiler-assisted and platform-specific optimization resulted in 16%
3362306a36Sopenharmony_ci# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci# September 2013.
3662306a36Sopenharmony_ci#
3762306a36Sopenharmony_ci# Add NEON implementation. On Cortex A8 it was measured to process one
3862306a36Sopenharmony_ci# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
3962306a36Sopenharmony_ci# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
4062306a36Sopenharmony_ci# code (meaning that latter performs sub-optimally, nothing was done
4162306a36Sopenharmony_ci# about it).
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ci# May 2014.
4462306a36Sopenharmony_ci#
4562306a36Sopenharmony_ci# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ciwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
4862306a36Sopenharmony_ciopen STDOUT,">$output";
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci$ctx="r0";	$t0="r0";
5162306a36Sopenharmony_ci$inp="r1";	$t4="r1";
5262306a36Sopenharmony_ci$len="r2";	$t1="r2";
5362306a36Sopenharmony_ci$T1="r3";	$t3="r3";
5462306a36Sopenharmony_ci$A="r4";
5562306a36Sopenharmony_ci$B="r5";
5662306a36Sopenharmony_ci$C="r6";
5762306a36Sopenharmony_ci$D="r7";
5862306a36Sopenharmony_ci$E="r8";
5962306a36Sopenharmony_ci$F="r9";
6062306a36Sopenharmony_ci$G="r10";
6162306a36Sopenharmony_ci$H="r11";
6262306a36Sopenharmony_ci@V=($A,$B,$C,$D,$E,$F,$G,$H);
6362306a36Sopenharmony_ci$t2="r12";
6462306a36Sopenharmony_ci$Ktbl="r14";
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci@Sigma0=( 2,13,22);
6762306a36Sopenharmony_ci@Sigma1=( 6,11,25);
6862306a36Sopenharmony_ci@sigma0=( 7,18, 3);
6962306a36Sopenharmony_ci@sigma1=(17,19,10);
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_cisub BODY_00_15 {
7262306a36Sopenharmony_cimy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_ci$code.=<<___ if ($i<16);
7562306a36Sopenharmony_ci#if __ARM_ARCH__>=7
7662306a36Sopenharmony_ci	@ ldr	$t1,[$inp],#4			@ $i
7762306a36Sopenharmony_ci# if $i==15
7862306a36Sopenharmony_ci	str	$inp,[sp,#17*4]			@ make room for $t4
7962306a36Sopenharmony_ci# endif
8062306a36Sopenharmony_ci	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
8162306a36Sopenharmony_ci	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
8262306a36Sopenharmony_ci	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
8362306a36Sopenharmony_ci# ifndef __ARMEB__
8462306a36Sopenharmony_ci	rev	$t1,$t1
8562306a36Sopenharmony_ci# endif
8662306a36Sopenharmony_ci#else
8762306a36Sopenharmony_ci	@ ldrb	$t1,[$inp,#3]			@ $i
8862306a36Sopenharmony_ci	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
8962306a36Sopenharmony_ci	ldrb	$t2,[$inp,#2]
9062306a36Sopenharmony_ci	ldrb	$t0,[$inp,#1]
9162306a36Sopenharmony_ci	orr	$t1,$t1,$t2,lsl#8
9262306a36Sopenharmony_ci	ldrb	$t2,[$inp],#4
9362306a36Sopenharmony_ci	orr	$t1,$t1,$t0,lsl#16
9462306a36Sopenharmony_ci# if $i==15
9562306a36Sopenharmony_ci	str	$inp,[sp,#17*4]			@ make room for $t4
9662306a36Sopenharmony_ci# endif
9762306a36Sopenharmony_ci	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
9862306a36Sopenharmony_ci	orr	$t1,$t1,$t2,lsl#24
9962306a36Sopenharmony_ci	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
10062306a36Sopenharmony_ci#endif
10162306a36Sopenharmony_ci___
10262306a36Sopenharmony_ci$code.=<<___;
10362306a36Sopenharmony_ci	ldr	$t2,[$Ktbl],#4			@ *K256++
10462306a36Sopenharmony_ci	add	$h,$h,$t1			@ h+=X[i]
10562306a36Sopenharmony_ci	str	$t1,[sp,#`$i%16`*4]
10662306a36Sopenharmony_ci	eor	$t1,$f,$g
10762306a36Sopenharmony_ci	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
10862306a36Sopenharmony_ci	and	$t1,$t1,$e
10962306a36Sopenharmony_ci	add	$h,$h,$t2			@ h+=K256[i]
11062306a36Sopenharmony_ci	eor	$t1,$t1,$g			@ Ch(e,f,g)
11162306a36Sopenharmony_ci	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
11262306a36Sopenharmony_ci	add	$h,$h,$t1			@ h+=Ch(e,f,g)
11362306a36Sopenharmony_ci#if $i==31
11462306a36Sopenharmony_ci	and	$t2,$t2,#0xff
11562306a36Sopenharmony_ci	cmp	$t2,#0xf2			@ done?
11662306a36Sopenharmony_ci#endif
11762306a36Sopenharmony_ci#if $i<15
11862306a36Sopenharmony_ci# if __ARM_ARCH__>=7
11962306a36Sopenharmony_ci	ldr	$t1,[$inp],#4			@ prefetch
12062306a36Sopenharmony_ci# else
12162306a36Sopenharmony_ci	ldrb	$t1,[$inp,#3]
12262306a36Sopenharmony_ci# endif
12362306a36Sopenharmony_ci	eor	$t2,$a,$b			@ a^b, b^c in next round
12462306a36Sopenharmony_ci#else
12562306a36Sopenharmony_ci	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
12662306a36Sopenharmony_ci	eor	$t2,$a,$b			@ a^b, b^c in next round
12762306a36Sopenharmony_ci	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
12862306a36Sopenharmony_ci#endif
12962306a36Sopenharmony_ci	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
13062306a36Sopenharmony_ci	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
13162306a36Sopenharmony_ci	add	$d,$d,$h			@ d+=h
13262306a36Sopenharmony_ci	eor	$t3,$t3,$b			@ Maj(a,b,c)
13362306a36Sopenharmony_ci	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
13462306a36Sopenharmony_ci	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
13562306a36Sopenharmony_ci___
13662306a36Sopenharmony_ci	($t2,$t3)=($t3,$t2);
13762306a36Sopenharmony_ci}
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_cisub BODY_16_XX {
14062306a36Sopenharmony_cimy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci$code.=<<___;
14362306a36Sopenharmony_ci	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
14462306a36Sopenharmony_ci	@ ldr	$t4,[sp,#`($i+14)%16`*4]
14562306a36Sopenharmony_ci	mov	$t0,$t1,ror#$sigma0[0]
14662306a36Sopenharmony_ci	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
14762306a36Sopenharmony_ci	mov	$t2,$t4,ror#$sigma1[0]
14862306a36Sopenharmony_ci	eor	$t0,$t0,$t1,ror#$sigma0[1]
14962306a36Sopenharmony_ci	eor	$t2,$t2,$t4,ror#$sigma1[1]
15062306a36Sopenharmony_ci	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
15162306a36Sopenharmony_ci	ldr	$t1,[sp,#`($i+0)%16`*4]
15262306a36Sopenharmony_ci	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
15362306a36Sopenharmony_ci	ldr	$t4,[sp,#`($i+9)%16`*4]
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	add	$t2,$t2,$t0
15662306a36Sopenharmony_ci	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
15762306a36Sopenharmony_ci	add	$t1,$t1,$t2
15862306a36Sopenharmony_ci	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
15962306a36Sopenharmony_ci	add	$t1,$t1,$t4			@ X[i]
16062306a36Sopenharmony_ci___
16162306a36Sopenharmony_ci	&BODY_00_15(@_);
16262306a36Sopenharmony_ci}
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci$code=<<___;
16562306a36Sopenharmony_ci#ifndef __KERNEL__
16662306a36Sopenharmony_ci# include "arm_arch.h"
16762306a36Sopenharmony_ci#else
16862306a36Sopenharmony_ci# define __ARM_ARCH__ __LINUX_ARM_ARCH__
16962306a36Sopenharmony_ci# define __ARM_MAX_ARCH__ 7
17062306a36Sopenharmony_ci#endif
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_ci.text
17362306a36Sopenharmony_ci#if __ARM_ARCH__<7
17462306a36Sopenharmony_ci.code	32
17562306a36Sopenharmony_ci#else
17662306a36Sopenharmony_ci.syntax unified
17762306a36Sopenharmony_ci# ifdef __thumb2__
17862306a36Sopenharmony_ci.thumb
17962306a36Sopenharmony_ci# else
18062306a36Sopenharmony_ci.code   32
18162306a36Sopenharmony_ci# endif
18262306a36Sopenharmony_ci#endif
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci.type	K256,%object
18562306a36Sopenharmony_ci.align	5
18662306a36Sopenharmony_ciK256:
18762306a36Sopenharmony_ci.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
18862306a36Sopenharmony_ci.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
18962306a36Sopenharmony_ci.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
19062306a36Sopenharmony_ci.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
19162306a36Sopenharmony_ci.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
19262306a36Sopenharmony_ci.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
19362306a36Sopenharmony_ci.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
19462306a36Sopenharmony_ci.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
19562306a36Sopenharmony_ci.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
19662306a36Sopenharmony_ci.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
19762306a36Sopenharmony_ci.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
19862306a36Sopenharmony_ci.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
19962306a36Sopenharmony_ci.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
20062306a36Sopenharmony_ci.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
20162306a36Sopenharmony_ci.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
20262306a36Sopenharmony_ci.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
20362306a36Sopenharmony_ci.size	K256,.-K256
20462306a36Sopenharmony_ci.word	0				@ terminator
20562306a36Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
20662306a36Sopenharmony_ci.LOPENSSL_armcap:
20762306a36Sopenharmony_ci.word	OPENSSL_armcap_P-sha256_block_data_order
20862306a36Sopenharmony_ci#endif
20962306a36Sopenharmony_ci.align	5
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci.global	sha256_block_data_order
21262306a36Sopenharmony_ci.type	sha256_block_data_order,%function
21362306a36Sopenharmony_cisha256_block_data_order:
21462306a36Sopenharmony_ci.Lsha256_block_data_order:
21562306a36Sopenharmony_ci#if __ARM_ARCH__<7
21662306a36Sopenharmony_ci	sub	r3,pc,#8		@ sha256_block_data_order
21762306a36Sopenharmony_ci#else
21862306a36Sopenharmony_ci	adr	r3,.Lsha256_block_data_order
21962306a36Sopenharmony_ci#endif
22062306a36Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
22162306a36Sopenharmony_ci	ldr	r12,.LOPENSSL_armcap
22262306a36Sopenharmony_ci	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
22362306a36Sopenharmony_ci	tst	r12,#ARMV8_SHA256
22462306a36Sopenharmony_ci	bne	.LARMv8
22562306a36Sopenharmony_ci	tst	r12,#ARMV7_NEON
22662306a36Sopenharmony_ci	bne	.LNEON
22762306a36Sopenharmony_ci#endif
22862306a36Sopenharmony_ci	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
22962306a36Sopenharmony_ci	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
23062306a36Sopenharmony_ci	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
23162306a36Sopenharmony_ci	sub	$Ktbl,r3,#256+32	@ K256
23262306a36Sopenharmony_ci	sub	sp,sp,#16*4		@ alloca(X[16])
23362306a36Sopenharmony_ci.Loop:
23462306a36Sopenharmony_ci# if __ARM_ARCH__>=7
23562306a36Sopenharmony_ci	ldr	$t1,[$inp],#4
23662306a36Sopenharmony_ci# else
23762306a36Sopenharmony_ci	ldrb	$t1,[$inp,#3]
23862306a36Sopenharmony_ci# endif
23962306a36Sopenharmony_ci	eor	$t3,$B,$C		@ magic
24062306a36Sopenharmony_ci	eor	$t2,$t2,$t2
24162306a36Sopenharmony_ci___
24262306a36Sopenharmony_cifor($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
24362306a36Sopenharmony_ci$code.=".Lrounds_16_xx:\n";
24462306a36Sopenharmony_cifor (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
24562306a36Sopenharmony_ci$code.=<<___;
24662306a36Sopenharmony_ci#if __ARM_ARCH__>=7
24762306a36Sopenharmony_ci	ite	eq			@ Thumb2 thing, sanity check in ARM
24862306a36Sopenharmony_ci#endif
24962306a36Sopenharmony_ci	ldreq	$t3,[sp,#16*4]		@ pull ctx
25062306a36Sopenharmony_ci	bne	.Lrounds_16_xx
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
25362306a36Sopenharmony_ci	ldr	$t0,[$t3,#0]
25462306a36Sopenharmony_ci	ldr	$t1,[$t3,#4]
25562306a36Sopenharmony_ci	ldr	$t2,[$t3,#8]
25662306a36Sopenharmony_ci	add	$A,$A,$t0
25762306a36Sopenharmony_ci	ldr	$t0,[$t3,#12]
25862306a36Sopenharmony_ci	add	$B,$B,$t1
25962306a36Sopenharmony_ci	ldr	$t1,[$t3,#16]
26062306a36Sopenharmony_ci	add	$C,$C,$t2
26162306a36Sopenharmony_ci	ldr	$t2,[$t3,#20]
26262306a36Sopenharmony_ci	add	$D,$D,$t0
26362306a36Sopenharmony_ci	ldr	$t0,[$t3,#24]
26462306a36Sopenharmony_ci	add	$E,$E,$t1
26562306a36Sopenharmony_ci	ldr	$t1,[$t3,#28]
26662306a36Sopenharmony_ci	add	$F,$F,$t2
26762306a36Sopenharmony_ci	ldr	$inp,[sp,#17*4]		@ pull inp
26862306a36Sopenharmony_ci	ldr	$t2,[sp,#18*4]		@ pull inp+len
26962306a36Sopenharmony_ci	add	$G,$G,$t0
27062306a36Sopenharmony_ci	add	$H,$H,$t1
27162306a36Sopenharmony_ci	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
27262306a36Sopenharmony_ci	cmp	$inp,$t2
27362306a36Sopenharmony_ci	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
27462306a36Sopenharmony_ci	bne	.Loop
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	add	sp,sp,#`16+3`*4	@ destroy frame
27762306a36Sopenharmony_ci#if __ARM_ARCH__>=5
27862306a36Sopenharmony_ci	ldmia	sp!,{r4-r11,pc}
27962306a36Sopenharmony_ci#else
28062306a36Sopenharmony_ci	ldmia	sp!,{r4-r11,lr}
28162306a36Sopenharmony_ci	tst	lr,#1
28262306a36Sopenharmony_ci	moveq	pc,lr			@ be binary compatible with V4, yet
28362306a36Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
28462306a36Sopenharmony_ci#endif
28562306a36Sopenharmony_ci.size	sha256_block_data_order,.-sha256_block_data_order
28662306a36Sopenharmony_ci___
28762306a36Sopenharmony_ci######################################################################
28862306a36Sopenharmony_ci# NEON stuff
28962306a36Sopenharmony_ci#
29062306a36Sopenharmony_ci{{{
29162306a36Sopenharmony_cimy @X=map("q$_",(0..3));
29262306a36Sopenharmony_cimy ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
29362306a36Sopenharmony_cimy $Xfer=$t4;
29462306a36Sopenharmony_cimy $j=0;
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_cisub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
29762306a36Sopenharmony_cisub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_cisub AUTOLOAD()          # thunk [simplified] x86-style perlasm
30062306a36Sopenharmony_ci{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
30162306a36Sopenharmony_ci  my $arg = pop;
30262306a36Sopenharmony_ci    $arg = "#$arg" if ($arg*1 eq $arg);
30362306a36Sopenharmony_ci    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
30462306a36Sopenharmony_ci}
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_cisub Xupdate()
30762306a36Sopenharmony_ci{ use integer;
30862306a36Sopenharmony_ci  my $body = shift;
30962306a36Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);
31062306a36Sopenharmony_ci  my ($a,$b,$c,$d,$e,$f,$g,$h);
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_ci	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
31362306a36Sopenharmony_ci	 eval(shift(@insns));
31462306a36Sopenharmony_ci	 eval(shift(@insns));
31562306a36Sopenharmony_ci	 eval(shift(@insns));
31662306a36Sopenharmony_ci	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
31762306a36Sopenharmony_ci	 eval(shift(@insns));
31862306a36Sopenharmony_ci	 eval(shift(@insns));
31962306a36Sopenharmony_ci	 eval(shift(@insns));
32062306a36Sopenharmony_ci	&vshr_u32	($T2,$T0,$sigma0[0]);
32162306a36Sopenharmony_ci	 eval(shift(@insns));
32262306a36Sopenharmony_ci	 eval(shift(@insns));
32362306a36Sopenharmony_ci	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
32462306a36Sopenharmony_ci	 eval(shift(@insns));
32562306a36Sopenharmony_ci	 eval(shift(@insns));
32662306a36Sopenharmony_ci	&vshr_u32	($T1,$T0,$sigma0[2]);
32762306a36Sopenharmony_ci	 eval(shift(@insns));
32862306a36Sopenharmony_ci	 eval(shift(@insns));
32962306a36Sopenharmony_ci	&vsli_32	($T2,$T0,32-$sigma0[0]);
33062306a36Sopenharmony_ci	 eval(shift(@insns));
33162306a36Sopenharmony_ci	 eval(shift(@insns));
33262306a36Sopenharmony_ci	&vshr_u32	($T3,$T0,$sigma0[1]);
33362306a36Sopenharmony_ci	 eval(shift(@insns));
33462306a36Sopenharmony_ci	 eval(shift(@insns));
33562306a36Sopenharmony_ci	&veor		($T1,$T1,$T2);
33662306a36Sopenharmony_ci	 eval(shift(@insns));
33762306a36Sopenharmony_ci	 eval(shift(@insns));
33862306a36Sopenharmony_ci	&vsli_32	($T3,$T0,32-$sigma0[1]);
33962306a36Sopenharmony_ci	 eval(shift(@insns));
34062306a36Sopenharmony_ci	 eval(shift(@insns));
34162306a36Sopenharmony_ci	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
34262306a36Sopenharmony_ci	 eval(shift(@insns));
34362306a36Sopenharmony_ci	 eval(shift(@insns));
34462306a36Sopenharmony_ci	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
34562306a36Sopenharmony_ci	 eval(shift(@insns));
34662306a36Sopenharmony_ci	 eval(shift(@insns));
34762306a36Sopenharmony_ci	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
34862306a36Sopenharmony_ci	 eval(shift(@insns));
34962306a36Sopenharmony_ci	 eval(shift(@insns));
35062306a36Sopenharmony_ci	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
35162306a36Sopenharmony_ci	 eval(shift(@insns));
35262306a36Sopenharmony_ci	 eval(shift(@insns));
35362306a36Sopenharmony_ci	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
35462306a36Sopenharmony_ci	 eval(shift(@insns));
35562306a36Sopenharmony_ci	 eval(shift(@insns));
35662306a36Sopenharmony_ci	  &veor		($T5,$T5,$T4);
35762306a36Sopenharmony_ci	 eval(shift(@insns));
35862306a36Sopenharmony_ci	 eval(shift(@insns));
35962306a36Sopenharmony_ci	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
36062306a36Sopenharmony_ci	 eval(shift(@insns));
36162306a36Sopenharmony_ci	 eval(shift(@insns));
36262306a36Sopenharmony_ci	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
36362306a36Sopenharmony_ci	 eval(shift(@insns));
36462306a36Sopenharmony_ci	 eval(shift(@insns));
36562306a36Sopenharmony_ci	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
36662306a36Sopenharmony_ci	 eval(shift(@insns));
36762306a36Sopenharmony_ci	 eval(shift(@insns));
36862306a36Sopenharmony_ci	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
36962306a36Sopenharmony_ci	 eval(shift(@insns));
37062306a36Sopenharmony_ci	 eval(shift(@insns));
37162306a36Sopenharmony_ci	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
37262306a36Sopenharmony_ci	 eval(shift(@insns));
37362306a36Sopenharmony_ci	 eval(shift(@insns));
37462306a36Sopenharmony_ci	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
37562306a36Sopenharmony_ci	 eval(shift(@insns));
37662306a36Sopenharmony_ci	 eval(shift(@insns));
37762306a36Sopenharmony_ci	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
37862306a36Sopenharmony_ci	 eval(shift(@insns));
37962306a36Sopenharmony_ci	 eval(shift(@insns));
38062306a36Sopenharmony_ci	  &veor		($T5,$T5,$T4);
38162306a36Sopenharmony_ci	 eval(shift(@insns));
38262306a36Sopenharmony_ci	 eval(shift(@insns));
38362306a36Sopenharmony_ci	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
38462306a36Sopenharmony_ci	 eval(shift(@insns));
38562306a36Sopenharmony_ci	 eval(shift(@insns));
38662306a36Sopenharmony_ci	&vld1_32	("{$T0}","[$Ktbl,:128]!");
38762306a36Sopenharmony_ci	 eval(shift(@insns));
38862306a36Sopenharmony_ci	 eval(shift(@insns));
38962306a36Sopenharmony_ci	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
39062306a36Sopenharmony_ci	 eval(shift(@insns));
39162306a36Sopenharmony_ci	 eval(shift(@insns));
39262306a36Sopenharmony_ci	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
39362306a36Sopenharmony_ci	 eval(shift(@insns));
39462306a36Sopenharmony_ci	 eval(shift(@insns));
39562306a36Sopenharmony_ci	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
39662306a36Sopenharmony_ci	 eval(shift(@insns));
39762306a36Sopenharmony_ci	 eval(shift(@insns));
39862306a36Sopenharmony_ci	&vadd_i32	($T0,$T0,@X[0]);
39962306a36Sopenharmony_ci	 while($#insns>=2) { eval(shift(@insns)); }
40062306a36Sopenharmony_ci	&vst1_32	("{$T0}","[$Xfer,:128]!");
40162306a36Sopenharmony_ci	 eval(shift(@insns));
40262306a36Sopenharmony_ci	 eval(shift(@insns));
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci	push(@X,shift(@X));		# "rotate" X[]
40562306a36Sopenharmony_ci}
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_cisub Xpreload()
40862306a36Sopenharmony_ci{ use integer;
40962306a36Sopenharmony_ci  my $body = shift;
41062306a36Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);
41162306a36Sopenharmony_ci  my ($a,$b,$c,$d,$e,$f,$g,$h);
41262306a36Sopenharmony_ci
41362306a36Sopenharmony_ci	 eval(shift(@insns));
41462306a36Sopenharmony_ci	 eval(shift(@insns));
41562306a36Sopenharmony_ci	 eval(shift(@insns));
41662306a36Sopenharmony_ci	 eval(shift(@insns));
41762306a36Sopenharmony_ci	&vld1_32	("{$T0}","[$Ktbl,:128]!");
41862306a36Sopenharmony_ci	 eval(shift(@insns));
41962306a36Sopenharmony_ci	 eval(shift(@insns));
42062306a36Sopenharmony_ci	 eval(shift(@insns));
42162306a36Sopenharmony_ci	 eval(shift(@insns));
42262306a36Sopenharmony_ci	&vrev32_8	(@X[0],@X[0]);
42362306a36Sopenharmony_ci	 eval(shift(@insns));
42462306a36Sopenharmony_ci	 eval(shift(@insns));
42562306a36Sopenharmony_ci	 eval(shift(@insns));
42662306a36Sopenharmony_ci	 eval(shift(@insns));
42762306a36Sopenharmony_ci	&vadd_i32	($T0,$T0,@X[0]);
42862306a36Sopenharmony_ci	 foreach (@insns) { eval; }	# remaining instructions
42962306a36Sopenharmony_ci	&vst1_32	("{$T0}","[$Xfer,:128]!");
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci	push(@X,shift(@X));		# "rotate" X[]
43262306a36Sopenharmony_ci}
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_cisub body_00_15 () {
43562306a36Sopenharmony_ci	(
43662306a36Sopenharmony_ci	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
43762306a36Sopenharmony_ci	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
43862306a36Sopenharmony_ci	'&eor	($t1,$f,$g)',
43962306a36Sopenharmony_ci	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
44062306a36Sopenharmony_ci	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
44162306a36Sopenharmony_ci	'&and	($t1,$t1,$e)',
44262306a36Sopenharmony_ci	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
44362306a36Sopenharmony_ci	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
44462306a36Sopenharmony_ci	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
44562306a36Sopenharmony_ci	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
44662306a36Sopenharmony_ci	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
44762306a36Sopenharmony_ci	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
44862306a36Sopenharmony_ci	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
44962306a36Sopenharmony_ci	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
45062306a36Sopenharmony_ci	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
45162306a36Sopenharmony_ci	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
45262306a36Sopenharmony_ci	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
45362306a36Sopenharmony_ci	'&add	($d,$d,$h)',			# d+=h
45462306a36Sopenharmony_ci	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
45562306a36Sopenharmony_ci	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
45662306a36Sopenharmony_ci	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
45762306a36Sopenharmony_ci	)
45862306a36Sopenharmony_ci}
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci$code.=<<___;
46162306a36Sopenharmony_ci#if __ARM_MAX_ARCH__>=7
46262306a36Sopenharmony_ci.arch	armv7-a
46362306a36Sopenharmony_ci.fpu	neon
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci.global	sha256_block_data_order_neon
46662306a36Sopenharmony_ci.type	sha256_block_data_order_neon,%function
46762306a36Sopenharmony_ci.align	4
46862306a36Sopenharmony_cisha256_block_data_order_neon:
46962306a36Sopenharmony_ci.LNEON:
47062306a36Sopenharmony_ci	stmdb	sp!,{r4-r12,lr}
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_ci	sub	$H,sp,#16*4+16
47362306a36Sopenharmony_ci	adr	$Ktbl,.Lsha256_block_data_order
47462306a36Sopenharmony_ci	sub	$Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
47562306a36Sopenharmony_ci	bic	$H,$H,#15		@ align for 128-bit stores
47662306a36Sopenharmony_ci	mov	$t2,sp
47762306a36Sopenharmony_ci	mov	sp,$H			@ alloca
47862306a36Sopenharmony_ci	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
47962306a36Sopenharmony_ci
48062306a36Sopenharmony_ci	vld1.8		{@X[0]},[$inp]!
48162306a36Sopenharmony_ci	vld1.8		{@X[1]},[$inp]!
48262306a36Sopenharmony_ci	vld1.8		{@X[2]},[$inp]!
48362306a36Sopenharmony_ci	vld1.8		{@X[3]},[$inp]!
48462306a36Sopenharmony_ci	vld1.32		{$T0},[$Ktbl,:128]!
48562306a36Sopenharmony_ci	vld1.32		{$T1},[$Ktbl,:128]!
48662306a36Sopenharmony_ci	vld1.32		{$T2},[$Ktbl,:128]!
48762306a36Sopenharmony_ci	vld1.32		{$T3},[$Ktbl,:128]!
48862306a36Sopenharmony_ci	vrev32.8	@X[0],@X[0]		@ yes, even on
48962306a36Sopenharmony_ci	str		$ctx,[sp,#64]
49062306a36Sopenharmony_ci	vrev32.8	@X[1],@X[1]		@ big-endian
49162306a36Sopenharmony_ci	str		$inp,[sp,#68]
49262306a36Sopenharmony_ci	mov		$Xfer,sp
49362306a36Sopenharmony_ci	vrev32.8	@X[2],@X[2]
49462306a36Sopenharmony_ci	str		$len,[sp,#72]
49562306a36Sopenharmony_ci	vrev32.8	@X[3],@X[3]
49662306a36Sopenharmony_ci	str		$t2,[sp,#76]		@ save original sp
49762306a36Sopenharmony_ci	vadd.i32	$T0,$T0,@X[0]
49862306a36Sopenharmony_ci	vadd.i32	$T1,$T1,@X[1]
49962306a36Sopenharmony_ci	vst1.32		{$T0},[$Xfer,:128]!
50062306a36Sopenharmony_ci	vadd.i32	$T2,$T2,@X[2]
50162306a36Sopenharmony_ci	vst1.32		{$T1},[$Xfer,:128]!
50262306a36Sopenharmony_ci	vadd.i32	$T3,$T3,@X[3]
50362306a36Sopenharmony_ci	vst1.32		{$T2},[$Xfer,:128]!
50462306a36Sopenharmony_ci	vst1.32		{$T3},[$Xfer,:128]!
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci	ldmia		$ctx,{$A-$H}
50762306a36Sopenharmony_ci	sub		$Xfer,$Xfer,#64
50862306a36Sopenharmony_ci	ldr		$t1,[sp,#0]
50962306a36Sopenharmony_ci	eor		$t2,$t2,$t2
51062306a36Sopenharmony_ci	eor		$t3,$B,$C
51162306a36Sopenharmony_ci	b		.L_00_48
51262306a36Sopenharmony_ci
51362306a36Sopenharmony_ci.align	4
51462306a36Sopenharmony_ci.L_00_48:
51562306a36Sopenharmony_ci___
51662306a36Sopenharmony_ci	&Xupdate(\&body_00_15);
51762306a36Sopenharmony_ci	&Xupdate(\&body_00_15);
51862306a36Sopenharmony_ci	&Xupdate(\&body_00_15);
51962306a36Sopenharmony_ci	&Xupdate(\&body_00_15);
52062306a36Sopenharmony_ci$code.=<<___;
52162306a36Sopenharmony_ci	teq	$t1,#0				@ check for K256 terminator
52262306a36Sopenharmony_ci	ldr	$t1,[sp,#0]
52362306a36Sopenharmony_ci	sub	$Xfer,$Xfer,#64
52462306a36Sopenharmony_ci	bne	.L_00_48
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_ci	ldr		$inp,[sp,#68]
52762306a36Sopenharmony_ci	ldr		$t0,[sp,#72]
52862306a36Sopenharmony_ci	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
52962306a36Sopenharmony_ci	teq		$inp,$t0
53062306a36Sopenharmony_ci	it		eq
53162306a36Sopenharmony_ci	subeq		$inp,$inp,#64		@ avoid SEGV
53262306a36Sopenharmony_ci	vld1.8		{@X[0]},[$inp]!		@ load next input block
53362306a36Sopenharmony_ci	vld1.8		{@X[1]},[$inp]!
53462306a36Sopenharmony_ci	vld1.8		{@X[2]},[$inp]!
53562306a36Sopenharmony_ci	vld1.8		{@X[3]},[$inp]!
53662306a36Sopenharmony_ci	it		ne
53762306a36Sopenharmony_ci	strne		$inp,[sp,#68]
53862306a36Sopenharmony_ci	mov		$Xfer,sp
53962306a36Sopenharmony_ci___
54062306a36Sopenharmony_ci	&Xpreload(\&body_00_15);
54162306a36Sopenharmony_ci	&Xpreload(\&body_00_15);
54262306a36Sopenharmony_ci	&Xpreload(\&body_00_15);
54362306a36Sopenharmony_ci	&Xpreload(\&body_00_15);
54462306a36Sopenharmony_ci$code.=<<___;
54562306a36Sopenharmony_ci	ldr	$t0,[$t1,#0]
54662306a36Sopenharmony_ci	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
54762306a36Sopenharmony_ci	ldr	$t2,[$t1,#4]
54862306a36Sopenharmony_ci	ldr	$t3,[$t1,#8]
54962306a36Sopenharmony_ci	ldr	$t4,[$t1,#12]
55062306a36Sopenharmony_ci	add	$A,$A,$t0			@ accumulate
55162306a36Sopenharmony_ci	ldr	$t0,[$t1,#16]
55262306a36Sopenharmony_ci	add	$B,$B,$t2
55362306a36Sopenharmony_ci	ldr	$t2,[$t1,#20]
55462306a36Sopenharmony_ci	add	$C,$C,$t3
55562306a36Sopenharmony_ci	ldr	$t3,[$t1,#24]
55662306a36Sopenharmony_ci	add	$D,$D,$t4
55762306a36Sopenharmony_ci	ldr	$t4,[$t1,#28]
55862306a36Sopenharmony_ci	add	$E,$E,$t0
55962306a36Sopenharmony_ci	str	$A,[$t1],#4
56062306a36Sopenharmony_ci	add	$F,$F,$t2
56162306a36Sopenharmony_ci	str	$B,[$t1],#4
56262306a36Sopenharmony_ci	add	$G,$G,$t3
56362306a36Sopenharmony_ci	str	$C,[$t1],#4
56462306a36Sopenharmony_ci	add	$H,$H,$t4
56562306a36Sopenharmony_ci	str	$D,[$t1],#4
56662306a36Sopenharmony_ci	stmia	$t1,{$E-$H}
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci	ittte	ne
56962306a36Sopenharmony_ci	movne	$Xfer,sp
57062306a36Sopenharmony_ci	ldrne	$t1,[sp,#0]
57162306a36Sopenharmony_ci	eorne	$t2,$t2,$t2
57262306a36Sopenharmony_ci	ldreq	sp,[sp,#76]			@ restore original sp
57362306a36Sopenharmony_ci	itt	ne
57462306a36Sopenharmony_ci	eorne	$t3,$B,$C
57562306a36Sopenharmony_ci	bne	.L_00_48
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
57862306a36Sopenharmony_ci.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
57962306a36Sopenharmony_ci#endif
58062306a36Sopenharmony_ci___
58162306a36Sopenharmony_ci}}}
58262306a36Sopenharmony_ci######################################################################
58362306a36Sopenharmony_ci# ARMv8 stuff
58462306a36Sopenharmony_ci#
58562306a36Sopenharmony_ci{{{
58662306a36Sopenharmony_cimy ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
58762306a36Sopenharmony_cimy @MSG=map("q$_",(8..11));
58862306a36Sopenharmony_cimy ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
58962306a36Sopenharmony_cimy $Ktbl="r3";
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_ci$code.=<<___;
59262306a36Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
59362306a36Sopenharmony_ci
59462306a36Sopenharmony_ci# ifdef __thumb2__
59562306a36Sopenharmony_ci#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
59662306a36Sopenharmony_ci# else
59762306a36Sopenharmony_ci#  define INST(a,b,c,d)	.byte	a,b,c,d
59862306a36Sopenharmony_ci# endif
59962306a36Sopenharmony_ci
60062306a36Sopenharmony_ci.type	sha256_block_data_order_armv8,%function
60162306a36Sopenharmony_ci.align	5
60262306a36Sopenharmony_cisha256_block_data_order_armv8:
60362306a36Sopenharmony_ci.LARMv8:
60462306a36Sopenharmony_ci	vld1.32	{$ABCD,$EFGH},[$ctx]
60562306a36Sopenharmony_ci# ifdef __thumb2__
60662306a36Sopenharmony_ci	adr	$Ktbl,.LARMv8
60762306a36Sopenharmony_ci	sub	$Ktbl,$Ktbl,#.LARMv8-K256
60862306a36Sopenharmony_ci# else
60962306a36Sopenharmony_ci	adrl	$Ktbl,K256
61062306a36Sopenharmony_ci# endif
61162306a36Sopenharmony_ci	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
61262306a36Sopenharmony_ci
61362306a36Sopenharmony_ci.Loop_v8:
61462306a36Sopenharmony_ci	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
61562306a36Sopenharmony_ci	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
61662306a36Sopenharmony_ci	vld1.32		{$W0},[$Ktbl]!
61762306a36Sopenharmony_ci	vrev32.8	@MSG[0],@MSG[0]
61862306a36Sopenharmony_ci	vrev32.8	@MSG[1],@MSG[1]
61962306a36Sopenharmony_ci	vrev32.8	@MSG[2],@MSG[2]
62062306a36Sopenharmony_ci	vrev32.8	@MSG[3],@MSG[3]
62162306a36Sopenharmony_ci	vmov		$ABCD_SAVE,$ABCD	@ offload
62262306a36Sopenharmony_ci	vmov		$EFGH_SAVE,$EFGH
62362306a36Sopenharmony_ci	teq		$inp,$len
62462306a36Sopenharmony_ci___
62562306a36Sopenharmony_cifor($i=0;$i<12;$i++) {
62662306a36Sopenharmony_ci$code.=<<___;
62762306a36Sopenharmony_ci	vld1.32		{$W1},[$Ktbl]!
62862306a36Sopenharmony_ci	vadd.i32	$W0,$W0,@MSG[0]
62962306a36Sopenharmony_ci	sha256su0	@MSG[0],@MSG[1]
63062306a36Sopenharmony_ci	vmov		$abcd,$ABCD
63162306a36Sopenharmony_ci	sha256h		$ABCD,$EFGH,$W0
63262306a36Sopenharmony_ci	sha256h2	$EFGH,$abcd,$W0
63362306a36Sopenharmony_ci	sha256su1	@MSG[0],@MSG[2],@MSG[3]
63462306a36Sopenharmony_ci___
63562306a36Sopenharmony_ci	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
63662306a36Sopenharmony_ci}
63762306a36Sopenharmony_ci$code.=<<___;
63862306a36Sopenharmony_ci	vld1.32		{$W1},[$Ktbl]!
63962306a36Sopenharmony_ci	vadd.i32	$W0,$W0,@MSG[0]
64062306a36Sopenharmony_ci	vmov		$abcd,$ABCD
64162306a36Sopenharmony_ci	sha256h		$ABCD,$EFGH,$W0
64262306a36Sopenharmony_ci	sha256h2	$EFGH,$abcd,$W0
64362306a36Sopenharmony_ci
64462306a36Sopenharmony_ci	vld1.32		{$W0},[$Ktbl]!
64562306a36Sopenharmony_ci	vadd.i32	$W1,$W1,@MSG[1]
64662306a36Sopenharmony_ci	vmov		$abcd,$ABCD
64762306a36Sopenharmony_ci	sha256h		$ABCD,$EFGH,$W1
64862306a36Sopenharmony_ci	sha256h2	$EFGH,$abcd,$W1
64962306a36Sopenharmony_ci
65062306a36Sopenharmony_ci	vld1.32		{$W1},[$Ktbl]
65162306a36Sopenharmony_ci	vadd.i32	$W0,$W0,@MSG[2]
65262306a36Sopenharmony_ci	sub		$Ktbl,$Ktbl,#256-16	@ rewind
65362306a36Sopenharmony_ci	vmov		$abcd,$ABCD
65462306a36Sopenharmony_ci	sha256h		$ABCD,$EFGH,$W0
65562306a36Sopenharmony_ci	sha256h2	$EFGH,$abcd,$W0
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci	vadd.i32	$W1,$W1,@MSG[3]
65862306a36Sopenharmony_ci	vmov		$abcd,$ABCD
65962306a36Sopenharmony_ci	sha256h		$ABCD,$EFGH,$W1
66062306a36Sopenharmony_ci	sha256h2	$EFGH,$abcd,$W1
66162306a36Sopenharmony_ci
66262306a36Sopenharmony_ci	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
66362306a36Sopenharmony_ci	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
66462306a36Sopenharmony_ci	it		ne
66562306a36Sopenharmony_ci	bne		.Loop_v8
66662306a36Sopenharmony_ci
66762306a36Sopenharmony_ci	vst1.32		{$ABCD,$EFGH},[$ctx]
66862306a36Sopenharmony_ci
66962306a36Sopenharmony_ci	ret		@ bx lr
67062306a36Sopenharmony_ci.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
67162306a36Sopenharmony_ci#endif
67262306a36Sopenharmony_ci___
67362306a36Sopenharmony_ci}}}
67462306a36Sopenharmony_ci$code.=<<___;
67562306a36Sopenharmony_ci.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
67662306a36Sopenharmony_ci.align	2
67762306a36Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
67862306a36Sopenharmony_ci.comm   OPENSSL_armcap_P,4,4
67962306a36Sopenharmony_ci#endif
68062306a36Sopenharmony_ci___
68162306a36Sopenharmony_ci
68262306a36Sopenharmony_ciopen SELF,$0;
68362306a36Sopenharmony_ciwhile(<SELF>) {
68462306a36Sopenharmony_ci	next if (/^#!/);
68562306a36Sopenharmony_ci	last if (!s/^#/@/ and !/^$/);
68662306a36Sopenharmony_ci	print;
68762306a36Sopenharmony_ci}
68862306a36Sopenharmony_ciclose SELF;
68962306a36Sopenharmony_ci
69062306a36Sopenharmony_ci{   my  %opcode = (
69162306a36Sopenharmony_ci	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
69262306a36Sopenharmony_ci	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
69362306a36Sopenharmony_ci
69462306a36Sopenharmony_ci    sub unsha256 {
69562306a36Sopenharmony_ci	my ($mnemonic,$arg)=@_;
69662306a36Sopenharmony_ci
69762306a36Sopenharmony_ci	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
69862306a36Sopenharmony_ci	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
69962306a36Sopenharmony_ci					 |(($2&7)<<17)|(($2&8)<<4)
70062306a36Sopenharmony_ci					 |(($3&7)<<1) |(($3&8)<<2);
70162306a36Sopenharmony_ci	    # since ARMv7 instructions are always encoded little-endian.
70262306a36Sopenharmony_ci	    # correct solution is to use .inst directive, but older
70362306a36Sopenharmony_ci	    # assemblers don't implement it:-(
70462306a36Sopenharmony_ci	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
70562306a36Sopenharmony_ci			$word&0xff,($word>>8)&0xff,
70662306a36Sopenharmony_ci			($word>>16)&0xff,($word>>24)&0xff,
70762306a36Sopenharmony_ci			$mnemonic,$arg;
70862306a36Sopenharmony_ci	}
70962306a36Sopenharmony_ci    }
71062306a36Sopenharmony_ci}
71162306a36Sopenharmony_ci
71262306a36Sopenharmony_ciforeach (split($/,$code)) {
71362306a36Sopenharmony_ci
71462306a36Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/geo;
71562306a36Sopenharmony_ci
71662306a36Sopenharmony_ci	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
71762306a36Sopenharmony_ci
71862306a36Sopenharmony_ci	s/\bret\b/bx	lr/go		or
71962306a36Sopenharmony_ci	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
72062306a36Sopenharmony_ci
72162306a36Sopenharmony_ci	print $_,"\n";
72262306a36Sopenharmony_ci}
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ciclose STDOUT; # enforce flush
725