1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# ECP_NISTZ256 module for ARMv4. 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# October 2014. 20e1051a39Sopenharmony_ci# 21e1051a39Sopenharmony_ci# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22e1051a39Sopenharmony_ci# http://eprint.iacr.org/2013/816. In the process of adaptation 23e1051a39Sopenharmony_ci# original .c module was made 32-bit savvy in order to make this 24e1051a39Sopenharmony_ci# implementation possible. 25e1051a39Sopenharmony_ci# 26e1051a39Sopenharmony_ci# with/without -DECP_NISTZ256_ASM 27e1051a39Sopenharmony_ci# Cortex-A8 +53-170% 28e1051a39Sopenharmony_ci# Cortex-A9 +76-205% 29e1051a39Sopenharmony_ci# Cortex-A15 +100-316% 30e1051a39Sopenharmony_ci# Snapdragon S4 +66-187% 31e1051a39Sopenharmony_ci# 32e1051a39Sopenharmony_ci# Ranges denote minimum and maximum improvement coefficients depending 33e1051a39Sopenharmony_ci# on benchmark. Lower coefficients are for ECDSA sign, server-side 34e1051a39Sopenharmony_ci# operation. Keep in mind that +200% means 3x improvement. 35e1051a39Sopenharmony_ci 36e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 37e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 38e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 39e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 40e1051a39Sopenharmony_ci 41e1051a39Sopenharmony_ciif ($flavour && $flavour ne "void") { 42e1051a39Sopenharmony_ci $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43e1051a39Sopenharmony_ci ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 44e1051a39Sopenharmony_ci ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 45e1051a39Sopenharmony_ci die "can't locate arm-xlate.pl"; 46e1051a39Sopenharmony_ci 47e1051a39Sopenharmony_ci open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" 48e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 49e1051a39Sopenharmony_ci} else { 50e1051a39Sopenharmony_ci $output and open STDOUT,">$output"; 51e1051a39Sopenharmony_ci} 52e1051a39Sopenharmony_ci 53e1051a39Sopenharmony_ci$code.=<<___; 54e1051a39Sopenharmony_ci#include "arm_arch.h" 55e1051a39Sopenharmony_ci 56e1051a39Sopenharmony_ci#if defined(__thumb2__) 57e1051a39Sopenharmony_ci.syntax unified 58e1051a39Sopenharmony_ci.thumb 59e1051a39Sopenharmony_ci#else 60e1051a39Sopenharmony_ci.code 32 61e1051a39Sopenharmony_ci#endif 62e1051a39Sopenharmony_ci___ 63e1051a39Sopenharmony_ci######################################################################## 64e1051a39Sopenharmony_ci# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 65e1051a39Sopenharmony_ci# 66e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 67e1051a39Sopenharmony_ciopen TABLE,"<ecp_nistz256_table.c" or 68e1051a39Sopenharmony_ciopen TABLE,"<${dir}../ecp_nistz256_table.c" or 69e1051a39Sopenharmony_cidie "failed to open ecp_nistz256_table.c:",$!; 70e1051a39Sopenharmony_ci 71e1051a39Sopenharmony_ciuse integer; 72e1051a39Sopenharmony_ci 73e1051a39Sopenharmony_ciforeach(<TABLE>) { 74e1051a39Sopenharmony_ci s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 75e1051a39Sopenharmony_ci} 76e1051a39Sopenharmony_ciclose TABLE; 77e1051a39Sopenharmony_ci 78e1051a39Sopenharmony_ci# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 79e1051a39Sopenharmony_ci# 64*16*37-1 is because $#arr returns last valid index or @arr, not 80e1051a39Sopenharmony_ci# amount of elements. 81e1051a39Sopenharmony_cidie "insane number of elements" if ($#arr != 64*16*37-1); 82e1051a39Sopenharmony_ci 83e1051a39Sopenharmony_ci$code.=<<___; 84e1051a39Sopenharmony_ci.rodata 85e1051a39Sopenharmony_ci.globl ecp_nistz256_precomputed 86e1051a39Sopenharmony_ci.type ecp_nistz256_precomputed,%object 87e1051a39Sopenharmony_ci.align 12 88e1051a39Sopenharmony_ciecp_nistz256_precomputed: 89e1051a39Sopenharmony_ci___ 90e1051a39Sopenharmony_ci######################################################################## 91e1051a39Sopenharmony_ci# this conversion smashes P256_POINT_AFFINE by individual bytes with 92e1051a39Sopenharmony_ci# 64 byte interval, similar to 93e1051a39Sopenharmony_ci# 1111222233334444 94e1051a39Sopenharmony_ci# 1234123412341234 95e1051a39Sopenharmony_cifor(1..37) { 96e1051a39Sopenharmony_ci @tbl = splice(@arr,0,64*16); 97e1051a39Sopenharmony_ci for($i=0;$i<64;$i++) { 98e1051a39Sopenharmony_ci undef @line; 99e1051a39Sopenharmony_ci for($j=0;$j<64;$j++) { 100e1051a39Sopenharmony_ci push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 101e1051a39Sopenharmony_ci } 102e1051a39Sopenharmony_ci $code.=".byte\t"; 103e1051a39Sopenharmony_ci $code.=join(',',map { sprintf "0x%02x",$_} @line); 104e1051a39Sopenharmony_ci $code.="\n"; 105e1051a39Sopenharmony_ci } 106e1051a39Sopenharmony_ci} 107e1051a39Sopenharmony_ci$code.=<<___; 108e1051a39Sopenharmony_ci.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 109e1051a39Sopenharmony_ci 110e1051a39Sopenharmony_ci.text 111e1051a39Sopenharmony_ci.align 5 112e1051a39Sopenharmony_ci.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial 113e1051a39Sopenharmony_ci.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb 114e1051a39Sopenharmony_ci.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 115e1051a39Sopenharmony_ci.Lone: 116e1051a39Sopenharmony_ci.long 1,0,0,0,0,0,0,0 117e1051a39Sopenharmony_ci.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 118e1051a39Sopenharmony_ci.align 6 119e1051a39Sopenharmony_ci___ 120e1051a39Sopenharmony_ci 121e1051a39Sopenharmony_ci######################################################################## 122e1051a39Sopenharmony_ci# common register layout, note that $t2 is link register, so that if 123e1051a39Sopenharmony_ci# internal subroutine uses $t2, then it has to offload lr... 124e1051a39Sopenharmony_ci 125e1051a39Sopenharmony_ci($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)= 126e1051a39Sopenharmony_ci map("r$_",(0..12,14)); 127e1051a39Sopenharmony_ci($t0,$t3)=($ff,$a_ptr); 128e1051a39Sopenharmony_ci 129e1051a39Sopenharmony_ci$code.=<<___; 130e1051a39Sopenharmony_ci@ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 131e1051a39Sopenharmony_ci.globl ecp_nistz256_to_mont 132e1051a39Sopenharmony_ci.type ecp_nistz256_to_mont,%function 133e1051a39Sopenharmony_ciecp_nistz256_to_mont: 134e1051a39Sopenharmony_ci adr $b_ptr,.LRR 135e1051a39Sopenharmony_ci b .Lecp_nistz256_mul_mont 136e1051a39Sopenharmony_ci.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 137e1051a39Sopenharmony_ci 138e1051a39Sopenharmony_ci@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 139e1051a39Sopenharmony_ci.globl ecp_nistz256_from_mont 140e1051a39Sopenharmony_ci.type ecp_nistz256_from_mont,%function 141e1051a39Sopenharmony_ciecp_nistz256_from_mont: 142e1051a39Sopenharmony_ci adr $b_ptr,.Lone 143e1051a39Sopenharmony_ci b .Lecp_nistz256_mul_mont 144e1051a39Sopenharmony_ci.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 145e1051a39Sopenharmony_ci 146e1051a39Sopenharmony_ci@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); 147e1051a39Sopenharmony_ci.globl ecp_nistz256_mul_by_2 148e1051a39Sopenharmony_ci.type ecp_nistz256_mul_by_2,%function 149e1051a39Sopenharmony_ci.align 4 150e1051a39Sopenharmony_ciecp_nistz256_mul_by_2: 151e1051a39Sopenharmony_ci stmdb sp!,{r4-r12,lr} 152e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_by_2 153e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__) 154e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,pc} 155e1051a39Sopenharmony_ci#else 156e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,lr} 157e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 158e1051a39Sopenharmony_ci#endif 159e1051a39Sopenharmony_ci.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 160e1051a39Sopenharmony_ci 161e1051a39Sopenharmony_ci.type __ecp_nistz256_mul_by_2,%function 162e1051a39Sopenharmony_ci.align 4 163e1051a39Sopenharmony_ci__ecp_nistz256_mul_by_2: 164e1051a39Sopenharmony_ci ldr $a0,[$a_ptr,#0] 165e1051a39Sopenharmony_ci ldr $a1,[$a_ptr,#4] 166e1051a39Sopenharmony_ci ldr $a2,[$a_ptr,#8] 167e1051a39Sopenharmony_ci adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself 168e1051a39Sopenharmony_ci ldr $a3,[$a_ptr,#12] 169e1051a39Sopenharmony_ci adcs $a1,$a1,$a1 170e1051a39Sopenharmony_ci ldr $a4,[$a_ptr,#16] 171e1051a39Sopenharmony_ci adcs $a2,$a2,$a2 172e1051a39Sopenharmony_ci ldr $a5,[$a_ptr,#20] 173e1051a39Sopenharmony_ci adcs $a3,$a3,$a3 174e1051a39Sopenharmony_ci ldr $a6,[$a_ptr,#24] 175e1051a39Sopenharmony_ci adcs $a4,$a4,$a4 176e1051a39Sopenharmony_ci ldr $a7,[$a_ptr,#28] 177e1051a39Sopenharmony_ci adcs $a5,$a5,$a5 178e1051a39Sopenharmony_ci adcs $a6,$a6,$a6 179e1051a39Sopenharmony_ci mov $ff,#0 180e1051a39Sopenharmony_ci adcs $a7,$a7,$a7 181e1051a39Sopenharmony_ci adc $ff,$ff,#0 182e1051a39Sopenharmony_ci 183e1051a39Sopenharmony_ci b .Lreduce_by_sub 184e1051a39Sopenharmony_ci.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 185e1051a39Sopenharmony_ci 186e1051a39Sopenharmony_ci@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], 187e1051a39Sopenharmony_ci@ const BN_ULONG r2[8]); 188e1051a39Sopenharmony_ci.globl ecp_nistz256_add 189e1051a39Sopenharmony_ci.type ecp_nistz256_add,%function 190e1051a39Sopenharmony_ci.align 4 191e1051a39Sopenharmony_ciecp_nistz256_add: 192e1051a39Sopenharmony_ci stmdb sp!,{r4-r12,lr} 193e1051a39Sopenharmony_ci bl __ecp_nistz256_add 194e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__) 195e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,pc} 196e1051a39Sopenharmony_ci#else 197e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,lr} 198e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 199e1051a39Sopenharmony_ci#endif 200e1051a39Sopenharmony_ci.size ecp_nistz256_add,.-ecp_nistz256_add 201e1051a39Sopenharmony_ci 202e1051a39Sopenharmony_ci.type __ecp_nistz256_add,%function 203e1051a39Sopenharmony_ci.align 4 204e1051a39Sopenharmony_ci__ecp_nistz256_add: 205e1051a39Sopenharmony_ci str lr,[sp,#-4]! @ push lr 206e1051a39Sopenharmony_ci 207e1051a39Sopenharmony_ci ldr $a0,[$a_ptr,#0] 208e1051a39Sopenharmony_ci ldr $a1,[$a_ptr,#4] 209e1051a39Sopenharmony_ci ldr $a2,[$a_ptr,#8] 210e1051a39Sopenharmony_ci ldr $a3,[$a_ptr,#12] 211e1051a39Sopenharmony_ci ldr $a4,[$a_ptr,#16] 212e1051a39Sopenharmony_ci ldr $t0,[$b_ptr,#0] 213e1051a39Sopenharmony_ci ldr $a5,[$a_ptr,#20] 214e1051a39Sopenharmony_ci ldr $t1,[$b_ptr,#4] 215e1051a39Sopenharmony_ci ldr $a6,[$a_ptr,#24] 216e1051a39Sopenharmony_ci ldr $t2,[$b_ptr,#8] 217e1051a39Sopenharmony_ci ldr $a7,[$a_ptr,#28] 218e1051a39Sopenharmony_ci ldr $t3,[$b_ptr,#12] 219e1051a39Sopenharmony_ci adds $a0,$a0,$t0 220e1051a39Sopenharmony_ci ldr $t0,[$b_ptr,#16] 221e1051a39Sopenharmony_ci adcs $a1,$a1,$t1 222e1051a39Sopenharmony_ci ldr $t1,[$b_ptr,#20] 223e1051a39Sopenharmony_ci adcs $a2,$a2,$t2 224e1051a39Sopenharmony_ci ldr $t2,[$b_ptr,#24] 225e1051a39Sopenharmony_ci adcs $a3,$a3,$t3 226e1051a39Sopenharmony_ci ldr $t3,[$b_ptr,#28] 227e1051a39Sopenharmony_ci adcs $a4,$a4,$t0 228e1051a39Sopenharmony_ci adcs $a5,$a5,$t1 229e1051a39Sopenharmony_ci adcs $a6,$a6,$t2 230e1051a39Sopenharmony_ci mov $ff,#0 231e1051a39Sopenharmony_ci adcs $a7,$a7,$t3 232e1051a39Sopenharmony_ci adc $ff,$ff,#0 233e1051a39Sopenharmony_ci ldr lr,[sp],#4 @ pop lr 234e1051a39Sopenharmony_ci 235e1051a39Sopenharmony_ci.Lreduce_by_sub: 236e1051a39Sopenharmony_ci 237e1051a39Sopenharmony_ci @ if a+b >= modulus, subtract modulus. 238e1051a39Sopenharmony_ci @ 239e1051a39Sopenharmony_ci @ But since comparison implies subtraction, we subtract 240e1051a39Sopenharmony_ci @ modulus and then add it back if subtraction borrowed. 241e1051a39Sopenharmony_ci 242e1051a39Sopenharmony_ci subs $a0,$a0,#-1 243e1051a39Sopenharmony_ci sbcs $a1,$a1,#-1 244e1051a39Sopenharmony_ci sbcs $a2,$a2,#-1 245e1051a39Sopenharmony_ci sbcs $a3,$a3,#0 246e1051a39Sopenharmony_ci sbcs $a4,$a4,#0 247e1051a39Sopenharmony_ci sbcs $a5,$a5,#0 248e1051a39Sopenharmony_ci sbcs $a6,$a6,#1 249e1051a39Sopenharmony_ci sbcs $a7,$a7,#-1 250e1051a39Sopenharmony_ci sbc $ff,$ff,#0 251e1051a39Sopenharmony_ci 252e1051a39Sopenharmony_ci @ Note that because mod has special form, i.e. consists of 253e1051a39Sopenharmony_ci @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 254e1051a39Sopenharmony_ci @ using value of borrow as a whole or extracting single bit. 255e1051a39Sopenharmony_ci @ Follow $ff register... 256e1051a39Sopenharmony_ci 257e1051a39Sopenharmony_ci adds $a0,$a0,$ff @ add synthesized modulus 258e1051a39Sopenharmony_ci adcs $a1,$a1,$ff 259e1051a39Sopenharmony_ci str $a0,[$r_ptr,#0] 260e1051a39Sopenharmony_ci adcs $a2,$a2,$ff 261e1051a39Sopenharmony_ci str $a1,[$r_ptr,#4] 262e1051a39Sopenharmony_ci adcs $a3,$a3,#0 263e1051a39Sopenharmony_ci str $a2,[$r_ptr,#8] 264e1051a39Sopenharmony_ci adcs $a4,$a4,#0 265e1051a39Sopenharmony_ci str $a3,[$r_ptr,#12] 266e1051a39Sopenharmony_ci adcs $a5,$a5,#0 267e1051a39Sopenharmony_ci str $a4,[$r_ptr,#16] 268e1051a39Sopenharmony_ci adcs $a6,$a6,$ff,lsr#31 269e1051a39Sopenharmony_ci str $a5,[$r_ptr,#20] 270e1051a39Sopenharmony_ci adcs $a7,$a7,$ff 271e1051a39Sopenharmony_ci str $a6,[$r_ptr,#24] 272e1051a39Sopenharmony_ci str $a7,[$r_ptr,#28] 273e1051a39Sopenharmony_ci 274e1051a39Sopenharmony_ci mov pc,lr 275e1051a39Sopenharmony_ci.size __ecp_nistz256_add,.-__ecp_nistz256_add 276e1051a39Sopenharmony_ci 277e1051a39Sopenharmony_ci@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); 278e1051a39Sopenharmony_ci.globl ecp_nistz256_mul_by_3 279e1051a39Sopenharmony_ci.type ecp_nistz256_mul_by_3,%function 280e1051a39Sopenharmony_ci.align 4 281e1051a39Sopenharmony_ciecp_nistz256_mul_by_3: 282e1051a39Sopenharmony_ci stmdb sp!,{r4-r12,lr} 283e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_by_3 284e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__) 285e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,pc} 286e1051a39Sopenharmony_ci#else 287e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,lr} 288e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 289e1051a39Sopenharmony_ci#endif 290e1051a39Sopenharmony_ci.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 291e1051a39Sopenharmony_ci 292e1051a39Sopenharmony_ci.type __ecp_nistz256_mul_by_3,%function 293e1051a39Sopenharmony_ci.align 4 294e1051a39Sopenharmony_ci__ecp_nistz256_mul_by_3: 295e1051a39Sopenharmony_ci str lr,[sp,#-4]! @ push lr 296e1051a39Sopenharmony_ci 297e1051a39Sopenharmony_ci @ As multiplication by 3 is performed as 2*n+n, below are inline 298e1051a39Sopenharmony_ci @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see 299e1051a39Sopenharmony_ci @ corresponding subroutines for details. 300e1051a39Sopenharmony_ci 301e1051a39Sopenharmony_ci ldr $a0,[$a_ptr,#0] 302e1051a39Sopenharmony_ci ldr $a1,[$a_ptr,#4] 303e1051a39Sopenharmony_ci ldr $a2,[$a_ptr,#8] 304e1051a39Sopenharmony_ci adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] 305e1051a39Sopenharmony_ci ldr $a3,[$a_ptr,#12] 306e1051a39Sopenharmony_ci adcs $a1,$a1,$a1 307e1051a39Sopenharmony_ci ldr $a4,[$a_ptr,#16] 308e1051a39Sopenharmony_ci adcs $a2,$a2,$a2 309e1051a39Sopenharmony_ci ldr $a5,[$a_ptr,#20] 310e1051a39Sopenharmony_ci adcs $a3,$a3,$a3 311e1051a39Sopenharmony_ci ldr $a6,[$a_ptr,#24] 312e1051a39Sopenharmony_ci adcs $a4,$a4,$a4 313e1051a39Sopenharmony_ci ldr $a7,[$a_ptr,#28] 314e1051a39Sopenharmony_ci adcs $a5,$a5,$a5 315e1051a39Sopenharmony_ci adcs $a6,$a6,$a6 316e1051a39Sopenharmony_ci mov $ff,#0 317e1051a39Sopenharmony_ci adcs $a7,$a7,$a7 318e1051a39Sopenharmony_ci adc $ff,$ff,#0 319e1051a39Sopenharmony_ci 320e1051a39Sopenharmony_ci subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores 321e1051a39Sopenharmony_ci sbcs $a1,$a1,#-1 322e1051a39Sopenharmony_ci sbcs $a2,$a2,#-1 323e1051a39Sopenharmony_ci sbcs $a3,$a3,#0 324e1051a39Sopenharmony_ci sbcs $a4,$a4,#0 325e1051a39Sopenharmony_ci sbcs $a5,$a5,#0 326e1051a39Sopenharmony_ci sbcs $a6,$a6,#1 327e1051a39Sopenharmony_ci sbcs $a7,$a7,#-1 328e1051a39Sopenharmony_ci sbc $ff,$ff,#0 329e1051a39Sopenharmony_ci 330e1051a39Sopenharmony_ci adds $a0,$a0,$ff @ add synthesized modulus 331e1051a39Sopenharmony_ci adcs $a1,$a1,$ff 332e1051a39Sopenharmony_ci adcs $a2,$a2,$ff 333e1051a39Sopenharmony_ci adcs $a3,$a3,#0 334e1051a39Sopenharmony_ci adcs $a4,$a4,#0 335e1051a39Sopenharmony_ci ldr $b_ptr,[$a_ptr,#0] 336e1051a39Sopenharmony_ci adcs $a5,$a5,#0 337e1051a39Sopenharmony_ci ldr $t1,[$a_ptr,#4] 338e1051a39Sopenharmony_ci adcs $a6,$a6,$ff,lsr#31 339e1051a39Sopenharmony_ci ldr $t2,[$a_ptr,#8] 340e1051a39Sopenharmony_ci adc $a7,$a7,$ff 341e1051a39Sopenharmony_ci 342e1051a39Sopenharmony_ci ldr $t0,[$a_ptr,#12] 343e1051a39Sopenharmony_ci adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] 344e1051a39Sopenharmony_ci ldr $b_ptr,[$a_ptr,#16] 345e1051a39Sopenharmony_ci adcs $a1,$a1,$t1 346e1051a39Sopenharmony_ci ldr $t1,[$a_ptr,#20] 347e1051a39Sopenharmony_ci adcs $a2,$a2,$t2 348e1051a39Sopenharmony_ci ldr $t2,[$a_ptr,#24] 349e1051a39Sopenharmony_ci adcs $a3,$a3,$t0 350e1051a39Sopenharmony_ci ldr $t3,[$a_ptr,#28] 351e1051a39Sopenharmony_ci adcs $a4,$a4,$b_ptr 352e1051a39Sopenharmony_ci adcs $a5,$a5,$t1 353e1051a39Sopenharmony_ci adcs $a6,$a6,$t2 354e1051a39Sopenharmony_ci mov $ff,#0 355e1051a39Sopenharmony_ci adcs $a7,$a7,$t3 356e1051a39Sopenharmony_ci adc $ff,$ff,#0 357e1051a39Sopenharmony_ci ldr lr,[sp],#4 @ pop lr 358e1051a39Sopenharmony_ci 359e1051a39Sopenharmony_ci b .Lreduce_by_sub 360e1051a39Sopenharmony_ci.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 361e1051a39Sopenharmony_ci 362e1051a39Sopenharmony_ci@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); 363e1051a39Sopenharmony_ci.globl ecp_nistz256_div_by_2 364e1051a39Sopenharmony_ci.type ecp_nistz256_div_by_2,%function 365e1051a39Sopenharmony_ci.align 4 366e1051a39Sopenharmony_ciecp_nistz256_div_by_2: 367e1051a39Sopenharmony_ci stmdb sp!,{r4-r12,lr} 368e1051a39Sopenharmony_ci bl __ecp_nistz256_div_by_2 369e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__) 370e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,pc} 371e1051a39Sopenharmony_ci#else 372e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,lr} 373e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 374e1051a39Sopenharmony_ci#endif 375e1051a39Sopenharmony_ci.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 376e1051a39Sopenharmony_ci 377e1051a39Sopenharmony_ci.type __ecp_nistz256_div_by_2,%function 378e1051a39Sopenharmony_ci.align 4 379e1051a39Sopenharmony_ci__ecp_nistz256_div_by_2: 380e1051a39Sopenharmony_ci @ ret = (a is odd ? a+mod : a) >> 1 381e1051a39Sopenharmony_ci 382e1051a39Sopenharmony_ci ldr $a0,[$a_ptr,#0] 383e1051a39Sopenharmony_ci ldr $a1,[$a_ptr,#4] 384e1051a39Sopenharmony_ci ldr $a2,[$a_ptr,#8] 385e1051a39Sopenharmony_ci mov $ff,$a0,lsl#31 @ place least significant bit to most 386e1051a39Sopenharmony_ci @ significant position, now arithmetic 387e1051a39Sopenharmony_ci @ right shift by 31 will produce -1 or 388e1051a39Sopenharmony_ci @ 0, while logical right shift 1 or 0, 389e1051a39Sopenharmony_ci @ this is how modulus is conditionally 390e1051a39Sopenharmony_ci @ synthesized in this case... 391e1051a39Sopenharmony_ci ldr $a3,[$a_ptr,#12] 392e1051a39Sopenharmony_ci adds $a0,$a0,$ff,asr#31 393e1051a39Sopenharmony_ci ldr $a4,[$a_ptr,#16] 394e1051a39Sopenharmony_ci adcs $a1,$a1,$ff,asr#31 395e1051a39Sopenharmony_ci ldr $a5,[$a_ptr,#20] 396e1051a39Sopenharmony_ci adcs $a2,$a2,$ff,asr#31 397e1051a39Sopenharmony_ci ldr $a6,[$a_ptr,#24] 398e1051a39Sopenharmony_ci adcs $a3,$a3,#0 399e1051a39Sopenharmony_ci ldr $a7,[$a_ptr,#28] 400e1051a39Sopenharmony_ci adcs $a4,$a4,#0 401e1051a39Sopenharmony_ci mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early 402e1051a39Sopenharmony_ci @ because it doesn't affect flags 403e1051a39Sopenharmony_ci adcs $a5,$a5,#0 404e1051a39Sopenharmony_ci orr $a0,$a0,$a1,lsl#31 405e1051a39Sopenharmony_ci adcs $a6,$a6,$ff,lsr#31 406e1051a39Sopenharmony_ci mov $b_ptr,#0 407e1051a39Sopenharmony_ci adcs $a7,$a7,$ff,asr#31 408e1051a39Sopenharmony_ci mov $a1,$a1,lsr#1 409e1051a39Sopenharmony_ci adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition 410e1051a39Sopenharmony_ci 411e1051a39Sopenharmony_ci orr $a1,$a1,$a2,lsl#31 412e1051a39Sopenharmony_ci mov $a2,$a2,lsr#1 413e1051a39Sopenharmony_ci str $a0,[$r_ptr,#0] 414e1051a39Sopenharmony_ci orr $a2,$a2,$a3,lsl#31 415e1051a39Sopenharmony_ci mov $a3,$a3,lsr#1 416e1051a39Sopenharmony_ci str $a1,[$r_ptr,#4] 417e1051a39Sopenharmony_ci orr $a3,$a3,$a4,lsl#31 418e1051a39Sopenharmony_ci mov $a4,$a4,lsr#1 419e1051a39Sopenharmony_ci str $a2,[$r_ptr,#8] 420e1051a39Sopenharmony_ci orr $a4,$a4,$a5,lsl#31 421e1051a39Sopenharmony_ci mov $a5,$a5,lsr#1 422e1051a39Sopenharmony_ci str $a3,[$r_ptr,#12] 423e1051a39Sopenharmony_ci orr $a5,$a5,$a6,lsl#31 424e1051a39Sopenharmony_ci mov $a6,$a6,lsr#1 425e1051a39Sopenharmony_ci str $a4,[$r_ptr,#16] 426e1051a39Sopenharmony_ci orr $a6,$a6,$a7,lsl#31 427e1051a39Sopenharmony_ci mov $a7,$a7,lsr#1 428e1051a39Sopenharmony_ci str $a5,[$r_ptr,#20] 429e1051a39Sopenharmony_ci orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit 430e1051a39Sopenharmony_ci str $a6,[$r_ptr,#24] 431e1051a39Sopenharmony_ci str $a7,[$r_ptr,#28] 432e1051a39Sopenharmony_ci 433e1051a39Sopenharmony_ci mov pc,lr 434e1051a39Sopenharmony_ci.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 435e1051a39Sopenharmony_ci 436e1051a39Sopenharmony_ci@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], 437e1051a39Sopenharmony_ci@ const BN_ULONG r2[8]); 438e1051a39Sopenharmony_ci.globl ecp_nistz256_sub 439e1051a39Sopenharmony_ci.type ecp_nistz256_sub,%function 440e1051a39Sopenharmony_ci.align 4 441e1051a39Sopenharmony_ciecp_nistz256_sub: 442e1051a39Sopenharmony_ci stmdb sp!,{r4-r12,lr} 443e1051a39Sopenharmony_ci bl __ecp_nistz256_sub 444e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__) 445e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,pc} 446e1051a39Sopenharmony_ci#else 447e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,lr} 448e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 449e1051a39Sopenharmony_ci#endif 450e1051a39Sopenharmony_ci.size ecp_nistz256_sub,.-ecp_nistz256_sub 451e1051a39Sopenharmony_ci 452e1051a39Sopenharmony_ci.type __ecp_nistz256_sub,%function 453e1051a39Sopenharmony_ci.align 4 454e1051a39Sopenharmony_ci__ecp_nistz256_sub: 455e1051a39Sopenharmony_ci str lr,[sp,#-4]! @ push lr 456e1051a39Sopenharmony_ci 457e1051a39Sopenharmony_ci ldr $a0,[$a_ptr,#0] 458e1051a39Sopenharmony_ci ldr $a1,[$a_ptr,#4] 459e1051a39Sopenharmony_ci ldr $a2,[$a_ptr,#8] 460e1051a39Sopenharmony_ci ldr $a3,[$a_ptr,#12] 461e1051a39Sopenharmony_ci ldr $a4,[$a_ptr,#16] 462e1051a39Sopenharmony_ci ldr $t0,[$b_ptr,#0] 463e1051a39Sopenharmony_ci ldr $a5,[$a_ptr,#20] 464e1051a39Sopenharmony_ci ldr $t1,[$b_ptr,#4] 465e1051a39Sopenharmony_ci ldr $a6,[$a_ptr,#24] 466e1051a39Sopenharmony_ci ldr $t2,[$b_ptr,#8] 467e1051a39Sopenharmony_ci ldr $a7,[$a_ptr,#28] 468e1051a39Sopenharmony_ci ldr $t3,[$b_ptr,#12] 469e1051a39Sopenharmony_ci subs $a0,$a0,$t0 470e1051a39Sopenharmony_ci ldr $t0,[$b_ptr,#16] 471e1051a39Sopenharmony_ci sbcs $a1,$a1,$t1 472e1051a39Sopenharmony_ci ldr $t1,[$b_ptr,#20] 473e1051a39Sopenharmony_ci sbcs $a2,$a2,$t2 474e1051a39Sopenharmony_ci ldr $t2,[$b_ptr,#24] 475e1051a39Sopenharmony_ci sbcs $a3,$a3,$t3 476e1051a39Sopenharmony_ci ldr $t3,[$b_ptr,#28] 477e1051a39Sopenharmony_ci sbcs $a4,$a4,$t0 478e1051a39Sopenharmony_ci sbcs $a5,$a5,$t1 479e1051a39Sopenharmony_ci sbcs $a6,$a6,$t2 480e1051a39Sopenharmony_ci sbcs $a7,$a7,$t3 481e1051a39Sopenharmony_ci sbc $ff,$ff,$ff @ broadcast borrow bit 482e1051a39Sopenharmony_ci ldr lr,[sp],#4 @ pop lr 483e1051a39Sopenharmony_ci 484e1051a39Sopenharmony_ci.Lreduce_by_add: 485e1051a39Sopenharmony_ci 486e1051a39Sopenharmony_ci @ if a-b borrows, add modulus. 487e1051a39Sopenharmony_ci @ 488e1051a39Sopenharmony_ci @ Note that because mod has special form, i.e. consists of 489e1051a39Sopenharmony_ci @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 490e1051a39Sopenharmony_ci @ broadcasting borrow bit to a register, $ff, and using it as 491e1051a39Sopenharmony_ci @ a whole or extracting single bit. 492e1051a39Sopenharmony_ci 493e1051a39Sopenharmony_ci adds $a0,$a0,$ff @ add synthesized modulus 494e1051a39Sopenharmony_ci adcs $a1,$a1,$ff 495e1051a39Sopenharmony_ci str $a0,[$r_ptr,#0] 496e1051a39Sopenharmony_ci adcs $a2,$a2,$ff 497e1051a39Sopenharmony_ci str $a1,[$r_ptr,#4] 498e1051a39Sopenharmony_ci adcs $a3,$a3,#0 499e1051a39Sopenharmony_ci str $a2,[$r_ptr,#8] 500e1051a39Sopenharmony_ci adcs $a4,$a4,#0 501e1051a39Sopenharmony_ci str $a3,[$r_ptr,#12] 502e1051a39Sopenharmony_ci adcs $a5,$a5,#0 503e1051a39Sopenharmony_ci str $a4,[$r_ptr,#16] 504e1051a39Sopenharmony_ci adcs $a6,$a6,$ff,lsr#31 505e1051a39Sopenharmony_ci str $a5,[$r_ptr,#20] 506e1051a39Sopenharmony_ci adcs $a7,$a7,$ff 507e1051a39Sopenharmony_ci str $a6,[$r_ptr,#24] 508e1051a39Sopenharmony_ci str $a7,[$r_ptr,#28] 509e1051a39Sopenharmony_ci 510e1051a39Sopenharmony_ci mov pc,lr 511e1051a39Sopenharmony_ci.size __ecp_nistz256_sub,.-__ecp_nistz256_sub 512e1051a39Sopenharmony_ci 513e1051a39Sopenharmony_ci@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); 514e1051a39Sopenharmony_ci.globl ecp_nistz256_neg 515e1051a39Sopenharmony_ci.type ecp_nistz256_neg,%function 516e1051a39Sopenharmony_ci.align 4 517e1051a39Sopenharmony_ciecp_nistz256_neg: 518e1051a39Sopenharmony_ci stmdb sp!,{r4-r12,lr} 519e1051a39Sopenharmony_ci bl __ecp_nistz256_neg 520e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__) 521e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,pc} 522e1051a39Sopenharmony_ci#else 523e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,lr} 524e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 525e1051a39Sopenharmony_ci#endif 526e1051a39Sopenharmony_ci.size ecp_nistz256_neg,.-ecp_nistz256_neg 527e1051a39Sopenharmony_ci 528e1051a39Sopenharmony_ci.type __ecp_nistz256_neg,%function 529e1051a39Sopenharmony_ci.align 4 530e1051a39Sopenharmony_ci__ecp_nistz256_neg: 531e1051a39Sopenharmony_ci ldr $a0,[$a_ptr,#0] 532e1051a39Sopenharmony_ci eor $ff,$ff,$ff 533e1051a39Sopenharmony_ci ldr $a1,[$a_ptr,#4] 534e1051a39Sopenharmony_ci ldr $a2,[$a_ptr,#8] 535e1051a39Sopenharmony_ci subs $a0,$ff,$a0 536e1051a39Sopenharmony_ci ldr $a3,[$a_ptr,#12] 537e1051a39Sopenharmony_ci sbcs $a1,$ff,$a1 538e1051a39Sopenharmony_ci ldr $a4,[$a_ptr,#16] 539e1051a39Sopenharmony_ci sbcs $a2,$ff,$a2 540e1051a39Sopenharmony_ci ldr $a5,[$a_ptr,#20] 541e1051a39Sopenharmony_ci sbcs $a3,$ff,$a3 542e1051a39Sopenharmony_ci ldr $a6,[$a_ptr,#24] 543e1051a39Sopenharmony_ci sbcs $a4,$ff,$a4 544e1051a39Sopenharmony_ci ldr $a7,[$a_ptr,#28] 545e1051a39Sopenharmony_ci sbcs $a5,$ff,$a5 546e1051a39Sopenharmony_ci sbcs $a6,$ff,$a6 547e1051a39Sopenharmony_ci sbcs $a7,$ff,$a7 548e1051a39Sopenharmony_ci sbc $ff,$ff,$ff 549e1051a39Sopenharmony_ci 550e1051a39Sopenharmony_ci b .Lreduce_by_add 551e1051a39Sopenharmony_ci.size __ecp_nistz256_neg,.-__ecp_nistz256_neg 552e1051a39Sopenharmony_ci___ 553e1051a39Sopenharmony_ci{ 554e1051a39Sopenharmony_cimy @acc=map("r$_",(3..11)); 555e1051a39Sopenharmony_cimy ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14)); 556e1051a39Sopenharmony_ci 557e1051a39Sopenharmony_ci$code.=<<___; 558e1051a39Sopenharmony_ci@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 559e1051a39Sopenharmony_ci.globl ecp_nistz256_sqr_mont 560e1051a39Sopenharmony_ci.type ecp_nistz256_sqr_mont,%function 561e1051a39Sopenharmony_ci.align 4 562e1051a39Sopenharmony_ciecp_nistz256_sqr_mont: 563e1051a39Sopenharmony_ci mov $b_ptr,$a_ptr 564e1051a39Sopenharmony_ci b .Lecp_nistz256_mul_mont 565e1051a39Sopenharmony_ci.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 566e1051a39Sopenharmony_ci 567e1051a39Sopenharmony_ci@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], 568e1051a39Sopenharmony_ci@ const BN_ULONG r2[8]); 569e1051a39Sopenharmony_ci.globl ecp_nistz256_mul_mont 570e1051a39Sopenharmony_ci.type ecp_nistz256_mul_mont,%function 571e1051a39Sopenharmony_ci.align 4 572e1051a39Sopenharmony_ciecp_nistz256_mul_mont: 573e1051a39Sopenharmony_ci.Lecp_nistz256_mul_mont: 574e1051a39Sopenharmony_ci stmdb sp!,{r4-r12,lr} 575e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont 576e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__) 577e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,pc} 578e1051a39Sopenharmony_ci#else 579e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,lr} 580e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 581e1051a39Sopenharmony_ci#endif 582e1051a39Sopenharmony_ci.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 583e1051a39Sopenharmony_ci 584e1051a39Sopenharmony_ci.type __ecp_nistz256_mul_mont,%function 585e1051a39Sopenharmony_ci.align 4 586e1051a39Sopenharmony_ci__ecp_nistz256_mul_mont: 587e1051a39Sopenharmony_ci stmdb sp!,{r0-r2,lr} @ make a copy of arguments too 588e1051a39Sopenharmony_ci 589e1051a39Sopenharmony_ci ldr $bj,[$b_ptr,#0] @ b[0] 590e1051a39Sopenharmony_ci ldmia $a_ptr,{@acc[1]-@acc[8]} 591e1051a39Sopenharmony_ci 592e1051a39Sopenharmony_ci umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0] 593e1051a39Sopenharmony_ci stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so 594e1051a39Sopenharmony_ci @ that it can be addressed 595e1051a39Sopenharmony_ci @ without spending register 596e1051a39Sopenharmony_ci @ on address 597e1051a39Sopenharmony_ci umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0] 598e1051a39Sopenharmony_ci umull @acc[2],$t1,@acc[3],$bj 599e1051a39Sopenharmony_ci adds @acc[1],@acc[1],$t3 @ accumulate high part of mult 600e1051a39Sopenharmony_ci umull @acc[3],$t2,@acc[4],$bj 601e1051a39Sopenharmony_ci adcs @acc[2],@acc[2],$t0 602e1051a39Sopenharmony_ci umull @acc[4],$t3,@acc[5],$bj 603e1051a39Sopenharmony_ci adcs @acc[3],@acc[3],$t1 604e1051a39Sopenharmony_ci umull @acc[5],$t0,@acc[6],$bj 605e1051a39Sopenharmony_ci adcs @acc[4],@acc[4],$t2 606e1051a39Sopenharmony_ci umull @acc[6],$t1,@acc[7],$bj 607e1051a39Sopenharmony_ci adcs @acc[5],@acc[5],$t3 608e1051a39Sopenharmony_ci umull @acc[7],$t2,@acc[8],$bj 609e1051a39Sopenharmony_ci adcs @acc[6],@acc[6],$t0 610e1051a39Sopenharmony_ci adcs @acc[7],@acc[7],$t1 611e1051a39Sopenharmony_ci eor $t3,$t3,$t3 @ first overflow bit is zero 612e1051a39Sopenharmony_ci adc @acc[8],$t2,#0 613e1051a39Sopenharmony_ci___ 614e1051a39Sopenharmony_cifor(my $i=1;$i<8;$i++) { 615e1051a39Sopenharmony_cimy $t4=@acc[0]; 616e1051a39Sopenharmony_ci 617e1051a39Sopenharmony_ci # Reduction iteration is normally performed by accumulating 618e1051a39Sopenharmony_ci # result of multiplication of modulus by "magic" digit [and 619e1051a39Sopenharmony_ci # omitting least significant word, which is guaranteed to 620e1051a39Sopenharmony_ci # be 0], but thanks to special form of modulus and "magic" 621e1051a39Sopenharmony_ci # digit being equal to least significant word, it can be 622e1051a39Sopenharmony_ci # performed with additions and subtractions alone. Indeed: 623e1051a39Sopenharmony_ci # 624e1051a39Sopenharmony_ci # ffff.0001.0000.0000.0000.ffff.ffff.ffff 625e1051a39Sopenharmony_ci # * abcd 626e1051a39Sopenharmony_ci # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 627e1051a39Sopenharmony_ci # 628e1051a39Sopenharmony_ci # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 629e1051a39Sopenharmony_ci # rewrite above as: 630e1051a39Sopenharmony_ci # 631e1051a39Sopenharmony_ci # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 632e1051a39Sopenharmony_ci # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 633e1051a39Sopenharmony_ci # - abcd.0000.0000.0000.0000.0000.0000.abcd 634e1051a39Sopenharmony_ci # 635e1051a39Sopenharmony_ci # or marking redundant operations: 636e1051a39Sopenharmony_ci # 637e1051a39Sopenharmony_ci # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- 638e1051a39Sopenharmony_ci # + abcd.0000.abcd.0000.0000.abcd.----.----.---- 639e1051a39Sopenharmony_ci # - abcd.----.----.----.----.----.----.---- 640e1051a39Sopenharmony_ci 641e1051a39Sopenharmony_ci$code.=<<___; 642e1051a39Sopenharmony_ci @ multiplication-less reduction $i 643e1051a39Sopenharmony_ci adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0] 644e1051a39Sopenharmony_ci ldr $bj,[sp,#40] @ restore b_ptr 645e1051a39Sopenharmony_ci adcs @acc[4],@acc[4],#0 @ r[4]+=0 646e1051a39Sopenharmony_ci adcs @acc[5],@acc[5],#0 @ r[5]+=0 647e1051a39Sopenharmony_ci adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0] 648e1051a39Sopenharmony_ci ldr $t1,[sp,#0] @ load a[0] 649e1051a39Sopenharmony_ci adcs @acc[7],@acc[7],#0 @ r[7]+=0 650e1051a39Sopenharmony_ci ldr $bj,[$bj,#4*$i] @ load b[i] 651e1051a39Sopenharmony_ci adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0] 652e1051a39Sopenharmony_ci eor $t0,$t0,$t0 653e1051a39Sopenharmony_ci adc $t3,$t3,#0 @ overflow bit 654e1051a39Sopenharmony_ci subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0] 655e1051a39Sopenharmony_ci ldr $t2,[sp,#4] @ a[1] 656e1051a39Sopenharmony_ci sbcs @acc[8],@acc[8],#0 @ r[8]-=0 657e1051a39Sopenharmony_ci umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i] 658e1051a39Sopenharmony_ci eor $t1,$t1,$t1 659e1051a39Sopenharmony_ci sbc @acc[0],$t3,#0 @ overflow bit, keep in mind 660e1051a39Sopenharmony_ci @ that netto result is 661e1051a39Sopenharmony_ci @ addition of a value which 662e1051a39Sopenharmony_ci @ makes underflow impossible 663e1051a39Sopenharmony_ci 664e1051a39Sopenharmony_ci ldr $t3,[sp,#8] @ a[2] 665e1051a39Sopenharmony_ci umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i] 666e1051a39Sopenharmony_ci str @acc[0],[sp,#36] @ temporarily offload overflow 667e1051a39Sopenharmony_ci eor $t2,$t2,$t2 668e1051a39Sopenharmony_ci ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0] 669e1051a39Sopenharmony_ci umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i] 670e1051a39Sopenharmony_ci eor $t3,$t3,$t3 671e1051a39Sopenharmony_ci adds @acc[2],@acc[2],$t0 @ accumulate high part of mult 672e1051a39Sopenharmony_ci ldr $t0,[sp,#16] @ a[4] 673e1051a39Sopenharmony_ci umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i] 674e1051a39Sopenharmony_ci eor $t4,$t4,$t4 675e1051a39Sopenharmony_ci adcs @acc[3],@acc[3],$t1 676e1051a39Sopenharmony_ci ldr $t1,[sp,#20] @ a[5] 677e1051a39Sopenharmony_ci umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i] 678e1051a39Sopenharmony_ci eor $t0,$t0,$t0 679e1051a39Sopenharmony_ci adcs @acc[4],@acc[4],$t2 680e1051a39Sopenharmony_ci ldr $t2,[sp,#24] @ a[6] 681e1051a39Sopenharmony_ci umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i] 682e1051a39Sopenharmony_ci eor $t1,$t1,$t1 683e1051a39Sopenharmony_ci adcs @acc[5],@acc[5],$t3 684e1051a39Sopenharmony_ci ldr $t3,[sp,#28] @ a[7] 685e1051a39Sopenharmony_ci umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i] 686e1051a39Sopenharmony_ci eor $t2,$t2,$t2 687e1051a39Sopenharmony_ci adcs @acc[6],@acc[6],$t4 688e1051a39Sopenharmony_ci ldr @acc[0],[sp,#36] @ restore overflow bit 689e1051a39Sopenharmony_ci umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i] 690e1051a39Sopenharmony_ci eor $t3,$t3,$t3 691e1051a39Sopenharmony_ci adcs @acc[7],@acc[7],$t0 692e1051a39Sopenharmony_ci adcs @acc[8],@acc[8],$t1 693e1051a39Sopenharmony_ci adcs @acc[0],$acc[0],$t2 694e1051a39Sopenharmony_ci adc $t3,$t3,#0 @ new overflow bit 695e1051a39Sopenharmony_ci___ 696e1051a39Sopenharmony_ci push(@acc,shift(@acc)); # rotate registers, so that 697e1051a39Sopenharmony_ci # "r[i]" becomes r[i] 698e1051a39Sopenharmony_ci} 699e1051a39Sopenharmony_ci$code.=<<___; 700e1051a39Sopenharmony_ci @ last multiplication-less reduction 701e1051a39Sopenharmony_ci adds @acc[3],@acc[3],@acc[0] 702e1051a39Sopenharmony_ci ldr $r_ptr,[sp,#32] @ restore r_ptr 703e1051a39Sopenharmony_ci adcs @acc[4],@acc[4],#0 704e1051a39Sopenharmony_ci adcs @acc[5],@acc[5],#0 705e1051a39Sopenharmony_ci adcs @acc[6],@acc[6],@acc[0] 706e1051a39Sopenharmony_ci adcs @acc[7],@acc[7],#0 707e1051a39Sopenharmony_ci adcs @acc[8],@acc[8],@acc[0] 708e1051a39Sopenharmony_ci adc $t3,$t3,#0 709e1051a39Sopenharmony_ci subs @acc[7],@acc[7],@acc[0] 710e1051a39Sopenharmony_ci sbcs @acc[8],@acc[8],#0 711e1051a39Sopenharmony_ci sbc @acc[0],$t3,#0 @ overflow bit 712e1051a39Sopenharmony_ci 713e1051a39Sopenharmony_ci @ Final step is "if result > mod, subtract mod", but we do it 714e1051a39Sopenharmony_ci @ "other way around", namely subtract modulus from result 715e1051a39Sopenharmony_ci @ and if it borrowed, add modulus back. 716e1051a39Sopenharmony_ci 717e1051a39Sopenharmony_ci adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 718e1051a39Sopenharmony_ci adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 719e1051a39Sopenharmony_ci adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 720e1051a39Sopenharmony_ci sbcs @acc[4],@acc[4],#0 721e1051a39Sopenharmony_ci sbcs @acc[5],@acc[5],#0 722e1051a39Sopenharmony_ci sbcs @acc[6],@acc[6],#0 723e1051a39Sopenharmony_ci sbcs @acc[7],@acc[7],#1 724e1051a39Sopenharmony_ci adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 725e1051a39Sopenharmony_ci ldr lr,[sp,#44] @ restore lr 726e1051a39Sopenharmony_ci sbc @acc[0],@acc[0],#0 @ broadcast borrow bit 727e1051a39Sopenharmony_ci add sp,sp,#48 728e1051a39Sopenharmony_ci 729e1051a39Sopenharmony_ci @ Note that because mod has special form, i.e. consists of 730e1051a39Sopenharmony_ci @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 731e1051a39Sopenharmony_ci @ broadcasting borrow bit to a register, @acc[0], and using it as 732e1051a39Sopenharmony_ci @ a whole or extracting single bit. 733e1051a39Sopenharmony_ci 734e1051a39Sopenharmony_ci adds @acc[1],@acc[1],@acc[0] @ add modulus or zero 735e1051a39Sopenharmony_ci adcs @acc[2],@acc[2],@acc[0] 736e1051a39Sopenharmony_ci str @acc[1],[$r_ptr,#0] 737e1051a39Sopenharmony_ci adcs @acc[3],@acc[3],@acc[0] 738e1051a39Sopenharmony_ci str @acc[2],[$r_ptr,#4] 739e1051a39Sopenharmony_ci adcs @acc[4],@acc[4],#0 740e1051a39Sopenharmony_ci str @acc[3],[$r_ptr,#8] 741e1051a39Sopenharmony_ci adcs @acc[5],@acc[5],#0 742e1051a39Sopenharmony_ci str @acc[4],[$r_ptr,#12] 743e1051a39Sopenharmony_ci adcs @acc[6],@acc[6],#0 744e1051a39Sopenharmony_ci str @acc[5],[$r_ptr,#16] 745e1051a39Sopenharmony_ci adcs @acc[7],@acc[7],@acc[0],lsr#31 746e1051a39Sopenharmony_ci str @acc[6],[$r_ptr,#20] 747e1051a39Sopenharmony_ci adc @acc[8],@acc[8],@acc[0] 748e1051a39Sopenharmony_ci str @acc[7],[$r_ptr,#24] 749e1051a39Sopenharmony_ci str @acc[8],[$r_ptr,#28] 750e1051a39Sopenharmony_ci 751e1051a39Sopenharmony_ci mov pc,lr 752e1051a39Sopenharmony_ci.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 753e1051a39Sopenharmony_ci___ 754e1051a39Sopenharmony_ci} 755e1051a39Sopenharmony_ci 756e1051a39Sopenharmony_ci{ 757e1051a39Sopenharmony_cimy ($out,$inp,$index,$mask)=map("r$_",(0..3)); 758e1051a39Sopenharmony_ci$code.=<<___; 759e1051a39Sopenharmony_ci@ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1, 760e1051a39Sopenharmony_ci@ int r2); 761e1051a39Sopenharmony_ci.globl ecp_nistz256_scatter_w5 762e1051a39Sopenharmony_ci.type ecp_nistz256_scatter_w5,%function 763e1051a39Sopenharmony_ci.align 5 764e1051a39Sopenharmony_ciecp_nistz256_scatter_w5: 765e1051a39Sopenharmony_ci stmdb sp!,{r4-r11} 766e1051a39Sopenharmony_ci 767e1051a39Sopenharmony_ci add $out,$out,$index,lsl#2 768e1051a39Sopenharmony_ci 769e1051a39Sopenharmony_ci ldmia $inp!,{r4-r11} @ X 770e1051a39Sopenharmony_ci str r4,[$out,#64*0-4] 771e1051a39Sopenharmony_ci str r5,[$out,#64*1-4] 772e1051a39Sopenharmony_ci str r6,[$out,#64*2-4] 773e1051a39Sopenharmony_ci str r7,[$out,#64*3-4] 774e1051a39Sopenharmony_ci str r8,[$out,#64*4-4] 775e1051a39Sopenharmony_ci str r9,[$out,#64*5-4] 776e1051a39Sopenharmony_ci str r10,[$out,#64*6-4] 777e1051a39Sopenharmony_ci str r11,[$out,#64*7-4] 778e1051a39Sopenharmony_ci add $out,$out,#64*8 779e1051a39Sopenharmony_ci 780e1051a39Sopenharmony_ci ldmia $inp!,{r4-r11} @ Y 781e1051a39Sopenharmony_ci str r4,[$out,#64*0-4] 782e1051a39Sopenharmony_ci str r5,[$out,#64*1-4] 783e1051a39Sopenharmony_ci str r6,[$out,#64*2-4] 784e1051a39Sopenharmony_ci str r7,[$out,#64*3-4] 785e1051a39Sopenharmony_ci str r8,[$out,#64*4-4] 786e1051a39Sopenharmony_ci str r9,[$out,#64*5-4] 787e1051a39Sopenharmony_ci str r10,[$out,#64*6-4] 788e1051a39Sopenharmony_ci str r11,[$out,#64*7-4] 789e1051a39Sopenharmony_ci add $out,$out,#64*8 790e1051a39Sopenharmony_ci 791e1051a39Sopenharmony_ci ldmia $inp,{r4-r11} @ Z 792e1051a39Sopenharmony_ci str r4,[$out,#64*0-4] 793e1051a39Sopenharmony_ci str r5,[$out,#64*1-4] 794e1051a39Sopenharmony_ci str r6,[$out,#64*2-4] 795e1051a39Sopenharmony_ci str r7,[$out,#64*3-4] 796e1051a39Sopenharmony_ci str r8,[$out,#64*4-4] 797e1051a39Sopenharmony_ci str r9,[$out,#64*5-4] 798e1051a39Sopenharmony_ci str r10,[$out,#64*6-4] 799e1051a39Sopenharmony_ci str r11,[$out,#64*7-4] 800e1051a39Sopenharmony_ci 801e1051a39Sopenharmony_ci ldmia sp!,{r4-r11} 802e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || defined(__thumb__) 803e1051a39Sopenharmony_ci bx lr 804e1051a39Sopenharmony_ci#else 805e1051a39Sopenharmony_ci mov pc,lr 806e1051a39Sopenharmony_ci#endif 807e1051a39Sopenharmony_ci.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 808e1051a39Sopenharmony_ci 809e1051a39Sopenharmony_ci@ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1, 810e1051a39Sopenharmony_ci@ int r2); 811e1051a39Sopenharmony_ci.globl ecp_nistz256_gather_w5 812e1051a39Sopenharmony_ci.type ecp_nistz256_gather_w5,%function 813e1051a39Sopenharmony_ci.align 5 814e1051a39Sopenharmony_ciecp_nistz256_gather_w5: 815e1051a39Sopenharmony_ci stmdb sp!,{r4-r11} 816e1051a39Sopenharmony_ci 817e1051a39Sopenharmony_ci cmp $index,#0 818e1051a39Sopenharmony_ci mov $mask,#0 819e1051a39Sopenharmony_ci#ifdef __thumb2__ 820e1051a39Sopenharmony_ci itt ne 821e1051a39Sopenharmony_ci#endif 822e1051a39Sopenharmony_ci subne $index,$index,#1 823e1051a39Sopenharmony_ci movne $mask,#-1 824e1051a39Sopenharmony_ci add $inp,$inp,$index,lsl#2 825e1051a39Sopenharmony_ci 826e1051a39Sopenharmony_ci ldr r4,[$inp,#64*0] 827e1051a39Sopenharmony_ci ldr r5,[$inp,#64*1] 828e1051a39Sopenharmony_ci ldr r6,[$inp,#64*2] 829e1051a39Sopenharmony_ci and r4,r4,$mask 830e1051a39Sopenharmony_ci ldr r7,[$inp,#64*3] 831e1051a39Sopenharmony_ci and r5,r5,$mask 832e1051a39Sopenharmony_ci ldr r8,[$inp,#64*4] 833e1051a39Sopenharmony_ci and r6,r6,$mask 834e1051a39Sopenharmony_ci ldr r9,[$inp,#64*5] 835e1051a39Sopenharmony_ci and r7,r7,$mask 836e1051a39Sopenharmony_ci ldr r10,[$inp,#64*6] 837e1051a39Sopenharmony_ci and r8,r8,$mask 838e1051a39Sopenharmony_ci ldr r11,[$inp,#64*7] 839e1051a39Sopenharmony_ci add $inp,$inp,#64*8 840e1051a39Sopenharmony_ci and r9,r9,$mask 841e1051a39Sopenharmony_ci and r10,r10,$mask 842e1051a39Sopenharmony_ci and r11,r11,$mask 843e1051a39Sopenharmony_ci stmia $out!,{r4-r11} @ X 844e1051a39Sopenharmony_ci 845e1051a39Sopenharmony_ci ldr r4,[$inp,#64*0] 846e1051a39Sopenharmony_ci ldr r5,[$inp,#64*1] 847e1051a39Sopenharmony_ci ldr r6,[$inp,#64*2] 848e1051a39Sopenharmony_ci and r4,r4,$mask 849e1051a39Sopenharmony_ci ldr r7,[$inp,#64*3] 850e1051a39Sopenharmony_ci and r5,r5,$mask 851e1051a39Sopenharmony_ci ldr r8,[$inp,#64*4] 852e1051a39Sopenharmony_ci and r6,r6,$mask 853e1051a39Sopenharmony_ci ldr r9,[$inp,#64*5] 854e1051a39Sopenharmony_ci and r7,r7,$mask 855e1051a39Sopenharmony_ci ldr r10,[$inp,#64*6] 856e1051a39Sopenharmony_ci and r8,r8,$mask 857e1051a39Sopenharmony_ci ldr r11,[$inp,#64*7] 858e1051a39Sopenharmony_ci add $inp,$inp,#64*8 859e1051a39Sopenharmony_ci and r9,r9,$mask 860e1051a39Sopenharmony_ci and r10,r10,$mask 861e1051a39Sopenharmony_ci and r11,r11,$mask 862e1051a39Sopenharmony_ci stmia $out!,{r4-r11} @ Y 863e1051a39Sopenharmony_ci 864e1051a39Sopenharmony_ci ldr r4,[$inp,#64*0] 865e1051a39Sopenharmony_ci ldr r5,[$inp,#64*1] 866e1051a39Sopenharmony_ci ldr r6,[$inp,#64*2] 867e1051a39Sopenharmony_ci and r4,r4,$mask 868e1051a39Sopenharmony_ci ldr r7,[$inp,#64*3] 869e1051a39Sopenharmony_ci and r5,r5,$mask 870e1051a39Sopenharmony_ci ldr r8,[$inp,#64*4] 871e1051a39Sopenharmony_ci and r6,r6,$mask 872e1051a39Sopenharmony_ci ldr r9,[$inp,#64*5] 873e1051a39Sopenharmony_ci and r7,r7,$mask 874e1051a39Sopenharmony_ci ldr r10,[$inp,#64*6] 875e1051a39Sopenharmony_ci and r8,r8,$mask 876e1051a39Sopenharmony_ci ldr r11,[$inp,#64*7] 877e1051a39Sopenharmony_ci and r9,r9,$mask 878e1051a39Sopenharmony_ci and r10,r10,$mask 879e1051a39Sopenharmony_ci and r11,r11,$mask 880e1051a39Sopenharmony_ci stmia $out,{r4-r11} @ Z 881e1051a39Sopenharmony_ci 882e1051a39Sopenharmony_ci ldmia sp!,{r4-r11} 883e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || defined(__thumb__) 884e1051a39Sopenharmony_ci bx lr 885e1051a39Sopenharmony_ci#else 886e1051a39Sopenharmony_ci mov pc,lr 887e1051a39Sopenharmony_ci#endif 888e1051a39Sopenharmony_ci.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 889e1051a39Sopenharmony_ci 890e1051a39Sopenharmony_ci@ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1, 891e1051a39Sopenharmony_ci@ int r2); 892e1051a39Sopenharmony_ci.globl ecp_nistz256_scatter_w7 893e1051a39Sopenharmony_ci.type ecp_nistz256_scatter_w7,%function 894e1051a39Sopenharmony_ci.align 5 895e1051a39Sopenharmony_ciecp_nistz256_scatter_w7: 896e1051a39Sopenharmony_ci add $out,$out,$index 897e1051a39Sopenharmony_ci mov $index,#64/4 898e1051a39Sopenharmony_ci.Loop_scatter_w7: 899e1051a39Sopenharmony_ci ldr $mask,[$inp],#4 900e1051a39Sopenharmony_ci subs $index,$index,#1 901e1051a39Sopenharmony_ci strb $mask,[$out,#64*0] 902e1051a39Sopenharmony_ci mov $mask,$mask,lsr#8 903e1051a39Sopenharmony_ci strb $mask,[$out,#64*1] 904e1051a39Sopenharmony_ci mov $mask,$mask,lsr#8 905e1051a39Sopenharmony_ci strb $mask,[$out,#64*2] 906e1051a39Sopenharmony_ci mov $mask,$mask,lsr#8 907e1051a39Sopenharmony_ci strb $mask,[$out,#64*3] 908e1051a39Sopenharmony_ci add $out,$out,#64*4 909e1051a39Sopenharmony_ci bne .Loop_scatter_w7 910e1051a39Sopenharmony_ci 911e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || defined(__thumb__) 912e1051a39Sopenharmony_ci bx lr 913e1051a39Sopenharmony_ci#else 914e1051a39Sopenharmony_ci mov pc,lr 915e1051a39Sopenharmony_ci#endif 916e1051a39Sopenharmony_ci.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 917e1051a39Sopenharmony_ci 918e1051a39Sopenharmony_ci@ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1, 919e1051a39Sopenharmony_ci@ int r2); 920e1051a39Sopenharmony_ci.globl ecp_nistz256_gather_w7 921e1051a39Sopenharmony_ci.type ecp_nistz256_gather_w7,%function 922e1051a39Sopenharmony_ci.align 5 923e1051a39Sopenharmony_ciecp_nistz256_gather_w7: 924e1051a39Sopenharmony_ci stmdb sp!,{r4-r7} 925e1051a39Sopenharmony_ci 926e1051a39Sopenharmony_ci cmp $index,#0 927e1051a39Sopenharmony_ci mov $mask,#0 928e1051a39Sopenharmony_ci#ifdef __thumb2__ 929e1051a39Sopenharmony_ci itt ne 930e1051a39Sopenharmony_ci#endif 931e1051a39Sopenharmony_ci subne $index,$index,#1 932e1051a39Sopenharmony_ci movne $mask,#-1 933e1051a39Sopenharmony_ci add $inp,$inp,$index 934e1051a39Sopenharmony_ci mov $index,#64/4 935e1051a39Sopenharmony_ci nop 936e1051a39Sopenharmony_ci.Loop_gather_w7: 937e1051a39Sopenharmony_ci ldrb r4,[$inp,#64*0] 938e1051a39Sopenharmony_ci subs $index,$index,#1 939e1051a39Sopenharmony_ci ldrb r5,[$inp,#64*1] 940e1051a39Sopenharmony_ci ldrb r6,[$inp,#64*2] 941e1051a39Sopenharmony_ci ldrb r7,[$inp,#64*3] 942e1051a39Sopenharmony_ci add $inp,$inp,#64*4 943e1051a39Sopenharmony_ci orr r4,r4,r5,lsl#8 944e1051a39Sopenharmony_ci orr r4,r4,r6,lsl#16 945e1051a39Sopenharmony_ci orr r4,r4,r7,lsl#24 946e1051a39Sopenharmony_ci and r4,r4,$mask 947e1051a39Sopenharmony_ci str r4,[$out],#4 948e1051a39Sopenharmony_ci bne .Loop_gather_w7 949e1051a39Sopenharmony_ci 950e1051a39Sopenharmony_ci ldmia sp!,{r4-r7} 951e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || defined(__thumb__) 952e1051a39Sopenharmony_ci bx lr 953e1051a39Sopenharmony_ci#else 954e1051a39Sopenharmony_ci mov pc,lr 955e1051a39Sopenharmony_ci#endif 956e1051a39Sopenharmony_ci.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 957e1051a39Sopenharmony_ci___ 958e1051a39Sopenharmony_ci} 959e1051a39Sopenharmony_ciif (0) { 960e1051a39Sopenharmony_ci# In comparison to integer-only equivalent of below subroutine: 961e1051a39Sopenharmony_ci# 962e1051a39Sopenharmony_ci# Cortex-A8 +10% 963e1051a39Sopenharmony_ci# Cortex-A9 -10% 964e1051a39Sopenharmony_ci# Snapdragon S4 +5% 965e1051a39Sopenharmony_ci# 966e1051a39Sopenharmony_ci# As not all time is spent in multiplication, overall impact is deemed 967e1051a39Sopenharmony_ci# too low to care about. 968e1051a39Sopenharmony_ci 969e1051a39Sopenharmony_cimy ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7)); 970e1051a39Sopenharmony_cimy $mask="q4"; 971e1051a39Sopenharmony_cimy $mult="q5"; 972e1051a39Sopenharmony_cimy @AxB=map("q$_",(8..15)); 973e1051a39Sopenharmony_ci 974e1051a39Sopenharmony_cimy ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3)); 975e1051a39Sopenharmony_ci 976e1051a39Sopenharmony_ci$code.=<<___; 977e1051a39Sopenharmony_ci#if __ARM_ARCH__>=7 978e1051a39Sopenharmony_ci.fpu neon 979e1051a39Sopenharmony_ci 980e1051a39Sopenharmony_ci.globl ecp_nistz256_mul_mont_neon 981e1051a39Sopenharmony_ci.type ecp_nistz256_mul_mont_neon,%function 982e1051a39Sopenharmony_ci.align 5 983e1051a39Sopenharmony_ciecp_nistz256_mul_mont_neon: 984e1051a39Sopenharmony_ci mov ip,sp 985e1051a39Sopenharmony_ci stmdb sp!,{r4-r9} 986e1051a39Sopenharmony_ci vstmdb sp!,{q4-q5} @ ABI specification says so 987e1051a39Sopenharmony_ci 988e1051a39Sopenharmony_ci sub $toutptr,sp,#40 989e1051a39Sopenharmony_ci vld1.32 {${Bi}[0]},[$bptr,:32]! 990e1051a39Sopenharmony_ci veor $zero,$zero,$zero 991e1051a39Sopenharmony_ci vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-( 992e1051a39Sopenharmony_ci vzip.16 $Bi,$zero 993e1051a39Sopenharmony_ci mov sp,$toutptr @ alloca 994e1051a39Sopenharmony_ci vmov.i64 $mask,#0xffff 995e1051a39Sopenharmony_ci 996e1051a39Sopenharmony_ci vmull.u32 @AxB[0],$Bi,${A0}[0] 997e1051a39Sopenharmony_ci vmull.u32 @AxB[1],$Bi,${A0}[1] 998e1051a39Sopenharmony_ci vmull.u32 @AxB[2],$Bi,${A1}[0] 999e1051a39Sopenharmony_ci vmull.u32 @AxB[3],$Bi,${A1}[1] 1000e1051a39Sopenharmony_ci vshr.u64 $temp,@AxB[0]#lo,#16 1001e1051a39Sopenharmony_ci vmull.u32 @AxB[4],$Bi,${A2}[0] 1002e1051a39Sopenharmony_ci vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp 1003e1051a39Sopenharmony_ci vmull.u32 @AxB[5],$Bi,${A2}[1] 1004e1051a39Sopenharmony_ci vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0] 1005e1051a39Sopenharmony_ci vmull.u32 @AxB[6],$Bi,${A3}[0] 1006e1051a39Sopenharmony_ci vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] 1007e1051a39Sopenharmony_ci vmull.u32 @AxB[7],$Bi,${A3}[1] 1008e1051a39Sopenharmony_ci___ 1009e1051a39Sopenharmony_cifor($i=1;$i<8;$i++) { 1010e1051a39Sopenharmony_ci$code.=<<___; 1011e1051a39Sopenharmony_ci vld1.32 {${Bi}[0]},[$bptr,:32]! 1012e1051a39Sopenharmony_ci veor $zero,$zero,$zero 1013e1051a39Sopenharmony_ci vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction 1014e1051a39Sopenharmony_ci vshl.u64 $mult,@AxB[0],#32 1015e1051a39Sopenharmony_ci vadd.u64 @AxB[3],@AxB[3],@AxB[0] 1016e1051a39Sopenharmony_ci vsub.u64 $mult,$mult,@AxB[0] 1017e1051a39Sopenharmony_ci vzip.16 $Bi,$zero 1018e1051a39Sopenharmony_ci vadd.u64 @AxB[6],@AxB[6],@AxB[0] 1019e1051a39Sopenharmony_ci vadd.u64 @AxB[7],@AxB[7],$mult 1020e1051a39Sopenharmony_ci___ 1021e1051a39Sopenharmony_ci push(@AxB,shift(@AxB)); 1022e1051a39Sopenharmony_ci$code.=<<___; 1023e1051a39Sopenharmony_ci vmlal.u32 @AxB[0],$Bi,${A0}[0] 1024e1051a39Sopenharmony_ci vmlal.u32 @AxB[1],$Bi,${A0}[1] 1025e1051a39Sopenharmony_ci vmlal.u32 @AxB[2],$Bi,${A1}[0] 1026e1051a39Sopenharmony_ci vmlal.u32 @AxB[3],$Bi,${A1}[1] 1027e1051a39Sopenharmony_ci vshr.u64 $temp,@AxB[0]#lo,#16 1028e1051a39Sopenharmony_ci vmlal.u32 @AxB[4],$Bi,${A2}[0] 1029e1051a39Sopenharmony_ci vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp 1030e1051a39Sopenharmony_ci vmlal.u32 @AxB[5],$Bi,${A2}[1] 1031e1051a39Sopenharmony_ci vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0] 1032e1051a39Sopenharmony_ci vmlal.u32 @AxB[6],$Bi,${A3}[0] 1033e1051a39Sopenharmony_ci vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] 1034e1051a39Sopenharmony_ci vmull.u32 @AxB[7],$Bi,${A3}[1] 1035e1051a39Sopenharmony_ci___ 1036e1051a39Sopenharmony_ci} 1037e1051a39Sopenharmony_ci$code.=<<___; 1038e1051a39Sopenharmony_ci vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction 1039e1051a39Sopenharmony_ci vshl.u64 $mult,@AxB[0],#32 1040e1051a39Sopenharmony_ci vadd.u64 @AxB[3],@AxB[3],@AxB[0] 1041e1051a39Sopenharmony_ci vsub.u64 $mult,$mult,@AxB[0] 1042e1051a39Sopenharmony_ci vadd.u64 @AxB[6],@AxB[6],@AxB[0] 1043e1051a39Sopenharmony_ci vadd.u64 @AxB[7],@AxB[7],$mult 1044e1051a39Sopenharmony_ci 1045e1051a39Sopenharmony_ci vshr.u64 $temp,@AxB[1]#lo,#16 @ convert 1046e1051a39Sopenharmony_ci vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp 1047e1051a39Sopenharmony_ci vshr.u64 $temp,@AxB[1]#hi,#16 1048e1051a39Sopenharmony_ci vzip.16 @AxB[1]#lo,@AxB[1]#hi 1049e1051a39Sopenharmony_ci___ 1050e1051a39Sopenharmony_ciforeach (2..7) { 1051e1051a39Sopenharmony_ci$code.=<<___; 1052e1051a39Sopenharmony_ci vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp 1053e1051a39Sopenharmony_ci vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]! 1054e1051a39Sopenharmony_ci vshr.u64 $temp,@AxB[$_]#lo,#16 1055e1051a39Sopenharmony_ci vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp 1056e1051a39Sopenharmony_ci vshr.u64 $temp,@AxB[$_]#hi,#16 1057e1051a39Sopenharmony_ci vzip.16 @AxB[$_]#lo,@AxB[$_]#hi 1058e1051a39Sopenharmony_ci___ 1059e1051a39Sopenharmony_ci} 1060e1051a39Sopenharmony_ci$code.=<<___; 1061e1051a39Sopenharmony_ci vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]! 1062e1051a39Sopenharmony_ci vst1.32 {$temp},[$toutptr] @ upper 33 bits 1063e1051a39Sopenharmony_ci 1064e1051a39Sopenharmony_ci ldr r1,[sp,#0] 1065e1051a39Sopenharmony_ci ldr r2,[sp,#4] 1066e1051a39Sopenharmony_ci ldr r3,[sp,#8] 1067e1051a39Sopenharmony_ci subs r1,r1,#-1 1068e1051a39Sopenharmony_ci ldr r4,[sp,#12] 1069e1051a39Sopenharmony_ci sbcs r2,r2,#-1 1070e1051a39Sopenharmony_ci ldr r5,[sp,#16] 1071e1051a39Sopenharmony_ci sbcs r3,r3,#-1 1072e1051a39Sopenharmony_ci ldr r6,[sp,#20] 1073e1051a39Sopenharmony_ci sbcs r4,r4,#0 1074e1051a39Sopenharmony_ci ldr r7,[sp,#24] 1075e1051a39Sopenharmony_ci sbcs r5,r5,#0 1076e1051a39Sopenharmony_ci ldr r8,[sp,#28] 1077e1051a39Sopenharmony_ci sbcs r6,r6,#0 1078e1051a39Sopenharmony_ci ldr r9,[sp,#32] @ top-most bit 1079e1051a39Sopenharmony_ci sbcs r7,r7,#1 1080e1051a39Sopenharmony_ci sub sp,ip,#40+16 1081e1051a39Sopenharmony_ci sbcs r8,r8,#-1 1082e1051a39Sopenharmony_ci sbc r9,r9,#0 1083e1051a39Sopenharmony_ci vldmia sp!,{q4-q5} 1084e1051a39Sopenharmony_ci 1085e1051a39Sopenharmony_ci adds r1,r1,r9 1086e1051a39Sopenharmony_ci adcs r2,r2,r9 1087e1051a39Sopenharmony_ci str r1,[$rptr,#0] 1088e1051a39Sopenharmony_ci adcs r3,r3,r9 1089e1051a39Sopenharmony_ci str r2,[$rptr,#4] 1090e1051a39Sopenharmony_ci adcs r4,r4,#0 1091e1051a39Sopenharmony_ci str r3,[$rptr,#8] 1092e1051a39Sopenharmony_ci adcs r5,r5,#0 1093e1051a39Sopenharmony_ci str r4,[$rptr,#12] 1094e1051a39Sopenharmony_ci adcs r6,r6,#0 1095e1051a39Sopenharmony_ci str r5,[$rptr,#16] 1096e1051a39Sopenharmony_ci adcs r7,r7,r9,lsr#31 1097e1051a39Sopenharmony_ci str r6,[$rptr,#20] 1098e1051a39Sopenharmony_ci adcs r8,r8,r9 1099e1051a39Sopenharmony_ci str r7,[$rptr,#24] 1100e1051a39Sopenharmony_ci str r8,[$rptr,#28] 1101e1051a39Sopenharmony_ci 1102e1051a39Sopenharmony_ci ldmia sp!,{r4-r9} 1103e1051a39Sopenharmony_ci bx lr 1104e1051a39Sopenharmony_ci.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon 1105e1051a39Sopenharmony_ci#endif 1106e1051a39Sopenharmony_ci___ 1107e1051a39Sopenharmony_ci} 1108e1051a39Sopenharmony_ci 1109e1051a39Sopenharmony_ci{{{ 1110e1051a39Sopenharmony_ci######################################################################## 1111e1051a39Sopenharmony_ci# Below $aN assignment matches order in which 256-bit result appears in 1112e1051a39Sopenharmony_ci# register bank at return from __ecp_nistz256_mul_mont, so that we can 1113e1051a39Sopenharmony_ci# skip over reloading it from memory. This means that below functions 1114e1051a39Sopenharmony_ci# use custom calling sequence accepting 256-bit input in registers, 1115e1051a39Sopenharmony_ci# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. 1116e1051a39Sopenharmony_ci# 1117e1051a39Sopenharmony_ci# See their "normal" counterparts for insights on calculations. 1118e1051a39Sopenharmony_ci 1119e1051a39Sopenharmony_cimy ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7, 1120e1051a39Sopenharmony_ci $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1)); 1121e1051a39Sopenharmony_cimy $ff=$b_ptr; 1122e1051a39Sopenharmony_ci 1123e1051a39Sopenharmony_ci$code.=<<___; 1124e1051a39Sopenharmony_ci.type __ecp_nistz256_sub_from,%function 1125e1051a39Sopenharmony_ci.align 5 1126e1051a39Sopenharmony_ci__ecp_nistz256_sub_from: 1127e1051a39Sopenharmony_ci str lr,[sp,#-4]! @ push lr 1128e1051a39Sopenharmony_ci 1129e1051a39Sopenharmony_ci ldr $t0,[$b_ptr,#0] 1130e1051a39Sopenharmony_ci ldr $t1,[$b_ptr,#4] 1131e1051a39Sopenharmony_ci ldr $t2,[$b_ptr,#8] 1132e1051a39Sopenharmony_ci ldr $t3,[$b_ptr,#12] 1133e1051a39Sopenharmony_ci subs $a0,$a0,$t0 1134e1051a39Sopenharmony_ci ldr $t0,[$b_ptr,#16] 1135e1051a39Sopenharmony_ci sbcs $a1,$a1,$t1 1136e1051a39Sopenharmony_ci ldr $t1,[$b_ptr,#20] 1137e1051a39Sopenharmony_ci sbcs $a2,$a2,$t2 1138e1051a39Sopenharmony_ci ldr $t2,[$b_ptr,#24] 1139e1051a39Sopenharmony_ci sbcs $a3,$a3,$t3 1140e1051a39Sopenharmony_ci ldr $t3,[$b_ptr,#28] 1141e1051a39Sopenharmony_ci sbcs $a4,$a4,$t0 1142e1051a39Sopenharmony_ci sbcs $a5,$a5,$t1 1143e1051a39Sopenharmony_ci sbcs $a6,$a6,$t2 1144e1051a39Sopenharmony_ci sbcs $a7,$a7,$t3 1145e1051a39Sopenharmony_ci sbc $ff,$ff,$ff @ broadcast borrow bit 1146e1051a39Sopenharmony_ci ldr lr,[sp],#4 @ pop lr 1147e1051a39Sopenharmony_ci 1148e1051a39Sopenharmony_ci adds $a0,$a0,$ff @ add synthesized modulus 1149e1051a39Sopenharmony_ci adcs $a1,$a1,$ff 1150e1051a39Sopenharmony_ci str $a0,[$r_ptr,#0] 1151e1051a39Sopenharmony_ci adcs $a2,$a2,$ff 1152e1051a39Sopenharmony_ci str $a1,[$r_ptr,#4] 1153e1051a39Sopenharmony_ci adcs $a3,$a3,#0 1154e1051a39Sopenharmony_ci str $a2,[$r_ptr,#8] 1155e1051a39Sopenharmony_ci adcs $a4,$a4,#0 1156e1051a39Sopenharmony_ci str $a3,[$r_ptr,#12] 1157e1051a39Sopenharmony_ci adcs $a5,$a5,#0 1158e1051a39Sopenharmony_ci str $a4,[$r_ptr,#16] 1159e1051a39Sopenharmony_ci adcs $a6,$a6,$ff,lsr#31 1160e1051a39Sopenharmony_ci str $a5,[$r_ptr,#20] 1161e1051a39Sopenharmony_ci adcs $a7,$a7,$ff 1162e1051a39Sopenharmony_ci str $a6,[$r_ptr,#24] 1163e1051a39Sopenharmony_ci str $a7,[$r_ptr,#28] 1164e1051a39Sopenharmony_ci 1165e1051a39Sopenharmony_ci mov pc,lr 1166e1051a39Sopenharmony_ci.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 1167e1051a39Sopenharmony_ci 1168e1051a39Sopenharmony_ci.type __ecp_nistz256_sub_morf,%function 1169e1051a39Sopenharmony_ci.align 5 1170e1051a39Sopenharmony_ci__ecp_nistz256_sub_morf: 1171e1051a39Sopenharmony_ci str lr,[sp,#-4]! @ push lr 1172e1051a39Sopenharmony_ci 1173e1051a39Sopenharmony_ci ldr $t0,[$b_ptr,#0] 1174e1051a39Sopenharmony_ci ldr $t1,[$b_ptr,#4] 1175e1051a39Sopenharmony_ci ldr $t2,[$b_ptr,#8] 1176e1051a39Sopenharmony_ci ldr $t3,[$b_ptr,#12] 1177e1051a39Sopenharmony_ci subs $a0,$t0,$a0 1178e1051a39Sopenharmony_ci ldr $t0,[$b_ptr,#16] 1179e1051a39Sopenharmony_ci sbcs $a1,$t1,$a1 1180e1051a39Sopenharmony_ci ldr $t1,[$b_ptr,#20] 1181e1051a39Sopenharmony_ci sbcs $a2,$t2,$a2 1182e1051a39Sopenharmony_ci ldr $t2,[$b_ptr,#24] 1183e1051a39Sopenharmony_ci sbcs $a3,$t3,$a3 1184e1051a39Sopenharmony_ci ldr $t3,[$b_ptr,#28] 1185e1051a39Sopenharmony_ci sbcs $a4,$t0,$a4 1186e1051a39Sopenharmony_ci sbcs $a5,$t1,$a5 1187e1051a39Sopenharmony_ci sbcs $a6,$t2,$a6 1188e1051a39Sopenharmony_ci sbcs $a7,$t3,$a7 1189e1051a39Sopenharmony_ci sbc $ff,$ff,$ff @ broadcast borrow bit 1190e1051a39Sopenharmony_ci ldr lr,[sp],#4 @ pop lr 1191e1051a39Sopenharmony_ci 1192e1051a39Sopenharmony_ci adds $a0,$a0,$ff @ add synthesized modulus 1193e1051a39Sopenharmony_ci adcs $a1,$a1,$ff 1194e1051a39Sopenharmony_ci str $a0,[$r_ptr,#0] 1195e1051a39Sopenharmony_ci adcs $a2,$a2,$ff 1196e1051a39Sopenharmony_ci str $a1,[$r_ptr,#4] 1197e1051a39Sopenharmony_ci adcs $a3,$a3,#0 1198e1051a39Sopenharmony_ci str $a2,[$r_ptr,#8] 1199e1051a39Sopenharmony_ci adcs $a4,$a4,#0 1200e1051a39Sopenharmony_ci str $a3,[$r_ptr,#12] 1201e1051a39Sopenharmony_ci adcs $a5,$a5,#0 1202e1051a39Sopenharmony_ci str $a4,[$r_ptr,#16] 1203e1051a39Sopenharmony_ci adcs $a6,$a6,$ff,lsr#31 1204e1051a39Sopenharmony_ci str $a5,[$r_ptr,#20] 1205e1051a39Sopenharmony_ci adcs $a7,$a7,$ff 1206e1051a39Sopenharmony_ci str $a6,[$r_ptr,#24] 1207e1051a39Sopenharmony_ci str $a7,[$r_ptr,#28] 1208e1051a39Sopenharmony_ci 1209e1051a39Sopenharmony_ci mov pc,lr 1210e1051a39Sopenharmony_ci.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 1211e1051a39Sopenharmony_ci 1212e1051a39Sopenharmony_ci.type __ecp_nistz256_add_self,%function 1213e1051a39Sopenharmony_ci.align 4 1214e1051a39Sopenharmony_ci__ecp_nistz256_add_self: 1215e1051a39Sopenharmony_ci adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] 1216e1051a39Sopenharmony_ci adcs $a1,$a1,$a1 1217e1051a39Sopenharmony_ci adcs $a2,$a2,$a2 1218e1051a39Sopenharmony_ci adcs $a3,$a3,$a3 1219e1051a39Sopenharmony_ci adcs $a4,$a4,$a4 1220e1051a39Sopenharmony_ci adcs $a5,$a5,$a5 1221e1051a39Sopenharmony_ci adcs $a6,$a6,$a6 1222e1051a39Sopenharmony_ci mov $ff,#0 1223e1051a39Sopenharmony_ci adcs $a7,$a7,$a7 1224e1051a39Sopenharmony_ci adc $ff,$ff,#0 1225e1051a39Sopenharmony_ci 1226e1051a39Sopenharmony_ci @ if a+b >= modulus, subtract modulus. 1227e1051a39Sopenharmony_ci @ 1228e1051a39Sopenharmony_ci @ But since comparison implies subtraction, we subtract 1229e1051a39Sopenharmony_ci @ modulus and then add it back if subtraction borrowed. 1230e1051a39Sopenharmony_ci 1231e1051a39Sopenharmony_ci subs $a0,$a0,#-1 1232e1051a39Sopenharmony_ci sbcs $a1,$a1,#-1 1233e1051a39Sopenharmony_ci sbcs $a2,$a2,#-1 1234e1051a39Sopenharmony_ci sbcs $a3,$a3,#0 1235e1051a39Sopenharmony_ci sbcs $a4,$a4,#0 1236e1051a39Sopenharmony_ci sbcs $a5,$a5,#0 1237e1051a39Sopenharmony_ci sbcs $a6,$a6,#1 1238e1051a39Sopenharmony_ci sbcs $a7,$a7,#-1 1239e1051a39Sopenharmony_ci sbc $ff,$ff,#0 1240e1051a39Sopenharmony_ci 1241e1051a39Sopenharmony_ci @ Note that because mod has special form, i.e. consists of 1242e1051a39Sopenharmony_ci @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 1243e1051a39Sopenharmony_ci @ using value of borrow as a whole or extracting single bit. 1244e1051a39Sopenharmony_ci @ Follow $ff register... 1245e1051a39Sopenharmony_ci 1246e1051a39Sopenharmony_ci adds $a0,$a0,$ff @ add synthesized modulus 1247e1051a39Sopenharmony_ci adcs $a1,$a1,$ff 1248e1051a39Sopenharmony_ci str $a0,[$r_ptr,#0] 1249e1051a39Sopenharmony_ci adcs $a2,$a2,$ff 1250e1051a39Sopenharmony_ci str $a1,[$r_ptr,#4] 1251e1051a39Sopenharmony_ci adcs $a3,$a3,#0 1252e1051a39Sopenharmony_ci str $a2,[$r_ptr,#8] 1253e1051a39Sopenharmony_ci adcs $a4,$a4,#0 1254e1051a39Sopenharmony_ci str $a3,[$r_ptr,#12] 1255e1051a39Sopenharmony_ci adcs $a5,$a5,#0 1256e1051a39Sopenharmony_ci str $a4,[$r_ptr,#16] 1257e1051a39Sopenharmony_ci adcs $a6,$a6,$ff,lsr#31 1258e1051a39Sopenharmony_ci str $a5,[$r_ptr,#20] 1259e1051a39Sopenharmony_ci adcs $a7,$a7,$ff 1260e1051a39Sopenharmony_ci str $a6,[$r_ptr,#24] 1261e1051a39Sopenharmony_ci str $a7,[$r_ptr,#28] 1262e1051a39Sopenharmony_ci 1263e1051a39Sopenharmony_ci mov pc,lr 1264e1051a39Sopenharmony_ci.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self 1265e1051a39Sopenharmony_ci 1266e1051a39Sopenharmony_ci___ 1267e1051a39Sopenharmony_ci 1268e1051a39Sopenharmony_ci######################################################################## 1269e1051a39Sopenharmony_ci# following subroutines are "literal" implementation of those found in 1270e1051a39Sopenharmony_ci# ecp_nistz256.c 1271e1051a39Sopenharmony_ci# 1272e1051a39Sopenharmony_ci######################################################################## 1273e1051a39Sopenharmony_ci# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 1274e1051a39Sopenharmony_ci# 1275e1051a39Sopenharmony_ci{ 1276e1051a39Sopenharmony_cimy ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1277e1051a39Sopenharmony_ci# above map() describes stack layout with 5 temporary 1278e1051a39Sopenharmony_ci# 256-bit vectors on top. Then note that we push 1279e1051a39Sopenharmony_ci# starting from r0, which means that we have copy of 1280e1051a39Sopenharmony_ci# input arguments just below these temporary vectors. 1281e1051a39Sopenharmony_ci 1282e1051a39Sopenharmony_ci$code.=<<___; 1283e1051a39Sopenharmony_ci.globl ecp_nistz256_point_double 1284e1051a39Sopenharmony_ci.type ecp_nistz256_point_double,%function 1285e1051a39Sopenharmony_ci.align 5 1286e1051a39Sopenharmony_ciecp_nistz256_point_double: 1287e1051a39Sopenharmony_ci stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1288e1051a39Sopenharmony_ci sub sp,sp,#32*5 1289e1051a39Sopenharmony_ci 1290e1051a39Sopenharmony_ci.Lpoint_double_shortcut: 1291e1051a39Sopenharmony_ci add r3,sp,#$in_x 1292e1051a39Sopenharmony_ci ldmia $a_ptr!,{r4-r11} @ copy in_x 1293e1051a39Sopenharmony_ci stmia r3,{r4-r11} 1294e1051a39Sopenharmony_ci 1295e1051a39Sopenharmony_ci add $r_ptr,sp,#$S 1296e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); 1297e1051a39Sopenharmony_ci 1298e1051a39Sopenharmony_ci add $b_ptr,$a_ptr,#32 1299e1051a39Sopenharmony_ci add $a_ptr,$a_ptr,#32 1300e1051a39Sopenharmony_ci add $r_ptr,sp,#$Zsqr 1301e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); 1302e1051a39Sopenharmony_ci 1303e1051a39Sopenharmony_ci add $a_ptr,sp,#$S 1304e1051a39Sopenharmony_ci add $b_ptr,sp,#$S 1305e1051a39Sopenharmony_ci add $r_ptr,sp,#$S 1306e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); 1307e1051a39Sopenharmony_ci 1308e1051a39Sopenharmony_ci ldr $b_ptr,[sp,#32*5+4] 1309e1051a39Sopenharmony_ci add $a_ptr,$b_ptr,#32 1310e1051a39Sopenharmony_ci add $b_ptr,$b_ptr,#64 1311e1051a39Sopenharmony_ci add $r_ptr,sp,#$tmp0 1312e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); 1313e1051a39Sopenharmony_ci 1314e1051a39Sopenharmony_ci ldr $r_ptr,[sp,#32*5] 1315e1051a39Sopenharmony_ci add $r_ptr,$r_ptr,#64 1316e1051a39Sopenharmony_ci bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); 1317e1051a39Sopenharmony_ci 1318e1051a39Sopenharmony_ci add $a_ptr,sp,#$in_x 1319e1051a39Sopenharmony_ci add $b_ptr,sp,#$Zsqr 1320e1051a39Sopenharmony_ci add $r_ptr,sp,#$M 1321e1051a39Sopenharmony_ci bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); 1322e1051a39Sopenharmony_ci 1323e1051a39Sopenharmony_ci add $a_ptr,sp,#$in_x 1324e1051a39Sopenharmony_ci add $b_ptr,sp,#$Zsqr 1325e1051a39Sopenharmony_ci add $r_ptr,sp,#$Zsqr 1326e1051a39Sopenharmony_ci bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); 1327e1051a39Sopenharmony_ci 1328e1051a39Sopenharmony_ci add $a_ptr,sp,#$S 1329e1051a39Sopenharmony_ci add $b_ptr,sp,#$S 1330e1051a39Sopenharmony_ci add $r_ptr,sp,#$tmp0 1331e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); 1332e1051a39Sopenharmony_ci 1333e1051a39Sopenharmony_ci add $a_ptr,sp,#$Zsqr 1334e1051a39Sopenharmony_ci add $b_ptr,sp,#$M 1335e1051a39Sopenharmony_ci add $r_ptr,sp,#$M 1336e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); 1337e1051a39Sopenharmony_ci 1338e1051a39Sopenharmony_ci ldr $r_ptr,[sp,#32*5] 1339e1051a39Sopenharmony_ci add $a_ptr,sp,#$tmp0 1340e1051a39Sopenharmony_ci add $r_ptr,$r_ptr,#32 1341e1051a39Sopenharmony_ci bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); 1342e1051a39Sopenharmony_ci 1343e1051a39Sopenharmony_ci add $a_ptr,sp,#$M 1344e1051a39Sopenharmony_ci add $r_ptr,sp,#$M 1345e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); 1346e1051a39Sopenharmony_ci 1347e1051a39Sopenharmony_ci add $a_ptr,sp,#$in_x 1348e1051a39Sopenharmony_ci add $b_ptr,sp,#$S 1349e1051a39Sopenharmony_ci add $r_ptr,sp,#$S 1350e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); 1351e1051a39Sopenharmony_ci 1352e1051a39Sopenharmony_ci add $r_ptr,sp,#$tmp0 1353e1051a39Sopenharmony_ci bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); 1354e1051a39Sopenharmony_ci 1355e1051a39Sopenharmony_ci ldr $r_ptr,[sp,#32*5] 1356e1051a39Sopenharmony_ci add $a_ptr,sp,#$M 1357e1051a39Sopenharmony_ci add $b_ptr,sp,#$M 1358e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); 1359e1051a39Sopenharmony_ci 1360e1051a39Sopenharmony_ci add $b_ptr,sp,#$tmp0 1361e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); 1362e1051a39Sopenharmony_ci 1363e1051a39Sopenharmony_ci add $b_ptr,sp,#$S 1364e1051a39Sopenharmony_ci add $r_ptr,sp,#$S 1365e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); 1366e1051a39Sopenharmony_ci 1367e1051a39Sopenharmony_ci add $a_ptr,sp,#$M 1368e1051a39Sopenharmony_ci add $b_ptr,sp,#$S 1369e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); 1370e1051a39Sopenharmony_ci 1371e1051a39Sopenharmony_ci ldr $r_ptr,[sp,#32*5] 1372e1051a39Sopenharmony_ci add $b_ptr,$r_ptr,#32 1373e1051a39Sopenharmony_ci add $r_ptr,$r_ptr,#32 1374e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); 1375e1051a39Sopenharmony_ci 1376e1051a39Sopenharmony_ci add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" 1377e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__) 1378e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,pc} 1379e1051a39Sopenharmony_ci#else 1380e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,lr} 1381e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 1382e1051a39Sopenharmony_ci#endif 1383e1051a39Sopenharmony_ci.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 1384e1051a39Sopenharmony_ci___ 1385e1051a39Sopenharmony_ci} 1386e1051a39Sopenharmony_ci 1387e1051a39Sopenharmony_ci######################################################################## 1388e1051a39Sopenharmony_ci# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 1389e1051a39Sopenharmony_ci# const P256_POINT *in2); 1390e1051a39Sopenharmony_ci{ 1391e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z, 1392e1051a39Sopenharmony_ci $in1_x,$in1_y,$in1_z, 1393e1051a39Sopenharmony_ci $in2_x,$in2_y,$in2_z, 1394e1051a39Sopenharmony_ci $H,$Hsqr,$R,$Rsqr,$Hcub, 1395e1051a39Sopenharmony_ci $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); 1396e1051a39Sopenharmony_cimy ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1397e1051a39Sopenharmony_ci# above map() describes stack layout with 18 temporary 1398e1051a39Sopenharmony_ci# 256-bit vectors on top. Then note that we push 1399e1051a39Sopenharmony_ci# starting from r0, which means that we have copy of 1400e1051a39Sopenharmony_ci# input arguments just below these temporary vectors. 1401e1051a39Sopenharmony_ci# We use three of them for ~in1infty, ~in2infty and 1402e1051a39Sopenharmony_ci# result of check for zero. 1403e1051a39Sopenharmony_ci 1404e1051a39Sopenharmony_ci$code.=<<___; 1405e1051a39Sopenharmony_ci.globl ecp_nistz256_point_add 1406e1051a39Sopenharmony_ci.type ecp_nistz256_point_add,%function 1407e1051a39Sopenharmony_ci.align 5 1408e1051a39Sopenharmony_ciecp_nistz256_point_add: 1409e1051a39Sopenharmony_ci stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1410e1051a39Sopenharmony_ci sub sp,sp,#32*18+16 1411e1051a39Sopenharmony_ci 1412e1051a39Sopenharmony_ci ldmia $b_ptr!,{r4-r11} @ copy in2_x 1413e1051a39Sopenharmony_ci add r3,sp,#$in2_x 1414e1051a39Sopenharmony_ci stmia r3!,{r4-r11} 1415e1051a39Sopenharmony_ci ldmia $b_ptr!,{r4-r11} @ copy in2_y 1416e1051a39Sopenharmony_ci stmia r3!,{r4-r11} 1417e1051a39Sopenharmony_ci ldmia $b_ptr,{r4-r11} @ copy in2_z 1418e1051a39Sopenharmony_ci orr r12,r4,r5 1419e1051a39Sopenharmony_ci orr r12,r12,r6 1420e1051a39Sopenharmony_ci orr r12,r12,r7 1421e1051a39Sopenharmony_ci orr r12,r12,r8 1422e1051a39Sopenharmony_ci orr r12,r12,r9 1423e1051a39Sopenharmony_ci orr r12,r12,r10 1424e1051a39Sopenharmony_ci orr r12,r12,r11 1425e1051a39Sopenharmony_ci cmp r12,#0 1426e1051a39Sopenharmony_ci#ifdef __thumb2__ 1427e1051a39Sopenharmony_ci it ne 1428e1051a39Sopenharmony_ci#endif 1429e1051a39Sopenharmony_ci movne r12,#-1 1430e1051a39Sopenharmony_ci stmia r3,{r4-r11} 1431e1051a39Sopenharmony_ci str r12,[sp,#32*18+8] @ ~in2infty 1432e1051a39Sopenharmony_ci 1433e1051a39Sopenharmony_ci ldmia $a_ptr!,{r4-r11} @ copy in1_x 1434e1051a39Sopenharmony_ci add r3,sp,#$in1_x 1435e1051a39Sopenharmony_ci stmia r3!,{r4-r11} 1436e1051a39Sopenharmony_ci ldmia $a_ptr!,{r4-r11} @ copy in1_y 1437e1051a39Sopenharmony_ci stmia r3!,{r4-r11} 1438e1051a39Sopenharmony_ci ldmia $a_ptr,{r4-r11} @ copy in1_z 1439e1051a39Sopenharmony_ci orr r12,r4,r5 1440e1051a39Sopenharmony_ci orr r12,r12,r6 1441e1051a39Sopenharmony_ci orr r12,r12,r7 1442e1051a39Sopenharmony_ci orr r12,r12,r8 1443e1051a39Sopenharmony_ci orr r12,r12,r9 1444e1051a39Sopenharmony_ci orr r12,r12,r10 1445e1051a39Sopenharmony_ci orr r12,r12,r11 1446e1051a39Sopenharmony_ci cmp r12,#0 1447e1051a39Sopenharmony_ci#ifdef __thumb2__ 1448e1051a39Sopenharmony_ci it ne 1449e1051a39Sopenharmony_ci#endif 1450e1051a39Sopenharmony_ci movne r12,#-1 1451e1051a39Sopenharmony_ci stmia r3,{r4-r11} 1452e1051a39Sopenharmony_ci str r12,[sp,#32*18+4] @ ~in1infty 1453e1051a39Sopenharmony_ci 1454e1051a39Sopenharmony_ci add $a_ptr,sp,#$in2_z 1455e1051a39Sopenharmony_ci add $b_ptr,sp,#$in2_z 1456e1051a39Sopenharmony_ci add $r_ptr,sp,#$Z2sqr 1457e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); 1458e1051a39Sopenharmony_ci 1459e1051a39Sopenharmony_ci add $a_ptr,sp,#$in1_z 1460e1051a39Sopenharmony_ci add $b_ptr,sp,#$in1_z 1461e1051a39Sopenharmony_ci add $r_ptr,sp,#$Z1sqr 1462e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); 1463e1051a39Sopenharmony_ci 1464e1051a39Sopenharmony_ci add $a_ptr,sp,#$in2_z 1465e1051a39Sopenharmony_ci add $b_ptr,sp,#$Z2sqr 1466e1051a39Sopenharmony_ci add $r_ptr,sp,#$S1 1467e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); 1468e1051a39Sopenharmony_ci 1469e1051a39Sopenharmony_ci add $a_ptr,sp,#$in1_z 1470e1051a39Sopenharmony_ci add $b_ptr,sp,#$Z1sqr 1471e1051a39Sopenharmony_ci add $r_ptr,sp,#$S2 1472e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); 1473e1051a39Sopenharmony_ci 1474e1051a39Sopenharmony_ci add $a_ptr,sp,#$in1_y 1475e1051a39Sopenharmony_ci add $b_ptr,sp,#$S1 1476e1051a39Sopenharmony_ci add $r_ptr,sp,#$S1 1477e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); 1478e1051a39Sopenharmony_ci 1479e1051a39Sopenharmony_ci add $a_ptr,sp,#$in2_y 1480e1051a39Sopenharmony_ci add $b_ptr,sp,#$S2 1481e1051a39Sopenharmony_ci add $r_ptr,sp,#$S2 1482e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); 1483e1051a39Sopenharmony_ci 1484e1051a39Sopenharmony_ci add $b_ptr,sp,#$S1 1485e1051a39Sopenharmony_ci add $r_ptr,sp,#$R 1486e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1); 1487e1051a39Sopenharmony_ci 1488e1051a39Sopenharmony_ci orr $a0,$a0,$a1 @ see if result is zero 1489e1051a39Sopenharmony_ci orr $a2,$a2,$a3 1490e1051a39Sopenharmony_ci orr $a4,$a4,$a5 1491e1051a39Sopenharmony_ci orr $a0,$a0,$a2 1492e1051a39Sopenharmony_ci orr $a4,$a4,$a6 1493e1051a39Sopenharmony_ci orr $a0,$a0,$a7 1494e1051a39Sopenharmony_ci add $a_ptr,sp,#$in1_x 1495e1051a39Sopenharmony_ci orr $a0,$a0,$a4 1496e1051a39Sopenharmony_ci add $b_ptr,sp,#$Z2sqr 1497e1051a39Sopenharmony_ci str $a0,[sp,#32*18+12] 1498e1051a39Sopenharmony_ci 1499e1051a39Sopenharmony_ci add $r_ptr,sp,#$U1 1500e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); 1501e1051a39Sopenharmony_ci 1502e1051a39Sopenharmony_ci add $a_ptr,sp,#$in2_x 1503e1051a39Sopenharmony_ci add $b_ptr,sp,#$Z1sqr 1504e1051a39Sopenharmony_ci add $r_ptr,sp,#$U2 1505e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); 1506e1051a39Sopenharmony_ci 1507e1051a39Sopenharmony_ci add $b_ptr,sp,#$U1 1508e1051a39Sopenharmony_ci add $r_ptr,sp,#$H 1509e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1); 1510e1051a39Sopenharmony_ci 1511e1051a39Sopenharmony_ci orr $a0,$a0,$a1 @ see if result is zero 1512e1051a39Sopenharmony_ci orr $a2,$a2,$a3 1513e1051a39Sopenharmony_ci orr $a4,$a4,$a5 1514e1051a39Sopenharmony_ci orr $a0,$a0,$a2 1515e1051a39Sopenharmony_ci orr $a4,$a4,$a6 1516e1051a39Sopenharmony_ci orr $a0,$a0,$a7 1517e1051a39Sopenharmony_ci orr $a0,$a0,$a4 @ ~is_equal(U1,U2) 1518e1051a39Sopenharmony_ci 1519e1051a39Sopenharmony_ci ldr $t0,[sp,#32*18+4] @ ~in1infty 1520e1051a39Sopenharmony_ci ldr $t1,[sp,#32*18+8] @ ~in2infty 1521e1051a39Sopenharmony_ci ldr $t2,[sp,#32*18+12] @ ~is_equal(S1,S2) 1522e1051a39Sopenharmony_ci mvn $t0,$t0 @ -1/0 -> 0/-1 1523e1051a39Sopenharmony_ci mvn $t1,$t1 @ -1/0 -> 0/-1 1524e1051a39Sopenharmony_ci orr $a0,$a0,$t0 1525e1051a39Sopenharmony_ci orr $a0,$a0,$t1 1526e1051a39Sopenharmony_ci orrs $a0,$a0,$t2 @ set flags 1527e1051a39Sopenharmony_ci 1528e1051a39Sopenharmony_ci @ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 1529e1051a39Sopenharmony_ci bne .Ladd_proceed 1530e1051a39Sopenharmony_ci 1531e1051a39Sopenharmony_ci.Ladd_double: 1532e1051a39Sopenharmony_ci ldr $a_ptr,[sp,#32*18+20] 1533e1051a39Sopenharmony_ci add sp,sp,#32*(18-5)+16 @ difference in frame sizes 1534e1051a39Sopenharmony_ci b .Lpoint_double_shortcut 1535e1051a39Sopenharmony_ci 1536e1051a39Sopenharmony_ci.align 4 1537e1051a39Sopenharmony_ci.Ladd_proceed: 1538e1051a39Sopenharmony_ci add $a_ptr,sp,#$R 1539e1051a39Sopenharmony_ci add $b_ptr,sp,#$R 1540e1051a39Sopenharmony_ci add $r_ptr,sp,#$Rsqr 1541e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); 1542e1051a39Sopenharmony_ci 1543e1051a39Sopenharmony_ci add $a_ptr,sp,#$H 1544e1051a39Sopenharmony_ci add $b_ptr,sp,#$in1_z 1545e1051a39Sopenharmony_ci add $r_ptr,sp,#$res_z 1546e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); 1547e1051a39Sopenharmony_ci 1548e1051a39Sopenharmony_ci add $a_ptr,sp,#$H 1549e1051a39Sopenharmony_ci add $b_ptr,sp,#$H 1550e1051a39Sopenharmony_ci add $r_ptr,sp,#$Hsqr 1551e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); 1552e1051a39Sopenharmony_ci 1553e1051a39Sopenharmony_ci add $a_ptr,sp,#$in2_z 1554e1051a39Sopenharmony_ci add $b_ptr,sp,#$res_z 1555e1051a39Sopenharmony_ci add $r_ptr,sp,#$res_z 1556e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); 1557e1051a39Sopenharmony_ci 1558e1051a39Sopenharmony_ci add $a_ptr,sp,#$H 1559e1051a39Sopenharmony_ci add $b_ptr,sp,#$Hsqr 1560e1051a39Sopenharmony_ci add $r_ptr,sp,#$Hcub 1561e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); 1562e1051a39Sopenharmony_ci 1563e1051a39Sopenharmony_ci add $a_ptr,sp,#$Hsqr 1564e1051a39Sopenharmony_ci add $b_ptr,sp,#$U1 1565e1051a39Sopenharmony_ci add $r_ptr,sp,#$U2 1566e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); 1567e1051a39Sopenharmony_ci 1568e1051a39Sopenharmony_ci add $r_ptr,sp,#$Hsqr 1569e1051a39Sopenharmony_ci bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); 1570e1051a39Sopenharmony_ci 1571e1051a39Sopenharmony_ci add $b_ptr,sp,#$Rsqr 1572e1051a39Sopenharmony_ci add $r_ptr,sp,#$res_x 1573e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); 1574e1051a39Sopenharmony_ci 1575e1051a39Sopenharmony_ci add $b_ptr,sp,#$Hcub 1576e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); 1577e1051a39Sopenharmony_ci 1578e1051a39Sopenharmony_ci add $b_ptr,sp,#$U2 1579e1051a39Sopenharmony_ci add $r_ptr,sp,#$res_y 1580e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); 1581e1051a39Sopenharmony_ci 1582e1051a39Sopenharmony_ci add $a_ptr,sp,#$Hcub 1583e1051a39Sopenharmony_ci add $b_ptr,sp,#$S1 1584e1051a39Sopenharmony_ci add $r_ptr,sp,#$S2 1585e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); 1586e1051a39Sopenharmony_ci 1587e1051a39Sopenharmony_ci add $a_ptr,sp,#$R 1588e1051a39Sopenharmony_ci add $b_ptr,sp,#$res_y 1589e1051a39Sopenharmony_ci add $r_ptr,sp,#$res_y 1590e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); 1591e1051a39Sopenharmony_ci 1592e1051a39Sopenharmony_ci add $b_ptr,sp,#$S2 1593e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); 1594e1051a39Sopenharmony_ci 1595e1051a39Sopenharmony_ci ldr r11,[sp,#32*18+4] @ ~in1infty 1596e1051a39Sopenharmony_ci ldr r12,[sp,#32*18+8] @ ~in2infty 1597e1051a39Sopenharmony_ci add r1,sp,#$res_x 1598e1051a39Sopenharmony_ci add r2,sp,#$in2_x 1599e1051a39Sopenharmony_ci and r10,r11,r12 @ ~in1infty & ~in2infty 1600e1051a39Sopenharmony_ci mvn r11,r11 1601e1051a39Sopenharmony_ci add r3,sp,#$in1_x 1602e1051a39Sopenharmony_ci and r11,r11,r12 @ in1infty & ~in2infty 1603e1051a39Sopenharmony_ci mvn r12,r12 @ in2infty 1604e1051a39Sopenharmony_ci ldr $r_ptr,[sp,#32*18+16] 1605e1051a39Sopenharmony_ci___ 1606e1051a39Sopenharmony_cifor($i=0;$i<96;$i+=8) { # conditional moves 1607e1051a39Sopenharmony_ci$code.=<<___; 1608e1051a39Sopenharmony_ci ldmia r1!,{r4-r5} @ res_x 1609e1051a39Sopenharmony_ci ldmia r2!,{r6-r7} @ in2_x 1610e1051a39Sopenharmony_ci ldmia r3!,{r8-r9} @ in1_x 1611e1051a39Sopenharmony_ci and r4,r4,r10 @ ~in1infty & ~in2infty 1612e1051a39Sopenharmony_ci and r5,r5,r10 1613e1051a39Sopenharmony_ci and r6,r6,r11 @ in1infty & ~in2infty 1614e1051a39Sopenharmony_ci and r7,r7,r11 1615e1051a39Sopenharmony_ci and r8,r8,r12 @ in2infty 1616e1051a39Sopenharmony_ci and r9,r9,r12 1617e1051a39Sopenharmony_ci orr r4,r4,r6 1618e1051a39Sopenharmony_ci orr r5,r5,r7 1619e1051a39Sopenharmony_ci orr r4,r4,r8 1620e1051a39Sopenharmony_ci orr r5,r5,r9 1621e1051a39Sopenharmony_ci stmia $r_ptr!,{r4-r5} 1622e1051a39Sopenharmony_ci___ 1623e1051a39Sopenharmony_ci} 1624e1051a39Sopenharmony_ci$code.=<<___; 1625e1051a39Sopenharmony_ci.Ladd_done: 1626e1051a39Sopenharmony_ci add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" 1627e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__) 1628e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,pc} 1629e1051a39Sopenharmony_ci#else 1630e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,lr} 1631e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 1632e1051a39Sopenharmony_ci#endif 1633e1051a39Sopenharmony_ci.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1634e1051a39Sopenharmony_ci___ 1635e1051a39Sopenharmony_ci} 1636e1051a39Sopenharmony_ci 1637e1051a39Sopenharmony_ci######################################################################## 1638e1051a39Sopenharmony_ci# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1639e1051a39Sopenharmony_ci# const P256_POINT_AFFINE *in2); 1640e1051a39Sopenharmony_ci{ 1641e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z, 1642e1051a39Sopenharmony_ci $in1_x,$in1_y,$in1_z, 1643e1051a39Sopenharmony_ci $in2_x,$in2_y, 1644e1051a39Sopenharmony_ci $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 1645e1051a39Sopenharmony_cimy $Z1sqr = $S2; 1646e1051a39Sopenharmony_ci# above map() describes stack layout with 18 temporary 1647e1051a39Sopenharmony_ci# 256-bit vectors on top. Then note that we push 1648e1051a39Sopenharmony_ci# starting from r0, which means that we have copy of 1649e1051a39Sopenharmony_ci# input arguments just below these temporary vectors. 1650e1051a39Sopenharmony_ci# We use two of them for ~in1infty, ~in2infty. 1651e1051a39Sopenharmony_ci 1652e1051a39Sopenharmony_cimy @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 1653e1051a39Sopenharmony_ci 1654e1051a39Sopenharmony_ci$code.=<<___; 1655e1051a39Sopenharmony_ci.globl ecp_nistz256_point_add_affine 1656e1051a39Sopenharmony_ci.type ecp_nistz256_point_add_affine,%function 1657e1051a39Sopenharmony_ci.align 5 1658e1051a39Sopenharmony_ciecp_nistz256_point_add_affine: 1659e1051a39Sopenharmony_ci stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1660e1051a39Sopenharmony_ci sub sp,sp,#32*15 1661e1051a39Sopenharmony_ci 1662e1051a39Sopenharmony_ci ldmia $a_ptr!,{r4-r11} @ copy in1_x 1663e1051a39Sopenharmony_ci add r3,sp,#$in1_x 1664e1051a39Sopenharmony_ci stmia r3!,{r4-r11} 1665e1051a39Sopenharmony_ci ldmia $a_ptr!,{r4-r11} @ copy in1_y 1666e1051a39Sopenharmony_ci stmia r3!,{r4-r11} 1667e1051a39Sopenharmony_ci ldmia $a_ptr,{r4-r11} @ copy in1_z 1668e1051a39Sopenharmony_ci orr r12,r4,r5 1669e1051a39Sopenharmony_ci orr r12,r12,r6 1670e1051a39Sopenharmony_ci orr r12,r12,r7 1671e1051a39Sopenharmony_ci orr r12,r12,r8 1672e1051a39Sopenharmony_ci orr r12,r12,r9 1673e1051a39Sopenharmony_ci orr r12,r12,r10 1674e1051a39Sopenharmony_ci orr r12,r12,r11 1675e1051a39Sopenharmony_ci cmp r12,#0 1676e1051a39Sopenharmony_ci#ifdef __thumb2__ 1677e1051a39Sopenharmony_ci it ne 1678e1051a39Sopenharmony_ci#endif 1679e1051a39Sopenharmony_ci movne r12,#-1 1680e1051a39Sopenharmony_ci stmia r3,{r4-r11} 1681e1051a39Sopenharmony_ci str r12,[sp,#32*15+4] @ ~in1infty 1682e1051a39Sopenharmony_ci 1683e1051a39Sopenharmony_ci ldmia $b_ptr!,{r4-r11} @ copy in2_x 1684e1051a39Sopenharmony_ci add r3,sp,#$in2_x 1685e1051a39Sopenharmony_ci orr r12,r4,r5 1686e1051a39Sopenharmony_ci orr r12,r12,r6 1687e1051a39Sopenharmony_ci orr r12,r12,r7 1688e1051a39Sopenharmony_ci orr r12,r12,r8 1689e1051a39Sopenharmony_ci orr r12,r12,r9 1690e1051a39Sopenharmony_ci orr r12,r12,r10 1691e1051a39Sopenharmony_ci orr r12,r12,r11 1692e1051a39Sopenharmony_ci stmia r3!,{r4-r11} 1693e1051a39Sopenharmony_ci ldmia $b_ptr!,{r4-r11} @ copy in2_y 1694e1051a39Sopenharmony_ci orr r12,r12,r4 1695e1051a39Sopenharmony_ci orr r12,r12,r5 1696e1051a39Sopenharmony_ci orr r12,r12,r6 1697e1051a39Sopenharmony_ci orr r12,r12,r7 1698e1051a39Sopenharmony_ci orr r12,r12,r8 1699e1051a39Sopenharmony_ci orr r12,r12,r9 1700e1051a39Sopenharmony_ci orr r12,r12,r10 1701e1051a39Sopenharmony_ci orr r12,r12,r11 1702e1051a39Sopenharmony_ci stmia r3!,{r4-r11} 1703e1051a39Sopenharmony_ci cmp r12,#0 1704e1051a39Sopenharmony_ci#ifdef __thumb2__ 1705e1051a39Sopenharmony_ci it ne 1706e1051a39Sopenharmony_ci#endif 1707e1051a39Sopenharmony_ci movne r12,#-1 1708e1051a39Sopenharmony_ci str r12,[sp,#32*15+8] @ ~in2infty 1709e1051a39Sopenharmony_ci 1710e1051a39Sopenharmony_ci add $a_ptr,sp,#$in1_z 1711e1051a39Sopenharmony_ci add $b_ptr,sp,#$in1_z 1712e1051a39Sopenharmony_ci add $r_ptr,sp,#$Z1sqr 1713e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); 1714e1051a39Sopenharmony_ci 1715e1051a39Sopenharmony_ci add $a_ptr,sp,#$Z1sqr 1716e1051a39Sopenharmony_ci add $b_ptr,sp,#$in2_x 1717e1051a39Sopenharmony_ci add $r_ptr,sp,#$U2 1718e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); 1719e1051a39Sopenharmony_ci 1720e1051a39Sopenharmony_ci add $b_ptr,sp,#$in1_x 1721e1051a39Sopenharmony_ci add $r_ptr,sp,#$H 1722e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x); 1723e1051a39Sopenharmony_ci 1724e1051a39Sopenharmony_ci add $a_ptr,sp,#$Z1sqr 1725e1051a39Sopenharmony_ci add $b_ptr,sp,#$in1_z 1726e1051a39Sopenharmony_ci add $r_ptr,sp,#$S2 1727e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); 1728e1051a39Sopenharmony_ci 1729e1051a39Sopenharmony_ci add $a_ptr,sp,#$H 1730e1051a39Sopenharmony_ci add $b_ptr,sp,#$in1_z 1731e1051a39Sopenharmony_ci add $r_ptr,sp,#$res_z 1732e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); 1733e1051a39Sopenharmony_ci 1734e1051a39Sopenharmony_ci add $a_ptr,sp,#$in2_y 1735e1051a39Sopenharmony_ci add $b_ptr,sp,#$S2 1736e1051a39Sopenharmony_ci add $r_ptr,sp,#$S2 1737e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); 1738e1051a39Sopenharmony_ci 1739e1051a39Sopenharmony_ci add $b_ptr,sp,#$in1_y 1740e1051a39Sopenharmony_ci add $r_ptr,sp,#$R 1741e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y); 1742e1051a39Sopenharmony_ci 1743e1051a39Sopenharmony_ci add $a_ptr,sp,#$H 1744e1051a39Sopenharmony_ci add $b_ptr,sp,#$H 1745e1051a39Sopenharmony_ci add $r_ptr,sp,#$Hsqr 1746e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); 1747e1051a39Sopenharmony_ci 1748e1051a39Sopenharmony_ci add $a_ptr,sp,#$R 1749e1051a39Sopenharmony_ci add $b_ptr,sp,#$R 1750e1051a39Sopenharmony_ci add $r_ptr,sp,#$Rsqr 1751e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); 1752e1051a39Sopenharmony_ci 1753e1051a39Sopenharmony_ci add $a_ptr,sp,#$H 1754e1051a39Sopenharmony_ci add $b_ptr,sp,#$Hsqr 1755e1051a39Sopenharmony_ci add $r_ptr,sp,#$Hcub 1756e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); 1757e1051a39Sopenharmony_ci 1758e1051a39Sopenharmony_ci add $a_ptr,sp,#$Hsqr 1759e1051a39Sopenharmony_ci add $b_ptr,sp,#$in1_x 1760e1051a39Sopenharmony_ci add $r_ptr,sp,#$U2 1761e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); 1762e1051a39Sopenharmony_ci 1763e1051a39Sopenharmony_ci add $r_ptr,sp,#$Hsqr 1764e1051a39Sopenharmony_ci bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); 1765e1051a39Sopenharmony_ci 1766e1051a39Sopenharmony_ci add $b_ptr,sp,#$Rsqr 1767e1051a39Sopenharmony_ci add $r_ptr,sp,#$res_x 1768e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); 1769e1051a39Sopenharmony_ci 1770e1051a39Sopenharmony_ci add $b_ptr,sp,#$Hcub 1771e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); 1772e1051a39Sopenharmony_ci 1773e1051a39Sopenharmony_ci add $b_ptr,sp,#$U2 1774e1051a39Sopenharmony_ci add $r_ptr,sp,#$res_y 1775e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); 1776e1051a39Sopenharmony_ci 1777e1051a39Sopenharmony_ci add $a_ptr,sp,#$Hcub 1778e1051a39Sopenharmony_ci add $b_ptr,sp,#$in1_y 1779e1051a39Sopenharmony_ci add $r_ptr,sp,#$S2 1780e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); 1781e1051a39Sopenharmony_ci 1782e1051a39Sopenharmony_ci add $a_ptr,sp,#$R 1783e1051a39Sopenharmony_ci add $b_ptr,sp,#$res_y 1784e1051a39Sopenharmony_ci add $r_ptr,sp,#$res_y 1785e1051a39Sopenharmony_ci bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); 1786e1051a39Sopenharmony_ci 1787e1051a39Sopenharmony_ci add $b_ptr,sp,#$S2 1788e1051a39Sopenharmony_ci bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); 1789e1051a39Sopenharmony_ci 1790e1051a39Sopenharmony_ci ldr r11,[sp,#32*15+4] @ ~in1infty 1791e1051a39Sopenharmony_ci ldr r12,[sp,#32*15+8] @ ~in2infty 1792e1051a39Sopenharmony_ci add r1,sp,#$res_x 1793e1051a39Sopenharmony_ci add r2,sp,#$in2_x 1794e1051a39Sopenharmony_ci and r10,r11,r12 @ ~in1infty & ~in2infty 1795e1051a39Sopenharmony_ci mvn r11,r11 1796e1051a39Sopenharmony_ci add r3,sp,#$in1_x 1797e1051a39Sopenharmony_ci and r11,r11,r12 @ in1infty & ~in2infty 1798e1051a39Sopenharmony_ci mvn r12,r12 @ in2infty 1799e1051a39Sopenharmony_ci ldr $r_ptr,[sp,#32*15] 1800e1051a39Sopenharmony_ci___ 1801e1051a39Sopenharmony_cifor($i=0;$i<64;$i+=8) { # conditional moves 1802e1051a39Sopenharmony_ci$code.=<<___; 1803e1051a39Sopenharmony_ci ldmia r1!,{r4-r5} @ res_x 1804e1051a39Sopenharmony_ci ldmia r2!,{r6-r7} @ in2_x 1805e1051a39Sopenharmony_ci ldmia r3!,{r8-r9} @ in1_x 1806e1051a39Sopenharmony_ci and r4,r4,r10 @ ~in1infty & ~in2infty 1807e1051a39Sopenharmony_ci and r5,r5,r10 1808e1051a39Sopenharmony_ci and r6,r6,r11 @ in1infty & ~in2infty 1809e1051a39Sopenharmony_ci and r7,r7,r11 1810e1051a39Sopenharmony_ci and r8,r8,r12 @ in2infty 1811e1051a39Sopenharmony_ci and r9,r9,r12 1812e1051a39Sopenharmony_ci orr r4,r4,r6 1813e1051a39Sopenharmony_ci orr r5,r5,r7 1814e1051a39Sopenharmony_ci orr r4,r4,r8 1815e1051a39Sopenharmony_ci orr r5,r5,r9 1816e1051a39Sopenharmony_ci stmia $r_ptr!,{r4-r5} 1817e1051a39Sopenharmony_ci___ 1818e1051a39Sopenharmony_ci} 1819e1051a39Sopenharmony_cifor(;$i<96;$i+=8) { 1820e1051a39Sopenharmony_cimy $j=($i-64)/4; 1821e1051a39Sopenharmony_ci$code.=<<___; 1822e1051a39Sopenharmony_ci ldmia r1!,{r4-r5} @ res_z 1823e1051a39Sopenharmony_ci ldmia r3!,{r8-r9} @ in1_z 1824e1051a39Sopenharmony_ci and r4,r4,r10 1825e1051a39Sopenharmony_ci and r5,r5,r10 1826e1051a39Sopenharmony_ci and r6,r11,#@ONE_mont[$j] 1827e1051a39Sopenharmony_ci and r7,r11,#@ONE_mont[$j+1] 1828e1051a39Sopenharmony_ci and r8,r8,r12 1829e1051a39Sopenharmony_ci and r9,r9,r12 1830e1051a39Sopenharmony_ci orr r4,r4,r6 1831e1051a39Sopenharmony_ci orr r5,r5,r7 1832e1051a39Sopenharmony_ci orr r4,r4,r8 1833e1051a39Sopenharmony_ci orr r5,r5,r9 1834e1051a39Sopenharmony_ci stmia $r_ptr!,{r4-r5} 1835e1051a39Sopenharmony_ci___ 1836e1051a39Sopenharmony_ci} 1837e1051a39Sopenharmony_ci$code.=<<___; 1838e1051a39Sopenharmony_ci add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3" 1839e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__) 1840e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,pc} 1841e1051a39Sopenharmony_ci#else 1842e1051a39Sopenharmony_ci ldmia sp!,{r4-r12,lr} 1843e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 1844e1051a39Sopenharmony_ci#endif 1845e1051a39Sopenharmony_ci.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1846e1051a39Sopenharmony_ci___ 1847e1051a39Sopenharmony_ci} }}} 1848e1051a39Sopenharmony_ci 1849e1051a39Sopenharmony_ciforeach (split("\n",$code)) { 1850e1051a39Sopenharmony_ci s/\`([^\`]*)\`/eval $1/geo; 1851e1051a39Sopenharmony_ci 1852e1051a39Sopenharmony_ci s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1853e1051a39Sopenharmony_ci 1854e1051a39Sopenharmony_ci print $_,"\n"; 1855e1051a39Sopenharmony_ci} 1856e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; # enforce flush 1857