1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# ECP_NISTZ256 module for SPARCv9. 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# February 2015. 20e1051a39Sopenharmony_ci# 21e1051a39Sopenharmony_ci# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22e1051a39Sopenharmony_ci# http://eprint.iacr.org/2013/816. In the process of adaptation 23e1051a39Sopenharmony_ci# original .c module was made 32-bit savvy in order to make this 24e1051a39Sopenharmony_ci# implementation possible. 25e1051a39Sopenharmony_ci# 26e1051a39Sopenharmony_ci# with/without -DECP_NISTZ256_ASM 27e1051a39Sopenharmony_ci# UltraSPARC III +12-18% 28e1051a39Sopenharmony_ci# SPARC T4 +99-550% (+66-150% on 32-bit Solaris) 29e1051a39Sopenharmony_ci# 30e1051a39Sopenharmony_ci# Ranges denote minimum and maximum improvement coefficients depending 31e1051a39Sopenharmony_ci# on benchmark. Lower coefficients are for ECDSA sign, server-side 32e1051a39Sopenharmony_ci# operation. Keep in mind that +200% means 3x improvement. 33e1051a39Sopenharmony_ci 34e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output"; 35e1051a39Sopenharmony_ci 36e1051a39Sopenharmony_ci$code.=<<___; 37e1051a39Sopenharmony_ci#ifndef __ASSEMBLER__ 38e1051a39Sopenharmony_ci# define __ASSEMBLER__ 1 39e1051a39Sopenharmony_ci#endif 40e1051a39Sopenharmony_ci#include "crypto/sparc_arch.h" 41e1051a39Sopenharmony_ci 42e1051a39Sopenharmony_ci#define LOCALS (STACK_BIAS+STACK_FRAME) 43e1051a39Sopenharmony_ci#ifdef __arch64__ 44e1051a39Sopenharmony_ci.register %g2,#scratch 45e1051a39Sopenharmony_ci.register %g3,#scratch 46e1051a39Sopenharmony_ci# define STACK64_FRAME STACK_FRAME 47e1051a39Sopenharmony_ci# define LOCALS64 LOCALS 48e1051a39Sopenharmony_ci#else 49e1051a39Sopenharmony_ci# define STACK64_FRAME (2047+192) 50e1051a39Sopenharmony_ci# define LOCALS64 STACK64_FRAME 51e1051a39Sopenharmony_ci#endif 52e1051a39Sopenharmony_ci 53e1051a39Sopenharmony_ci.section ".text",#alloc,#execinstr 54e1051a39Sopenharmony_ci___ 55e1051a39Sopenharmony_ci######################################################################## 56e1051a39Sopenharmony_ci# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 57e1051a39Sopenharmony_ci# 58e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 59e1051a39Sopenharmony_ciopen TABLE,"<ecp_nistz256_table.c" or 60e1051a39Sopenharmony_ciopen TABLE,"<${dir}../ecp_nistz256_table.c" or 61e1051a39Sopenharmony_cidie "failed to open ecp_nistz256_table.c:",$!; 62e1051a39Sopenharmony_ci 63e1051a39Sopenharmony_ciuse integer; 64e1051a39Sopenharmony_ci 65e1051a39Sopenharmony_ciforeach(<TABLE>) { 66e1051a39Sopenharmony_ci s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 67e1051a39Sopenharmony_ci} 68e1051a39Sopenharmony_ciclose TABLE; 69e1051a39Sopenharmony_ci 70e1051a39Sopenharmony_ci# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 71e1051a39Sopenharmony_ci# 64*16*37-1 is because $#arr returns last valid index or @arr, not 72e1051a39Sopenharmony_ci# amount of elements. 73e1051a39Sopenharmony_cidie "insane number of elements" if ($#arr != 64*16*37-1); 74e1051a39Sopenharmony_ci 75e1051a39Sopenharmony_ci$code.=<<___; 76e1051a39Sopenharmony_ci.globl ecp_nistz256_precomputed 77e1051a39Sopenharmony_ci.align 4096 78e1051a39Sopenharmony_ciecp_nistz256_precomputed: 79e1051a39Sopenharmony_ci___ 80e1051a39Sopenharmony_ci######################################################################## 81e1051a39Sopenharmony_ci# this conversion smashes P256_POINT_AFFINE by individual bytes with 82e1051a39Sopenharmony_ci# 64 byte interval, similar to 83e1051a39Sopenharmony_ci# 1111222233334444 84e1051a39Sopenharmony_ci# 1234123412341234 85e1051a39Sopenharmony_cifor(1..37) { 86e1051a39Sopenharmony_ci @tbl = splice(@arr,0,64*16); 87e1051a39Sopenharmony_ci for($i=0;$i<64;$i++) { 88e1051a39Sopenharmony_ci undef @line; 89e1051a39Sopenharmony_ci for($j=0;$j<64;$j++) { 90e1051a39Sopenharmony_ci push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 91e1051a39Sopenharmony_ci } 92e1051a39Sopenharmony_ci $code.=".byte\t"; 93e1051a39Sopenharmony_ci $code.=join(',',map { sprintf "0x%02x",$_} @line); 94e1051a39Sopenharmony_ci $code.="\n"; 95e1051a39Sopenharmony_ci } 96e1051a39Sopenharmony_ci} 97e1051a39Sopenharmony_ci 98e1051a39Sopenharmony_ci{{{ 99e1051a39Sopenharmony_cimy ($rp,$ap,$bp)=map("%i$_",(0..2)); 100e1051a39Sopenharmony_cimy @acc=map("%l$_",(0..7)); 101e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5"); 102e1051a39Sopenharmony_cimy ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1"); 103e1051a39Sopenharmony_cimy ($rp_real,$ap_real)=("%g2","%g3"); 104e1051a39Sopenharmony_ci 105e1051a39Sopenharmony_ci$code.=<<___; 106e1051a39Sopenharmony_ci.type ecp_nistz256_precomputed,#object 107e1051a39Sopenharmony_ci.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 108e1051a39Sopenharmony_ci.align 64 109e1051a39Sopenharmony_ci.LRR: ! 2^512 mod P precomputed for NIST P256 polynomial 110e1051a39Sopenharmony_ci.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb 111e1051a39Sopenharmony_ci.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 112e1051a39Sopenharmony_ci.Lone: 113e1051a39Sopenharmony_ci.long 1,0,0,0,0,0,0,0 114e1051a39Sopenharmony_ci.asciz "ECP_NISTZ256 for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 115e1051a39Sopenharmony_ci 116e1051a39Sopenharmony_ci! void ecp_nistz256_to_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 117e1051a39Sopenharmony_ci.globl ecp_nistz256_to_mont 118e1051a39Sopenharmony_ci.align 64 119e1051a39Sopenharmony_ciecp_nistz256_to_mont: 120e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 121e1051a39Sopenharmony_ci nop 122e1051a39Sopenharmony_ci1: call .+8 123e1051a39Sopenharmony_ci add %o7,.LRR-1b,$bp 124e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont 125e1051a39Sopenharmony_ci nop 126e1051a39Sopenharmony_ci ret 127e1051a39Sopenharmony_ci restore 128e1051a39Sopenharmony_ci.type ecp_nistz256_to_mont,#function 129e1051a39Sopenharmony_ci.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 130e1051a39Sopenharmony_ci 131e1051a39Sopenharmony_ci! void ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 132e1051a39Sopenharmony_ci.globl ecp_nistz256_from_mont 133e1051a39Sopenharmony_ci.align 32 134e1051a39Sopenharmony_ciecp_nistz256_from_mont: 135e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 136e1051a39Sopenharmony_ci nop 137e1051a39Sopenharmony_ci1: call .+8 138e1051a39Sopenharmony_ci add %o7,.Lone-1b,$bp 139e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont 140e1051a39Sopenharmony_ci nop 141e1051a39Sopenharmony_ci ret 142e1051a39Sopenharmony_ci restore 143e1051a39Sopenharmony_ci.type ecp_nistz256_from_mont,#function 144e1051a39Sopenharmony_ci.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 145e1051a39Sopenharmony_ci 146e1051a39Sopenharmony_ci! void ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8], 147e1051a39Sopenharmony_ci! const BN_ULONG %i2[8]); 148e1051a39Sopenharmony_ci.globl ecp_nistz256_mul_mont 149e1051a39Sopenharmony_ci.align 32 150e1051a39Sopenharmony_ciecp_nistz256_mul_mont: 151e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 152e1051a39Sopenharmony_ci nop 153e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont 154e1051a39Sopenharmony_ci nop 155e1051a39Sopenharmony_ci ret 156e1051a39Sopenharmony_ci restore 157e1051a39Sopenharmony_ci.type ecp_nistz256_mul_mont,#function 158e1051a39Sopenharmony_ci.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 159e1051a39Sopenharmony_ci 160e1051a39Sopenharmony_ci! void ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]); 161e1051a39Sopenharmony_ci.globl ecp_nistz256_sqr_mont 162e1051a39Sopenharmony_ci.align 32 163e1051a39Sopenharmony_ciecp_nistz256_sqr_mont: 164e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 165e1051a39Sopenharmony_ci mov $ap,$bp 166e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont 167e1051a39Sopenharmony_ci nop 168e1051a39Sopenharmony_ci ret 169e1051a39Sopenharmony_ci restore 170e1051a39Sopenharmony_ci.type ecp_nistz256_sqr_mont,#function 171e1051a39Sopenharmony_ci.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 172e1051a39Sopenharmony_ci___ 173e1051a39Sopenharmony_ci 174e1051a39Sopenharmony_ci######################################################################## 175e1051a39Sopenharmony_ci# Special thing to keep in mind is that $t0-$t7 hold 64-bit values, 176e1051a39Sopenharmony_ci# while all others are meant to keep 32. "Meant to" means that additions 177e1051a39Sopenharmony_ci# to @acc[0-7] do "contaminate" upper bits, but they are cleared before 178e1051a39Sopenharmony_ci# they can affect outcome (follow 'and' with $mask). Also keep in mind 179e1051a39Sopenharmony_ci# that addition with carry is addition with 32-bit carry, even though 180e1051a39Sopenharmony_ci# CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see 181e1051a39Sopenharmony_ci# below for VIS3 code paths.] 182e1051a39Sopenharmony_ci 183e1051a39Sopenharmony_ci$code.=<<___; 184e1051a39Sopenharmony_ci.align 32 185e1051a39Sopenharmony_ci__ecp_nistz256_mul_mont: 186e1051a39Sopenharmony_ci ld [$bp+0],$bi ! b[0] 187e1051a39Sopenharmony_ci mov -1,$mask 188e1051a39Sopenharmony_ci ld [$ap+0],$a0 189e1051a39Sopenharmony_ci srl $mask,0,$mask ! 0xffffffff 190e1051a39Sopenharmony_ci ld [$ap+4],$t1 191e1051a39Sopenharmony_ci ld [$ap+8],$t2 192e1051a39Sopenharmony_ci ld [$ap+12],$t3 193e1051a39Sopenharmony_ci ld [$ap+16],$t4 194e1051a39Sopenharmony_ci ld [$ap+20],$t5 195e1051a39Sopenharmony_ci ld [$ap+24],$t6 196e1051a39Sopenharmony_ci ld [$ap+28],$t7 197e1051a39Sopenharmony_ci mulx $a0,$bi,$t0 ! a[0-7]*b[0], 64-bit results 198e1051a39Sopenharmony_ci mulx $t1,$bi,$t1 199e1051a39Sopenharmony_ci mulx $t2,$bi,$t2 200e1051a39Sopenharmony_ci mulx $t3,$bi,$t3 201e1051a39Sopenharmony_ci mulx $t4,$bi,$t4 202e1051a39Sopenharmony_ci mulx $t5,$bi,$t5 203e1051a39Sopenharmony_ci mulx $t6,$bi,$t6 204e1051a39Sopenharmony_ci mulx $t7,$bi,$t7 205e1051a39Sopenharmony_ci srlx $t0,32,@acc[1] ! extract high parts 206e1051a39Sopenharmony_ci srlx $t1,32,@acc[2] 207e1051a39Sopenharmony_ci srlx $t2,32,@acc[3] 208e1051a39Sopenharmony_ci srlx $t3,32,@acc[4] 209e1051a39Sopenharmony_ci srlx $t4,32,@acc[5] 210e1051a39Sopenharmony_ci srlx $t5,32,@acc[6] 211e1051a39Sopenharmony_ci srlx $t6,32,@acc[7] 212e1051a39Sopenharmony_ci srlx $t7,32,@acc[0] ! "@acc[8]" 213e1051a39Sopenharmony_ci mov 0,$carry 214e1051a39Sopenharmony_ci___ 215e1051a39Sopenharmony_cifor($i=1;$i<8;$i++) { 216e1051a39Sopenharmony_ci$code.=<<___; 217e1051a39Sopenharmony_ci addcc @acc[1],$t1,@acc[1] ! accumulate high parts 218e1051a39Sopenharmony_ci ld [$bp+4*$i],$bi ! b[$i] 219e1051a39Sopenharmony_ci ld [$ap+4],$t1 ! re-load a[1-7] 220e1051a39Sopenharmony_ci addccc @acc[2],$t2,@acc[2] 221e1051a39Sopenharmony_ci addccc @acc[3],$t3,@acc[3] 222e1051a39Sopenharmony_ci ld [$ap+8],$t2 223e1051a39Sopenharmony_ci ld [$ap+12],$t3 224e1051a39Sopenharmony_ci addccc @acc[4],$t4,@acc[4] 225e1051a39Sopenharmony_ci addccc @acc[5],$t5,@acc[5] 226e1051a39Sopenharmony_ci ld [$ap+16],$t4 227e1051a39Sopenharmony_ci ld [$ap+20],$t5 228e1051a39Sopenharmony_ci addccc @acc[6],$t6,@acc[6] 229e1051a39Sopenharmony_ci addccc @acc[7],$t7,@acc[7] 230e1051a39Sopenharmony_ci ld [$ap+24],$t6 231e1051a39Sopenharmony_ci ld [$ap+28],$t7 232e1051a39Sopenharmony_ci addccc @acc[0],$carry,@acc[0] ! "@acc[8]" 233e1051a39Sopenharmony_ci addc %g0,%g0,$carry 234e1051a39Sopenharmony_ci___ 235e1051a39Sopenharmony_ci # Reduction iteration is normally performed by accumulating 236e1051a39Sopenharmony_ci # result of multiplication of modulus by "magic" digit [and 237e1051a39Sopenharmony_ci # omitting least significant word, which is guaranteed to 238e1051a39Sopenharmony_ci # be 0], but thanks to special form of modulus and "magic" 239e1051a39Sopenharmony_ci # digit being equal to least significant word, it can be 240e1051a39Sopenharmony_ci # performed with additions and subtractions alone. Indeed: 241e1051a39Sopenharmony_ci # 242e1051a39Sopenharmony_ci # ffff.0001.0000.0000.0000.ffff.ffff.ffff 243e1051a39Sopenharmony_ci # * abcd 244e1051a39Sopenharmony_ci # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 245e1051a39Sopenharmony_ci # 246e1051a39Sopenharmony_ci # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 247e1051a39Sopenharmony_ci # rewrite above as: 248e1051a39Sopenharmony_ci # 249e1051a39Sopenharmony_ci # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 250e1051a39Sopenharmony_ci # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 251e1051a39Sopenharmony_ci # - abcd.0000.0000.0000.0000.0000.0000.abcd 252e1051a39Sopenharmony_ci # 253e1051a39Sopenharmony_ci # or marking redundant operations: 254e1051a39Sopenharmony_ci # 255e1051a39Sopenharmony_ci # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- 256e1051a39Sopenharmony_ci # + abcd.0000.abcd.0000.0000.abcd.----.----.---- 257e1051a39Sopenharmony_ci # - abcd.----.----.----.----.----.----.---- 258e1051a39Sopenharmony_ci 259e1051a39Sopenharmony_ci$code.=<<___; 260e1051a39Sopenharmony_ci ! multiplication-less reduction 261e1051a39Sopenharmony_ci addcc @acc[3],$t0,@acc[3] ! r[3]+=r[0] 262e1051a39Sopenharmony_ci addccc @acc[4],%g0,@acc[4] ! r[4]+=0 263e1051a39Sopenharmony_ci and @acc[1],$mask,@acc[1] 264e1051a39Sopenharmony_ci and @acc[2],$mask,@acc[2] 265e1051a39Sopenharmony_ci addccc @acc[5],%g0,@acc[5] ! r[5]+=0 266e1051a39Sopenharmony_ci addccc @acc[6],$t0,@acc[6] ! r[6]+=r[0] 267e1051a39Sopenharmony_ci and @acc[3],$mask,@acc[3] 268e1051a39Sopenharmony_ci and @acc[4],$mask,@acc[4] 269e1051a39Sopenharmony_ci addccc @acc[7],%g0,@acc[7] ! r[7]+=0 270e1051a39Sopenharmony_ci addccc @acc[0],$t0,@acc[0] ! r[8]+=r[0] "@acc[8]" 271e1051a39Sopenharmony_ci and @acc[5],$mask,@acc[5] 272e1051a39Sopenharmony_ci and @acc[6],$mask,@acc[6] 273e1051a39Sopenharmony_ci addc $carry,%g0,$carry ! top-most carry 274e1051a39Sopenharmony_ci subcc @acc[7],$t0,@acc[7] ! r[7]-=r[0] 275e1051a39Sopenharmony_ci subccc @acc[0],%g0,@acc[0] ! r[8]-=0 "@acc[8]" 276e1051a39Sopenharmony_ci subc $carry,%g0,$carry ! top-most carry 277e1051a39Sopenharmony_ci and @acc[7],$mask,@acc[7] 278e1051a39Sopenharmony_ci and @acc[0],$mask,@acc[0] ! "@acc[8]" 279e1051a39Sopenharmony_ci___ 280e1051a39Sopenharmony_ci push(@acc,shift(@acc)); # rotate registers to "omit" acc[0] 281e1051a39Sopenharmony_ci$code.=<<___; 282e1051a39Sopenharmony_ci mulx $a0,$bi,$t0 ! a[0-7]*b[$i], 64-bit results 283e1051a39Sopenharmony_ci mulx $t1,$bi,$t1 284e1051a39Sopenharmony_ci mulx $t2,$bi,$t2 285e1051a39Sopenharmony_ci mulx $t3,$bi,$t3 286e1051a39Sopenharmony_ci mulx $t4,$bi,$t4 287e1051a39Sopenharmony_ci mulx $t5,$bi,$t5 288e1051a39Sopenharmony_ci mulx $t6,$bi,$t6 289e1051a39Sopenharmony_ci mulx $t7,$bi,$t7 290e1051a39Sopenharmony_ci add @acc[0],$t0,$t0 ! accumulate low parts, can't overflow 291e1051a39Sopenharmony_ci add @acc[1],$t1,$t1 292e1051a39Sopenharmony_ci srlx $t0,32,@acc[1] ! extract high parts 293e1051a39Sopenharmony_ci add @acc[2],$t2,$t2 294e1051a39Sopenharmony_ci srlx $t1,32,@acc[2] 295e1051a39Sopenharmony_ci add @acc[3],$t3,$t3 296e1051a39Sopenharmony_ci srlx $t2,32,@acc[3] 297e1051a39Sopenharmony_ci add @acc[4],$t4,$t4 298e1051a39Sopenharmony_ci srlx $t3,32,@acc[4] 299e1051a39Sopenharmony_ci add @acc[5],$t5,$t5 300e1051a39Sopenharmony_ci srlx $t4,32,@acc[5] 301e1051a39Sopenharmony_ci add @acc[6],$t6,$t6 302e1051a39Sopenharmony_ci srlx $t5,32,@acc[6] 303e1051a39Sopenharmony_ci add @acc[7],$t7,$t7 304e1051a39Sopenharmony_ci srlx $t6,32,@acc[7] 305e1051a39Sopenharmony_ci srlx $t7,32,@acc[0] ! "@acc[8]" 306e1051a39Sopenharmony_ci___ 307e1051a39Sopenharmony_ci} 308e1051a39Sopenharmony_ci$code.=<<___; 309e1051a39Sopenharmony_ci addcc @acc[1],$t1,@acc[1] ! accumulate high parts 310e1051a39Sopenharmony_ci addccc @acc[2],$t2,@acc[2] 311e1051a39Sopenharmony_ci addccc @acc[3],$t3,@acc[3] 312e1051a39Sopenharmony_ci addccc @acc[4],$t4,@acc[4] 313e1051a39Sopenharmony_ci addccc @acc[5],$t5,@acc[5] 314e1051a39Sopenharmony_ci addccc @acc[6],$t6,@acc[6] 315e1051a39Sopenharmony_ci addccc @acc[7],$t7,@acc[7] 316e1051a39Sopenharmony_ci addccc @acc[0],$carry,@acc[0] ! "@acc[8]" 317e1051a39Sopenharmony_ci addc %g0,%g0,$carry 318e1051a39Sopenharmony_ci 319e1051a39Sopenharmony_ci addcc @acc[3],$t0,@acc[3] ! multiplication-less reduction 320e1051a39Sopenharmony_ci addccc @acc[4],%g0,@acc[4] 321e1051a39Sopenharmony_ci addccc @acc[5],%g0,@acc[5] 322e1051a39Sopenharmony_ci addccc @acc[6],$t0,@acc[6] 323e1051a39Sopenharmony_ci addccc @acc[7],%g0,@acc[7] 324e1051a39Sopenharmony_ci addccc @acc[0],$t0,@acc[0] ! "@acc[8]" 325e1051a39Sopenharmony_ci addc $carry,%g0,$carry 326e1051a39Sopenharmony_ci subcc @acc[7],$t0,@acc[7] 327e1051a39Sopenharmony_ci subccc @acc[0],%g0,@acc[0] ! "@acc[8]" 328e1051a39Sopenharmony_ci subc $carry,%g0,$carry ! top-most carry 329e1051a39Sopenharmony_ci___ 330e1051a39Sopenharmony_ci push(@acc,shift(@acc)); # rotate registers to omit acc[0] 331e1051a39Sopenharmony_ci$code.=<<___; 332e1051a39Sopenharmony_ci ! Final step is "if result > mod, subtract mod", but we do it 333e1051a39Sopenharmony_ci ! "other way around", namely subtract modulus from result 334e1051a39Sopenharmony_ci ! and if it borrowed, add modulus back. 335e1051a39Sopenharmony_ci 336e1051a39Sopenharmony_ci subcc @acc[0],-1,@acc[0] ! subtract modulus 337e1051a39Sopenharmony_ci subccc @acc[1],-1,@acc[1] 338e1051a39Sopenharmony_ci subccc @acc[2],-1,@acc[2] 339e1051a39Sopenharmony_ci subccc @acc[3],0,@acc[3] 340e1051a39Sopenharmony_ci subccc @acc[4],0,@acc[4] 341e1051a39Sopenharmony_ci subccc @acc[5],0,@acc[5] 342e1051a39Sopenharmony_ci subccc @acc[6],1,@acc[6] 343e1051a39Sopenharmony_ci subccc @acc[7],-1,@acc[7] 344e1051a39Sopenharmony_ci subc $carry,0,$carry ! broadcast borrow bit 345e1051a39Sopenharmony_ci 346e1051a39Sopenharmony_ci ! Note that because mod has special form, i.e. consists of 347e1051a39Sopenharmony_ci ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by 348e1051a39Sopenharmony_ci ! using value of broadcasted borrow and the borrow bit itself. 349e1051a39Sopenharmony_ci ! To minimize dependency chain we first broadcast and then 350e1051a39Sopenharmony_ci ! extract the bit by negating (follow $bi). 351e1051a39Sopenharmony_ci 352e1051a39Sopenharmony_ci addcc @acc[0],$carry,@acc[0] ! add modulus or zero 353e1051a39Sopenharmony_ci addccc @acc[1],$carry,@acc[1] 354e1051a39Sopenharmony_ci neg $carry,$bi 355e1051a39Sopenharmony_ci st @acc[0],[$rp] 356e1051a39Sopenharmony_ci addccc @acc[2],$carry,@acc[2] 357e1051a39Sopenharmony_ci st @acc[1],[$rp+4] 358e1051a39Sopenharmony_ci addccc @acc[3],0,@acc[3] 359e1051a39Sopenharmony_ci st @acc[2],[$rp+8] 360e1051a39Sopenharmony_ci addccc @acc[4],0,@acc[4] 361e1051a39Sopenharmony_ci st @acc[3],[$rp+12] 362e1051a39Sopenharmony_ci addccc @acc[5],0,@acc[5] 363e1051a39Sopenharmony_ci st @acc[4],[$rp+16] 364e1051a39Sopenharmony_ci addccc @acc[6],$bi,@acc[6] 365e1051a39Sopenharmony_ci st @acc[5],[$rp+20] 366e1051a39Sopenharmony_ci addc @acc[7],$carry,@acc[7] 367e1051a39Sopenharmony_ci st @acc[6],[$rp+24] 368e1051a39Sopenharmony_ci retl 369e1051a39Sopenharmony_ci st @acc[7],[$rp+28] 370e1051a39Sopenharmony_ci.type __ecp_nistz256_mul_mont,#function 371e1051a39Sopenharmony_ci.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 372e1051a39Sopenharmony_ci 373e1051a39Sopenharmony_ci! void ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8], 374e1051a39Sopenharmony_ci! const BN_ULONG %i2[8]); 375e1051a39Sopenharmony_ci.globl ecp_nistz256_add 376e1051a39Sopenharmony_ci.align 32 377e1051a39Sopenharmony_ciecp_nistz256_add: 378e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 379e1051a39Sopenharmony_ci ld [$ap],@acc[0] 380e1051a39Sopenharmony_ci ld [$ap+4],@acc[1] 381e1051a39Sopenharmony_ci ld [$ap+8],@acc[2] 382e1051a39Sopenharmony_ci ld [$ap+12],@acc[3] 383e1051a39Sopenharmony_ci ld [$ap+16],@acc[4] 384e1051a39Sopenharmony_ci ld [$ap+20],@acc[5] 385e1051a39Sopenharmony_ci ld [$ap+24],@acc[6] 386e1051a39Sopenharmony_ci call __ecp_nistz256_add 387e1051a39Sopenharmony_ci ld [$ap+28],@acc[7] 388e1051a39Sopenharmony_ci ret 389e1051a39Sopenharmony_ci restore 390e1051a39Sopenharmony_ci.type ecp_nistz256_add,#function 391e1051a39Sopenharmony_ci.size ecp_nistz256_add,.-ecp_nistz256_add 392e1051a39Sopenharmony_ci 393e1051a39Sopenharmony_ci.align 32 394e1051a39Sopenharmony_ci__ecp_nistz256_add: 395e1051a39Sopenharmony_ci ld [$bp+0],$t0 ! b[0] 396e1051a39Sopenharmony_ci ld [$bp+4],$t1 397e1051a39Sopenharmony_ci ld [$bp+8],$t2 398e1051a39Sopenharmony_ci ld [$bp+12],$t3 399e1051a39Sopenharmony_ci addcc @acc[0],$t0,@acc[0] 400e1051a39Sopenharmony_ci ld [$bp+16],$t4 401e1051a39Sopenharmony_ci ld [$bp+20],$t5 402e1051a39Sopenharmony_ci addccc @acc[1],$t1,@acc[1] 403e1051a39Sopenharmony_ci ld [$bp+24],$t6 404e1051a39Sopenharmony_ci ld [$bp+28],$t7 405e1051a39Sopenharmony_ci addccc @acc[2],$t2,@acc[2] 406e1051a39Sopenharmony_ci addccc @acc[3],$t3,@acc[3] 407e1051a39Sopenharmony_ci addccc @acc[4],$t4,@acc[4] 408e1051a39Sopenharmony_ci addccc @acc[5],$t5,@acc[5] 409e1051a39Sopenharmony_ci addccc @acc[6],$t6,@acc[6] 410e1051a39Sopenharmony_ci addccc @acc[7],$t7,@acc[7] 411e1051a39Sopenharmony_ci addc %g0,%g0,$carry 412e1051a39Sopenharmony_ci 413e1051a39Sopenharmony_ci.Lreduce_by_sub: 414e1051a39Sopenharmony_ci 415e1051a39Sopenharmony_ci ! if a+b >= modulus, subtract modulus. 416e1051a39Sopenharmony_ci ! 417e1051a39Sopenharmony_ci ! But since comparison implies subtraction, we subtract 418e1051a39Sopenharmony_ci ! modulus and then add it back if subtraction borrowed. 419e1051a39Sopenharmony_ci 420e1051a39Sopenharmony_ci subcc @acc[0],-1,@acc[0] 421e1051a39Sopenharmony_ci subccc @acc[1],-1,@acc[1] 422e1051a39Sopenharmony_ci subccc @acc[2],-1,@acc[2] 423e1051a39Sopenharmony_ci subccc @acc[3], 0,@acc[3] 424e1051a39Sopenharmony_ci subccc @acc[4], 0,@acc[4] 425e1051a39Sopenharmony_ci subccc @acc[5], 0,@acc[5] 426e1051a39Sopenharmony_ci subccc @acc[6], 1,@acc[6] 427e1051a39Sopenharmony_ci subccc @acc[7],-1,@acc[7] 428e1051a39Sopenharmony_ci subc $carry,0,$carry 429e1051a39Sopenharmony_ci 430e1051a39Sopenharmony_ci ! Note that because mod has special form, i.e. consists of 431e1051a39Sopenharmony_ci ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by 432e1051a39Sopenharmony_ci ! using value of borrow and its negative. 433e1051a39Sopenharmony_ci 434e1051a39Sopenharmony_ci addcc @acc[0],$carry,@acc[0] ! add synthesized modulus 435e1051a39Sopenharmony_ci addccc @acc[1],$carry,@acc[1] 436e1051a39Sopenharmony_ci neg $carry,$bi 437e1051a39Sopenharmony_ci st @acc[0],[$rp] 438e1051a39Sopenharmony_ci addccc @acc[2],$carry,@acc[2] 439e1051a39Sopenharmony_ci st @acc[1],[$rp+4] 440e1051a39Sopenharmony_ci addccc @acc[3],0,@acc[3] 441e1051a39Sopenharmony_ci st @acc[2],[$rp+8] 442e1051a39Sopenharmony_ci addccc @acc[4],0,@acc[4] 443e1051a39Sopenharmony_ci st @acc[3],[$rp+12] 444e1051a39Sopenharmony_ci addccc @acc[5],0,@acc[5] 445e1051a39Sopenharmony_ci st @acc[4],[$rp+16] 446e1051a39Sopenharmony_ci addccc @acc[6],$bi,@acc[6] 447e1051a39Sopenharmony_ci st @acc[5],[$rp+20] 448e1051a39Sopenharmony_ci addc @acc[7],$carry,@acc[7] 449e1051a39Sopenharmony_ci st @acc[6],[$rp+24] 450e1051a39Sopenharmony_ci retl 451e1051a39Sopenharmony_ci st @acc[7],[$rp+28] 452e1051a39Sopenharmony_ci.type __ecp_nistz256_add,#function 453e1051a39Sopenharmony_ci.size __ecp_nistz256_add,.-__ecp_nistz256_add 454e1051a39Sopenharmony_ci 455e1051a39Sopenharmony_ci! void ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 456e1051a39Sopenharmony_ci.globl ecp_nistz256_mul_by_2 457e1051a39Sopenharmony_ci.align 32 458e1051a39Sopenharmony_ciecp_nistz256_mul_by_2: 459e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 460e1051a39Sopenharmony_ci ld [$ap],@acc[0] 461e1051a39Sopenharmony_ci ld [$ap+4],@acc[1] 462e1051a39Sopenharmony_ci ld [$ap+8],@acc[2] 463e1051a39Sopenharmony_ci ld [$ap+12],@acc[3] 464e1051a39Sopenharmony_ci ld [$ap+16],@acc[4] 465e1051a39Sopenharmony_ci ld [$ap+20],@acc[5] 466e1051a39Sopenharmony_ci ld [$ap+24],@acc[6] 467e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2 468e1051a39Sopenharmony_ci ld [$ap+28],@acc[7] 469e1051a39Sopenharmony_ci ret 470e1051a39Sopenharmony_ci restore 471e1051a39Sopenharmony_ci.type ecp_nistz256_mul_by_2,#function 472e1051a39Sopenharmony_ci.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 473e1051a39Sopenharmony_ci 474e1051a39Sopenharmony_ci.align 32 475e1051a39Sopenharmony_ci__ecp_nistz256_mul_by_2: 476e1051a39Sopenharmony_ci addcc @acc[0],@acc[0],@acc[0] ! a+a=2*a 477e1051a39Sopenharmony_ci addccc @acc[1],@acc[1],@acc[1] 478e1051a39Sopenharmony_ci addccc @acc[2],@acc[2],@acc[2] 479e1051a39Sopenharmony_ci addccc @acc[3],@acc[3],@acc[3] 480e1051a39Sopenharmony_ci addccc @acc[4],@acc[4],@acc[4] 481e1051a39Sopenharmony_ci addccc @acc[5],@acc[5],@acc[5] 482e1051a39Sopenharmony_ci addccc @acc[6],@acc[6],@acc[6] 483e1051a39Sopenharmony_ci addccc @acc[7],@acc[7],@acc[7] 484e1051a39Sopenharmony_ci b .Lreduce_by_sub 485e1051a39Sopenharmony_ci addc %g0,%g0,$carry 486e1051a39Sopenharmony_ci.type __ecp_nistz256_mul_by_2,#function 487e1051a39Sopenharmony_ci.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 488e1051a39Sopenharmony_ci 489e1051a39Sopenharmony_ci! void ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 490e1051a39Sopenharmony_ci.globl ecp_nistz256_mul_by_3 491e1051a39Sopenharmony_ci.align 32 492e1051a39Sopenharmony_ciecp_nistz256_mul_by_3: 493e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 494e1051a39Sopenharmony_ci ld [$ap],@acc[0] 495e1051a39Sopenharmony_ci ld [$ap+4],@acc[1] 496e1051a39Sopenharmony_ci ld [$ap+8],@acc[2] 497e1051a39Sopenharmony_ci ld [$ap+12],@acc[3] 498e1051a39Sopenharmony_ci ld [$ap+16],@acc[4] 499e1051a39Sopenharmony_ci ld [$ap+20],@acc[5] 500e1051a39Sopenharmony_ci ld [$ap+24],@acc[6] 501e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_3 502e1051a39Sopenharmony_ci ld [$ap+28],@acc[7] 503e1051a39Sopenharmony_ci ret 504e1051a39Sopenharmony_ci restore 505e1051a39Sopenharmony_ci.type ecp_nistz256_mul_by_3,#function 506e1051a39Sopenharmony_ci.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 507e1051a39Sopenharmony_ci 508e1051a39Sopenharmony_ci.align 32 509e1051a39Sopenharmony_ci__ecp_nistz256_mul_by_3: 510e1051a39Sopenharmony_ci addcc @acc[0],@acc[0],$t0 ! a+a=2*a 511e1051a39Sopenharmony_ci addccc @acc[1],@acc[1],$t1 512e1051a39Sopenharmony_ci addccc @acc[2],@acc[2],$t2 513e1051a39Sopenharmony_ci addccc @acc[3],@acc[3],$t3 514e1051a39Sopenharmony_ci addccc @acc[4],@acc[4],$t4 515e1051a39Sopenharmony_ci addccc @acc[5],@acc[5],$t5 516e1051a39Sopenharmony_ci addccc @acc[6],@acc[6],$t6 517e1051a39Sopenharmony_ci addccc @acc[7],@acc[7],$t7 518e1051a39Sopenharmony_ci addc %g0,%g0,$carry 519e1051a39Sopenharmony_ci 520e1051a39Sopenharmony_ci subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores 521e1051a39Sopenharmony_ci subccc $t1,-1,$t1 522e1051a39Sopenharmony_ci subccc $t2,-1,$t2 523e1051a39Sopenharmony_ci subccc $t3, 0,$t3 524e1051a39Sopenharmony_ci subccc $t4, 0,$t4 525e1051a39Sopenharmony_ci subccc $t5, 0,$t5 526e1051a39Sopenharmony_ci subccc $t6, 1,$t6 527e1051a39Sopenharmony_ci subccc $t7,-1,$t7 528e1051a39Sopenharmony_ci subc $carry,0,$carry 529e1051a39Sopenharmony_ci 530e1051a39Sopenharmony_ci addcc $t0,$carry,$t0 ! add synthesized modulus 531e1051a39Sopenharmony_ci addccc $t1,$carry,$t1 532e1051a39Sopenharmony_ci neg $carry,$bi 533e1051a39Sopenharmony_ci addccc $t2,$carry,$t2 534e1051a39Sopenharmony_ci addccc $t3,0,$t3 535e1051a39Sopenharmony_ci addccc $t4,0,$t4 536e1051a39Sopenharmony_ci addccc $t5,0,$t5 537e1051a39Sopenharmony_ci addccc $t6,$bi,$t6 538e1051a39Sopenharmony_ci addc $t7,$carry,$t7 539e1051a39Sopenharmony_ci 540e1051a39Sopenharmony_ci addcc $t0,@acc[0],@acc[0] ! 2*a+a=3*a 541e1051a39Sopenharmony_ci addccc $t1,@acc[1],@acc[1] 542e1051a39Sopenharmony_ci addccc $t2,@acc[2],@acc[2] 543e1051a39Sopenharmony_ci addccc $t3,@acc[3],@acc[3] 544e1051a39Sopenharmony_ci addccc $t4,@acc[4],@acc[4] 545e1051a39Sopenharmony_ci addccc $t5,@acc[5],@acc[5] 546e1051a39Sopenharmony_ci addccc $t6,@acc[6],@acc[6] 547e1051a39Sopenharmony_ci addccc $t7,@acc[7],@acc[7] 548e1051a39Sopenharmony_ci b .Lreduce_by_sub 549e1051a39Sopenharmony_ci addc %g0,%g0,$carry 550e1051a39Sopenharmony_ci.type __ecp_nistz256_mul_by_3,#function 551e1051a39Sopenharmony_ci.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3 552e1051a39Sopenharmony_ci 553e1051a39Sopenharmony_ci! void ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8], 554e1051a39Sopenharmony_ci! const BN_ULONG %i2[8]); 555e1051a39Sopenharmony_ci.globl ecp_nistz256_sub 556e1051a39Sopenharmony_ci.align 32 557e1051a39Sopenharmony_ciecp_nistz256_sub: 558e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 559e1051a39Sopenharmony_ci ld [$ap],@acc[0] 560e1051a39Sopenharmony_ci ld [$ap+4],@acc[1] 561e1051a39Sopenharmony_ci ld [$ap+8],@acc[2] 562e1051a39Sopenharmony_ci ld [$ap+12],@acc[3] 563e1051a39Sopenharmony_ci ld [$ap+16],@acc[4] 564e1051a39Sopenharmony_ci ld [$ap+20],@acc[5] 565e1051a39Sopenharmony_ci ld [$ap+24],@acc[6] 566e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from 567e1051a39Sopenharmony_ci ld [$ap+28],@acc[7] 568e1051a39Sopenharmony_ci ret 569e1051a39Sopenharmony_ci restore 570e1051a39Sopenharmony_ci.type ecp_nistz256_sub,#function 571e1051a39Sopenharmony_ci.size ecp_nistz256_sub,.-ecp_nistz256_sub 572e1051a39Sopenharmony_ci 573e1051a39Sopenharmony_ci! void ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 574e1051a39Sopenharmony_ci.globl ecp_nistz256_neg 575e1051a39Sopenharmony_ci.align 32 576e1051a39Sopenharmony_ciecp_nistz256_neg: 577e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 578e1051a39Sopenharmony_ci mov $ap,$bp 579e1051a39Sopenharmony_ci mov 0,@acc[0] 580e1051a39Sopenharmony_ci mov 0,@acc[1] 581e1051a39Sopenharmony_ci mov 0,@acc[2] 582e1051a39Sopenharmony_ci mov 0,@acc[3] 583e1051a39Sopenharmony_ci mov 0,@acc[4] 584e1051a39Sopenharmony_ci mov 0,@acc[5] 585e1051a39Sopenharmony_ci mov 0,@acc[6] 586e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from 587e1051a39Sopenharmony_ci mov 0,@acc[7] 588e1051a39Sopenharmony_ci ret 589e1051a39Sopenharmony_ci restore 590e1051a39Sopenharmony_ci.type ecp_nistz256_neg,#function 591e1051a39Sopenharmony_ci.size ecp_nistz256_neg,.-ecp_nistz256_neg 592e1051a39Sopenharmony_ci 593e1051a39Sopenharmony_ci.align 32 594e1051a39Sopenharmony_ci__ecp_nistz256_sub_from: 595e1051a39Sopenharmony_ci ld [$bp+0],$t0 ! b[0] 596e1051a39Sopenharmony_ci ld [$bp+4],$t1 597e1051a39Sopenharmony_ci ld [$bp+8],$t2 598e1051a39Sopenharmony_ci ld [$bp+12],$t3 599e1051a39Sopenharmony_ci subcc @acc[0],$t0,@acc[0] 600e1051a39Sopenharmony_ci ld [$bp+16],$t4 601e1051a39Sopenharmony_ci ld [$bp+20],$t5 602e1051a39Sopenharmony_ci subccc @acc[1],$t1,@acc[1] 603e1051a39Sopenharmony_ci subccc @acc[2],$t2,@acc[2] 604e1051a39Sopenharmony_ci ld [$bp+24],$t6 605e1051a39Sopenharmony_ci ld [$bp+28],$t7 606e1051a39Sopenharmony_ci subccc @acc[3],$t3,@acc[3] 607e1051a39Sopenharmony_ci subccc @acc[4],$t4,@acc[4] 608e1051a39Sopenharmony_ci subccc @acc[5],$t5,@acc[5] 609e1051a39Sopenharmony_ci subccc @acc[6],$t6,@acc[6] 610e1051a39Sopenharmony_ci subccc @acc[7],$t7,@acc[7] 611e1051a39Sopenharmony_ci subc %g0,%g0,$carry ! broadcast borrow bit 612e1051a39Sopenharmony_ci 613e1051a39Sopenharmony_ci.Lreduce_by_add: 614e1051a39Sopenharmony_ci 615e1051a39Sopenharmony_ci ! if a-b borrows, add modulus. 616e1051a39Sopenharmony_ci ! 617e1051a39Sopenharmony_ci ! Note that because mod has special form, i.e. consists of 618e1051a39Sopenharmony_ci ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by 619e1051a39Sopenharmony_ci ! using value of broadcasted borrow and the borrow bit itself. 620e1051a39Sopenharmony_ci ! To minimize dependency chain we first broadcast and then 621e1051a39Sopenharmony_ci ! extract the bit by negating (follow $bi). 622e1051a39Sopenharmony_ci 623e1051a39Sopenharmony_ci addcc @acc[0],$carry,@acc[0] ! add synthesized modulus 624e1051a39Sopenharmony_ci addccc @acc[1],$carry,@acc[1] 625e1051a39Sopenharmony_ci neg $carry,$bi 626e1051a39Sopenharmony_ci st @acc[0],[$rp] 627e1051a39Sopenharmony_ci addccc @acc[2],$carry,@acc[2] 628e1051a39Sopenharmony_ci st @acc[1],[$rp+4] 629e1051a39Sopenharmony_ci addccc @acc[3],0,@acc[3] 630e1051a39Sopenharmony_ci st @acc[2],[$rp+8] 631e1051a39Sopenharmony_ci addccc @acc[4],0,@acc[4] 632e1051a39Sopenharmony_ci st @acc[3],[$rp+12] 633e1051a39Sopenharmony_ci addccc @acc[5],0,@acc[5] 634e1051a39Sopenharmony_ci st @acc[4],[$rp+16] 635e1051a39Sopenharmony_ci addccc @acc[6],$bi,@acc[6] 636e1051a39Sopenharmony_ci st @acc[5],[$rp+20] 637e1051a39Sopenharmony_ci addc @acc[7],$carry,@acc[7] 638e1051a39Sopenharmony_ci st @acc[6],[$rp+24] 639e1051a39Sopenharmony_ci retl 640e1051a39Sopenharmony_ci st @acc[7],[$rp+28] 641e1051a39Sopenharmony_ci.type __ecp_nistz256_sub_from,#function 642e1051a39Sopenharmony_ci.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 643e1051a39Sopenharmony_ci 644e1051a39Sopenharmony_ci.align 32 645e1051a39Sopenharmony_ci__ecp_nistz256_sub_morf: 646e1051a39Sopenharmony_ci ld [$bp+0],$t0 ! b[0] 647e1051a39Sopenharmony_ci ld [$bp+4],$t1 648e1051a39Sopenharmony_ci ld [$bp+8],$t2 649e1051a39Sopenharmony_ci ld [$bp+12],$t3 650e1051a39Sopenharmony_ci subcc $t0,@acc[0],@acc[0] 651e1051a39Sopenharmony_ci ld [$bp+16],$t4 652e1051a39Sopenharmony_ci ld [$bp+20],$t5 653e1051a39Sopenharmony_ci subccc $t1,@acc[1],@acc[1] 654e1051a39Sopenharmony_ci subccc $t2,@acc[2],@acc[2] 655e1051a39Sopenharmony_ci ld [$bp+24],$t6 656e1051a39Sopenharmony_ci ld [$bp+28],$t7 657e1051a39Sopenharmony_ci subccc $t3,@acc[3],@acc[3] 658e1051a39Sopenharmony_ci subccc $t4,@acc[4],@acc[4] 659e1051a39Sopenharmony_ci subccc $t5,@acc[5],@acc[5] 660e1051a39Sopenharmony_ci subccc $t6,@acc[6],@acc[6] 661e1051a39Sopenharmony_ci subccc $t7,@acc[7],@acc[7] 662e1051a39Sopenharmony_ci b .Lreduce_by_add 663e1051a39Sopenharmony_ci subc %g0,%g0,$carry ! broadcast borrow bit 664e1051a39Sopenharmony_ci.type __ecp_nistz256_sub_morf,#function 665e1051a39Sopenharmony_ci.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 666e1051a39Sopenharmony_ci 667e1051a39Sopenharmony_ci! void ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); 668e1051a39Sopenharmony_ci.globl ecp_nistz256_div_by_2 669e1051a39Sopenharmony_ci.align 32 670e1051a39Sopenharmony_ciecp_nistz256_div_by_2: 671e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 672e1051a39Sopenharmony_ci ld [$ap],@acc[0] 673e1051a39Sopenharmony_ci ld [$ap+4],@acc[1] 674e1051a39Sopenharmony_ci ld [$ap+8],@acc[2] 675e1051a39Sopenharmony_ci ld [$ap+12],@acc[3] 676e1051a39Sopenharmony_ci ld [$ap+16],@acc[4] 677e1051a39Sopenharmony_ci ld [$ap+20],@acc[5] 678e1051a39Sopenharmony_ci ld [$ap+24],@acc[6] 679e1051a39Sopenharmony_ci call __ecp_nistz256_div_by_2 680e1051a39Sopenharmony_ci ld [$ap+28],@acc[7] 681e1051a39Sopenharmony_ci ret 682e1051a39Sopenharmony_ci restore 683e1051a39Sopenharmony_ci.type ecp_nistz256_div_by_2,#function 684e1051a39Sopenharmony_ci.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 685e1051a39Sopenharmony_ci 686e1051a39Sopenharmony_ci.align 32 687e1051a39Sopenharmony_ci__ecp_nistz256_div_by_2: 688e1051a39Sopenharmony_ci ! ret = (a is odd ? a+mod : a) >> 1 689e1051a39Sopenharmony_ci 690e1051a39Sopenharmony_ci and @acc[0],1,$bi 691e1051a39Sopenharmony_ci neg $bi,$carry 692e1051a39Sopenharmony_ci addcc @acc[0],$carry,@acc[0] 693e1051a39Sopenharmony_ci addccc @acc[1],$carry,@acc[1] 694e1051a39Sopenharmony_ci addccc @acc[2],$carry,@acc[2] 695e1051a39Sopenharmony_ci addccc @acc[3],0,@acc[3] 696e1051a39Sopenharmony_ci addccc @acc[4],0,@acc[4] 697e1051a39Sopenharmony_ci addccc @acc[5],0,@acc[5] 698e1051a39Sopenharmony_ci addccc @acc[6],$bi,@acc[6] 699e1051a39Sopenharmony_ci addccc @acc[7],$carry,@acc[7] 700e1051a39Sopenharmony_ci addc %g0,%g0,$carry 701e1051a39Sopenharmony_ci 702e1051a39Sopenharmony_ci ! ret >>= 1 703e1051a39Sopenharmony_ci 704e1051a39Sopenharmony_ci srl @acc[0],1,@acc[0] 705e1051a39Sopenharmony_ci sll @acc[1],31,$t0 706e1051a39Sopenharmony_ci srl @acc[1],1,@acc[1] 707e1051a39Sopenharmony_ci or @acc[0],$t0,@acc[0] 708e1051a39Sopenharmony_ci sll @acc[2],31,$t1 709e1051a39Sopenharmony_ci srl @acc[2],1,@acc[2] 710e1051a39Sopenharmony_ci or @acc[1],$t1,@acc[1] 711e1051a39Sopenharmony_ci sll @acc[3],31,$t2 712e1051a39Sopenharmony_ci st @acc[0],[$rp] 713e1051a39Sopenharmony_ci srl @acc[3],1,@acc[3] 714e1051a39Sopenharmony_ci or @acc[2],$t2,@acc[2] 715e1051a39Sopenharmony_ci sll @acc[4],31,$t3 716e1051a39Sopenharmony_ci st @acc[1],[$rp+4] 717e1051a39Sopenharmony_ci srl @acc[4],1,@acc[4] 718e1051a39Sopenharmony_ci or @acc[3],$t3,@acc[3] 719e1051a39Sopenharmony_ci sll @acc[5],31,$t4 720e1051a39Sopenharmony_ci st @acc[2],[$rp+8] 721e1051a39Sopenharmony_ci srl @acc[5],1,@acc[5] 722e1051a39Sopenharmony_ci or @acc[4],$t4,@acc[4] 723e1051a39Sopenharmony_ci sll @acc[6],31,$t5 724e1051a39Sopenharmony_ci st @acc[3],[$rp+12] 725e1051a39Sopenharmony_ci srl @acc[6],1,@acc[6] 726e1051a39Sopenharmony_ci or @acc[5],$t5,@acc[5] 727e1051a39Sopenharmony_ci sll @acc[7],31,$t6 728e1051a39Sopenharmony_ci st @acc[4],[$rp+16] 729e1051a39Sopenharmony_ci srl @acc[7],1,@acc[7] 730e1051a39Sopenharmony_ci or @acc[6],$t6,@acc[6] 731e1051a39Sopenharmony_ci sll $carry,31,$t7 732e1051a39Sopenharmony_ci st @acc[5],[$rp+20] 733e1051a39Sopenharmony_ci or @acc[7],$t7,@acc[7] 734e1051a39Sopenharmony_ci st @acc[6],[$rp+24] 735e1051a39Sopenharmony_ci retl 736e1051a39Sopenharmony_ci st @acc[7],[$rp+28] 737e1051a39Sopenharmony_ci.type __ecp_nistz256_div_by_2,#function 738e1051a39Sopenharmony_ci.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 739e1051a39Sopenharmony_ci___ 740e1051a39Sopenharmony_ci 741e1051a39Sopenharmony_ci######################################################################## 742e1051a39Sopenharmony_ci# following subroutines are "literal" implementation of those found in 743e1051a39Sopenharmony_ci# ecp_nistz256.c 744e1051a39Sopenharmony_ci# 745e1051a39Sopenharmony_ci######################################################################## 746e1051a39Sopenharmony_ci# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 747e1051a39Sopenharmony_ci# 748e1051a39Sopenharmony_ci{ 749e1051a39Sopenharmony_cimy ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 750e1051a39Sopenharmony_ci# above map() describes stack layout with 4 temporary 751e1051a39Sopenharmony_ci# 256-bit vectors on top. 752e1051a39Sopenharmony_ci 753e1051a39Sopenharmony_ci$code.=<<___; 754e1051a39Sopenharmony_ci#ifdef __PIC__ 755e1051a39Sopenharmony_ciSPARC_PIC_THUNK(%g1) 756e1051a39Sopenharmony_ci#endif 757e1051a39Sopenharmony_ci 758e1051a39Sopenharmony_ci.globl ecp_nistz256_point_double 759e1051a39Sopenharmony_ci.align 32 760e1051a39Sopenharmony_ciecp_nistz256_point_double: 761e1051a39Sopenharmony_ci SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 762e1051a39Sopenharmony_ci ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] 763e1051a39Sopenharmony_ci and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 764e1051a39Sopenharmony_ci cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) 765e1051a39Sopenharmony_ci be ecp_nistz256_point_double_vis3 766e1051a39Sopenharmony_ci nop 767e1051a39Sopenharmony_ci 768e1051a39Sopenharmony_ci save %sp,-STACK_FRAME-32*4,%sp 769e1051a39Sopenharmony_ci 770e1051a39Sopenharmony_ci mov $rp,$rp_real 771e1051a39Sopenharmony_ci mov $ap,$ap_real 772e1051a39Sopenharmony_ci 773e1051a39Sopenharmony_ci.Lpoint_double_shortcut: 774e1051a39Sopenharmony_ci ld [$ap+32],@acc[0] 775e1051a39Sopenharmony_ci ld [$ap+32+4],@acc[1] 776e1051a39Sopenharmony_ci ld [$ap+32+8],@acc[2] 777e1051a39Sopenharmony_ci ld [$ap+32+12],@acc[3] 778e1051a39Sopenharmony_ci ld [$ap+32+16],@acc[4] 779e1051a39Sopenharmony_ci ld [$ap+32+20],@acc[5] 780e1051a39Sopenharmony_ci ld [$ap+32+24],@acc[6] 781e1051a39Sopenharmony_ci ld [$ap+32+28],@acc[7] 782e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y); 783e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$rp 784e1051a39Sopenharmony_ci 785e1051a39Sopenharmony_ci add $ap_real,64,$bp 786e1051a39Sopenharmony_ci add $ap_real,64,$ap 787e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z); 788e1051a39Sopenharmony_ci add %sp,LOCALS+$Zsqr,$rp 789e1051a39Sopenharmony_ci 790e1051a39Sopenharmony_ci add $ap_real,0,$bp 791e1051a39Sopenharmony_ci call __ecp_nistz256_add ! p256_add(M, Zsqr, in_x); 792e1051a39Sopenharmony_ci add %sp,LOCALS+$M,$rp 793e1051a39Sopenharmony_ci 794e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$bp 795e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$ap 796e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S); 797e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$rp 798e1051a39Sopenharmony_ci 799e1051a39Sopenharmony_ci ld [$ap_real],@acc[0] 800e1051a39Sopenharmony_ci add %sp,LOCALS+$Zsqr,$bp 801e1051a39Sopenharmony_ci ld [$ap_real+4],@acc[1] 802e1051a39Sopenharmony_ci ld [$ap_real+8],@acc[2] 803e1051a39Sopenharmony_ci ld [$ap_real+12],@acc[3] 804e1051a39Sopenharmony_ci ld [$ap_real+16],@acc[4] 805e1051a39Sopenharmony_ci ld [$ap_real+20],@acc[5] 806e1051a39Sopenharmony_ci ld [$ap_real+24],@acc[6] 807e1051a39Sopenharmony_ci ld [$ap_real+28],@acc[7] 808e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr); 809e1051a39Sopenharmony_ci add %sp,LOCALS+$Zsqr,$rp 810e1051a39Sopenharmony_ci 811e1051a39Sopenharmony_ci add $ap_real,32,$bp 812e1051a39Sopenharmony_ci add $ap_real,64,$ap 813e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y); 814e1051a39Sopenharmony_ci add %sp,LOCALS+$tmp0,$rp 815e1051a39Sopenharmony_ci 816e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0); 817e1051a39Sopenharmony_ci add $rp_real,64,$rp 818e1051a39Sopenharmony_ci 819e1051a39Sopenharmony_ci add %sp,LOCALS+$Zsqr,$bp 820e1051a39Sopenharmony_ci add %sp,LOCALS+$M,$ap 821e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr); 822e1051a39Sopenharmony_ci add %sp,LOCALS+$M,$rp 823e1051a39Sopenharmony_ci 824e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M); 825e1051a39Sopenharmony_ci add %sp,LOCALS+$M,$rp 826e1051a39Sopenharmony_ci 827e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$bp 828e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$ap 829e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S); 830e1051a39Sopenharmony_ci add %sp,LOCALS+$tmp0,$rp 831e1051a39Sopenharmony_ci 832e1051a39Sopenharmony_ci call __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0); 833e1051a39Sopenharmony_ci add $rp_real,32,$rp 834e1051a39Sopenharmony_ci 835e1051a39Sopenharmony_ci add $ap_real,0,$bp 836e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$ap 837e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x); 838e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$rp 839e1051a39Sopenharmony_ci 840e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S); 841e1051a39Sopenharmony_ci add %sp,LOCALS+$tmp0,$rp 842e1051a39Sopenharmony_ci 843e1051a39Sopenharmony_ci add %sp,LOCALS+$M,$bp 844e1051a39Sopenharmony_ci add %sp,LOCALS+$M,$ap 845e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M); 846e1051a39Sopenharmony_ci add $rp_real,0,$rp 847e1051a39Sopenharmony_ci 848e1051a39Sopenharmony_ci add %sp,LOCALS+$tmp0,$bp 849e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0); 850e1051a39Sopenharmony_ci add $rp_real,0,$rp 851e1051a39Sopenharmony_ci 852e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$bp 853e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x); 854e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$rp 855e1051a39Sopenharmony_ci 856e1051a39Sopenharmony_ci add %sp,LOCALS+$M,$bp 857e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$ap 858e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M); 859e1051a39Sopenharmony_ci add %sp,LOCALS+$S,$rp 860e1051a39Sopenharmony_ci 861e1051a39Sopenharmony_ci add $rp_real,32,$bp 862e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y); 863e1051a39Sopenharmony_ci add $rp_real,32,$rp 864e1051a39Sopenharmony_ci 865e1051a39Sopenharmony_ci ret 866e1051a39Sopenharmony_ci restore 867e1051a39Sopenharmony_ci.type ecp_nistz256_point_double,#function 868e1051a39Sopenharmony_ci.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 869e1051a39Sopenharmony_ci___ 870e1051a39Sopenharmony_ci} 871e1051a39Sopenharmony_ci 872e1051a39Sopenharmony_ci######################################################################## 873e1051a39Sopenharmony_ci# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 874e1051a39Sopenharmony_ci# const P256_POINT *in2); 875e1051a39Sopenharmony_ci{ 876e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z, 877e1051a39Sopenharmony_ci $H,$Hsqr,$R,$Rsqr,$Hcub, 878e1051a39Sopenharmony_ci $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); 879e1051a39Sopenharmony_cimy ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 880e1051a39Sopenharmony_ci 881e1051a39Sopenharmony_ci# above map() describes stack layout with 12 temporary 882e1051a39Sopenharmony_ci# 256-bit vectors on top. Then we reserve some space for 883e1051a39Sopenharmony_ci# !in1infty, !in2infty, result of check for zero and return pointer. 884e1051a39Sopenharmony_ci 885e1051a39Sopenharmony_cimy $bp_real=$rp_real; 886e1051a39Sopenharmony_ci 887e1051a39Sopenharmony_ci$code.=<<___; 888e1051a39Sopenharmony_ci.globl ecp_nistz256_point_add 889e1051a39Sopenharmony_ci.align 32 890e1051a39Sopenharmony_ciecp_nistz256_point_add: 891e1051a39Sopenharmony_ci SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 892e1051a39Sopenharmony_ci ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] 893e1051a39Sopenharmony_ci and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 894e1051a39Sopenharmony_ci cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) 895e1051a39Sopenharmony_ci be ecp_nistz256_point_add_vis3 896e1051a39Sopenharmony_ci nop 897e1051a39Sopenharmony_ci 898e1051a39Sopenharmony_ci save %sp,-STACK_FRAME-32*12-32,%sp 899e1051a39Sopenharmony_ci 900e1051a39Sopenharmony_ci stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp 901e1051a39Sopenharmony_ci mov $ap,$ap_real 902e1051a39Sopenharmony_ci mov $bp,$bp_real 903e1051a39Sopenharmony_ci 904e1051a39Sopenharmony_ci ld [$bp+64],$t0 ! in2_z 905e1051a39Sopenharmony_ci ld [$bp+64+4],$t1 906e1051a39Sopenharmony_ci ld [$bp+64+8],$t2 907e1051a39Sopenharmony_ci ld [$bp+64+12],$t3 908e1051a39Sopenharmony_ci ld [$bp+64+16],$t4 909e1051a39Sopenharmony_ci ld [$bp+64+20],$t5 910e1051a39Sopenharmony_ci ld [$bp+64+24],$t6 911e1051a39Sopenharmony_ci ld [$bp+64+28],$t7 912e1051a39Sopenharmony_ci or $t1,$t0,$t0 913e1051a39Sopenharmony_ci or $t3,$t2,$t2 914e1051a39Sopenharmony_ci or $t5,$t4,$t4 915e1051a39Sopenharmony_ci or $t7,$t6,$t6 916e1051a39Sopenharmony_ci or $t2,$t0,$t0 917e1051a39Sopenharmony_ci or $t6,$t4,$t4 918e1051a39Sopenharmony_ci or $t4,$t0,$t0 ! !in2infty 919e1051a39Sopenharmony_ci movrnz $t0,-1,$t0 920e1051a39Sopenharmony_ci st $t0,[%fp+STACK_BIAS-12] 921e1051a39Sopenharmony_ci 922e1051a39Sopenharmony_ci ld [$ap+64],$t0 ! in1_z 923e1051a39Sopenharmony_ci ld [$ap+64+4],$t1 924e1051a39Sopenharmony_ci ld [$ap+64+8],$t2 925e1051a39Sopenharmony_ci ld [$ap+64+12],$t3 926e1051a39Sopenharmony_ci ld [$ap+64+16],$t4 927e1051a39Sopenharmony_ci ld [$ap+64+20],$t5 928e1051a39Sopenharmony_ci ld [$ap+64+24],$t6 929e1051a39Sopenharmony_ci ld [$ap+64+28],$t7 930e1051a39Sopenharmony_ci or $t1,$t0,$t0 931e1051a39Sopenharmony_ci or $t3,$t2,$t2 932e1051a39Sopenharmony_ci or $t5,$t4,$t4 933e1051a39Sopenharmony_ci or $t7,$t6,$t6 934e1051a39Sopenharmony_ci or $t2,$t0,$t0 935e1051a39Sopenharmony_ci or $t6,$t4,$t4 936e1051a39Sopenharmony_ci or $t4,$t0,$t0 ! !in1infty 937e1051a39Sopenharmony_ci movrnz $t0,-1,$t0 938e1051a39Sopenharmony_ci st $t0,[%fp+STACK_BIAS-16] 939e1051a39Sopenharmony_ci 940e1051a39Sopenharmony_ci add $bp_real,64,$bp 941e1051a39Sopenharmony_ci add $bp_real,64,$ap 942e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z); 943e1051a39Sopenharmony_ci add %sp,LOCALS+$Z2sqr,$rp 944e1051a39Sopenharmony_ci 945e1051a39Sopenharmony_ci add $ap_real,64,$bp 946e1051a39Sopenharmony_ci add $ap_real,64,$ap 947e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); 948e1051a39Sopenharmony_ci add %sp,LOCALS+$Z1sqr,$rp 949e1051a39Sopenharmony_ci 950e1051a39Sopenharmony_ci add $bp_real,64,$bp 951e1051a39Sopenharmony_ci add %sp,LOCALS+$Z2sqr,$ap 952e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z); 953e1051a39Sopenharmony_ci add %sp,LOCALS+$S1,$rp 954e1051a39Sopenharmony_ci 955e1051a39Sopenharmony_ci add $ap_real,64,$bp 956e1051a39Sopenharmony_ci add %sp,LOCALS+$Z1sqr,$ap 957e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); 958e1051a39Sopenharmony_ci add %sp,LOCALS+$S2,$rp 959e1051a39Sopenharmony_ci 960e1051a39Sopenharmony_ci add $ap_real,32,$bp 961e1051a39Sopenharmony_ci add %sp,LOCALS+$S1,$ap 962e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y); 963e1051a39Sopenharmony_ci add %sp,LOCALS+$S1,$rp 964e1051a39Sopenharmony_ci 965e1051a39Sopenharmony_ci add $bp_real,32,$bp 966e1051a39Sopenharmony_ci add %sp,LOCALS+$S2,$ap 967e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); 968e1051a39Sopenharmony_ci add %sp,LOCALS+$S2,$rp 969e1051a39Sopenharmony_ci 970e1051a39Sopenharmony_ci add %sp,LOCALS+$S1,$bp 971e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(R, S2, S1); 972e1051a39Sopenharmony_ci add %sp,LOCALS+$R,$rp 973e1051a39Sopenharmony_ci 974e1051a39Sopenharmony_ci or @acc[1],@acc[0],@acc[0] ! see if result is zero 975e1051a39Sopenharmony_ci or @acc[3],@acc[2],@acc[2] 976e1051a39Sopenharmony_ci or @acc[5],@acc[4],@acc[4] 977e1051a39Sopenharmony_ci or @acc[7],@acc[6],@acc[6] 978e1051a39Sopenharmony_ci or @acc[2],@acc[0],@acc[0] 979e1051a39Sopenharmony_ci or @acc[6],@acc[4],@acc[4] 980e1051a39Sopenharmony_ci or @acc[4],@acc[0],@acc[0] 981e1051a39Sopenharmony_ci st @acc[0],[%fp+STACK_BIAS-20] 982e1051a39Sopenharmony_ci 983e1051a39Sopenharmony_ci add $ap_real,0,$bp 984e1051a39Sopenharmony_ci add %sp,LOCALS+$Z2sqr,$ap 985e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr); 986e1051a39Sopenharmony_ci add %sp,LOCALS+$U1,$rp 987e1051a39Sopenharmony_ci 988e1051a39Sopenharmony_ci add $bp_real,0,$bp 989e1051a39Sopenharmony_ci add %sp,LOCALS+$Z1sqr,$ap 990e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr); 991e1051a39Sopenharmony_ci add %sp,LOCALS+$U2,$rp 992e1051a39Sopenharmony_ci 993e1051a39Sopenharmony_ci add %sp,LOCALS+$U1,$bp 994e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(H, U2, U1); 995e1051a39Sopenharmony_ci add %sp,LOCALS+$H,$rp 996e1051a39Sopenharmony_ci 997e1051a39Sopenharmony_ci or @acc[1],@acc[0],@acc[0] ! see if result is zero 998e1051a39Sopenharmony_ci or @acc[3],@acc[2],@acc[2] 999e1051a39Sopenharmony_ci or @acc[5],@acc[4],@acc[4] 1000e1051a39Sopenharmony_ci or @acc[7],@acc[6],@acc[6] 1001e1051a39Sopenharmony_ci or @acc[2],@acc[0],@acc[0] 1002e1051a39Sopenharmony_ci or @acc[6],@acc[4],@acc[4] 1003e1051a39Sopenharmony_ci orcc @acc[4],@acc[0],@acc[0] 1004e1051a39Sopenharmony_ci 1005e1051a39Sopenharmony_ci bne,pt %icc,.Ladd_proceed ! is_equal(U1,U2)? 1006e1051a39Sopenharmony_ci nop 1007e1051a39Sopenharmony_ci 1008e1051a39Sopenharmony_ci ld [%fp+STACK_BIAS-12],$t0 1009e1051a39Sopenharmony_ci ld [%fp+STACK_BIAS-16],$t1 1010e1051a39Sopenharmony_ci ld [%fp+STACK_BIAS-20],$t2 1011e1051a39Sopenharmony_ci andcc $t0,$t1,%g0 1012e1051a39Sopenharmony_ci be,pt %icc,.Ladd_proceed ! (in1infty || in2infty)? 1013e1051a39Sopenharmony_ci nop 1014e1051a39Sopenharmony_ci andcc $t2,$t2,%g0 1015e1051a39Sopenharmony_ci be,pt %icc,.Ladd_double ! is_equal(S1,S2)? 1016e1051a39Sopenharmony_ci nop 1017e1051a39Sopenharmony_ci 1018e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-8],$rp 1019e1051a39Sopenharmony_ci st %g0,[$rp] 1020e1051a39Sopenharmony_ci st %g0,[$rp+4] 1021e1051a39Sopenharmony_ci st %g0,[$rp+8] 1022e1051a39Sopenharmony_ci st %g0,[$rp+12] 1023e1051a39Sopenharmony_ci st %g0,[$rp+16] 1024e1051a39Sopenharmony_ci st %g0,[$rp+20] 1025e1051a39Sopenharmony_ci st %g0,[$rp+24] 1026e1051a39Sopenharmony_ci st %g0,[$rp+28] 1027e1051a39Sopenharmony_ci st %g0,[$rp+32] 1028e1051a39Sopenharmony_ci st %g0,[$rp+32+4] 1029e1051a39Sopenharmony_ci st %g0,[$rp+32+8] 1030e1051a39Sopenharmony_ci st %g0,[$rp+32+12] 1031e1051a39Sopenharmony_ci st %g0,[$rp+32+16] 1032e1051a39Sopenharmony_ci st %g0,[$rp+32+20] 1033e1051a39Sopenharmony_ci st %g0,[$rp+32+24] 1034e1051a39Sopenharmony_ci st %g0,[$rp+32+28] 1035e1051a39Sopenharmony_ci st %g0,[$rp+64] 1036e1051a39Sopenharmony_ci st %g0,[$rp+64+4] 1037e1051a39Sopenharmony_ci st %g0,[$rp+64+8] 1038e1051a39Sopenharmony_ci st %g0,[$rp+64+12] 1039e1051a39Sopenharmony_ci st %g0,[$rp+64+16] 1040e1051a39Sopenharmony_ci st %g0,[$rp+64+20] 1041e1051a39Sopenharmony_ci st %g0,[$rp+64+24] 1042e1051a39Sopenharmony_ci st %g0,[$rp+64+28] 1043e1051a39Sopenharmony_ci b .Ladd_done 1044e1051a39Sopenharmony_ci nop 1045e1051a39Sopenharmony_ci 1046e1051a39Sopenharmony_ci.align 16 1047e1051a39Sopenharmony_ci.Ladd_double: 1048e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-8],$rp_real 1049e1051a39Sopenharmony_ci mov $ap_real,$ap 1050e1051a39Sopenharmony_ci b .Lpoint_double_shortcut 1051e1051a39Sopenharmony_ci add %sp,32*(12-4)+32,%sp ! difference in frame sizes 1052e1051a39Sopenharmony_ci 1053e1051a39Sopenharmony_ci.align 16 1054e1051a39Sopenharmony_ci.Ladd_proceed: 1055e1051a39Sopenharmony_ci add %sp,LOCALS+$R,$bp 1056e1051a39Sopenharmony_ci add %sp,LOCALS+$R,$ap 1057e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); 1058e1051a39Sopenharmony_ci add %sp,LOCALS+$Rsqr,$rp 1059e1051a39Sopenharmony_ci 1060e1051a39Sopenharmony_ci add $ap_real,64,$bp 1061e1051a39Sopenharmony_ci add %sp,LOCALS+$H,$ap 1062e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); 1063e1051a39Sopenharmony_ci add %sp,LOCALS+$res_z,$rp 1064e1051a39Sopenharmony_ci 1065e1051a39Sopenharmony_ci add %sp,LOCALS+$H,$bp 1066e1051a39Sopenharmony_ci add %sp,LOCALS+$H,$ap 1067e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); 1068e1051a39Sopenharmony_ci add %sp,LOCALS+$Hsqr,$rp 1069e1051a39Sopenharmony_ci 1070e1051a39Sopenharmony_ci add $bp_real,64,$bp 1071e1051a39Sopenharmony_ci add %sp,LOCALS+$res_z,$ap 1072e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z); 1073e1051a39Sopenharmony_ci add %sp,LOCALS+$res_z,$rp 1074e1051a39Sopenharmony_ci 1075e1051a39Sopenharmony_ci add %sp,LOCALS+$H,$bp 1076e1051a39Sopenharmony_ci add %sp,LOCALS+$Hsqr,$ap 1077e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); 1078e1051a39Sopenharmony_ci add %sp,LOCALS+$Hcub,$rp 1079e1051a39Sopenharmony_ci 1080e1051a39Sopenharmony_ci add %sp,LOCALS+$U1,$bp 1081e1051a39Sopenharmony_ci add %sp,LOCALS+$Hsqr,$ap 1082e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr); 1083e1051a39Sopenharmony_ci add %sp,LOCALS+$U2,$rp 1084e1051a39Sopenharmony_ci 1085e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); 1086e1051a39Sopenharmony_ci add %sp,LOCALS+$Hsqr,$rp 1087e1051a39Sopenharmony_ci 1088e1051a39Sopenharmony_ci add %sp,LOCALS+$Rsqr,$bp 1089e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); 1090e1051a39Sopenharmony_ci add %sp,LOCALS+$res_x,$rp 1091e1051a39Sopenharmony_ci 1092e1051a39Sopenharmony_ci add %sp,LOCALS+$Hcub,$bp 1093e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); 1094e1051a39Sopenharmony_ci add %sp,LOCALS+$res_x,$rp 1095e1051a39Sopenharmony_ci 1096e1051a39Sopenharmony_ci add %sp,LOCALS+$U2,$bp 1097e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); 1098e1051a39Sopenharmony_ci add %sp,LOCALS+$res_y,$rp 1099e1051a39Sopenharmony_ci 1100e1051a39Sopenharmony_ci add %sp,LOCALS+$Hcub,$bp 1101e1051a39Sopenharmony_ci add %sp,LOCALS+$S1,$ap 1102e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub); 1103e1051a39Sopenharmony_ci add %sp,LOCALS+$S2,$rp 1104e1051a39Sopenharmony_ci 1105e1051a39Sopenharmony_ci add %sp,LOCALS+$R,$bp 1106e1051a39Sopenharmony_ci add %sp,LOCALS+$res_y,$ap 1107e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); 1108e1051a39Sopenharmony_ci add %sp,LOCALS+$res_y,$rp 1109e1051a39Sopenharmony_ci 1110e1051a39Sopenharmony_ci add %sp,LOCALS+$S2,$bp 1111e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); 1112e1051a39Sopenharmony_ci add %sp,LOCALS+$res_y,$rp 1113e1051a39Sopenharmony_ci 1114e1051a39Sopenharmony_ci ld [%fp+STACK_BIAS-16],$t1 ! !in1infty 1115e1051a39Sopenharmony_ci ld [%fp+STACK_BIAS-12],$t2 ! !in2infty 1116e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-8],$rp 1117e1051a39Sopenharmony_ci___ 1118e1051a39Sopenharmony_cifor($i=0;$i<96;$i+=8) { # conditional moves 1119e1051a39Sopenharmony_ci$code.=<<___; 1120e1051a39Sopenharmony_ci ld [%sp+LOCALS+$i],@acc[0] ! res 1121e1051a39Sopenharmony_ci ld [%sp+LOCALS+$i+4],@acc[1] 1122e1051a39Sopenharmony_ci ld [$bp_real+$i],@acc[2] ! in2 1123e1051a39Sopenharmony_ci ld [$bp_real+$i+4],@acc[3] 1124e1051a39Sopenharmony_ci ld [$ap_real+$i],@acc[4] ! in1 1125e1051a39Sopenharmony_ci ld [$ap_real+$i+4],@acc[5] 1126e1051a39Sopenharmony_ci movrz $t1,@acc[2],@acc[0] 1127e1051a39Sopenharmony_ci movrz $t1,@acc[3],@acc[1] 1128e1051a39Sopenharmony_ci movrz $t2,@acc[4],@acc[0] 1129e1051a39Sopenharmony_ci movrz $t2,@acc[5],@acc[1] 1130e1051a39Sopenharmony_ci st @acc[0],[$rp+$i] 1131e1051a39Sopenharmony_ci st @acc[1],[$rp+$i+4] 1132e1051a39Sopenharmony_ci___ 1133e1051a39Sopenharmony_ci} 1134e1051a39Sopenharmony_ci$code.=<<___; 1135e1051a39Sopenharmony_ci.Ladd_done: 1136e1051a39Sopenharmony_ci ret 1137e1051a39Sopenharmony_ci restore 1138e1051a39Sopenharmony_ci.type ecp_nistz256_point_add,#function 1139e1051a39Sopenharmony_ci.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1140e1051a39Sopenharmony_ci___ 1141e1051a39Sopenharmony_ci} 1142e1051a39Sopenharmony_ci 1143e1051a39Sopenharmony_ci######################################################################## 1144e1051a39Sopenharmony_ci# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1145e1051a39Sopenharmony_ci# const P256_POINT_AFFINE *in2); 1146e1051a39Sopenharmony_ci{ 1147e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z, 1148e1051a39Sopenharmony_ci $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 1149e1051a39Sopenharmony_cimy $Z1sqr = $S2; 1150e1051a39Sopenharmony_ci# above map() describes stack layout with 10 temporary 1151e1051a39Sopenharmony_ci# 256-bit vectors on top. Then we reserve some space for 1152e1051a39Sopenharmony_ci# !in1infty, !in2infty, result of check for zero and return pointer. 1153e1051a39Sopenharmony_ci 1154e1051a39Sopenharmony_cimy @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 1155e1051a39Sopenharmony_cimy $bp_real=$rp_real; 1156e1051a39Sopenharmony_ci 1157e1051a39Sopenharmony_ci$code.=<<___; 1158e1051a39Sopenharmony_ci.globl ecp_nistz256_point_add_affine 1159e1051a39Sopenharmony_ci.align 32 1160e1051a39Sopenharmony_ciecp_nistz256_point_add_affine: 1161e1051a39Sopenharmony_ci SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 1162e1051a39Sopenharmony_ci ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] 1163e1051a39Sopenharmony_ci and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 1164e1051a39Sopenharmony_ci cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) 1165e1051a39Sopenharmony_ci be ecp_nistz256_point_add_affine_vis3 1166e1051a39Sopenharmony_ci nop 1167e1051a39Sopenharmony_ci 1168e1051a39Sopenharmony_ci save %sp,-STACK_FRAME-32*10-32,%sp 1169e1051a39Sopenharmony_ci 1170e1051a39Sopenharmony_ci stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp 1171e1051a39Sopenharmony_ci mov $ap,$ap_real 1172e1051a39Sopenharmony_ci mov $bp,$bp_real 1173e1051a39Sopenharmony_ci 1174e1051a39Sopenharmony_ci ld [$ap+64],$t0 ! in1_z 1175e1051a39Sopenharmony_ci ld [$ap+64+4],$t1 1176e1051a39Sopenharmony_ci ld [$ap+64+8],$t2 1177e1051a39Sopenharmony_ci ld [$ap+64+12],$t3 1178e1051a39Sopenharmony_ci ld [$ap+64+16],$t4 1179e1051a39Sopenharmony_ci ld [$ap+64+20],$t5 1180e1051a39Sopenharmony_ci ld [$ap+64+24],$t6 1181e1051a39Sopenharmony_ci ld [$ap+64+28],$t7 1182e1051a39Sopenharmony_ci or $t1,$t0,$t0 1183e1051a39Sopenharmony_ci or $t3,$t2,$t2 1184e1051a39Sopenharmony_ci or $t5,$t4,$t4 1185e1051a39Sopenharmony_ci or $t7,$t6,$t6 1186e1051a39Sopenharmony_ci or $t2,$t0,$t0 1187e1051a39Sopenharmony_ci or $t6,$t4,$t4 1188e1051a39Sopenharmony_ci or $t4,$t0,$t0 ! !in1infty 1189e1051a39Sopenharmony_ci movrnz $t0,-1,$t0 1190e1051a39Sopenharmony_ci st $t0,[%fp+STACK_BIAS-16] 1191e1051a39Sopenharmony_ci 1192e1051a39Sopenharmony_ci ld [$bp],@acc[0] ! in2_x 1193e1051a39Sopenharmony_ci ld [$bp+4],@acc[1] 1194e1051a39Sopenharmony_ci ld [$bp+8],@acc[2] 1195e1051a39Sopenharmony_ci ld [$bp+12],@acc[3] 1196e1051a39Sopenharmony_ci ld [$bp+16],@acc[4] 1197e1051a39Sopenharmony_ci ld [$bp+20],@acc[5] 1198e1051a39Sopenharmony_ci ld [$bp+24],@acc[6] 1199e1051a39Sopenharmony_ci ld [$bp+28],@acc[7] 1200e1051a39Sopenharmony_ci ld [$bp+32],$t0 ! in2_y 1201e1051a39Sopenharmony_ci ld [$bp+32+4],$t1 1202e1051a39Sopenharmony_ci ld [$bp+32+8],$t2 1203e1051a39Sopenharmony_ci ld [$bp+32+12],$t3 1204e1051a39Sopenharmony_ci ld [$bp+32+16],$t4 1205e1051a39Sopenharmony_ci ld [$bp+32+20],$t5 1206e1051a39Sopenharmony_ci ld [$bp+32+24],$t6 1207e1051a39Sopenharmony_ci ld [$bp+32+28],$t7 1208e1051a39Sopenharmony_ci or @acc[1],@acc[0],@acc[0] 1209e1051a39Sopenharmony_ci or @acc[3],@acc[2],@acc[2] 1210e1051a39Sopenharmony_ci or @acc[5],@acc[4],@acc[4] 1211e1051a39Sopenharmony_ci or @acc[7],@acc[6],@acc[6] 1212e1051a39Sopenharmony_ci or @acc[2],@acc[0],@acc[0] 1213e1051a39Sopenharmony_ci or @acc[6],@acc[4],@acc[4] 1214e1051a39Sopenharmony_ci or @acc[4],@acc[0],@acc[0] 1215e1051a39Sopenharmony_ci or $t1,$t0,$t0 1216e1051a39Sopenharmony_ci or $t3,$t2,$t2 1217e1051a39Sopenharmony_ci or $t5,$t4,$t4 1218e1051a39Sopenharmony_ci or $t7,$t6,$t6 1219e1051a39Sopenharmony_ci or $t2,$t0,$t0 1220e1051a39Sopenharmony_ci or $t6,$t4,$t4 1221e1051a39Sopenharmony_ci or $t4,$t0,$t0 1222e1051a39Sopenharmony_ci or @acc[0],$t0,$t0 ! !in2infty 1223e1051a39Sopenharmony_ci movrnz $t0,-1,$t0 1224e1051a39Sopenharmony_ci st $t0,[%fp+STACK_BIAS-12] 1225e1051a39Sopenharmony_ci 1226e1051a39Sopenharmony_ci add $ap_real,64,$bp 1227e1051a39Sopenharmony_ci add $ap_real,64,$ap 1228e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); 1229e1051a39Sopenharmony_ci add %sp,LOCALS+$Z1sqr,$rp 1230e1051a39Sopenharmony_ci 1231e1051a39Sopenharmony_ci add $bp_real,0,$bp 1232e1051a39Sopenharmony_ci add %sp,LOCALS+$Z1sqr,$ap 1233e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x); 1234e1051a39Sopenharmony_ci add %sp,LOCALS+$U2,$rp 1235e1051a39Sopenharmony_ci 1236e1051a39Sopenharmony_ci add $ap_real,0,$bp 1237e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x); 1238e1051a39Sopenharmony_ci add %sp,LOCALS+$H,$rp 1239e1051a39Sopenharmony_ci 1240e1051a39Sopenharmony_ci add $ap_real,64,$bp 1241e1051a39Sopenharmony_ci add %sp,LOCALS+$Z1sqr,$ap 1242e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); 1243e1051a39Sopenharmony_ci add %sp,LOCALS+$S2,$rp 1244e1051a39Sopenharmony_ci 1245e1051a39Sopenharmony_ci add $ap_real,64,$bp 1246e1051a39Sopenharmony_ci add %sp,LOCALS+$H,$ap 1247e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); 1248e1051a39Sopenharmony_ci add %sp,LOCALS+$res_z,$rp 1249e1051a39Sopenharmony_ci 1250e1051a39Sopenharmony_ci add $bp_real,32,$bp 1251e1051a39Sopenharmony_ci add %sp,LOCALS+$S2,$ap 1252e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); 1253e1051a39Sopenharmony_ci add %sp,LOCALS+$S2,$rp 1254e1051a39Sopenharmony_ci 1255e1051a39Sopenharmony_ci add $ap_real,32,$bp 1256e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y); 1257e1051a39Sopenharmony_ci add %sp,LOCALS+$R,$rp 1258e1051a39Sopenharmony_ci 1259e1051a39Sopenharmony_ci add %sp,LOCALS+$H,$bp 1260e1051a39Sopenharmony_ci add %sp,LOCALS+$H,$ap 1261e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); 1262e1051a39Sopenharmony_ci add %sp,LOCALS+$Hsqr,$rp 1263e1051a39Sopenharmony_ci 1264e1051a39Sopenharmony_ci add %sp,LOCALS+$R,$bp 1265e1051a39Sopenharmony_ci add %sp,LOCALS+$R,$ap 1266e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); 1267e1051a39Sopenharmony_ci add %sp,LOCALS+$Rsqr,$rp 1268e1051a39Sopenharmony_ci 1269e1051a39Sopenharmony_ci add %sp,LOCALS+$H,$bp 1270e1051a39Sopenharmony_ci add %sp,LOCALS+$Hsqr,$ap 1271e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); 1272e1051a39Sopenharmony_ci add %sp,LOCALS+$Hcub,$rp 1273e1051a39Sopenharmony_ci 1274e1051a39Sopenharmony_ci add $ap_real,0,$bp 1275e1051a39Sopenharmony_ci add %sp,LOCALS+$Hsqr,$ap 1276e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr); 1277e1051a39Sopenharmony_ci add %sp,LOCALS+$U2,$rp 1278e1051a39Sopenharmony_ci 1279e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); 1280e1051a39Sopenharmony_ci add %sp,LOCALS+$Hsqr,$rp 1281e1051a39Sopenharmony_ci 1282e1051a39Sopenharmony_ci add %sp,LOCALS+$Rsqr,$bp 1283e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); 1284e1051a39Sopenharmony_ci add %sp,LOCALS+$res_x,$rp 1285e1051a39Sopenharmony_ci 1286e1051a39Sopenharmony_ci add %sp,LOCALS+$Hcub,$bp 1287e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); 1288e1051a39Sopenharmony_ci add %sp,LOCALS+$res_x,$rp 1289e1051a39Sopenharmony_ci 1290e1051a39Sopenharmony_ci add %sp,LOCALS+$U2,$bp 1291e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); 1292e1051a39Sopenharmony_ci add %sp,LOCALS+$res_y,$rp 1293e1051a39Sopenharmony_ci 1294e1051a39Sopenharmony_ci add $ap_real,32,$bp 1295e1051a39Sopenharmony_ci add %sp,LOCALS+$Hcub,$ap 1296e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub); 1297e1051a39Sopenharmony_ci add %sp,LOCALS+$S2,$rp 1298e1051a39Sopenharmony_ci 1299e1051a39Sopenharmony_ci add %sp,LOCALS+$R,$bp 1300e1051a39Sopenharmony_ci add %sp,LOCALS+$res_y,$ap 1301e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); 1302e1051a39Sopenharmony_ci add %sp,LOCALS+$res_y,$rp 1303e1051a39Sopenharmony_ci 1304e1051a39Sopenharmony_ci add %sp,LOCALS+$S2,$bp 1305e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); 1306e1051a39Sopenharmony_ci add %sp,LOCALS+$res_y,$rp 1307e1051a39Sopenharmony_ci 1308e1051a39Sopenharmony_ci ld [%fp+STACK_BIAS-16],$t1 ! !in1infty 1309e1051a39Sopenharmony_ci ld [%fp+STACK_BIAS-12],$t2 ! !in2infty 1310e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-8],$rp 1311e1051a39Sopenharmony_ci___ 1312e1051a39Sopenharmony_cifor($i=0;$i<64;$i+=8) { # conditional moves 1313e1051a39Sopenharmony_ci$code.=<<___; 1314e1051a39Sopenharmony_ci ld [%sp+LOCALS+$i],@acc[0] ! res 1315e1051a39Sopenharmony_ci ld [%sp+LOCALS+$i+4],@acc[1] 1316e1051a39Sopenharmony_ci ld [$bp_real+$i],@acc[2] ! in2 1317e1051a39Sopenharmony_ci ld [$bp_real+$i+4],@acc[3] 1318e1051a39Sopenharmony_ci ld [$ap_real+$i],@acc[4] ! in1 1319e1051a39Sopenharmony_ci ld [$ap_real+$i+4],@acc[5] 1320e1051a39Sopenharmony_ci movrz $t1,@acc[2],@acc[0] 1321e1051a39Sopenharmony_ci movrz $t1,@acc[3],@acc[1] 1322e1051a39Sopenharmony_ci movrz $t2,@acc[4],@acc[0] 1323e1051a39Sopenharmony_ci movrz $t2,@acc[5],@acc[1] 1324e1051a39Sopenharmony_ci st @acc[0],[$rp+$i] 1325e1051a39Sopenharmony_ci st @acc[1],[$rp+$i+4] 1326e1051a39Sopenharmony_ci___ 1327e1051a39Sopenharmony_ci} 1328e1051a39Sopenharmony_cifor(;$i<96;$i+=8) { 1329e1051a39Sopenharmony_cimy $j=($i-64)/4; 1330e1051a39Sopenharmony_ci$code.=<<___; 1331e1051a39Sopenharmony_ci ld [%sp+LOCALS+$i],@acc[0] ! res 1332e1051a39Sopenharmony_ci ld [%sp+LOCALS+$i+4],@acc[1] 1333e1051a39Sopenharmony_ci ld [$ap_real+$i],@acc[4] ! in1 1334e1051a39Sopenharmony_ci ld [$ap_real+$i+4],@acc[5] 1335e1051a39Sopenharmony_ci movrz $t1,@ONE_mont[$j],@acc[0] 1336e1051a39Sopenharmony_ci movrz $t1,@ONE_mont[$j+1],@acc[1] 1337e1051a39Sopenharmony_ci movrz $t2,@acc[4],@acc[0] 1338e1051a39Sopenharmony_ci movrz $t2,@acc[5],@acc[1] 1339e1051a39Sopenharmony_ci st @acc[0],[$rp+$i] 1340e1051a39Sopenharmony_ci st @acc[1],[$rp+$i+4] 1341e1051a39Sopenharmony_ci___ 1342e1051a39Sopenharmony_ci} 1343e1051a39Sopenharmony_ci$code.=<<___; 1344e1051a39Sopenharmony_ci ret 1345e1051a39Sopenharmony_ci restore 1346e1051a39Sopenharmony_ci.type ecp_nistz256_point_add_affine,#function 1347e1051a39Sopenharmony_ci.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1348e1051a39Sopenharmony_ci___ 1349e1051a39Sopenharmony_ci} }}} 1350e1051a39Sopenharmony_ci{{{ 1351e1051a39Sopenharmony_cimy ($out,$inp,$index)=map("%i$_",(0..2)); 1352e1051a39Sopenharmony_cimy $mask="%o0"; 1353e1051a39Sopenharmony_ci 1354e1051a39Sopenharmony_ci$code.=<<___; 1355e1051a39Sopenharmony_ci! void ecp_nistz256_scatter_w5(void *%i0,const P256_POINT *%i1, 1356e1051a39Sopenharmony_ci! int %i2); 1357e1051a39Sopenharmony_ci.globl ecp_nistz256_scatter_w5 1358e1051a39Sopenharmony_ci.align 32 1359e1051a39Sopenharmony_ciecp_nistz256_scatter_w5: 1360e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 1361e1051a39Sopenharmony_ci 1362e1051a39Sopenharmony_ci sll $index,2,$index 1363e1051a39Sopenharmony_ci add $out,$index,$out 1364e1051a39Sopenharmony_ci 1365e1051a39Sopenharmony_ci ld [$inp],%l0 ! X 1366e1051a39Sopenharmony_ci ld [$inp+4],%l1 1367e1051a39Sopenharmony_ci ld [$inp+8],%l2 1368e1051a39Sopenharmony_ci ld [$inp+12],%l3 1369e1051a39Sopenharmony_ci ld [$inp+16],%l4 1370e1051a39Sopenharmony_ci ld [$inp+20],%l5 1371e1051a39Sopenharmony_ci ld [$inp+24],%l6 1372e1051a39Sopenharmony_ci ld [$inp+28],%l7 1373e1051a39Sopenharmony_ci add $inp,32,$inp 1374e1051a39Sopenharmony_ci st %l0,[$out+64*0-4] 1375e1051a39Sopenharmony_ci st %l1,[$out+64*1-4] 1376e1051a39Sopenharmony_ci st %l2,[$out+64*2-4] 1377e1051a39Sopenharmony_ci st %l3,[$out+64*3-4] 1378e1051a39Sopenharmony_ci st %l4,[$out+64*4-4] 1379e1051a39Sopenharmony_ci st %l5,[$out+64*5-4] 1380e1051a39Sopenharmony_ci st %l6,[$out+64*6-4] 1381e1051a39Sopenharmony_ci st %l7,[$out+64*7-4] 1382e1051a39Sopenharmony_ci add $out,64*8,$out 1383e1051a39Sopenharmony_ci 1384e1051a39Sopenharmony_ci ld [$inp],%l0 ! Y 1385e1051a39Sopenharmony_ci ld [$inp+4],%l1 1386e1051a39Sopenharmony_ci ld [$inp+8],%l2 1387e1051a39Sopenharmony_ci ld [$inp+12],%l3 1388e1051a39Sopenharmony_ci ld [$inp+16],%l4 1389e1051a39Sopenharmony_ci ld [$inp+20],%l5 1390e1051a39Sopenharmony_ci ld [$inp+24],%l6 1391e1051a39Sopenharmony_ci ld [$inp+28],%l7 1392e1051a39Sopenharmony_ci add $inp,32,$inp 1393e1051a39Sopenharmony_ci st %l0,[$out+64*0-4] 1394e1051a39Sopenharmony_ci st %l1,[$out+64*1-4] 1395e1051a39Sopenharmony_ci st %l2,[$out+64*2-4] 1396e1051a39Sopenharmony_ci st %l3,[$out+64*3-4] 1397e1051a39Sopenharmony_ci st %l4,[$out+64*4-4] 1398e1051a39Sopenharmony_ci st %l5,[$out+64*5-4] 1399e1051a39Sopenharmony_ci st %l6,[$out+64*6-4] 1400e1051a39Sopenharmony_ci st %l7,[$out+64*7-4] 1401e1051a39Sopenharmony_ci add $out,64*8,$out 1402e1051a39Sopenharmony_ci 1403e1051a39Sopenharmony_ci ld [$inp],%l0 ! Z 1404e1051a39Sopenharmony_ci ld [$inp+4],%l1 1405e1051a39Sopenharmony_ci ld [$inp+8],%l2 1406e1051a39Sopenharmony_ci ld [$inp+12],%l3 1407e1051a39Sopenharmony_ci ld [$inp+16],%l4 1408e1051a39Sopenharmony_ci ld [$inp+20],%l5 1409e1051a39Sopenharmony_ci ld [$inp+24],%l6 1410e1051a39Sopenharmony_ci ld [$inp+28],%l7 1411e1051a39Sopenharmony_ci st %l0,[$out+64*0-4] 1412e1051a39Sopenharmony_ci st %l1,[$out+64*1-4] 1413e1051a39Sopenharmony_ci st %l2,[$out+64*2-4] 1414e1051a39Sopenharmony_ci st %l3,[$out+64*3-4] 1415e1051a39Sopenharmony_ci st %l4,[$out+64*4-4] 1416e1051a39Sopenharmony_ci st %l5,[$out+64*5-4] 1417e1051a39Sopenharmony_ci st %l6,[$out+64*6-4] 1418e1051a39Sopenharmony_ci st %l7,[$out+64*7-4] 1419e1051a39Sopenharmony_ci 1420e1051a39Sopenharmony_ci ret 1421e1051a39Sopenharmony_ci restore 1422e1051a39Sopenharmony_ci.type ecp_nistz256_scatter_w5,#function 1423e1051a39Sopenharmony_ci.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 1424e1051a39Sopenharmony_ci 1425e1051a39Sopenharmony_ci! void ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1, 1426e1051a39Sopenharmony_ci! int %i2); 1427e1051a39Sopenharmony_ci.globl ecp_nistz256_gather_w5 1428e1051a39Sopenharmony_ci.align 32 1429e1051a39Sopenharmony_ciecp_nistz256_gather_w5: 1430e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 1431e1051a39Sopenharmony_ci 1432e1051a39Sopenharmony_ci neg $index,$mask 1433e1051a39Sopenharmony_ci srax $mask,63,$mask 1434e1051a39Sopenharmony_ci 1435e1051a39Sopenharmony_ci add $index,$mask,$index 1436e1051a39Sopenharmony_ci sll $index,2,$index 1437e1051a39Sopenharmony_ci add $inp,$index,$inp 1438e1051a39Sopenharmony_ci 1439e1051a39Sopenharmony_ci ld [$inp+64*0],%l0 1440e1051a39Sopenharmony_ci ld [$inp+64*1],%l1 1441e1051a39Sopenharmony_ci ld [$inp+64*2],%l2 1442e1051a39Sopenharmony_ci ld [$inp+64*3],%l3 1443e1051a39Sopenharmony_ci ld [$inp+64*4],%l4 1444e1051a39Sopenharmony_ci ld [$inp+64*5],%l5 1445e1051a39Sopenharmony_ci ld [$inp+64*6],%l6 1446e1051a39Sopenharmony_ci ld [$inp+64*7],%l7 1447e1051a39Sopenharmony_ci add $inp,64*8,$inp 1448e1051a39Sopenharmony_ci and %l0,$mask,%l0 1449e1051a39Sopenharmony_ci and %l1,$mask,%l1 1450e1051a39Sopenharmony_ci st %l0,[$out] ! X 1451e1051a39Sopenharmony_ci and %l2,$mask,%l2 1452e1051a39Sopenharmony_ci st %l1,[$out+4] 1453e1051a39Sopenharmony_ci and %l3,$mask,%l3 1454e1051a39Sopenharmony_ci st %l2,[$out+8] 1455e1051a39Sopenharmony_ci and %l4,$mask,%l4 1456e1051a39Sopenharmony_ci st %l3,[$out+12] 1457e1051a39Sopenharmony_ci and %l5,$mask,%l5 1458e1051a39Sopenharmony_ci st %l4,[$out+16] 1459e1051a39Sopenharmony_ci and %l6,$mask,%l6 1460e1051a39Sopenharmony_ci st %l5,[$out+20] 1461e1051a39Sopenharmony_ci and %l7,$mask,%l7 1462e1051a39Sopenharmony_ci st %l6,[$out+24] 1463e1051a39Sopenharmony_ci st %l7,[$out+28] 1464e1051a39Sopenharmony_ci add $out,32,$out 1465e1051a39Sopenharmony_ci 1466e1051a39Sopenharmony_ci ld [$inp+64*0],%l0 1467e1051a39Sopenharmony_ci ld [$inp+64*1],%l1 1468e1051a39Sopenharmony_ci ld [$inp+64*2],%l2 1469e1051a39Sopenharmony_ci ld [$inp+64*3],%l3 1470e1051a39Sopenharmony_ci ld [$inp+64*4],%l4 1471e1051a39Sopenharmony_ci ld [$inp+64*5],%l5 1472e1051a39Sopenharmony_ci ld [$inp+64*6],%l6 1473e1051a39Sopenharmony_ci ld [$inp+64*7],%l7 1474e1051a39Sopenharmony_ci add $inp,64*8,$inp 1475e1051a39Sopenharmony_ci and %l0,$mask,%l0 1476e1051a39Sopenharmony_ci and %l1,$mask,%l1 1477e1051a39Sopenharmony_ci st %l0,[$out] ! Y 1478e1051a39Sopenharmony_ci and %l2,$mask,%l2 1479e1051a39Sopenharmony_ci st %l1,[$out+4] 1480e1051a39Sopenharmony_ci and %l3,$mask,%l3 1481e1051a39Sopenharmony_ci st %l2,[$out+8] 1482e1051a39Sopenharmony_ci and %l4,$mask,%l4 1483e1051a39Sopenharmony_ci st %l3,[$out+12] 1484e1051a39Sopenharmony_ci and %l5,$mask,%l5 1485e1051a39Sopenharmony_ci st %l4,[$out+16] 1486e1051a39Sopenharmony_ci and %l6,$mask,%l6 1487e1051a39Sopenharmony_ci st %l5,[$out+20] 1488e1051a39Sopenharmony_ci and %l7,$mask,%l7 1489e1051a39Sopenharmony_ci st %l6,[$out+24] 1490e1051a39Sopenharmony_ci st %l7,[$out+28] 1491e1051a39Sopenharmony_ci add $out,32,$out 1492e1051a39Sopenharmony_ci 1493e1051a39Sopenharmony_ci ld [$inp+64*0],%l0 1494e1051a39Sopenharmony_ci ld [$inp+64*1],%l1 1495e1051a39Sopenharmony_ci ld [$inp+64*2],%l2 1496e1051a39Sopenharmony_ci ld [$inp+64*3],%l3 1497e1051a39Sopenharmony_ci ld [$inp+64*4],%l4 1498e1051a39Sopenharmony_ci ld [$inp+64*5],%l5 1499e1051a39Sopenharmony_ci ld [$inp+64*6],%l6 1500e1051a39Sopenharmony_ci ld [$inp+64*7],%l7 1501e1051a39Sopenharmony_ci and %l0,$mask,%l0 1502e1051a39Sopenharmony_ci and %l1,$mask,%l1 1503e1051a39Sopenharmony_ci st %l0,[$out] ! Z 1504e1051a39Sopenharmony_ci and %l2,$mask,%l2 1505e1051a39Sopenharmony_ci st %l1,[$out+4] 1506e1051a39Sopenharmony_ci and %l3,$mask,%l3 1507e1051a39Sopenharmony_ci st %l2,[$out+8] 1508e1051a39Sopenharmony_ci and %l4,$mask,%l4 1509e1051a39Sopenharmony_ci st %l3,[$out+12] 1510e1051a39Sopenharmony_ci and %l5,$mask,%l5 1511e1051a39Sopenharmony_ci st %l4,[$out+16] 1512e1051a39Sopenharmony_ci and %l6,$mask,%l6 1513e1051a39Sopenharmony_ci st %l5,[$out+20] 1514e1051a39Sopenharmony_ci and %l7,$mask,%l7 1515e1051a39Sopenharmony_ci st %l6,[$out+24] 1516e1051a39Sopenharmony_ci st %l7,[$out+28] 1517e1051a39Sopenharmony_ci 1518e1051a39Sopenharmony_ci ret 1519e1051a39Sopenharmony_ci restore 1520e1051a39Sopenharmony_ci.type ecp_nistz256_gather_w5,#function 1521e1051a39Sopenharmony_ci.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 1522e1051a39Sopenharmony_ci 1523e1051a39Sopenharmony_ci! void ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1, 1524e1051a39Sopenharmony_ci! int %i2); 1525e1051a39Sopenharmony_ci.globl ecp_nistz256_scatter_w7 1526e1051a39Sopenharmony_ci.align 32 1527e1051a39Sopenharmony_ciecp_nistz256_scatter_w7: 1528e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 1529e1051a39Sopenharmony_ci nop 1530e1051a39Sopenharmony_ci add $out,$index,$out 1531e1051a39Sopenharmony_ci mov 64/4,$index 1532e1051a39Sopenharmony_ci.Loop_scatter_w7: 1533e1051a39Sopenharmony_ci ld [$inp],%l0 1534e1051a39Sopenharmony_ci add $inp,4,$inp 1535e1051a39Sopenharmony_ci subcc $index,1,$index 1536e1051a39Sopenharmony_ci stb %l0,[$out+64*0] 1537e1051a39Sopenharmony_ci srl %l0,8,%l1 1538e1051a39Sopenharmony_ci stb %l1,[$out+64*1] 1539e1051a39Sopenharmony_ci srl %l0,16,%l2 1540e1051a39Sopenharmony_ci stb %l2,[$out+64*2] 1541e1051a39Sopenharmony_ci srl %l0,24,%l3 1542e1051a39Sopenharmony_ci stb %l3,[$out+64*3] 1543e1051a39Sopenharmony_ci bne .Loop_scatter_w7 1544e1051a39Sopenharmony_ci add $out,64*4,$out 1545e1051a39Sopenharmony_ci 1546e1051a39Sopenharmony_ci ret 1547e1051a39Sopenharmony_ci restore 1548e1051a39Sopenharmony_ci.type ecp_nistz256_scatter_w7,#function 1549e1051a39Sopenharmony_ci.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 1550e1051a39Sopenharmony_ci 1551e1051a39Sopenharmony_ci! void ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1, 1552e1051a39Sopenharmony_ci! int %i2); 1553e1051a39Sopenharmony_ci.globl ecp_nistz256_gather_w7 1554e1051a39Sopenharmony_ci.align 32 1555e1051a39Sopenharmony_ciecp_nistz256_gather_w7: 1556e1051a39Sopenharmony_ci save %sp,-STACK_FRAME,%sp 1557e1051a39Sopenharmony_ci 1558e1051a39Sopenharmony_ci neg $index,$mask 1559e1051a39Sopenharmony_ci srax $mask,63,$mask 1560e1051a39Sopenharmony_ci 1561e1051a39Sopenharmony_ci add $index,$mask,$index 1562e1051a39Sopenharmony_ci add $inp,$index,$inp 1563e1051a39Sopenharmony_ci mov 64/4,$index 1564e1051a39Sopenharmony_ci 1565e1051a39Sopenharmony_ci.Loop_gather_w7: 1566e1051a39Sopenharmony_ci ldub [$inp+64*0],%l0 1567e1051a39Sopenharmony_ci prefetch [$inp+3840+64*0],1 1568e1051a39Sopenharmony_ci subcc $index,1,$index 1569e1051a39Sopenharmony_ci ldub [$inp+64*1],%l1 1570e1051a39Sopenharmony_ci prefetch [$inp+3840+64*1],1 1571e1051a39Sopenharmony_ci ldub [$inp+64*2],%l2 1572e1051a39Sopenharmony_ci prefetch [$inp+3840+64*2],1 1573e1051a39Sopenharmony_ci ldub [$inp+64*3],%l3 1574e1051a39Sopenharmony_ci prefetch [$inp+3840+64*3],1 1575e1051a39Sopenharmony_ci add $inp,64*4,$inp 1576e1051a39Sopenharmony_ci sll %l1,8,%l1 1577e1051a39Sopenharmony_ci sll %l2,16,%l2 1578e1051a39Sopenharmony_ci or %l0,%l1,%l0 1579e1051a39Sopenharmony_ci sll %l3,24,%l3 1580e1051a39Sopenharmony_ci or %l0,%l2,%l0 1581e1051a39Sopenharmony_ci or %l0,%l3,%l0 1582e1051a39Sopenharmony_ci and %l0,$mask,%l0 1583e1051a39Sopenharmony_ci st %l0,[$out] 1584e1051a39Sopenharmony_ci bne .Loop_gather_w7 1585e1051a39Sopenharmony_ci add $out,4,$out 1586e1051a39Sopenharmony_ci 1587e1051a39Sopenharmony_ci ret 1588e1051a39Sopenharmony_ci restore 1589e1051a39Sopenharmony_ci.type ecp_nistz256_gather_w7,#function 1590e1051a39Sopenharmony_ci.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 1591e1051a39Sopenharmony_ci___ 1592e1051a39Sopenharmony_ci}}} 1593e1051a39Sopenharmony_ci{{{ 1594e1051a39Sopenharmony_ci######################################################################## 1595e1051a39Sopenharmony_ci# Following subroutines are VIS3 counterparts of those above that 1596e1051a39Sopenharmony_ci# implement ones found in ecp_nistz256.c. Key difference is that they 1597e1051a39Sopenharmony_ci# use 128-bit multiplication and addition with 64-bit carry, and in order 1598e1051a39Sopenharmony_ci# to do that they perform conversion from uin32_t[8] to uint64_t[4] upon 1599e1051a39Sopenharmony_ci# entry and vice versa on return. 1600e1051a39Sopenharmony_ci# 1601e1051a39Sopenharmony_cimy ($rp,$ap,$bp)=map("%i$_",(0..2)); 1602e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7)); 1603e1051a39Sopenharmony_cimy ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5)); 1604e1051a39Sopenharmony_cimy ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1"); 1605e1051a39Sopenharmony_cimy ($rp_real,$ap_real)=("%g2","%g3"); 1606e1051a39Sopenharmony_cimy ($acc6,$acc7)=($bp,$bi); # used in squaring 1607e1051a39Sopenharmony_ci 1608e1051a39Sopenharmony_ci$code.=<<___; 1609e1051a39Sopenharmony_ci.align 32 1610e1051a39Sopenharmony_ci__ecp_nistz256_mul_by_2_vis3: 1611e1051a39Sopenharmony_ci addcc $acc0,$acc0,$acc0 1612e1051a39Sopenharmony_ci addxccc $acc1,$acc1,$acc1 1613e1051a39Sopenharmony_ci addxccc $acc2,$acc2,$acc2 1614e1051a39Sopenharmony_ci addxccc $acc3,$acc3,$acc3 1615e1051a39Sopenharmony_ci b .Lreduce_by_sub_vis3 1616e1051a39Sopenharmony_ci addxc %g0,%g0,$acc4 ! did it carry? 1617e1051a39Sopenharmony_ci.type __ecp_nistz256_mul_by_2_vis3,#function 1618e1051a39Sopenharmony_ci.size __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3 1619e1051a39Sopenharmony_ci 1620e1051a39Sopenharmony_ci.align 32 1621e1051a39Sopenharmony_ci__ecp_nistz256_add_vis3: 1622e1051a39Sopenharmony_ci ldx [$bp+0],$t0 1623e1051a39Sopenharmony_ci ldx [$bp+8],$t1 1624e1051a39Sopenharmony_ci ldx [$bp+16],$t2 1625e1051a39Sopenharmony_ci ldx [$bp+24],$t3 1626e1051a39Sopenharmony_ci 1627e1051a39Sopenharmony_ci__ecp_nistz256_add_noload_vis3: 1628e1051a39Sopenharmony_ci 1629e1051a39Sopenharmony_ci addcc $t0,$acc0,$acc0 1630e1051a39Sopenharmony_ci addxccc $t1,$acc1,$acc1 1631e1051a39Sopenharmony_ci addxccc $t2,$acc2,$acc2 1632e1051a39Sopenharmony_ci addxccc $t3,$acc3,$acc3 1633e1051a39Sopenharmony_ci addxc %g0,%g0,$acc4 ! did it carry? 1634e1051a39Sopenharmony_ci 1635e1051a39Sopenharmony_ci.Lreduce_by_sub_vis3: 1636e1051a39Sopenharmony_ci 1637e1051a39Sopenharmony_ci addcc $acc0,1,$t0 ! add -modulus, i.e. subtract 1638e1051a39Sopenharmony_ci addxccc $acc1,$poly1,$t1 1639e1051a39Sopenharmony_ci addxccc $acc2,$minus1,$t2 1640e1051a39Sopenharmony_ci addxccc $acc3,$poly3,$t3 1641e1051a39Sopenharmony_ci addxc $acc4,$minus1,$acc4 1642e1051a39Sopenharmony_ci 1643e1051a39Sopenharmony_ci movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus 1644e1051a39Sopenharmony_ci movrz $acc4,$t1,$acc1 1645e1051a39Sopenharmony_ci stx $acc0,[$rp] 1646e1051a39Sopenharmony_ci movrz $acc4,$t2,$acc2 1647e1051a39Sopenharmony_ci stx $acc1,[$rp+8] 1648e1051a39Sopenharmony_ci movrz $acc4,$t3,$acc3 1649e1051a39Sopenharmony_ci stx $acc2,[$rp+16] 1650e1051a39Sopenharmony_ci retl 1651e1051a39Sopenharmony_ci stx $acc3,[$rp+24] 1652e1051a39Sopenharmony_ci.type __ecp_nistz256_add_vis3,#function 1653e1051a39Sopenharmony_ci.size __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3 1654e1051a39Sopenharmony_ci 1655e1051a39Sopenharmony_ci! Trouble with subtraction is that there is no subtraction with 64-bit 1656e1051a39Sopenharmony_ci! borrow, only with 32-bit one. For this reason we "decompose" 64-bit 1657e1051a39Sopenharmony_ci! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But 1658e1051a39Sopenharmony_ci! recall that SPARC is big-endian, which is why you'll observe that 1659e1051a39Sopenharmony_ci! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we 1660e1051a39Sopenharmony_ci! "collect" result back to 64-bit $acc0-$acc3. 1661e1051a39Sopenharmony_ci.align 32 1662e1051a39Sopenharmony_ci__ecp_nistz256_sub_from_vis3: 1663e1051a39Sopenharmony_ci ld [$bp+4],$t0 1664e1051a39Sopenharmony_ci ld [$bp+0],$t1 1665e1051a39Sopenharmony_ci ld [$bp+12],$t2 1666e1051a39Sopenharmony_ci ld [$bp+8],$t3 1667e1051a39Sopenharmony_ci 1668e1051a39Sopenharmony_ci srlx $acc0,32,$acc4 1669e1051a39Sopenharmony_ci not $poly1,$poly1 1670e1051a39Sopenharmony_ci srlx $acc1,32,$acc5 1671e1051a39Sopenharmony_ci subcc $acc0,$t0,$acc0 1672e1051a39Sopenharmony_ci ld [$bp+20],$t0 1673e1051a39Sopenharmony_ci subccc $acc4,$t1,$acc4 1674e1051a39Sopenharmony_ci ld [$bp+16],$t1 1675e1051a39Sopenharmony_ci subccc $acc1,$t2,$acc1 1676e1051a39Sopenharmony_ci ld [$bp+28],$t2 1677e1051a39Sopenharmony_ci and $acc0,$poly1,$acc0 1678e1051a39Sopenharmony_ci subccc $acc5,$t3,$acc5 1679e1051a39Sopenharmony_ci ld [$bp+24],$t3 1680e1051a39Sopenharmony_ci sllx $acc4,32,$acc4 1681e1051a39Sopenharmony_ci and $acc1,$poly1,$acc1 1682e1051a39Sopenharmony_ci sllx $acc5,32,$acc5 1683e1051a39Sopenharmony_ci or $acc0,$acc4,$acc0 1684e1051a39Sopenharmony_ci srlx $acc2,32,$acc4 1685e1051a39Sopenharmony_ci or $acc1,$acc5,$acc1 1686e1051a39Sopenharmony_ci srlx $acc3,32,$acc5 1687e1051a39Sopenharmony_ci subccc $acc2,$t0,$acc2 1688e1051a39Sopenharmony_ci subccc $acc4,$t1,$acc4 1689e1051a39Sopenharmony_ci subccc $acc3,$t2,$acc3 1690e1051a39Sopenharmony_ci and $acc2,$poly1,$acc2 1691e1051a39Sopenharmony_ci subccc $acc5,$t3,$acc5 1692e1051a39Sopenharmony_ci sllx $acc4,32,$acc4 1693e1051a39Sopenharmony_ci and $acc3,$poly1,$acc3 1694e1051a39Sopenharmony_ci sllx $acc5,32,$acc5 1695e1051a39Sopenharmony_ci or $acc2,$acc4,$acc2 1696e1051a39Sopenharmony_ci subc %g0,%g0,$acc4 ! did it borrow? 1697e1051a39Sopenharmony_ci b .Lreduce_by_add_vis3 1698e1051a39Sopenharmony_ci or $acc3,$acc5,$acc3 1699e1051a39Sopenharmony_ci.type __ecp_nistz256_sub_from_vis3,#function 1700e1051a39Sopenharmony_ci.size __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3 1701e1051a39Sopenharmony_ci 1702e1051a39Sopenharmony_ci.align 32 1703e1051a39Sopenharmony_ci__ecp_nistz256_sub_morf_vis3: 1704e1051a39Sopenharmony_ci ld [$bp+4],$t0 1705e1051a39Sopenharmony_ci ld [$bp+0],$t1 1706e1051a39Sopenharmony_ci ld [$bp+12],$t2 1707e1051a39Sopenharmony_ci ld [$bp+8],$t3 1708e1051a39Sopenharmony_ci 1709e1051a39Sopenharmony_ci srlx $acc0,32,$acc4 1710e1051a39Sopenharmony_ci not $poly1,$poly1 1711e1051a39Sopenharmony_ci srlx $acc1,32,$acc5 1712e1051a39Sopenharmony_ci subcc $t0,$acc0,$acc0 1713e1051a39Sopenharmony_ci ld [$bp+20],$t0 1714e1051a39Sopenharmony_ci subccc $t1,$acc4,$acc4 1715e1051a39Sopenharmony_ci ld [$bp+16],$t1 1716e1051a39Sopenharmony_ci subccc $t2,$acc1,$acc1 1717e1051a39Sopenharmony_ci ld [$bp+28],$t2 1718e1051a39Sopenharmony_ci and $acc0,$poly1,$acc0 1719e1051a39Sopenharmony_ci subccc $t3,$acc5,$acc5 1720e1051a39Sopenharmony_ci ld [$bp+24],$t3 1721e1051a39Sopenharmony_ci sllx $acc4,32,$acc4 1722e1051a39Sopenharmony_ci and $acc1,$poly1,$acc1 1723e1051a39Sopenharmony_ci sllx $acc5,32,$acc5 1724e1051a39Sopenharmony_ci or $acc0,$acc4,$acc0 1725e1051a39Sopenharmony_ci srlx $acc2,32,$acc4 1726e1051a39Sopenharmony_ci or $acc1,$acc5,$acc1 1727e1051a39Sopenharmony_ci srlx $acc3,32,$acc5 1728e1051a39Sopenharmony_ci subccc $t0,$acc2,$acc2 1729e1051a39Sopenharmony_ci subccc $t1,$acc4,$acc4 1730e1051a39Sopenharmony_ci subccc $t2,$acc3,$acc3 1731e1051a39Sopenharmony_ci and $acc2,$poly1,$acc2 1732e1051a39Sopenharmony_ci subccc $t3,$acc5,$acc5 1733e1051a39Sopenharmony_ci sllx $acc4,32,$acc4 1734e1051a39Sopenharmony_ci and $acc3,$poly1,$acc3 1735e1051a39Sopenharmony_ci sllx $acc5,32,$acc5 1736e1051a39Sopenharmony_ci or $acc2,$acc4,$acc2 1737e1051a39Sopenharmony_ci subc %g0,%g0,$acc4 ! did it borrow? 1738e1051a39Sopenharmony_ci or $acc3,$acc5,$acc3 1739e1051a39Sopenharmony_ci 1740e1051a39Sopenharmony_ci.Lreduce_by_add_vis3: 1741e1051a39Sopenharmony_ci 1742e1051a39Sopenharmony_ci addcc $acc0,-1,$t0 ! add modulus 1743e1051a39Sopenharmony_ci not $poly3,$t3 1744e1051a39Sopenharmony_ci addxccc $acc1,$poly1,$t1 1745e1051a39Sopenharmony_ci not $poly1,$poly1 ! restore $poly1 1746e1051a39Sopenharmony_ci addxccc $acc2,%g0,$t2 1747e1051a39Sopenharmony_ci addxc $acc3,$t3,$t3 1748e1051a39Sopenharmony_ci 1749e1051a39Sopenharmony_ci movrnz $acc4,$t0,$acc0 ! if a-b borrowed, ret = ret+mod 1750e1051a39Sopenharmony_ci movrnz $acc4,$t1,$acc1 1751e1051a39Sopenharmony_ci stx $acc0,[$rp] 1752e1051a39Sopenharmony_ci movrnz $acc4,$t2,$acc2 1753e1051a39Sopenharmony_ci stx $acc1,[$rp+8] 1754e1051a39Sopenharmony_ci movrnz $acc4,$t3,$acc3 1755e1051a39Sopenharmony_ci stx $acc2,[$rp+16] 1756e1051a39Sopenharmony_ci retl 1757e1051a39Sopenharmony_ci stx $acc3,[$rp+24] 1758e1051a39Sopenharmony_ci.type __ecp_nistz256_sub_morf_vis3,#function 1759e1051a39Sopenharmony_ci.size __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3 1760e1051a39Sopenharmony_ci 1761e1051a39Sopenharmony_ci.align 32 1762e1051a39Sopenharmony_ci__ecp_nistz256_div_by_2_vis3: 1763e1051a39Sopenharmony_ci ! ret = (a is odd ? a+mod : a) >> 1 1764e1051a39Sopenharmony_ci 1765e1051a39Sopenharmony_ci not $poly1,$t1 1766e1051a39Sopenharmony_ci not $poly3,$t3 1767e1051a39Sopenharmony_ci and $acc0,1,$acc5 1768e1051a39Sopenharmony_ci addcc $acc0,-1,$t0 ! add modulus 1769e1051a39Sopenharmony_ci addxccc $acc1,$t1,$t1 1770e1051a39Sopenharmony_ci addxccc $acc2,%g0,$t2 1771e1051a39Sopenharmony_ci addxccc $acc3,$t3,$t3 1772e1051a39Sopenharmony_ci addxc %g0,%g0,$acc4 ! carry bit 1773e1051a39Sopenharmony_ci 1774e1051a39Sopenharmony_ci movrnz $acc5,$t0,$acc0 1775e1051a39Sopenharmony_ci movrnz $acc5,$t1,$acc1 1776e1051a39Sopenharmony_ci movrnz $acc5,$t2,$acc2 1777e1051a39Sopenharmony_ci movrnz $acc5,$t3,$acc3 1778e1051a39Sopenharmony_ci movrz $acc5,%g0,$acc4 1779e1051a39Sopenharmony_ci 1780e1051a39Sopenharmony_ci ! ret >>= 1 1781e1051a39Sopenharmony_ci 1782e1051a39Sopenharmony_ci srlx $acc0,1,$acc0 1783e1051a39Sopenharmony_ci sllx $acc1,63,$t0 1784e1051a39Sopenharmony_ci srlx $acc1,1,$acc1 1785e1051a39Sopenharmony_ci or $acc0,$t0,$acc0 1786e1051a39Sopenharmony_ci sllx $acc2,63,$t1 1787e1051a39Sopenharmony_ci srlx $acc2,1,$acc2 1788e1051a39Sopenharmony_ci or $acc1,$t1,$acc1 1789e1051a39Sopenharmony_ci sllx $acc3,63,$t2 1790e1051a39Sopenharmony_ci stx $acc0,[$rp] 1791e1051a39Sopenharmony_ci srlx $acc3,1,$acc3 1792e1051a39Sopenharmony_ci or $acc2,$t2,$acc2 1793e1051a39Sopenharmony_ci sllx $acc4,63,$t3 ! don't forget carry bit 1794e1051a39Sopenharmony_ci stx $acc1,[$rp+8] 1795e1051a39Sopenharmony_ci or $acc3,$t3,$acc3 1796e1051a39Sopenharmony_ci stx $acc2,[$rp+16] 1797e1051a39Sopenharmony_ci retl 1798e1051a39Sopenharmony_ci stx $acc3,[$rp+24] 1799e1051a39Sopenharmony_ci.type __ecp_nistz256_div_by_2_vis3,#function 1800e1051a39Sopenharmony_ci.size __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3 1801e1051a39Sopenharmony_ci 1802e1051a39Sopenharmony_ci! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and 1803e1051a39Sopenharmony_ci! 4x faster [on T4]... 1804e1051a39Sopenharmony_ci.align 32 1805e1051a39Sopenharmony_ci__ecp_nistz256_mul_mont_vis3: 1806e1051a39Sopenharmony_ci mulx $a0,$bi,$acc0 1807e1051a39Sopenharmony_ci not $poly3,$poly3 ! 0xFFFFFFFF00000001 1808e1051a39Sopenharmony_ci umulxhi $a0,$bi,$t0 1809e1051a39Sopenharmony_ci mulx $a1,$bi,$acc1 1810e1051a39Sopenharmony_ci umulxhi $a1,$bi,$t1 1811e1051a39Sopenharmony_ci mulx $a2,$bi,$acc2 1812e1051a39Sopenharmony_ci umulxhi $a2,$bi,$t2 1813e1051a39Sopenharmony_ci mulx $a3,$bi,$acc3 1814e1051a39Sopenharmony_ci umulxhi $a3,$bi,$t3 1815e1051a39Sopenharmony_ci ldx [$bp+8],$bi ! b[1] 1816e1051a39Sopenharmony_ci 1817e1051a39Sopenharmony_ci addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication 1818e1051a39Sopenharmony_ci sllx $acc0,32,$t0 1819e1051a39Sopenharmony_ci addxccc $acc2,$t1,$acc2 1820e1051a39Sopenharmony_ci srlx $acc0,32,$t1 1821e1051a39Sopenharmony_ci addxccc $acc3,$t2,$acc3 1822e1051a39Sopenharmony_ci addxc %g0,$t3,$acc4 1823e1051a39Sopenharmony_ci mov 0,$acc5 1824e1051a39Sopenharmony_ci___ 1825e1051a39Sopenharmony_cifor($i=1;$i<4;$i++) { 1826e1051a39Sopenharmony_ci # Reduction iteration is normally performed by accumulating 1827e1051a39Sopenharmony_ci # result of multiplication of modulus by "magic" digit [and 1828e1051a39Sopenharmony_ci # omitting least significant word, which is guaranteed to 1829e1051a39Sopenharmony_ci # be 0], but thanks to special form of modulus and "magic" 1830e1051a39Sopenharmony_ci # digit being equal to least significant word, it can be 1831e1051a39Sopenharmony_ci # performed with additions and subtractions alone. Indeed: 1832e1051a39Sopenharmony_ci # 1833e1051a39Sopenharmony_ci # ffff0001.00000000.0000ffff.ffffffff 1834e1051a39Sopenharmony_ci # * abcdefgh 1835e1051a39Sopenharmony_ci # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 1836e1051a39Sopenharmony_ci # 1837e1051a39Sopenharmony_ci # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1838e1051a39Sopenharmony_ci # rewrite above as: 1839e1051a39Sopenharmony_ci # 1840e1051a39Sopenharmony_ci # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 1841e1051a39Sopenharmony_ci # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 1842e1051a39Sopenharmony_ci # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 1843e1051a39Sopenharmony_ci # 1844e1051a39Sopenharmony_ci # or marking redundant operations: 1845e1051a39Sopenharmony_ci # 1846e1051a39Sopenharmony_ci # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 1847e1051a39Sopenharmony_ci # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 1848e1051a39Sopenharmony_ci # - 0000abcd.efgh0000.--------.--------.-------- 1849e1051a39Sopenharmony_ci # ^^^^^^^^ but this word is calculated with umulxhi, because 1850e1051a39Sopenharmony_ci # there is no subtract with 64-bit borrow:-( 1851e1051a39Sopenharmony_ci 1852e1051a39Sopenharmony_ci$code.=<<___; 1853e1051a39Sopenharmony_ci sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part 1854e1051a39Sopenharmony_ci umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part 1855e1051a39Sopenharmony_ci addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] 1856e1051a39Sopenharmony_ci mulx $a0,$bi,$t0 1857e1051a39Sopenharmony_ci addxccc $acc2,$t1,$acc1 1858e1051a39Sopenharmony_ci mulx $a1,$bi,$t1 1859e1051a39Sopenharmony_ci addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 1860e1051a39Sopenharmony_ci mulx $a2,$bi,$t2 1861e1051a39Sopenharmony_ci addxccc $acc4,$t3,$acc3 1862e1051a39Sopenharmony_ci mulx $a3,$bi,$t3 1863e1051a39Sopenharmony_ci addxc $acc5,%g0,$acc4 1864e1051a39Sopenharmony_ci 1865e1051a39Sopenharmony_ci addcc $acc0,$t0,$acc0 ! accumulate low parts of multiplication 1866e1051a39Sopenharmony_ci umulxhi $a0,$bi,$t0 1867e1051a39Sopenharmony_ci addxccc $acc1,$t1,$acc1 1868e1051a39Sopenharmony_ci umulxhi $a1,$bi,$t1 1869e1051a39Sopenharmony_ci addxccc $acc2,$t2,$acc2 1870e1051a39Sopenharmony_ci umulxhi $a2,$bi,$t2 1871e1051a39Sopenharmony_ci addxccc $acc3,$t3,$acc3 1872e1051a39Sopenharmony_ci umulxhi $a3,$bi,$t3 1873e1051a39Sopenharmony_ci addxc $acc4,%g0,$acc4 1874e1051a39Sopenharmony_ci___ 1875e1051a39Sopenharmony_ci$code.=<<___ if ($i<3); 1876e1051a39Sopenharmony_ci ldx [$bp+8*($i+1)],$bi ! bp[$i+1] 1877e1051a39Sopenharmony_ci___ 1878e1051a39Sopenharmony_ci$code.=<<___; 1879e1051a39Sopenharmony_ci addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication 1880e1051a39Sopenharmony_ci sllx $acc0,32,$t0 1881e1051a39Sopenharmony_ci addxccc $acc2,$t1,$acc2 1882e1051a39Sopenharmony_ci srlx $acc0,32,$t1 1883e1051a39Sopenharmony_ci addxccc $acc3,$t2,$acc3 1884e1051a39Sopenharmony_ci addxccc $acc4,$t3,$acc4 1885e1051a39Sopenharmony_ci addxc %g0,%g0,$acc5 1886e1051a39Sopenharmony_ci___ 1887e1051a39Sopenharmony_ci} 1888e1051a39Sopenharmony_ci$code.=<<___; 1889e1051a39Sopenharmony_ci sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part 1890e1051a39Sopenharmony_ci umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part 1891e1051a39Sopenharmony_ci addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] 1892e1051a39Sopenharmony_ci addxccc $acc2,$t1,$acc1 1893e1051a39Sopenharmony_ci addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 1894e1051a39Sopenharmony_ci addxccc $acc4,$t3,$acc3 1895e1051a39Sopenharmony_ci b .Lmul_final_vis3 ! see below 1896e1051a39Sopenharmony_ci addxc $acc5,%g0,$acc4 1897e1051a39Sopenharmony_ci.type __ecp_nistz256_mul_mont_vis3,#function 1898e1051a39Sopenharmony_ci.size __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3 1899e1051a39Sopenharmony_ci 1900e1051a39Sopenharmony_ci! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less 1901e1051a39Sopenharmony_ci! instructions, but only 14% faster [on T4]... 1902e1051a39Sopenharmony_ci.align 32 1903e1051a39Sopenharmony_ci__ecp_nistz256_sqr_mont_vis3: 1904e1051a39Sopenharmony_ci ! | | | | | |a1*a0| | 1905e1051a39Sopenharmony_ci ! | | | | |a2*a0| | | 1906e1051a39Sopenharmony_ci ! | |a3*a2|a3*a0| | | | 1907e1051a39Sopenharmony_ci ! | | | |a2*a1| | | | 1908e1051a39Sopenharmony_ci ! | | |a3*a1| | | | | 1909e1051a39Sopenharmony_ci ! *| | | | | | | | 2| 1910e1051a39Sopenharmony_ci ! +|a3*a3|a2*a2|a1*a1|a0*a0| 1911e1051a39Sopenharmony_ci ! |--+--+--+--+--+--+--+--| 1912e1051a39Sopenharmony_ci ! |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1913e1051a39Sopenharmony_ci ! 1914e1051a39Sopenharmony_ci ! "can't overflow" below mark carrying into high part of 1915e1051a39Sopenharmony_ci ! multiplication result, which can't overflow, because it 1916e1051a39Sopenharmony_ci ! can never be all ones. 1917e1051a39Sopenharmony_ci 1918e1051a39Sopenharmony_ci mulx $a1,$a0,$acc1 ! a[1]*a[0] 1919e1051a39Sopenharmony_ci umulxhi $a1,$a0,$t1 1920e1051a39Sopenharmony_ci mulx $a2,$a0,$acc2 ! a[2]*a[0] 1921e1051a39Sopenharmony_ci umulxhi $a2,$a0,$t2 1922e1051a39Sopenharmony_ci mulx $a3,$a0,$acc3 ! a[3]*a[0] 1923e1051a39Sopenharmony_ci umulxhi $a3,$a0,$acc4 1924e1051a39Sopenharmony_ci 1925e1051a39Sopenharmony_ci addcc $acc2,$t1,$acc2 ! accumulate high parts of multiplication 1926e1051a39Sopenharmony_ci mulx $a2,$a1,$t0 ! a[2]*a[1] 1927e1051a39Sopenharmony_ci umulxhi $a2,$a1,$t1 1928e1051a39Sopenharmony_ci addxccc $acc3,$t2,$acc3 1929e1051a39Sopenharmony_ci mulx $a3,$a1,$t2 ! a[3]*a[1] 1930e1051a39Sopenharmony_ci umulxhi $a3,$a1,$t3 1931e1051a39Sopenharmony_ci addxc $acc4,%g0,$acc4 ! can't overflow 1932e1051a39Sopenharmony_ci 1933e1051a39Sopenharmony_ci mulx $a3,$a2,$acc5 ! a[3]*a[2] 1934e1051a39Sopenharmony_ci not $poly3,$poly3 ! 0xFFFFFFFF00000001 1935e1051a39Sopenharmony_ci umulxhi $a3,$a2,$acc6 1936e1051a39Sopenharmony_ci 1937e1051a39Sopenharmony_ci addcc $t2,$t1,$t1 ! accumulate high parts of multiplication 1938e1051a39Sopenharmony_ci mulx $a0,$a0,$acc0 ! a[0]*a[0] 1939e1051a39Sopenharmony_ci addxc $t3,%g0,$t2 ! can't overflow 1940e1051a39Sopenharmony_ci 1941e1051a39Sopenharmony_ci addcc $acc3,$t0,$acc3 ! accumulate low parts of multiplication 1942e1051a39Sopenharmony_ci umulxhi $a0,$a0,$a0 1943e1051a39Sopenharmony_ci addxccc $acc4,$t1,$acc4 1944e1051a39Sopenharmony_ci mulx $a1,$a1,$t1 ! a[1]*a[1] 1945e1051a39Sopenharmony_ci addxccc $acc5,$t2,$acc5 1946e1051a39Sopenharmony_ci umulxhi $a1,$a1,$a1 1947e1051a39Sopenharmony_ci addxc $acc6,%g0,$acc6 ! can't overflow 1948e1051a39Sopenharmony_ci 1949e1051a39Sopenharmony_ci addcc $acc1,$acc1,$acc1 ! acc[1-6]*=2 1950e1051a39Sopenharmony_ci mulx $a2,$a2,$t2 ! a[2]*a[2] 1951e1051a39Sopenharmony_ci addxccc $acc2,$acc2,$acc2 1952e1051a39Sopenharmony_ci umulxhi $a2,$a2,$a2 1953e1051a39Sopenharmony_ci addxccc $acc3,$acc3,$acc3 1954e1051a39Sopenharmony_ci mulx $a3,$a3,$t3 ! a[3]*a[3] 1955e1051a39Sopenharmony_ci addxccc $acc4,$acc4,$acc4 1956e1051a39Sopenharmony_ci umulxhi $a3,$a3,$a3 1957e1051a39Sopenharmony_ci addxccc $acc5,$acc5,$acc5 1958e1051a39Sopenharmony_ci addxccc $acc6,$acc6,$acc6 1959e1051a39Sopenharmony_ci addxc %g0,%g0,$acc7 1960e1051a39Sopenharmony_ci 1961e1051a39Sopenharmony_ci addcc $acc1,$a0,$acc1 ! +a[i]*a[i] 1962e1051a39Sopenharmony_ci addxccc $acc2,$t1,$acc2 1963e1051a39Sopenharmony_ci addxccc $acc3,$a1,$acc3 1964e1051a39Sopenharmony_ci addxccc $acc4,$t2,$acc4 1965e1051a39Sopenharmony_ci sllx $acc0,32,$t0 1966e1051a39Sopenharmony_ci addxccc $acc5,$a2,$acc5 1967e1051a39Sopenharmony_ci srlx $acc0,32,$t1 1968e1051a39Sopenharmony_ci addxccc $acc6,$t3,$acc6 1969e1051a39Sopenharmony_ci sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part 1970e1051a39Sopenharmony_ci addxc $acc7,$a3,$acc7 1971e1051a39Sopenharmony_ci___ 1972e1051a39Sopenharmony_cifor($i=0;$i<3;$i++) { # reductions, see commentary 1973e1051a39Sopenharmony_ci # in multiplication for details 1974e1051a39Sopenharmony_ci$code.=<<___; 1975e1051a39Sopenharmony_ci umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part 1976e1051a39Sopenharmony_ci addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] 1977e1051a39Sopenharmony_ci sllx $acc0,32,$t0 1978e1051a39Sopenharmony_ci addxccc $acc2,$t1,$acc1 1979e1051a39Sopenharmony_ci srlx $acc0,32,$t1 1980e1051a39Sopenharmony_ci addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 1981e1051a39Sopenharmony_ci sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part 1982e1051a39Sopenharmony_ci addxc %g0,$t3,$acc3 ! can't overflow 1983e1051a39Sopenharmony_ci___ 1984e1051a39Sopenharmony_ci} 1985e1051a39Sopenharmony_ci$code.=<<___; 1986e1051a39Sopenharmony_ci umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part 1987e1051a39Sopenharmony_ci addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] 1988e1051a39Sopenharmony_ci addxccc $acc2,$t1,$acc1 1989e1051a39Sopenharmony_ci addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 1990e1051a39Sopenharmony_ci addxc %g0,$t3,$acc3 ! can't overflow 1991e1051a39Sopenharmony_ci 1992e1051a39Sopenharmony_ci addcc $acc0,$acc4,$acc0 ! accumulate upper half 1993e1051a39Sopenharmony_ci addxccc $acc1,$acc5,$acc1 1994e1051a39Sopenharmony_ci addxccc $acc2,$acc6,$acc2 1995e1051a39Sopenharmony_ci addxccc $acc3,$acc7,$acc3 1996e1051a39Sopenharmony_ci addxc %g0,%g0,$acc4 1997e1051a39Sopenharmony_ci 1998e1051a39Sopenharmony_ci.Lmul_final_vis3: 1999e1051a39Sopenharmony_ci 2000e1051a39Sopenharmony_ci ! Final step is "if result > mod, subtract mod", but as comparison 2001e1051a39Sopenharmony_ci ! means subtraction, we do the subtraction and then copy outcome 2002e1051a39Sopenharmony_ci ! if it didn't borrow. But note that as we [have to] replace 2003e1051a39Sopenharmony_ci ! subtraction with addition with negative, carry/borrow logic is 2004e1051a39Sopenharmony_ci ! inverse. 2005e1051a39Sopenharmony_ci 2006e1051a39Sopenharmony_ci addcc $acc0,1,$t0 ! add -modulus, i.e. subtract 2007e1051a39Sopenharmony_ci not $poly3,$poly3 ! restore 0x00000000FFFFFFFE 2008e1051a39Sopenharmony_ci addxccc $acc1,$poly1,$t1 2009e1051a39Sopenharmony_ci addxccc $acc2,$minus1,$t2 2010e1051a39Sopenharmony_ci addxccc $acc3,$poly3,$t3 2011e1051a39Sopenharmony_ci addxccc $acc4,$minus1,%g0 ! did it carry? 2012e1051a39Sopenharmony_ci 2013e1051a39Sopenharmony_ci movcs %xcc,$t0,$acc0 2014e1051a39Sopenharmony_ci movcs %xcc,$t1,$acc1 2015e1051a39Sopenharmony_ci stx $acc0,[$rp] 2016e1051a39Sopenharmony_ci movcs %xcc,$t2,$acc2 2017e1051a39Sopenharmony_ci stx $acc1,[$rp+8] 2018e1051a39Sopenharmony_ci movcs %xcc,$t3,$acc3 2019e1051a39Sopenharmony_ci stx $acc2,[$rp+16] 2020e1051a39Sopenharmony_ci retl 2021e1051a39Sopenharmony_ci stx $acc3,[$rp+24] 2022e1051a39Sopenharmony_ci.type __ecp_nistz256_sqr_mont_vis3,#function 2023e1051a39Sopenharmony_ci.size __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3 2024e1051a39Sopenharmony_ci___ 2025e1051a39Sopenharmony_ci 2026e1051a39Sopenharmony_ci######################################################################## 2027e1051a39Sopenharmony_ci# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 2028e1051a39Sopenharmony_ci# 2029e1051a39Sopenharmony_ci{ 2030e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z, 2031e1051a39Sopenharmony_ci $in_x,$in_y,$in_z, 2032e1051a39Sopenharmony_ci $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9)); 2033e1051a39Sopenharmony_ci# above map() describes stack layout with 10 temporary 2034e1051a39Sopenharmony_ci# 256-bit vectors on top. 2035e1051a39Sopenharmony_ci 2036e1051a39Sopenharmony_ci$code.=<<___; 2037e1051a39Sopenharmony_ci.align 32 2038e1051a39Sopenharmony_ciecp_nistz256_point_double_vis3: 2039e1051a39Sopenharmony_ci save %sp,-STACK64_FRAME-32*10,%sp 2040e1051a39Sopenharmony_ci 2041e1051a39Sopenharmony_ci mov $rp,$rp_real 2042e1051a39Sopenharmony_ci.Ldouble_shortcut_vis3: 2043e1051a39Sopenharmony_ci mov -1,$minus1 2044e1051a39Sopenharmony_ci mov -2,$poly3 2045e1051a39Sopenharmony_ci sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 2046e1051a39Sopenharmony_ci srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE 2047e1051a39Sopenharmony_ci 2048e1051a39Sopenharmony_ci ! convert input to uint64_t[4] 2049e1051a39Sopenharmony_ci ld [$ap],$a0 ! in_x 2050e1051a39Sopenharmony_ci ld [$ap+4],$t0 2051e1051a39Sopenharmony_ci ld [$ap+8],$a1 2052e1051a39Sopenharmony_ci ld [$ap+12],$t1 2053e1051a39Sopenharmony_ci ld [$ap+16],$a2 2054e1051a39Sopenharmony_ci ld [$ap+20],$t2 2055e1051a39Sopenharmony_ci ld [$ap+24],$a3 2056e1051a39Sopenharmony_ci ld [$ap+28],$t3 2057e1051a39Sopenharmony_ci sllx $t0,32,$t0 2058e1051a39Sopenharmony_ci sllx $t1,32,$t1 2059e1051a39Sopenharmony_ci ld [$ap+32],$acc0 ! in_y 2060e1051a39Sopenharmony_ci or $a0,$t0,$a0 2061e1051a39Sopenharmony_ci ld [$ap+32+4],$t0 2062e1051a39Sopenharmony_ci sllx $t2,32,$t2 2063e1051a39Sopenharmony_ci ld [$ap+32+8],$acc1 2064e1051a39Sopenharmony_ci or $a1,$t1,$a1 2065e1051a39Sopenharmony_ci ld [$ap+32+12],$t1 2066e1051a39Sopenharmony_ci sllx $t3,32,$t3 2067e1051a39Sopenharmony_ci ld [$ap+32+16],$acc2 2068e1051a39Sopenharmony_ci or $a2,$t2,$a2 2069e1051a39Sopenharmony_ci ld [$ap+32+20],$t2 2070e1051a39Sopenharmony_ci or $a3,$t3,$a3 2071e1051a39Sopenharmony_ci ld [$ap+32+24],$acc3 2072e1051a39Sopenharmony_ci sllx $t0,32,$t0 2073e1051a39Sopenharmony_ci ld [$ap+32+28],$t3 2074e1051a39Sopenharmony_ci sllx $t1,32,$t1 2075e1051a39Sopenharmony_ci stx $a0,[%sp+LOCALS64+$in_x] 2076e1051a39Sopenharmony_ci sllx $t2,32,$t2 2077e1051a39Sopenharmony_ci stx $a1,[%sp+LOCALS64+$in_x+8] 2078e1051a39Sopenharmony_ci sllx $t3,32,$t3 2079e1051a39Sopenharmony_ci stx $a2,[%sp+LOCALS64+$in_x+16] 2080e1051a39Sopenharmony_ci or $acc0,$t0,$acc0 2081e1051a39Sopenharmony_ci stx $a3,[%sp+LOCALS64+$in_x+24] 2082e1051a39Sopenharmony_ci or $acc1,$t1,$acc1 2083e1051a39Sopenharmony_ci stx $acc0,[%sp+LOCALS64+$in_y] 2084e1051a39Sopenharmony_ci or $acc2,$t2,$acc2 2085e1051a39Sopenharmony_ci stx $acc1,[%sp+LOCALS64+$in_y+8] 2086e1051a39Sopenharmony_ci or $acc3,$t3,$acc3 2087e1051a39Sopenharmony_ci stx $acc2,[%sp+LOCALS64+$in_y+16] 2088e1051a39Sopenharmony_ci stx $acc3,[%sp+LOCALS64+$in_y+24] 2089e1051a39Sopenharmony_ci 2090e1051a39Sopenharmony_ci ld [$ap+64],$a0 ! in_z 2091e1051a39Sopenharmony_ci ld [$ap+64+4],$t0 2092e1051a39Sopenharmony_ci ld [$ap+64+8],$a1 2093e1051a39Sopenharmony_ci ld [$ap+64+12],$t1 2094e1051a39Sopenharmony_ci ld [$ap+64+16],$a2 2095e1051a39Sopenharmony_ci ld [$ap+64+20],$t2 2096e1051a39Sopenharmony_ci ld [$ap+64+24],$a3 2097e1051a39Sopenharmony_ci ld [$ap+64+28],$t3 2098e1051a39Sopenharmony_ci sllx $t0,32,$t0 2099e1051a39Sopenharmony_ci sllx $t1,32,$t1 2100e1051a39Sopenharmony_ci or $a0,$t0,$a0 2101e1051a39Sopenharmony_ci sllx $t2,32,$t2 2102e1051a39Sopenharmony_ci or $a1,$t1,$a1 2103e1051a39Sopenharmony_ci sllx $t3,32,$t3 2104e1051a39Sopenharmony_ci or $a2,$t2,$a2 2105e1051a39Sopenharmony_ci or $a3,$t3,$a3 2106e1051a39Sopenharmony_ci sllx $t0,32,$t0 2107e1051a39Sopenharmony_ci sllx $t1,32,$t1 2108e1051a39Sopenharmony_ci stx $a0,[%sp+LOCALS64+$in_z] 2109e1051a39Sopenharmony_ci sllx $t2,32,$t2 2110e1051a39Sopenharmony_ci stx $a1,[%sp+LOCALS64+$in_z+8] 2111e1051a39Sopenharmony_ci sllx $t3,32,$t3 2112e1051a39Sopenharmony_ci stx $a2,[%sp+LOCALS64+$in_z+16] 2113e1051a39Sopenharmony_ci stx $a3,[%sp+LOCALS64+$in_z+24] 2114e1051a39Sopenharmony_ci 2115e1051a39Sopenharmony_ci ! in_y is still in $acc0-$acc3 2116e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(S, in_y); 2117e1051a39Sopenharmony_ci add %sp,LOCALS64+$S,$rp 2118e1051a39Sopenharmony_ci 2119e1051a39Sopenharmony_ci ! in_z is still in $a0-$a3 2120e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Zsqr, in_z); 2121e1051a39Sopenharmony_ci add %sp,LOCALS64+$Zsqr,$rp 2122e1051a39Sopenharmony_ci 2123e1051a39Sopenharmony_ci mov $acc0,$a0 ! put Zsqr aside 2124e1051a39Sopenharmony_ci mov $acc1,$a1 2125e1051a39Sopenharmony_ci mov $acc2,$a2 2126e1051a39Sopenharmony_ci mov $acc3,$a3 2127e1051a39Sopenharmony_ci 2128e1051a39Sopenharmony_ci add %sp,LOCALS64+$in_x,$bp 2129e1051a39Sopenharmony_ci call __ecp_nistz256_add_vis3 ! p256_add(M, Zsqr, in_x); 2130e1051a39Sopenharmony_ci add %sp,LOCALS64+$M,$rp 2131e1051a39Sopenharmony_ci 2132e1051a39Sopenharmony_ci mov $a0,$acc0 ! restore Zsqr 2133e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S],$a0 ! forward load 2134e1051a39Sopenharmony_ci mov $a1,$acc1 2135e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S+8],$a1 2136e1051a39Sopenharmony_ci mov $a2,$acc2 2137e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S+16],$a2 2138e1051a39Sopenharmony_ci mov $a3,$acc3 2139e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S+24],$a3 2140e1051a39Sopenharmony_ci 2141e1051a39Sopenharmony_ci add %sp,LOCALS64+$in_x,$bp 2142e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf_vis3 ! p256_sub(Zsqr, in_x, Zsqr); 2143e1051a39Sopenharmony_ci add %sp,LOCALS64+$Zsqr,$rp 2144e1051a39Sopenharmony_ci 2145e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(S, S); 2146e1051a39Sopenharmony_ci add %sp,LOCALS64+$S,$rp 2147e1051a39Sopenharmony_ci 2148e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in_z],$bi 2149e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in_y],$a0 2150e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in_y+8],$a1 2151e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in_y+16],$a2 2152e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in_y+24],$a3 2153e1051a39Sopenharmony_ci add %sp,LOCALS64+$in_z,$bp 2154e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(tmp0, in_z, in_y); 2155e1051a39Sopenharmony_ci add %sp,LOCALS64+$tmp0,$rp 2156e1051a39Sopenharmony_ci 2157e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$M],$bi ! forward load 2158e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Zsqr],$a0 2159e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Zsqr+8],$a1 2160e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Zsqr+16],$a2 2161e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Zsqr+24],$a3 2162e1051a39Sopenharmony_ci 2163e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(res_z, tmp0); 2164e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_z,$rp 2165e1051a39Sopenharmony_ci 2166e1051a39Sopenharmony_ci add %sp,LOCALS64+$M,$bp 2167e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(M, M, Zsqr); 2168e1051a39Sopenharmony_ci add %sp,LOCALS64+$M,$rp 2169e1051a39Sopenharmony_ci 2170e1051a39Sopenharmony_ci mov $acc0,$a0 ! put aside M 2171e1051a39Sopenharmony_ci mov $acc1,$a1 2172e1051a39Sopenharmony_ci mov $acc2,$a2 2173e1051a39Sopenharmony_ci mov $acc3,$a3 2174e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2_vis3 2175e1051a39Sopenharmony_ci add %sp,LOCALS64+$M,$rp 2176e1051a39Sopenharmony_ci mov $a0,$t0 ! copy M 2177e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S],$a0 ! forward load 2178e1051a39Sopenharmony_ci mov $a1,$t1 2179e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S+8],$a1 2180e1051a39Sopenharmony_ci mov $a2,$t2 2181e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S+16],$a2 2182e1051a39Sopenharmony_ci mov $a3,$t3 2183e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S+24],$a3 2184e1051a39Sopenharmony_ci call __ecp_nistz256_add_noload_vis3 ! p256_mul_by_3(M, M); 2185e1051a39Sopenharmony_ci add %sp,LOCALS64+$M,$rp 2186e1051a39Sopenharmony_ci 2187e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(tmp0, S); 2188e1051a39Sopenharmony_ci add %sp,LOCALS64+$tmp0,$rp 2189e1051a39Sopenharmony_ci 2190e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S],$bi ! forward load 2191e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in_x],$a0 2192e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in_x+8],$a1 2193e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in_x+16],$a2 2194e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in_x+24],$a3 2195e1051a39Sopenharmony_ci 2196e1051a39Sopenharmony_ci call __ecp_nistz256_div_by_2_vis3 ! p256_div_by_2(res_y, tmp0); 2197e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_y,$rp 2198e1051a39Sopenharmony_ci 2199e1051a39Sopenharmony_ci add %sp,LOCALS64+$S,$bp 2200e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, in_x); 2201e1051a39Sopenharmony_ci add %sp,LOCALS64+$S,$rp 2202e1051a39Sopenharmony_ci 2203e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$M],$a0 ! forward load 2204e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$M+8],$a1 2205e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$M+16],$a2 2206e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$M+24],$a3 2207e1051a39Sopenharmony_ci 2208e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(tmp0, S); 2209e1051a39Sopenharmony_ci add %sp,LOCALS64+$tmp0,$rp 2210e1051a39Sopenharmony_ci 2211e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(res_x, M); 2212e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_x,$rp 2213e1051a39Sopenharmony_ci 2214e1051a39Sopenharmony_ci add %sp,LOCALS64+$tmp0,$bp 2215e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, tmp0); 2216e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_x,$rp 2217e1051a39Sopenharmony_ci 2218e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$M],$a0 ! forward load 2219e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$M+8],$a1 2220e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$M+16],$a2 2221e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$M+24],$a3 2222e1051a39Sopenharmony_ci 2223e1051a39Sopenharmony_ci add %sp,LOCALS64+$S,$bp 2224e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf_vis3 ! p256_sub(S, S, res_x); 2225e1051a39Sopenharmony_ci add %sp,LOCALS64+$S,$rp 2226e1051a39Sopenharmony_ci 2227e1051a39Sopenharmony_ci mov $acc0,$bi 2228e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, M); 2229e1051a39Sopenharmony_ci add %sp,LOCALS64+$S,$rp 2230e1051a39Sopenharmony_ci 2231e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_x],$a0 ! forward load 2232e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_x+8],$a1 2233e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_x+16],$a2 2234e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_x+24],$a3 2235e1051a39Sopenharmony_ci 2236e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_y,$bp 2237e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, S, res_y); 2238e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_y,$bp 2239e1051a39Sopenharmony_ci 2240e1051a39Sopenharmony_ci ! convert output to uint_32[8] 2241e1051a39Sopenharmony_ci srlx $a0,32,$t0 2242e1051a39Sopenharmony_ci srlx $a1,32,$t1 2243e1051a39Sopenharmony_ci st $a0,[$rp_real] ! res_x 2244e1051a39Sopenharmony_ci srlx $a2,32,$t2 2245e1051a39Sopenharmony_ci st $t0,[$rp_real+4] 2246e1051a39Sopenharmony_ci srlx $a3,32,$t3 2247e1051a39Sopenharmony_ci st $a1,[$rp_real+8] 2248e1051a39Sopenharmony_ci st $t1,[$rp_real+12] 2249e1051a39Sopenharmony_ci st $a2,[$rp_real+16] 2250e1051a39Sopenharmony_ci st $t2,[$rp_real+20] 2251e1051a39Sopenharmony_ci st $a3,[$rp_real+24] 2252e1051a39Sopenharmony_ci st $t3,[$rp_real+28] 2253e1051a39Sopenharmony_ci 2254e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_z],$a0 ! forward load 2255e1051a39Sopenharmony_ci srlx $acc0,32,$t0 2256e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_z+8],$a1 2257e1051a39Sopenharmony_ci srlx $acc1,32,$t1 2258e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_z+16],$a2 2259e1051a39Sopenharmony_ci srlx $acc2,32,$t2 2260e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_z+24],$a3 2261e1051a39Sopenharmony_ci srlx $acc3,32,$t3 2262e1051a39Sopenharmony_ci st $acc0,[$rp_real+32] ! res_y 2263e1051a39Sopenharmony_ci st $t0, [$rp_real+32+4] 2264e1051a39Sopenharmony_ci st $acc1,[$rp_real+32+8] 2265e1051a39Sopenharmony_ci st $t1, [$rp_real+32+12] 2266e1051a39Sopenharmony_ci st $acc2,[$rp_real+32+16] 2267e1051a39Sopenharmony_ci st $t2, [$rp_real+32+20] 2268e1051a39Sopenharmony_ci st $acc3,[$rp_real+32+24] 2269e1051a39Sopenharmony_ci st $t3, [$rp_real+32+28] 2270e1051a39Sopenharmony_ci 2271e1051a39Sopenharmony_ci srlx $a0,32,$t0 2272e1051a39Sopenharmony_ci srlx $a1,32,$t1 2273e1051a39Sopenharmony_ci st $a0,[$rp_real+64] ! res_z 2274e1051a39Sopenharmony_ci srlx $a2,32,$t2 2275e1051a39Sopenharmony_ci st $t0,[$rp_real+64+4] 2276e1051a39Sopenharmony_ci srlx $a3,32,$t3 2277e1051a39Sopenharmony_ci st $a1,[$rp_real+64+8] 2278e1051a39Sopenharmony_ci st $t1,[$rp_real+64+12] 2279e1051a39Sopenharmony_ci st $a2,[$rp_real+64+16] 2280e1051a39Sopenharmony_ci st $t2,[$rp_real+64+20] 2281e1051a39Sopenharmony_ci st $a3,[$rp_real+64+24] 2282e1051a39Sopenharmony_ci st $t3,[$rp_real+64+28] 2283e1051a39Sopenharmony_ci 2284e1051a39Sopenharmony_ci ret 2285e1051a39Sopenharmony_ci restore 2286e1051a39Sopenharmony_ci.type ecp_nistz256_point_double_vis3,#function 2287e1051a39Sopenharmony_ci.size ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3 2288e1051a39Sopenharmony_ci___ 2289e1051a39Sopenharmony_ci} 2290e1051a39Sopenharmony_ci######################################################################## 2291e1051a39Sopenharmony_ci# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 2292e1051a39Sopenharmony_ci# const P256_POINT *in2); 2293e1051a39Sopenharmony_ci{ 2294e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z, 2295e1051a39Sopenharmony_ci $in1_x,$in1_y,$in1_z, 2296e1051a39Sopenharmony_ci $in2_x,$in2_y,$in2_z, 2297e1051a39Sopenharmony_ci $H,$Hsqr,$R,$Rsqr,$Hcub, 2298e1051a39Sopenharmony_ci $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); 2299e1051a39Sopenharmony_cimy ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 2300e1051a39Sopenharmony_ci 2301e1051a39Sopenharmony_ci# above map() describes stack layout with 18 temporary 2302e1051a39Sopenharmony_ci# 256-bit vectors on top. Then we reserve some space for 2303e1051a39Sopenharmony_ci# !in1infty, !in2infty and result of check for zero. 2304e1051a39Sopenharmony_ci 2305e1051a39Sopenharmony_ci$code.=<<___; 2306e1051a39Sopenharmony_ci.align 32 2307e1051a39Sopenharmony_ciecp_nistz256_point_add_vis3: 2308e1051a39Sopenharmony_ci save %sp,-STACK64_FRAME-32*18-32,%sp 2309e1051a39Sopenharmony_ci 2310e1051a39Sopenharmony_ci mov $rp,$rp_real 2311e1051a39Sopenharmony_ci mov -1,$minus1 2312e1051a39Sopenharmony_ci mov -2,$poly3 2313e1051a39Sopenharmony_ci sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 2314e1051a39Sopenharmony_ci srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE 2315e1051a39Sopenharmony_ci 2316e1051a39Sopenharmony_ci ! convert input to uint64_t[4] 2317e1051a39Sopenharmony_ci ld [$bp],$a0 ! in2_x 2318e1051a39Sopenharmony_ci ld [$bp+4],$t0 2319e1051a39Sopenharmony_ci ld [$bp+8],$a1 2320e1051a39Sopenharmony_ci ld [$bp+12],$t1 2321e1051a39Sopenharmony_ci ld [$bp+16],$a2 2322e1051a39Sopenharmony_ci ld [$bp+20],$t2 2323e1051a39Sopenharmony_ci ld [$bp+24],$a3 2324e1051a39Sopenharmony_ci ld [$bp+28],$t3 2325e1051a39Sopenharmony_ci sllx $t0,32,$t0 2326e1051a39Sopenharmony_ci sllx $t1,32,$t1 2327e1051a39Sopenharmony_ci ld [$bp+32],$acc0 ! in2_y 2328e1051a39Sopenharmony_ci or $a0,$t0,$a0 2329e1051a39Sopenharmony_ci ld [$bp+32+4],$t0 2330e1051a39Sopenharmony_ci sllx $t2,32,$t2 2331e1051a39Sopenharmony_ci ld [$bp+32+8],$acc1 2332e1051a39Sopenharmony_ci or $a1,$t1,$a1 2333e1051a39Sopenharmony_ci ld [$bp+32+12],$t1 2334e1051a39Sopenharmony_ci sllx $t3,32,$t3 2335e1051a39Sopenharmony_ci ld [$bp+32+16],$acc2 2336e1051a39Sopenharmony_ci or $a2,$t2,$a2 2337e1051a39Sopenharmony_ci ld [$bp+32+20],$t2 2338e1051a39Sopenharmony_ci or $a3,$t3,$a3 2339e1051a39Sopenharmony_ci ld [$bp+32+24],$acc3 2340e1051a39Sopenharmony_ci sllx $t0,32,$t0 2341e1051a39Sopenharmony_ci ld [$bp+32+28],$t3 2342e1051a39Sopenharmony_ci sllx $t1,32,$t1 2343e1051a39Sopenharmony_ci stx $a0,[%sp+LOCALS64+$in2_x] 2344e1051a39Sopenharmony_ci sllx $t2,32,$t2 2345e1051a39Sopenharmony_ci stx $a1,[%sp+LOCALS64+$in2_x+8] 2346e1051a39Sopenharmony_ci sllx $t3,32,$t3 2347e1051a39Sopenharmony_ci stx $a2,[%sp+LOCALS64+$in2_x+16] 2348e1051a39Sopenharmony_ci or $acc0,$t0,$acc0 2349e1051a39Sopenharmony_ci stx $a3,[%sp+LOCALS64+$in2_x+24] 2350e1051a39Sopenharmony_ci or $acc1,$t1,$acc1 2351e1051a39Sopenharmony_ci stx $acc0,[%sp+LOCALS64+$in2_y] 2352e1051a39Sopenharmony_ci or $acc2,$t2,$acc2 2353e1051a39Sopenharmony_ci stx $acc1,[%sp+LOCALS64+$in2_y+8] 2354e1051a39Sopenharmony_ci or $acc3,$t3,$acc3 2355e1051a39Sopenharmony_ci stx $acc2,[%sp+LOCALS64+$in2_y+16] 2356e1051a39Sopenharmony_ci stx $acc3,[%sp+LOCALS64+$in2_y+24] 2357e1051a39Sopenharmony_ci 2358e1051a39Sopenharmony_ci ld [$bp+64],$acc0 ! in2_z 2359e1051a39Sopenharmony_ci ld [$bp+64+4],$t0 2360e1051a39Sopenharmony_ci ld [$bp+64+8],$acc1 2361e1051a39Sopenharmony_ci ld [$bp+64+12],$t1 2362e1051a39Sopenharmony_ci ld [$bp+64+16],$acc2 2363e1051a39Sopenharmony_ci ld [$bp+64+20],$t2 2364e1051a39Sopenharmony_ci ld [$bp+64+24],$acc3 2365e1051a39Sopenharmony_ci ld [$bp+64+28],$t3 2366e1051a39Sopenharmony_ci sllx $t0,32,$t0 2367e1051a39Sopenharmony_ci sllx $t1,32,$t1 2368e1051a39Sopenharmony_ci ld [$ap],$a0 ! in1_x 2369e1051a39Sopenharmony_ci or $acc0,$t0,$acc0 2370e1051a39Sopenharmony_ci ld [$ap+4],$t0 2371e1051a39Sopenharmony_ci sllx $t2,32,$t2 2372e1051a39Sopenharmony_ci ld [$ap+8],$a1 2373e1051a39Sopenharmony_ci or $acc1,$t1,$acc1 2374e1051a39Sopenharmony_ci ld [$ap+12],$t1 2375e1051a39Sopenharmony_ci sllx $t3,32,$t3 2376e1051a39Sopenharmony_ci ld [$ap+16],$a2 2377e1051a39Sopenharmony_ci or $acc2,$t2,$acc2 2378e1051a39Sopenharmony_ci ld [$ap+20],$t2 2379e1051a39Sopenharmony_ci or $acc3,$t3,$acc3 2380e1051a39Sopenharmony_ci ld [$ap+24],$a3 2381e1051a39Sopenharmony_ci sllx $t0,32,$t0 2382e1051a39Sopenharmony_ci ld [$ap+28],$t3 2383e1051a39Sopenharmony_ci sllx $t1,32,$t1 2384e1051a39Sopenharmony_ci stx $acc0,[%sp+LOCALS64+$in2_z] 2385e1051a39Sopenharmony_ci sllx $t2,32,$t2 2386e1051a39Sopenharmony_ci stx $acc1,[%sp+LOCALS64+$in2_z+8] 2387e1051a39Sopenharmony_ci sllx $t3,32,$t3 2388e1051a39Sopenharmony_ci stx $acc2,[%sp+LOCALS64+$in2_z+16] 2389e1051a39Sopenharmony_ci stx $acc3,[%sp+LOCALS64+$in2_z+24] 2390e1051a39Sopenharmony_ci 2391e1051a39Sopenharmony_ci or $acc1,$acc0,$acc0 2392e1051a39Sopenharmony_ci or $acc3,$acc2,$acc2 2393e1051a39Sopenharmony_ci or $acc2,$acc0,$acc0 2394e1051a39Sopenharmony_ci movrnz $acc0,-1,$acc0 ! !in2infty 2395e1051a39Sopenharmony_ci stx $acc0,[%fp+STACK_BIAS-8] 2396e1051a39Sopenharmony_ci 2397e1051a39Sopenharmony_ci or $a0,$t0,$a0 2398e1051a39Sopenharmony_ci ld [$ap+32],$acc0 ! in1_y 2399e1051a39Sopenharmony_ci or $a1,$t1,$a1 2400e1051a39Sopenharmony_ci ld [$ap+32+4],$t0 2401e1051a39Sopenharmony_ci or $a2,$t2,$a2 2402e1051a39Sopenharmony_ci ld [$ap+32+8],$acc1 2403e1051a39Sopenharmony_ci or $a3,$t3,$a3 2404e1051a39Sopenharmony_ci ld [$ap+32+12],$t1 2405e1051a39Sopenharmony_ci ld [$ap+32+16],$acc2 2406e1051a39Sopenharmony_ci ld [$ap+32+20],$t2 2407e1051a39Sopenharmony_ci ld [$ap+32+24],$acc3 2408e1051a39Sopenharmony_ci sllx $t0,32,$t0 2409e1051a39Sopenharmony_ci ld [$ap+32+28],$t3 2410e1051a39Sopenharmony_ci sllx $t1,32,$t1 2411e1051a39Sopenharmony_ci stx $a0,[%sp+LOCALS64+$in1_x] 2412e1051a39Sopenharmony_ci sllx $t2,32,$t2 2413e1051a39Sopenharmony_ci stx $a1,[%sp+LOCALS64+$in1_x+8] 2414e1051a39Sopenharmony_ci sllx $t3,32,$t3 2415e1051a39Sopenharmony_ci stx $a2,[%sp+LOCALS64+$in1_x+16] 2416e1051a39Sopenharmony_ci or $acc0,$t0,$acc0 2417e1051a39Sopenharmony_ci stx $a3,[%sp+LOCALS64+$in1_x+24] 2418e1051a39Sopenharmony_ci or $acc1,$t1,$acc1 2419e1051a39Sopenharmony_ci stx $acc0,[%sp+LOCALS64+$in1_y] 2420e1051a39Sopenharmony_ci or $acc2,$t2,$acc2 2421e1051a39Sopenharmony_ci stx $acc1,[%sp+LOCALS64+$in1_y+8] 2422e1051a39Sopenharmony_ci or $acc3,$t3,$acc3 2423e1051a39Sopenharmony_ci stx $acc2,[%sp+LOCALS64+$in1_y+16] 2424e1051a39Sopenharmony_ci stx $acc3,[%sp+LOCALS64+$in1_y+24] 2425e1051a39Sopenharmony_ci 2426e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z],$a0 ! forward load 2427e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z+8],$a1 2428e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z+16],$a2 2429e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z+24],$a3 2430e1051a39Sopenharmony_ci 2431e1051a39Sopenharmony_ci ld [$ap+64],$acc0 ! in1_z 2432e1051a39Sopenharmony_ci ld [$ap+64+4],$t0 2433e1051a39Sopenharmony_ci ld [$ap+64+8],$acc1 2434e1051a39Sopenharmony_ci ld [$ap+64+12],$t1 2435e1051a39Sopenharmony_ci ld [$ap+64+16],$acc2 2436e1051a39Sopenharmony_ci ld [$ap+64+20],$t2 2437e1051a39Sopenharmony_ci ld [$ap+64+24],$acc3 2438e1051a39Sopenharmony_ci ld [$ap+64+28],$t3 2439e1051a39Sopenharmony_ci sllx $t0,32,$t0 2440e1051a39Sopenharmony_ci sllx $t1,32,$t1 2441e1051a39Sopenharmony_ci or $acc0,$t0,$acc0 2442e1051a39Sopenharmony_ci sllx $t2,32,$t2 2443e1051a39Sopenharmony_ci or $acc1,$t1,$acc1 2444e1051a39Sopenharmony_ci sllx $t3,32,$t3 2445e1051a39Sopenharmony_ci stx $acc0,[%sp+LOCALS64+$in1_z] 2446e1051a39Sopenharmony_ci or $acc2,$t2,$acc2 2447e1051a39Sopenharmony_ci stx $acc1,[%sp+LOCALS64+$in1_z+8] 2448e1051a39Sopenharmony_ci or $acc3,$t3,$acc3 2449e1051a39Sopenharmony_ci stx $acc2,[%sp+LOCALS64+$in1_z+16] 2450e1051a39Sopenharmony_ci stx $acc3,[%sp+LOCALS64+$in1_z+24] 2451e1051a39Sopenharmony_ci 2452e1051a39Sopenharmony_ci or $acc1,$acc0,$acc0 2453e1051a39Sopenharmony_ci or $acc3,$acc2,$acc2 2454e1051a39Sopenharmony_ci or $acc2,$acc0,$acc0 2455e1051a39Sopenharmony_ci movrnz $acc0,-1,$acc0 ! !in1infty 2456e1051a39Sopenharmony_ci stx $acc0,[%fp+STACK_BIAS-16] 2457e1051a39Sopenharmony_ci 2458e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z2sqr, in2_z); 2459e1051a39Sopenharmony_ci add %sp,LOCALS64+$Z2sqr,$rp 2460e1051a39Sopenharmony_ci 2461e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z],$a0 2462e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+8],$a1 2463e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+16],$a2 2464e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+24],$a3 2465e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); 2466e1051a39Sopenharmony_ci add %sp,LOCALS64+$Z1sqr,$rp 2467e1051a39Sopenharmony_ci 2468e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Z2sqr],$bi 2469e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z],$a0 2470e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z+8],$a1 2471e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z+16],$a2 2472e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z+24],$a3 2473e1051a39Sopenharmony_ci add %sp,LOCALS64+$Z2sqr,$bp 2474e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, Z2sqr, in2_z); 2475e1051a39Sopenharmony_ci add %sp,LOCALS64+$S1,$rp 2476e1051a39Sopenharmony_ci 2477e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Z1sqr],$bi 2478e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z],$a0 2479e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+8],$a1 2480e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+16],$a2 2481e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+24],$a3 2482e1051a39Sopenharmony_ci add %sp,LOCALS64+$Z1sqr,$bp 2483e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); 2484e1051a39Sopenharmony_ci add %sp,LOCALS64+$S2,$rp 2485e1051a39Sopenharmony_ci 2486e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S1],$bi 2487e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_y],$a0 2488e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_y+8],$a1 2489e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_y+16],$a2 2490e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_y+24],$a3 2491e1051a39Sopenharmony_ci add %sp,LOCALS64+$S1,$bp 2492e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, S1, in1_y); 2493e1051a39Sopenharmony_ci add %sp,LOCALS64+$S1,$rp 2494e1051a39Sopenharmony_ci 2495e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S2],$bi 2496e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_y],$a0 2497e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_y+8],$a1 2498e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_y+16],$a2 2499e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_y+24],$a3 2500e1051a39Sopenharmony_ci add %sp,LOCALS64+$S2,$bp 2501e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); 2502e1051a39Sopenharmony_ci add %sp,LOCALS64+$S2,$rp 2503e1051a39Sopenharmony_ci 2504e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Z2sqr],$bi ! forward load 2505e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x],$a0 2506e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+8],$a1 2507e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+16],$a2 2508e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+24],$a3 2509e1051a39Sopenharmony_ci 2510e1051a39Sopenharmony_ci add %sp,LOCALS64+$S1,$bp 2511e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, S1); 2512e1051a39Sopenharmony_ci add %sp,LOCALS64+$R,$rp 2513e1051a39Sopenharmony_ci 2514e1051a39Sopenharmony_ci or $acc1,$acc0,$acc0 ! see if result is zero 2515e1051a39Sopenharmony_ci or $acc3,$acc2,$acc2 2516e1051a39Sopenharmony_ci or $acc2,$acc0,$acc0 2517e1051a39Sopenharmony_ci stx $acc0,[%fp+STACK_BIAS-24] 2518e1051a39Sopenharmony_ci 2519e1051a39Sopenharmony_ci add %sp,LOCALS64+$Z2sqr,$bp 2520e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U1, in1_x, Z2sqr); 2521e1051a39Sopenharmony_ci add %sp,LOCALS64+$U1,$rp 2522e1051a39Sopenharmony_ci 2523e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Z1sqr],$bi 2524e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_x],$a0 2525e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_x+8],$a1 2526e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_x+16],$a2 2527e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_x+24],$a3 2528e1051a39Sopenharmony_ci add %sp,LOCALS64+$Z1sqr,$bp 2529e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in2_x, Z1sqr); 2530e1051a39Sopenharmony_ci add %sp,LOCALS64+$U2,$rp 2531e1051a39Sopenharmony_ci 2532e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$R],$a0 ! forward load 2533e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$R+8],$a1 2534e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$R+16],$a2 2535e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$R+24],$a3 2536e1051a39Sopenharmony_ci 2537e1051a39Sopenharmony_ci add %sp,LOCALS64+$U1,$bp 2538e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, U1); 2539e1051a39Sopenharmony_ci add %sp,LOCALS64+$H,$rp 2540e1051a39Sopenharmony_ci 2541e1051a39Sopenharmony_ci or $acc1,$acc0,$acc0 ! see if result is zero 2542e1051a39Sopenharmony_ci or $acc3,$acc2,$acc2 2543e1051a39Sopenharmony_ci orcc $acc2,$acc0,$acc0 2544e1051a39Sopenharmony_ci 2545e1051a39Sopenharmony_ci bne,pt %xcc,.Ladd_proceed_vis3 ! is_equal(U1,U2)? 2546e1051a39Sopenharmony_ci nop 2547e1051a39Sopenharmony_ci 2548e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-8],$t0 2549e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-16],$t1 2550e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-24],$t2 2551e1051a39Sopenharmony_ci andcc $t0,$t1,%g0 2552e1051a39Sopenharmony_ci be,pt %xcc,.Ladd_proceed_vis3 ! (in1infty || in2infty)? 2553e1051a39Sopenharmony_ci nop 2554e1051a39Sopenharmony_ci andcc $t2,$t2,%g0 2555e1051a39Sopenharmony_ci be,a,pt %xcc,.Ldouble_shortcut_vis3 ! is_equal(S1,S2)? 2556e1051a39Sopenharmony_ci add %sp,32*(12-10)+32,%sp ! difference in frame sizes 2557e1051a39Sopenharmony_ci 2558e1051a39Sopenharmony_ci st %g0,[$rp_real] 2559e1051a39Sopenharmony_ci st %g0,[$rp_real+4] 2560e1051a39Sopenharmony_ci st %g0,[$rp_real+8] 2561e1051a39Sopenharmony_ci st %g0,[$rp_real+12] 2562e1051a39Sopenharmony_ci st %g0,[$rp_real+16] 2563e1051a39Sopenharmony_ci st %g0,[$rp_real+20] 2564e1051a39Sopenharmony_ci st %g0,[$rp_real+24] 2565e1051a39Sopenharmony_ci st %g0,[$rp_real+28] 2566e1051a39Sopenharmony_ci st %g0,[$rp_real+32] 2567e1051a39Sopenharmony_ci st %g0,[$rp_real+32+4] 2568e1051a39Sopenharmony_ci st %g0,[$rp_real+32+8] 2569e1051a39Sopenharmony_ci st %g0,[$rp_real+32+12] 2570e1051a39Sopenharmony_ci st %g0,[$rp_real+32+16] 2571e1051a39Sopenharmony_ci st %g0,[$rp_real+32+20] 2572e1051a39Sopenharmony_ci st %g0,[$rp_real+32+24] 2573e1051a39Sopenharmony_ci st %g0,[$rp_real+32+28] 2574e1051a39Sopenharmony_ci st %g0,[$rp_real+64] 2575e1051a39Sopenharmony_ci st %g0,[$rp_real+64+4] 2576e1051a39Sopenharmony_ci st %g0,[$rp_real+64+8] 2577e1051a39Sopenharmony_ci st %g0,[$rp_real+64+12] 2578e1051a39Sopenharmony_ci st %g0,[$rp_real+64+16] 2579e1051a39Sopenharmony_ci st %g0,[$rp_real+64+20] 2580e1051a39Sopenharmony_ci st %g0,[$rp_real+64+24] 2581e1051a39Sopenharmony_ci st %g0,[$rp_real+64+28] 2582e1051a39Sopenharmony_ci b .Ladd_done_vis3 2583e1051a39Sopenharmony_ci nop 2584e1051a39Sopenharmony_ci 2585e1051a39Sopenharmony_ci.align 16 2586e1051a39Sopenharmony_ci.Ladd_proceed_vis3: 2587e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); 2588e1051a39Sopenharmony_ci add %sp,LOCALS64+$Rsqr,$rp 2589e1051a39Sopenharmony_ci 2590e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H],$bi 2591e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z],$a0 2592e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+8],$a1 2593e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+16],$a2 2594e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+24],$a3 2595e1051a39Sopenharmony_ci add %sp,LOCALS64+$H,$bp 2596e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); 2597e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_z,$rp 2598e1051a39Sopenharmony_ci 2599e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H],$a0 2600e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H+8],$a1 2601e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H+16],$a2 2602e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H+24],$a3 2603e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); 2604e1051a39Sopenharmony_ci add %sp,LOCALS64+$Hsqr,$rp 2605e1051a39Sopenharmony_ci 2606e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_z],$bi 2607e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z],$a0 2608e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z+8],$a1 2609e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z+16],$a2 2610e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_z+24],$a3 2611e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_z,$bp 2612e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, res_z, in2_z); 2613e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_z,$rp 2614e1051a39Sopenharmony_ci 2615e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H],$bi 2616e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr],$a0 2617e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr+8],$a1 2618e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr+16],$a2 2619e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr+24],$a3 2620e1051a39Sopenharmony_ci add %sp,LOCALS64+$H,$bp 2621e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); 2622e1051a39Sopenharmony_ci add %sp,LOCALS64+$Hcub,$rp 2623e1051a39Sopenharmony_ci 2624e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$U1],$bi 2625e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr],$a0 2626e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr+8],$a1 2627e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr+16],$a2 2628e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr+24],$a3 2629e1051a39Sopenharmony_ci add %sp,LOCALS64+$U1,$bp 2630e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, U1, Hsqr); 2631e1051a39Sopenharmony_ci add %sp,LOCALS64+$U2,$rp 2632e1051a39Sopenharmony_ci 2633e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); 2634e1051a39Sopenharmony_ci add %sp,LOCALS64+$Hsqr,$rp 2635e1051a39Sopenharmony_ci 2636e1051a39Sopenharmony_ci add %sp,LOCALS64+$Rsqr,$bp 2637e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); 2638e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_x,$rp 2639e1051a39Sopenharmony_ci 2640e1051a39Sopenharmony_ci add %sp,LOCALS64+$Hcub,$bp 2641e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); 2642e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_x,$rp 2643e1051a39Sopenharmony_ci 2644e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S1],$bi ! forward load 2645e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hcub],$a0 2646e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hcub+8],$a1 2647e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hcub+16],$a2 2648e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hcub+24],$a3 2649e1051a39Sopenharmony_ci 2650e1051a39Sopenharmony_ci add %sp,LOCALS64+$U2,$bp 2651e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); 2652e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_y,$rp 2653e1051a39Sopenharmony_ci 2654e1051a39Sopenharmony_ci add %sp,LOCALS64+$S1,$bp 2655e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S1, Hcub); 2656e1051a39Sopenharmony_ci add %sp,LOCALS64+$S2,$rp 2657e1051a39Sopenharmony_ci 2658e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$R],$bi 2659e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_y],$a0 2660e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_y+8],$a1 2661e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_y+16],$a2 2662e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_y+24],$a3 2663e1051a39Sopenharmony_ci add %sp,LOCALS64+$R,$bp 2664e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); 2665e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_y,$rp 2666e1051a39Sopenharmony_ci 2667e1051a39Sopenharmony_ci add %sp,LOCALS64+$S2,$bp 2668e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); 2669e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_y,$rp 2670e1051a39Sopenharmony_ci 2671e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty 2672e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty 2673e1051a39Sopenharmony_ci___ 2674e1051a39Sopenharmony_cifor($i=0;$i<96;$i+=16) { # conditional moves 2675e1051a39Sopenharmony_ci$code.=<<___; 2676e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res 2677e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 2678e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 2679e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 2680e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 2681e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 2682e1051a39Sopenharmony_ci movrz $t1,$acc2,$acc0 2683e1051a39Sopenharmony_ci movrz $t1,$acc3,$acc1 2684e1051a39Sopenharmony_ci movrz $t2,$acc4,$acc0 2685e1051a39Sopenharmony_ci movrz $t2,$acc5,$acc1 2686e1051a39Sopenharmony_ci srlx $acc0,32,$acc2 2687e1051a39Sopenharmony_ci srlx $acc1,32,$acc3 2688e1051a39Sopenharmony_ci st $acc0,[$rp_real+$i] 2689e1051a39Sopenharmony_ci st $acc2,[$rp_real+$i+4] 2690e1051a39Sopenharmony_ci st $acc1,[$rp_real+$i+8] 2691e1051a39Sopenharmony_ci st $acc3,[$rp_real+$i+12] 2692e1051a39Sopenharmony_ci___ 2693e1051a39Sopenharmony_ci} 2694e1051a39Sopenharmony_ci$code.=<<___; 2695e1051a39Sopenharmony_ci.Ladd_done_vis3: 2696e1051a39Sopenharmony_ci ret 2697e1051a39Sopenharmony_ci restore 2698e1051a39Sopenharmony_ci.type ecp_nistz256_point_add_vis3,#function 2699e1051a39Sopenharmony_ci.size ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3 2700e1051a39Sopenharmony_ci___ 2701e1051a39Sopenharmony_ci} 2702e1051a39Sopenharmony_ci######################################################################## 2703e1051a39Sopenharmony_ci# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 2704e1051a39Sopenharmony_ci# const P256_POINT_AFFINE *in2); 2705e1051a39Sopenharmony_ci{ 2706e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z, 2707e1051a39Sopenharmony_ci $in1_x,$in1_y,$in1_z, 2708e1051a39Sopenharmony_ci $in2_x,$in2_y, 2709e1051a39Sopenharmony_ci $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 2710e1051a39Sopenharmony_cimy $Z1sqr = $S2; 2711e1051a39Sopenharmony_ci# above map() describes stack layout with 15 temporary 2712e1051a39Sopenharmony_ci# 256-bit vectors on top. Then we reserve some space for 2713e1051a39Sopenharmony_ci# !in1infty and !in2infty. 2714e1051a39Sopenharmony_ci 2715e1051a39Sopenharmony_ci$code.=<<___; 2716e1051a39Sopenharmony_ci.align 32 2717e1051a39Sopenharmony_ciecp_nistz256_point_add_affine_vis3: 2718e1051a39Sopenharmony_ci save %sp,-STACK64_FRAME-32*15-32,%sp 2719e1051a39Sopenharmony_ci 2720e1051a39Sopenharmony_ci mov $rp,$rp_real 2721e1051a39Sopenharmony_ci mov -1,$minus1 2722e1051a39Sopenharmony_ci mov -2,$poly3 2723e1051a39Sopenharmony_ci sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 2724e1051a39Sopenharmony_ci srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE 2725e1051a39Sopenharmony_ci 2726e1051a39Sopenharmony_ci ! convert input to uint64_t[4] 2727e1051a39Sopenharmony_ci ld [$bp],$a0 ! in2_x 2728e1051a39Sopenharmony_ci ld [$bp+4],$t0 2729e1051a39Sopenharmony_ci ld [$bp+8],$a1 2730e1051a39Sopenharmony_ci ld [$bp+12],$t1 2731e1051a39Sopenharmony_ci ld [$bp+16],$a2 2732e1051a39Sopenharmony_ci ld [$bp+20],$t2 2733e1051a39Sopenharmony_ci ld [$bp+24],$a3 2734e1051a39Sopenharmony_ci ld [$bp+28],$t3 2735e1051a39Sopenharmony_ci sllx $t0,32,$t0 2736e1051a39Sopenharmony_ci sllx $t1,32,$t1 2737e1051a39Sopenharmony_ci ld [$bp+32],$acc0 ! in2_y 2738e1051a39Sopenharmony_ci or $a0,$t0,$a0 2739e1051a39Sopenharmony_ci ld [$bp+32+4],$t0 2740e1051a39Sopenharmony_ci sllx $t2,32,$t2 2741e1051a39Sopenharmony_ci ld [$bp+32+8],$acc1 2742e1051a39Sopenharmony_ci or $a1,$t1,$a1 2743e1051a39Sopenharmony_ci ld [$bp+32+12],$t1 2744e1051a39Sopenharmony_ci sllx $t3,32,$t3 2745e1051a39Sopenharmony_ci ld [$bp+32+16],$acc2 2746e1051a39Sopenharmony_ci or $a2,$t2,$a2 2747e1051a39Sopenharmony_ci ld [$bp+32+20],$t2 2748e1051a39Sopenharmony_ci or $a3,$t3,$a3 2749e1051a39Sopenharmony_ci ld [$bp+32+24],$acc3 2750e1051a39Sopenharmony_ci sllx $t0,32,$t0 2751e1051a39Sopenharmony_ci ld [$bp+32+28],$t3 2752e1051a39Sopenharmony_ci sllx $t1,32,$t1 2753e1051a39Sopenharmony_ci stx $a0,[%sp+LOCALS64+$in2_x] 2754e1051a39Sopenharmony_ci sllx $t2,32,$t2 2755e1051a39Sopenharmony_ci stx $a1,[%sp+LOCALS64+$in2_x+8] 2756e1051a39Sopenharmony_ci sllx $t3,32,$t3 2757e1051a39Sopenharmony_ci stx $a2,[%sp+LOCALS64+$in2_x+16] 2758e1051a39Sopenharmony_ci or $acc0,$t0,$acc0 2759e1051a39Sopenharmony_ci stx $a3,[%sp+LOCALS64+$in2_x+24] 2760e1051a39Sopenharmony_ci or $acc1,$t1,$acc1 2761e1051a39Sopenharmony_ci stx $acc0,[%sp+LOCALS64+$in2_y] 2762e1051a39Sopenharmony_ci or $acc2,$t2,$acc2 2763e1051a39Sopenharmony_ci stx $acc1,[%sp+LOCALS64+$in2_y+8] 2764e1051a39Sopenharmony_ci or $acc3,$t3,$acc3 2765e1051a39Sopenharmony_ci stx $acc2,[%sp+LOCALS64+$in2_y+16] 2766e1051a39Sopenharmony_ci stx $acc3,[%sp+LOCALS64+$in2_y+24] 2767e1051a39Sopenharmony_ci 2768e1051a39Sopenharmony_ci or $a1,$a0,$a0 2769e1051a39Sopenharmony_ci or $a3,$a2,$a2 2770e1051a39Sopenharmony_ci or $acc1,$acc0,$acc0 2771e1051a39Sopenharmony_ci or $acc3,$acc2,$acc2 2772e1051a39Sopenharmony_ci or $a2,$a0,$a0 2773e1051a39Sopenharmony_ci or $acc2,$acc0,$acc0 2774e1051a39Sopenharmony_ci or $acc0,$a0,$a0 2775e1051a39Sopenharmony_ci movrnz $a0,-1,$a0 ! !in2infty 2776e1051a39Sopenharmony_ci stx $a0,[%fp+STACK_BIAS-8] 2777e1051a39Sopenharmony_ci 2778e1051a39Sopenharmony_ci ld [$ap],$a0 ! in1_x 2779e1051a39Sopenharmony_ci ld [$ap+4],$t0 2780e1051a39Sopenharmony_ci ld [$ap+8],$a1 2781e1051a39Sopenharmony_ci ld [$ap+12],$t1 2782e1051a39Sopenharmony_ci ld [$ap+16],$a2 2783e1051a39Sopenharmony_ci ld [$ap+20],$t2 2784e1051a39Sopenharmony_ci ld [$ap+24],$a3 2785e1051a39Sopenharmony_ci ld [$ap+28],$t3 2786e1051a39Sopenharmony_ci sllx $t0,32,$t0 2787e1051a39Sopenharmony_ci sllx $t1,32,$t1 2788e1051a39Sopenharmony_ci ld [$ap+32],$acc0 ! in1_y 2789e1051a39Sopenharmony_ci or $a0,$t0,$a0 2790e1051a39Sopenharmony_ci ld [$ap+32+4],$t0 2791e1051a39Sopenharmony_ci sllx $t2,32,$t2 2792e1051a39Sopenharmony_ci ld [$ap+32+8],$acc1 2793e1051a39Sopenharmony_ci or $a1,$t1,$a1 2794e1051a39Sopenharmony_ci ld [$ap+32+12],$t1 2795e1051a39Sopenharmony_ci sllx $t3,32,$t3 2796e1051a39Sopenharmony_ci ld [$ap+32+16],$acc2 2797e1051a39Sopenharmony_ci or $a2,$t2,$a2 2798e1051a39Sopenharmony_ci ld [$ap+32+20],$t2 2799e1051a39Sopenharmony_ci or $a3,$t3,$a3 2800e1051a39Sopenharmony_ci ld [$ap+32+24],$acc3 2801e1051a39Sopenharmony_ci sllx $t0,32,$t0 2802e1051a39Sopenharmony_ci ld [$ap+32+28],$t3 2803e1051a39Sopenharmony_ci sllx $t1,32,$t1 2804e1051a39Sopenharmony_ci stx $a0,[%sp+LOCALS64+$in1_x] 2805e1051a39Sopenharmony_ci sllx $t2,32,$t2 2806e1051a39Sopenharmony_ci stx $a1,[%sp+LOCALS64+$in1_x+8] 2807e1051a39Sopenharmony_ci sllx $t3,32,$t3 2808e1051a39Sopenharmony_ci stx $a2,[%sp+LOCALS64+$in1_x+16] 2809e1051a39Sopenharmony_ci or $acc0,$t0,$acc0 2810e1051a39Sopenharmony_ci stx $a3,[%sp+LOCALS64+$in1_x+24] 2811e1051a39Sopenharmony_ci or $acc1,$t1,$acc1 2812e1051a39Sopenharmony_ci stx $acc0,[%sp+LOCALS64+$in1_y] 2813e1051a39Sopenharmony_ci or $acc2,$t2,$acc2 2814e1051a39Sopenharmony_ci stx $acc1,[%sp+LOCALS64+$in1_y+8] 2815e1051a39Sopenharmony_ci or $acc3,$t3,$acc3 2816e1051a39Sopenharmony_ci stx $acc2,[%sp+LOCALS64+$in1_y+16] 2817e1051a39Sopenharmony_ci stx $acc3,[%sp+LOCALS64+$in1_y+24] 2818e1051a39Sopenharmony_ci 2819e1051a39Sopenharmony_ci ld [$ap+64],$a0 ! in1_z 2820e1051a39Sopenharmony_ci ld [$ap+64+4],$t0 2821e1051a39Sopenharmony_ci ld [$ap+64+8],$a1 2822e1051a39Sopenharmony_ci ld [$ap+64+12],$t1 2823e1051a39Sopenharmony_ci ld [$ap+64+16],$a2 2824e1051a39Sopenharmony_ci ld [$ap+64+20],$t2 2825e1051a39Sopenharmony_ci ld [$ap+64+24],$a3 2826e1051a39Sopenharmony_ci ld [$ap+64+28],$t3 2827e1051a39Sopenharmony_ci sllx $t0,32,$t0 2828e1051a39Sopenharmony_ci sllx $t1,32,$t1 2829e1051a39Sopenharmony_ci or $a0,$t0,$a0 2830e1051a39Sopenharmony_ci sllx $t2,32,$t2 2831e1051a39Sopenharmony_ci or $a1,$t1,$a1 2832e1051a39Sopenharmony_ci sllx $t3,32,$t3 2833e1051a39Sopenharmony_ci stx $a0,[%sp+LOCALS64+$in1_z] 2834e1051a39Sopenharmony_ci or $a2,$t2,$a2 2835e1051a39Sopenharmony_ci stx $a1,[%sp+LOCALS64+$in1_z+8] 2836e1051a39Sopenharmony_ci or $a3,$t3,$a3 2837e1051a39Sopenharmony_ci stx $a2,[%sp+LOCALS64+$in1_z+16] 2838e1051a39Sopenharmony_ci stx $a3,[%sp+LOCALS64+$in1_z+24] 2839e1051a39Sopenharmony_ci 2840e1051a39Sopenharmony_ci or $a1,$a0,$t0 2841e1051a39Sopenharmony_ci or $a3,$a2,$t2 2842e1051a39Sopenharmony_ci or $t2,$t0,$t0 2843e1051a39Sopenharmony_ci movrnz $t0,-1,$t0 ! !in1infty 2844e1051a39Sopenharmony_ci stx $t0,[%fp+STACK_BIAS-16] 2845e1051a39Sopenharmony_ci 2846e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); 2847e1051a39Sopenharmony_ci add %sp,LOCALS64+$Z1sqr,$rp 2848e1051a39Sopenharmony_ci 2849e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_x],$bi 2850e1051a39Sopenharmony_ci mov $acc0,$a0 2851e1051a39Sopenharmony_ci mov $acc1,$a1 2852e1051a39Sopenharmony_ci mov $acc2,$a2 2853e1051a39Sopenharmony_ci mov $acc3,$a3 2854e1051a39Sopenharmony_ci add %sp,LOCALS64+$in2_x,$bp 2855e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, Z1sqr, in2_x); 2856e1051a39Sopenharmony_ci add %sp,LOCALS64+$U2,$rp 2857e1051a39Sopenharmony_ci 2858e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Z1sqr],$bi ! forward load 2859e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z],$a0 2860e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+8],$a1 2861e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+16],$a2 2862e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+24],$a3 2863e1051a39Sopenharmony_ci 2864e1051a39Sopenharmony_ci add %sp,LOCALS64+$in1_x,$bp 2865e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, in1_x); 2866e1051a39Sopenharmony_ci add %sp,LOCALS64+$H,$rp 2867e1051a39Sopenharmony_ci 2868e1051a39Sopenharmony_ci add %sp,LOCALS64+$Z1sqr,$bp 2869e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); 2870e1051a39Sopenharmony_ci add %sp,LOCALS64+$S2,$rp 2871e1051a39Sopenharmony_ci 2872e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H],$bi 2873e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z],$a0 2874e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+8],$a1 2875e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+16],$a2 2876e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_z+24],$a3 2877e1051a39Sopenharmony_ci add %sp,LOCALS64+$H,$bp 2878e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); 2879e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_z,$rp 2880e1051a39Sopenharmony_ci 2881e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$S2],$bi 2882e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_y],$a0 2883e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_y+8],$a1 2884e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_y+16],$a2 2885e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_y+24],$a3 2886e1051a39Sopenharmony_ci add %sp,LOCALS64+$S2,$bp 2887e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); 2888e1051a39Sopenharmony_ci add %sp,LOCALS64+$S2,$rp 2889e1051a39Sopenharmony_ci 2890e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H],$a0 ! forward load 2891e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H+8],$a1 2892e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H+16],$a2 2893e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H+24],$a3 2894e1051a39Sopenharmony_ci 2895e1051a39Sopenharmony_ci add %sp,LOCALS64+$in1_y,$bp 2896e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, in1_y); 2897e1051a39Sopenharmony_ci add %sp,LOCALS64+$R,$rp 2898e1051a39Sopenharmony_ci 2899e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); 2900e1051a39Sopenharmony_ci add %sp,LOCALS64+$Hsqr,$rp 2901e1051a39Sopenharmony_ci 2902e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$R],$a0 2903e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$R+8],$a1 2904e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$R+16],$a2 2905e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$R+24],$a3 2906e1051a39Sopenharmony_ci call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); 2907e1051a39Sopenharmony_ci add %sp,LOCALS64+$Rsqr,$rp 2908e1051a39Sopenharmony_ci 2909e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$H],$bi 2910e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr],$a0 2911e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr+8],$a1 2912e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr+16],$a2 2913e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr+24],$a3 2914e1051a39Sopenharmony_ci add %sp,LOCALS64+$H,$bp 2915e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); 2916e1051a39Sopenharmony_ci add %sp,LOCALS64+$Hcub,$rp 2917e1051a39Sopenharmony_ci 2918e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hsqr],$bi 2919e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x],$a0 2920e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+8],$a1 2921e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+16],$a2 2922e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+24],$a3 2923e1051a39Sopenharmony_ci add %sp,LOCALS64+$Hsqr,$bp 2924e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in1_x, Hsqr); 2925e1051a39Sopenharmony_ci add %sp,LOCALS64+$U2,$rp 2926e1051a39Sopenharmony_ci 2927e1051a39Sopenharmony_ci call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); 2928e1051a39Sopenharmony_ci add %sp,LOCALS64+$Hsqr,$rp 2929e1051a39Sopenharmony_ci 2930e1051a39Sopenharmony_ci add %sp,LOCALS64+$Rsqr,$bp 2931e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); 2932e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_x,$rp 2933e1051a39Sopenharmony_ci 2934e1051a39Sopenharmony_ci add %sp,LOCALS64+$Hcub,$bp 2935e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); 2936e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_x,$rp 2937e1051a39Sopenharmony_ci 2938e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$Hcub],$bi ! forward load 2939e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_y],$a0 2940e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_y+8],$a1 2941e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_y+16],$a2 2942e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_y+24],$a3 2943e1051a39Sopenharmony_ci 2944e1051a39Sopenharmony_ci add %sp,LOCALS64+$U2,$bp 2945e1051a39Sopenharmony_ci call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); 2946e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_y,$rp 2947e1051a39Sopenharmony_ci 2948e1051a39Sopenharmony_ci add %sp,LOCALS64+$Hcub,$bp 2949e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, in1_y, Hcub); 2950e1051a39Sopenharmony_ci add %sp,LOCALS64+$S2,$rp 2951e1051a39Sopenharmony_ci 2952e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$R],$bi 2953e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_y],$a0 2954e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_y+8],$a1 2955e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_y+16],$a2 2956e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_y+24],$a3 2957e1051a39Sopenharmony_ci add %sp,LOCALS64+$R,$bp 2958e1051a39Sopenharmony_ci call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); 2959e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_y,$rp 2960e1051a39Sopenharmony_ci 2961e1051a39Sopenharmony_ci add %sp,LOCALS64+$S2,$bp 2962e1051a39Sopenharmony_ci call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); 2963e1051a39Sopenharmony_ci add %sp,LOCALS64+$res_y,$rp 2964e1051a39Sopenharmony_ci 2965e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty 2966e1051a39Sopenharmony_ci ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty 2967e1051a39Sopenharmony_ci1: call .+8 2968e1051a39Sopenharmony_ci add %o7,.Lone_mont_vis3-1b,$bp 2969e1051a39Sopenharmony_ci___ 2970e1051a39Sopenharmony_cifor($i=0;$i<64;$i+=16) { # conditional moves 2971e1051a39Sopenharmony_ci$code.=<<___; 2972e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res 2973e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 2974e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 2975e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 2976e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 2977e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 2978e1051a39Sopenharmony_ci movrz $t1,$acc2,$acc0 2979e1051a39Sopenharmony_ci movrz $t1,$acc3,$acc1 2980e1051a39Sopenharmony_ci movrz $t2,$acc4,$acc0 2981e1051a39Sopenharmony_ci movrz $t2,$acc5,$acc1 2982e1051a39Sopenharmony_ci srlx $acc0,32,$acc2 2983e1051a39Sopenharmony_ci srlx $acc1,32,$acc3 2984e1051a39Sopenharmony_ci st $acc0,[$rp_real+$i] 2985e1051a39Sopenharmony_ci st $acc2,[$rp_real+$i+4] 2986e1051a39Sopenharmony_ci st $acc1,[$rp_real+$i+8] 2987e1051a39Sopenharmony_ci st $acc3,[$rp_real+$i+12] 2988e1051a39Sopenharmony_ci___ 2989e1051a39Sopenharmony_ci} 2990e1051a39Sopenharmony_cifor(;$i<96;$i+=16) { 2991e1051a39Sopenharmony_ci$code.=<<___; 2992e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res 2993e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 2994e1051a39Sopenharmony_ci ldx [$bp+$i-64],$acc2 ! "in2" 2995e1051a39Sopenharmony_ci ldx [$bp+$i-64+8],$acc3 2996e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 2997e1051a39Sopenharmony_ci ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 2998e1051a39Sopenharmony_ci movrz $t1,$acc2,$acc0 2999e1051a39Sopenharmony_ci movrz $t1,$acc3,$acc1 3000e1051a39Sopenharmony_ci movrz $t2,$acc4,$acc0 3001e1051a39Sopenharmony_ci movrz $t2,$acc5,$acc1 3002e1051a39Sopenharmony_ci srlx $acc0,32,$acc2 3003e1051a39Sopenharmony_ci srlx $acc1,32,$acc3 3004e1051a39Sopenharmony_ci st $acc0,[$rp_real+$i] 3005e1051a39Sopenharmony_ci st $acc2,[$rp_real+$i+4] 3006e1051a39Sopenharmony_ci st $acc1,[$rp_real+$i+8] 3007e1051a39Sopenharmony_ci st $acc3,[$rp_real+$i+12] 3008e1051a39Sopenharmony_ci___ 3009e1051a39Sopenharmony_ci} 3010e1051a39Sopenharmony_ci$code.=<<___; 3011e1051a39Sopenharmony_ci ret 3012e1051a39Sopenharmony_ci restore 3013e1051a39Sopenharmony_ci.type ecp_nistz256_point_add_affine_vis3,#function 3014e1051a39Sopenharmony_ci.size ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3 3015e1051a39Sopenharmony_ci.align 64 3016e1051a39Sopenharmony_ci.Lone_mont_vis3: 3017e1051a39Sopenharmony_ci.long 0x00000000,0x00000001, 0xffffffff,0x00000000 3018e1051a39Sopenharmony_ci.long 0xffffffff,0xffffffff, 0x00000000,0xfffffffe 3019e1051a39Sopenharmony_ci.align 64 3020e1051a39Sopenharmony_ci___ 3021e1051a39Sopenharmony_ci} }}} 3022e1051a39Sopenharmony_ci 3023e1051a39Sopenharmony_ci# Purpose of these subroutines is to explicitly encode VIS instructions, 3024e1051a39Sopenharmony_ci# so that one can compile the module without having to specify VIS 3025e1051a39Sopenharmony_ci# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 3026e1051a39Sopenharmony_ci# Idea is to reserve for option to produce "universal" binary and let 3027e1051a39Sopenharmony_ci# programmer detect if current CPU is VIS capable at run-time. 3028e1051a39Sopenharmony_cisub unvis3 { 3029e1051a39Sopenharmony_cimy ($mnemonic,$rs1,$rs2,$rd)=@_; 3030e1051a39Sopenharmony_cimy %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 3031e1051a39Sopenharmony_cimy ($ref,$opf); 3032e1051a39Sopenharmony_cimy %visopf = ( "addxc" => 0x011, 3033e1051a39Sopenharmony_ci "addxccc" => 0x013, 3034e1051a39Sopenharmony_ci "umulxhi" => 0x016 ); 3035e1051a39Sopenharmony_ci 3036e1051a39Sopenharmony_ci $ref = "$mnemonic\t$rs1,$rs2,$rd"; 3037e1051a39Sopenharmony_ci 3038e1051a39Sopenharmony_ci if ($opf=$visopf{$mnemonic}) { 3039e1051a39Sopenharmony_ci foreach ($rs1,$rs2,$rd) { 3040e1051a39Sopenharmony_ci return $ref if (!/%([goli])([0-9])/); 3041e1051a39Sopenharmony_ci $_=$bias{$1}+$2; 3042e1051a39Sopenharmony_ci } 3043e1051a39Sopenharmony_ci 3044e1051a39Sopenharmony_ci return sprintf ".word\t0x%08x !%s", 3045e1051a39Sopenharmony_ci 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 3046e1051a39Sopenharmony_ci $ref; 3047e1051a39Sopenharmony_ci } else { 3048e1051a39Sopenharmony_ci return $ref; 3049e1051a39Sopenharmony_ci } 3050e1051a39Sopenharmony_ci} 3051e1051a39Sopenharmony_ci 3052e1051a39Sopenharmony_ciforeach (split("\n",$code)) { 3053e1051a39Sopenharmony_ci s/\`([^\`]*)\`/eval $1/ge; 3054e1051a39Sopenharmony_ci 3055e1051a39Sopenharmony_ci s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 3056e1051a39Sopenharmony_ci &unvis3($1,$2,$3,$4) 3057e1051a39Sopenharmony_ci /ge; 3058e1051a39Sopenharmony_ci 3059e1051a39Sopenharmony_ci print $_,"\n"; 3060e1051a39Sopenharmony_ci} 3061e1051a39Sopenharmony_ci 3062e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 3063