1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci 17e1051a39Sopenharmony_ci# April 2006 18e1051a39Sopenharmony_ci 19e1051a39Sopenharmony_ci# "Teaser" Montgomery multiplication module for PowerPC. It's possible 20e1051a39Sopenharmony_ci# to gain a bit more by modulo-scheduling outer loop, then dedicated 21e1051a39Sopenharmony_ci# squaring procedure should give further 20% and code can be adapted 22e1051a39Sopenharmony_ci# for 32-bit application running on 64-bit CPU. As for the latter. 23e1051a39Sopenharmony_ci# It won't be able to achieve "native" 64-bit performance, because in 24e1051a39Sopenharmony_ci# 32-bit application context every addc instruction will have to be 25e1051a39Sopenharmony_ci# expanded as addc, twice right shift by 32 and finally adde, etc. 26e1051a39Sopenharmony_ci# So far RSA *sign* performance improvement over pre-bn_mul_mont asm 27e1051a39Sopenharmony_ci# for 64-bit application running on PPC970/G5 is: 28e1051a39Sopenharmony_ci# 29e1051a39Sopenharmony_ci# 512-bit +65% 30e1051a39Sopenharmony_ci# 1024-bit +35% 31e1051a39Sopenharmony_ci# 2048-bit +18% 32e1051a39Sopenharmony_ci# 4096-bit +4% 33e1051a39Sopenharmony_ci 34e1051a39Sopenharmony_ci# September 2016 35e1051a39Sopenharmony_ci# 36e1051a39Sopenharmony_ci# Add multiplication procedure operating on lengths divisible by 4 37e1051a39Sopenharmony_ci# and squaring procedure operating on lengths divisible by 8. Length 38e1051a39Sopenharmony_ci# is expressed in number of limbs. RSA private key operations are 39e1051a39Sopenharmony_ci# ~35-50% faster (more for longer keys) on contemporary high-end POWER 40e1051a39Sopenharmony_ci# processors in 64-bit builds, [mysteriously enough] more in 32-bit 41e1051a39Sopenharmony_ci# builds. On low-end 32-bit processors performance improvement turned 42e1051a39Sopenharmony_ci# to be marginal... 43e1051a39Sopenharmony_ci 44e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 45e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 46e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 47e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 48e1051a39Sopenharmony_ci 49e1051a39Sopenharmony_ciif ($flavour =~ /32/) { 50e1051a39Sopenharmony_ci $BITS= 32; 51e1051a39Sopenharmony_ci $BNSZ= $BITS/8; 52e1051a39Sopenharmony_ci $SIZE_T=4; 53e1051a39Sopenharmony_ci $RZONE= 224; 54e1051a39Sopenharmony_ci 55e1051a39Sopenharmony_ci $LD= "lwz"; # load 56e1051a39Sopenharmony_ci $LDU= "lwzu"; # load and update 57e1051a39Sopenharmony_ci $LDX= "lwzx"; # load indexed 58e1051a39Sopenharmony_ci $ST= "stw"; # store 59e1051a39Sopenharmony_ci $STU= "stwu"; # store and update 60e1051a39Sopenharmony_ci $STX= "stwx"; # store indexed 61e1051a39Sopenharmony_ci $STUX= "stwux"; # store indexed and update 62e1051a39Sopenharmony_ci $UMULL= "mullw"; # unsigned multiply low 63e1051a39Sopenharmony_ci $UMULH= "mulhwu"; # unsigned multiply high 64e1051a39Sopenharmony_ci $UCMP= "cmplw"; # unsigned compare 65e1051a39Sopenharmony_ci $SHRI= "srwi"; # unsigned shift right by immediate 66e1051a39Sopenharmony_ci $SHLI= "slwi"; # unsigned shift left by immediate 67e1051a39Sopenharmony_ci $PUSH= $ST; 68e1051a39Sopenharmony_ci $POP= $LD; 69e1051a39Sopenharmony_ci} elsif ($flavour =~ /64/) { 70e1051a39Sopenharmony_ci $BITS= 64; 71e1051a39Sopenharmony_ci $BNSZ= $BITS/8; 72e1051a39Sopenharmony_ci $SIZE_T=8; 73e1051a39Sopenharmony_ci $RZONE= 288; 74e1051a39Sopenharmony_ci 75e1051a39Sopenharmony_ci # same as above, but 64-bit mnemonics... 76e1051a39Sopenharmony_ci $LD= "ld"; # load 77e1051a39Sopenharmony_ci $LDU= "ldu"; # load and update 78e1051a39Sopenharmony_ci $LDX= "ldx"; # load indexed 79e1051a39Sopenharmony_ci $ST= "std"; # store 80e1051a39Sopenharmony_ci $STU= "stdu"; # store and update 81e1051a39Sopenharmony_ci $STX= "stdx"; # store indexed 82e1051a39Sopenharmony_ci $STUX= "stdux"; # store indexed and update 83e1051a39Sopenharmony_ci $UMULL= "mulld"; # unsigned multiply low 84e1051a39Sopenharmony_ci $UMULH= "mulhdu"; # unsigned multiply high 85e1051a39Sopenharmony_ci $UCMP= "cmpld"; # unsigned compare 86e1051a39Sopenharmony_ci $SHRI= "srdi"; # unsigned shift right by immediate 87e1051a39Sopenharmony_ci $SHLI= "sldi"; # unsigned shift left by immediate 88e1051a39Sopenharmony_ci $PUSH= $ST; 89e1051a39Sopenharmony_ci $POP= $LD; 90e1051a39Sopenharmony_ci} else { die "nonsense $flavour"; } 91e1051a39Sopenharmony_ci 92e1051a39Sopenharmony_ci$FRAME=8*$SIZE_T+$RZONE; 93e1051a39Sopenharmony_ci$LOCALS=8*$SIZE_T; 94e1051a39Sopenharmony_ci 95e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 96e1051a39Sopenharmony_ci( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 97e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 98e1051a39Sopenharmony_cidie "can't locate ppc-xlate.pl"; 99e1051a39Sopenharmony_ci 100e1051a39Sopenharmony_ciopen STDOUT,"| $^X $xlate $flavour \"$output\"" 101e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 102e1051a39Sopenharmony_ci 103e1051a39Sopenharmony_ci$sp="r1"; 104e1051a39Sopenharmony_ci$toc="r2"; 105e1051a39Sopenharmony_ci$rp="r3"; 106e1051a39Sopenharmony_ci$ap="r4"; 107e1051a39Sopenharmony_ci$bp="r5"; 108e1051a39Sopenharmony_ci$np="r6"; 109e1051a39Sopenharmony_ci$n0="r7"; 110e1051a39Sopenharmony_ci$num="r8"; 111e1051a39Sopenharmony_ci 112e1051a39Sopenharmony_ci{ 113e1051a39Sopenharmony_cimy $ovf=$rp; 114e1051a39Sopenharmony_cimy $rp="r9"; # $rp is reassigned 115e1051a39Sopenharmony_cimy $aj="r10"; 116e1051a39Sopenharmony_cimy $nj="r11"; 117e1051a39Sopenharmony_cimy $tj="r12"; 118e1051a39Sopenharmony_ci# non-volatile registers 119e1051a39Sopenharmony_cimy $i="r20"; 120e1051a39Sopenharmony_cimy $j="r21"; 121e1051a39Sopenharmony_cimy $tp="r22"; 122e1051a39Sopenharmony_cimy $m0="r23"; 123e1051a39Sopenharmony_cimy $m1="r24"; 124e1051a39Sopenharmony_cimy $lo0="r25"; 125e1051a39Sopenharmony_cimy $hi0="r26"; 126e1051a39Sopenharmony_cimy $lo1="r27"; 127e1051a39Sopenharmony_cimy $hi1="r28"; 128e1051a39Sopenharmony_cimy $alo="r29"; 129e1051a39Sopenharmony_cimy $ahi="r30"; 130e1051a39Sopenharmony_cimy $nlo="r31"; 131e1051a39Sopenharmony_ci# 132e1051a39Sopenharmony_cimy $nhi="r0"; 133e1051a39Sopenharmony_ci 134e1051a39Sopenharmony_ci$code=<<___; 135e1051a39Sopenharmony_ci.machine "any" 136e1051a39Sopenharmony_ci.text 137e1051a39Sopenharmony_ci 138e1051a39Sopenharmony_ci.globl .bn_mul_mont_int 139e1051a39Sopenharmony_ci.align 5 140e1051a39Sopenharmony_ci.bn_mul_mont_int: 141e1051a39Sopenharmony_ci mr $rp,r3 ; $rp is reassigned 142e1051a39Sopenharmony_ci li r3,0 143e1051a39Sopenharmony_ci___ 144e1051a39Sopenharmony_ci$code.=<<___ if ($BNSZ==4); 145e1051a39Sopenharmony_ci cmpwi $num,32 ; longer key performance is not better 146e1051a39Sopenharmony_ci bgelr 147e1051a39Sopenharmony_ci___ 148e1051a39Sopenharmony_ci$code.=<<___; 149e1051a39Sopenharmony_ci slwi $num,$num,`log($BNSZ)/log(2)` 150e1051a39Sopenharmony_ci li $tj,-4096 151e1051a39Sopenharmony_ci addi $ovf,$num,$FRAME 152e1051a39Sopenharmony_ci subf $ovf,$ovf,$sp ; $sp-$ovf 153e1051a39Sopenharmony_ci and $ovf,$ovf,$tj ; minimize TLB usage 154e1051a39Sopenharmony_ci subf $ovf,$sp,$ovf ; $ovf-$sp 155e1051a39Sopenharmony_ci mr $tj,$sp 156e1051a39Sopenharmony_ci srwi $num,$num,`log($BNSZ)/log(2)` 157e1051a39Sopenharmony_ci $STUX $sp,$sp,$ovf 158e1051a39Sopenharmony_ci 159e1051a39Sopenharmony_ci $PUSH r20,`-12*$SIZE_T`($tj) 160e1051a39Sopenharmony_ci $PUSH r21,`-11*$SIZE_T`($tj) 161e1051a39Sopenharmony_ci $PUSH r22,`-10*$SIZE_T`($tj) 162e1051a39Sopenharmony_ci $PUSH r23,`-9*$SIZE_T`($tj) 163e1051a39Sopenharmony_ci $PUSH r24,`-8*$SIZE_T`($tj) 164e1051a39Sopenharmony_ci $PUSH r25,`-7*$SIZE_T`($tj) 165e1051a39Sopenharmony_ci $PUSH r26,`-6*$SIZE_T`($tj) 166e1051a39Sopenharmony_ci $PUSH r27,`-5*$SIZE_T`($tj) 167e1051a39Sopenharmony_ci $PUSH r28,`-4*$SIZE_T`($tj) 168e1051a39Sopenharmony_ci $PUSH r29,`-3*$SIZE_T`($tj) 169e1051a39Sopenharmony_ci $PUSH r30,`-2*$SIZE_T`($tj) 170e1051a39Sopenharmony_ci $PUSH r31,`-1*$SIZE_T`($tj) 171e1051a39Sopenharmony_ci 172e1051a39Sopenharmony_ci $LD $n0,0($n0) ; pull n0[0] value 173e1051a39Sopenharmony_ci addi $num,$num,-2 ; adjust $num for counter register 174e1051a39Sopenharmony_ci 175e1051a39Sopenharmony_ci $LD $m0,0($bp) ; m0=bp[0] 176e1051a39Sopenharmony_ci $LD $aj,0($ap) ; ap[0] 177e1051a39Sopenharmony_ci addi $tp,$sp,$LOCALS 178e1051a39Sopenharmony_ci $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] 179e1051a39Sopenharmony_ci $UMULH $hi0,$aj,$m0 180e1051a39Sopenharmony_ci 181e1051a39Sopenharmony_ci $LD $aj,$BNSZ($ap) ; ap[1] 182e1051a39Sopenharmony_ci $LD $nj,0($np) ; np[0] 183e1051a39Sopenharmony_ci 184e1051a39Sopenharmony_ci $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 185e1051a39Sopenharmony_ci 186e1051a39Sopenharmony_ci $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] 187e1051a39Sopenharmony_ci $UMULH $ahi,$aj,$m0 188e1051a39Sopenharmony_ci 189e1051a39Sopenharmony_ci $UMULL $lo1,$nj,$m1 ; np[0]*m1 190e1051a39Sopenharmony_ci $UMULH $hi1,$nj,$m1 191e1051a39Sopenharmony_ci $LD $nj,$BNSZ($np) ; np[1] 192e1051a39Sopenharmony_ci addc $lo1,$lo1,$lo0 193e1051a39Sopenharmony_ci addze $hi1,$hi1 194e1051a39Sopenharmony_ci 195e1051a39Sopenharmony_ci $UMULL $nlo,$nj,$m1 ; np[1]*m1 196e1051a39Sopenharmony_ci $UMULH $nhi,$nj,$m1 197e1051a39Sopenharmony_ci 198e1051a39Sopenharmony_ci mtctr $num 199e1051a39Sopenharmony_ci li $j,`2*$BNSZ` 200e1051a39Sopenharmony_ci.align 4 201e1051a39Sopenharmony_ciL1st: 202e1051a39Sopenharmony_ci $LDX $aj,$ap,$j ; ap[j] 203e1051a39Sopenharmony_ci addc $lo0,$alo,$hi0 204e1051a39Sopenharmony_ci $LDX $nj,$np,$j ; np[j] 205e1051a39Sopenharmony_ci addze $hi0,$ahi 206e1051a39Sopenharmony_ci $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] 207e1051a39Sopenharmony_ci addc $lo1,$nlo,$hi1 208e1051a39Sopenharmony_ci $UMULH $ahi,$aj,$m0 209e1051a39Sopenharmony_ci addze $hi1,$nhi 210e1051a39Sopenharmony_ci $UMULL $nlo,$nj,$m1 ; np[j]*m1 211e1051a39Sopenharmony_ci addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 212e1051a39Sopenharmony_ci $UMULH $nhi,$nj,$m1 213e1051a39Sopenharmony_ci addze $hi1,$hi1 214e1051a39Sopenharmony_ci $ST $lo1,0($tp) ; tp[j-1] 215e1051a39Sopenharmony_ci 216e1051a39Sopenharmony_ci addi $j,$j,$BNSZ ; j++ 217e1051a39Sopenharmony_ci addi $tp,$tp,$BNSZ ; tp++ 218e1051a39Sopenharmony_ci bdnz L1st 219e1051a39Sopenharmony_ci;L1st 220e1051a39Sopenharmony_ci addc $lo0,$alo,$hi0 221e1051a39Sopenharmony_ci addze $hi0,$ahi 222e1051a39Sopenharmony_ci 223e1051a39Sopenharmony_ci addc $lo1,$nlo,$hi1 224e1051a39Sopenharmony_ci addze $hi1,$nhi 225e1051a39Sopenharmony_ci addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 226e1051a39Sopenharmony_ci addze $hi1,$hi1 227e1051a39Sopenharmony_ci $ST $lo1,0($tp) ; tp[j-1] 228e1051a39Sopenharmony_ci 229e1051a39Sopenharmony_ci li $ovf,0 230e1051a39Sopenharmony_ci addc $hi1,$hi1,$hi0 231e1051a39Sopenharmony_ci addze $ovf,$ovf ; upmost overflow bit 232e1051a39Sopenharmony_ci $ST $hi1,$BNSZ($tp) 233e1051a39Sopenharmony_ci 234e1051a39Sopenharmony_ci li $i,$BNSZ 235e1051a39Sopenharmony_ci.align 4 236e1051a39Sopenharmony_ciLouter: 237e1051a39Sopenharmony_ci $LDX $m0,$bp,$i ; m0=bp[i] 238e1051a39Sopenharmony_ci $LD $aj,0($ap) ; ap[0] 239e1051a39Sopenharmony_ci addi $tp,$sp,$LOCALS 240e1051a39Sopenharmony_ci $LD $tj,$LOCALS($sp); tp[0] 241e1051a39Sopenharmony_ci $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] 242e1051a39Sopenharmony_ci $UMULH $hi0,$aj,$m0 243e1051a39Sopenharmony_ci $LD $aj,$BNSZ($ap) ; ap[1] 244e1051a39Sopenharmony_ci $LD $nj,0($np) ; np[0] 245e1051a39Sopenharmony_ci addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] 246e1051a39Sopenharmony_ci $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 247e1051a39Sopenharmony_ci addze $hi0,$hi0 248e1051a39Sopenharmony_ci $UMULL $m1,$lo0,$n0 ; tp[0]*n0 249e1051a39Sopenharmony_ci $UMULH $ahi,$aj,$m0 250e1051a39Sopenharmony_ci $UMULL $lo1,$nj,$m1 ; np[0]*m1 251e1051a39Sopenharmony_ci $UMULH $hi1,$nj,$m1 252e1051a39Sopenharmony_ci $LD $nj,$BNSZ($np) ; np[1] 253e1051a39Sopenharmony_ci addc $lo1,$lo1,$lo0 254e1051a39Sopenharmony_ci $UMULL $nlo,$nj,$m1 ; np[1]*m1 255e1051a39Sopenharmony_ci addze $hi1,$hi1 256e1051a39Sopenharmony_ci $UMULH $nhi,$nj,$m1 257e1051a39Sopenharmony_ci 258e1051a39Sopenharmony_ci mtctr $num 259e1051a39Sopenharmony_ci li $j,`2*$BNSZ` 260e1051a39Sopenharmony_ci.align 4 261e1051a39Sopenharmony_ciLinner: 262e1051a39Sopenharmony_ci $LDX $aj,$ap,$j ; ap[j] 263e1051a39Sopenharmony_ci addc $lo0,$alo,$hi0 264e1051a39Sopenharmony_ci $LD $tj,$BNSZ($tp) ; tp[j] 265e1051a39Sopenharmony_ci addze $hi0,$ahi 266e1051a39Sopenharmony_ci $LDX $nj,$np,$j ; np[j] 267e1051a39Sopenharmony_ci addc $lo1,$nlo,$hi1 268e1051a39Sopenharmony_ci $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 269e1051a39Sopenharmony_ci addze $hi1,$nhi 270e1051a39Sopenharmony_ci $UMULH $ahi,$aj,$m0 271e1051a39Sopenharmony_ci addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 272e1051a39Sopenharmony_ci $UMULL $nlo,$nj,$m1 ; np[j]*m1 273e1051a39Sopenharmony_ci addze $hi0,$hi0 274e1051a39Sopenharmony_ci $UMULH $nhi,$nj,$m1 275e1051a39Sopenharmony_ci addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 276e1051a39Sopenharmony_ci addi $j,$j,$BNSZ ; j++ 277e1051a39Sopenharmony_ci addze $hi1,$hi1 278e1051a39Sopenharmony_ci $ST $lo1,0($tp) ; tp[j-1] 279e1051a39Sopenharmony_ci addi $tp,$tp,$BNSZ ; tp++ 280e1051a39Sopenharmony_ci bdnz Linner 281e1051a39Sopenharmony_ci;Linner 282e1051a39Sopenharmony_ci $LD $tj,$BNSZ($tp) ; tp[j] 283e1051a39Sopenharmony_ci addc $lo0,$alo,$hi0 284e1051a39Sopenharmony_ci addze $hi0,$ahi 285e1051a39Sopenharmony_ci addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 286e1051a39Sopenharmony_ci addze $hi0,$hi0 287e1051a39Sopenharmony_ci 288e1051a39Sopenharmony_ci addc $lo1,$nlo,$hi1 289e1051a39Sopenharmony_ci addze $hi1,$nhi 290e1051a39Sopenharmony_ci addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 291e1051a39Sopenharmony_ci addze $hi1,$hi1 292e1051a39Sopenharmony_ci $ST $lo1,0($tp) ; tp[j-1] 293e1051a39Sopenharmony_ci 294e1051a39Sopenharmony_ci addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] 295e1051a39Sopenharmony_ci li $ovf,0 296e1051a39Sopenharmony_ci adde $hi1,$hi1,$hi0 297e1051a39Sopenharmony_ci addze $ovf,$ovf 298e1051a39Sopenharmony_ci $ST $hi1,$BNSZ($tp) 299e1051a39Sopenharmony_ci; 300e1051a39Sopenharmony_ci slwi $tj,$num,`log($BNSZ)/log(2)` 301e1051a39Sopenharmony_ci $UCMP $i,$tj 302e1051a39Sopenharmony_ci addi $i,$i,$BNSZ 303e1051a39Sopenharmony_ci ble Louter 304e1051a39Sopenharmony_ci 305e1051a39Sopenharmony_ci addi $num,$num,2 ; restore $num 306e1051a39Sopenharmony_ci subfc $j,$j,$j ; j=0 and "clear" XER[CA] 307e1051a39Sopenharmony_ci addi $tp,$sp,$LOCALS 308e1051a39Sopenharmony_ci mtctr $num 309e1051a39Sopenharmony_ci 310e1051a39Sopenharmony_ci.align 4 311e1051a39Sopenharmony_ciLsub: $LDX $tj,$tp,$j 312e1051a39Sopenharmony_ci $LDX $nj,$np,$j 313e1051a39Sopenharmony_ci subfe $aj,$nj,$tj ; tp[j]-np[j] 314e1051a39Sopenharmony_ci $STX $aj,$rp,$j 315e1051a39Sopenharmony_ci addi $j,$j,$BNSZ 316e1051a39Sopenharmony_ci bdnz Lsub 317e1051a39Sopenharmony_ci 318e1051a39Sopenharmony_ci li $j,0 319e1051a39Sopenharmony_ci mtctr $num 320e1051a39Sopenharmony_ci subfe $ovf,$j,$ovf ; handle upmost overflow bit 321e1051a39Sopenharmony_ci 322e1051a39Sopenharmony_ci.align 4 323e1051a39Sopenharmony_ciLcopy: ; conditional copy 324e1051a39Sopenharmony_ci $LDX $tj,$tp,$j 325e1051a39Sopenharmony_ci $LDX $aj,$rp,$j 326e1051a39Sopenharmony_ci and $tj,$tj,$ovf 327e1051a39Sopenharmony_ci andc $aj,$aj,$ovf 328e1051a39Sopenharmony_ci $STX $j,$tp,$j ; zap at once 329e1051a39Sopenharmony_ci or $aj,$aj,$tj 330e1051a39Sopenharmony_ci $STX $aj,$rp,$j 331e1051a39Sopenharmony_ci addi $j,$j,$BNSZ 332e1051a39Sopenharmony_ci bdnz Lcopy 333e1051a39Sopenharmony_ci 334e1051a39Sopenharmony_ci $POP $tj,0($sp) 335e1051a39Sopenharmony_ci li r3,1 336e1051a39Sopenharmony_ci $POP r20,`-12*$SIZE_T`($tj) 337e1051a39Sopenharmony_ci $POP r21,`-11*$SIZE_T`($tj) 338e1051a39Sopenharmony_ci $POP r22,`-10*$SIZE_T`($tj) 339e1051a39Sopenharmony_ci $POP r23,`-9*$SIZE_T`($tj) 340e1051a39Sopenharmony_ci $POP r24,`-8*$SIZE_T`($tj) 341e1051a39Sopenharmony_ci $POP r25,`-7*$SIZE_T`($tj) 342e1051a39Sopenharmony_ci $POP r26,`-6*$SIZE_T`($tj) 343e1051a39Sopenharmony_ci $POP r27,`-5*$SIZE_T`($tj) 344e1051a39Sopenharmony_ci $POP r28,`-4*$SIZE_T`($tj) 345e1051a39Sopenharmony_ci $POP r29,`-3*$SIZE_T`($tj) 346e1051a39Sopenharmony_ci $POP r30,`-2*$SIZE_T`($tj) 347e1051a39Sopenharmony_ci $POP r31,`-1*$SIZE_T`($tj) 348e1051a39Sopenharmony_ci mr $sp,$tj 349e1051a39Sopenharmony_ci blr 350e1051a39Sopenharmony_ci .long 0 351e1051a39Sopenharmony_ci .byte 0,12,4,0,0x80,12,6,0 352e1051a39Sopenharmony_ci .long 0 353e1051a39Sopenharmony_ci.size .bn_mul_mont_int,.-.bn_mul_mont_int 354e1051a39Sopenharmony_ci___ 355e1051a39Sopenharmony_ci} 356e1051a39Sopenharmony_ciif (1) { 357e1051a39Sopenharmony_cimy ($a0,$a1,$a2,$a3, 358e1051a39Sopenharmony_ci $t0,$t1,$t2,$t3, 359e1051a39Sopenharmony_ci $m0,$m1,$m2,$m3, 360e1051a39Sopenharmony_ci $acc0,$acc1,$acc2,$acc3,$acc4, 361e1051a39Sopenharmony_ci $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31)); 362e1051a39Sopenharmony_cimy ($carry,$zero) = ($rp,"r0"); 363e1051a39Sopenharmony_ci 364e1051a39Sopenharmony_ci# sp----------->+-------------------------------+ 365e1051a39Sopenharmony_ci# | saved sp | 366e1051a39Sopenharmony_ci# +-------------------------------+ 367e1051a39Sopenharmony_ci# . . 368e1051a39Sopenharmony_ci# +8*size_t +-------------------------------+ 369e1051a39Sopenharmony_ci# | 4 "n0*t0" | 370e1051a39Sopenharmony_ci# . . 371e1051a39Sopenharmony_ci# . . 372e1051a39Sopenharmony_ci# +12*size_t +-------------------------------+ 373e1051a39Sopenharmony_ci# | size_t tmp[num] | 374e1051a39Sopenharmony_ci# . . 375e1051a39Sopenharmony_ci# . . 376e1051a39Sopenharmony_ci# . . 377e1051a39Sopenharmony_ci# +-------------------------------+ 378e1051a39Sopenharmony_ci# | topmost carry | 379e1051a39Sopenharmony_ci# . . 380e1051a39Sopenharmony_ci# -18*size_t +-------------------------------+ 381e1051a39Sopenharmony_ci# | 18 saved gpr, r14-r31 | 382e1051a39Sopenharmony_ci# . . 383e1051a39Sopenharmony_ci# . . 384e1051a39Sopenharmony_ci# +-------------------------------+ 385e1051a39Sopenharmony_ci$code.=<<___; 386e1051a39Sopenharmony_ci.globl .bn_mul4x_mont_int 387e1051a39Sopenharmony_ci.align 5 388e1051a39Sopenharmony_ci.bn_mul4x_mont_int: 389e1051a39Sopenharmony_ci andi. r0,$num,7 390e1051a39Sopenharmony_ci bne .Lmul4x_do 391e1051a39Sopenharmony_ci $UCMP $ap,$bp 392e1051a39Sopenharmony_ci bne .Lmul4x_do 393e1051a39Sopenharmony_ci b .Lsqr8x_do 394e1051a39Sopenharmony_ci.Lmul4x_do: 395e1051a39Sopenharmony_ci slwi $num,$num,`log($SIZE_T)/log(2)` 396e1051a39Sopenharmony_ci mr $a0,$sp 397e1051a39Sopenharmony_ci li $a1,-32*$SIZE_T 398e1051a39Sopenharmony_ci sub $a1,$a1,$num 399e1051a39Sopenharmony_ci $STUX $sp,$sp,$a1 # alloca 400e1051a39Sopenharmony_ci 401e1051a39Sopenharmony_ci $PUSH r14,-$SIZE_T*18($a0) 402e1051a39Sopenharmony_ci $PUSH r15,-$SIZE_T*17($a0) 403e1051a39Sopenharmony_ci $PUSH r16,-$SIZE_T*16($a0) 404e1051a39Sopenharmony_ci $PUSH r17,-$SIZE_T*15($a0) 405e1051a39Sopenharmony_ci $PUSH r18,-$SIZE_T*14($a0) 406e1051a39Sopenharmony_ci $PUSH r19,-$SIZE_T*13($a0) 407e1051a39Sopenharmony_ci $PUSH r20,-$SIZE_T*12($a0) 408e1051a39Sopenharmony_ci $PUSH r21,-$SIZE_T*11($a0) 409e1051a39Sopenharmony_ci $PUSH r22,-$SIZE_T*10($a0) 410e1051a39Sopenharmony_ci $PUSH r23,-$SIZE_T*9($a0) 411e1051a39Sopenharmony_ci $PUSH r24,-$SIZE_T*8($a0) 412e1051a39Sopenharmony_ci $PUSH r25,-$SIZE_T*7($a0) 413e1051a39Sopenharmony_ci $PUSH r26,-$SIZE_T*6($a0) 414e1051a39Sopenharmony_ci $PUSH r27,-$SIZE_T*5($a0) 415e1051a39Sopenharmony_ci $PUSH r28,-$SIZE_T*4($a0) 416e1051a39Sopenharmony_ci $PUSH r29,-$SIZE_T*3($a0) 417e1051a39Sopenharmony_ci $PUSH r30,-$SIZE_T*2($a0) 418e1051a39Sopenharmony_ci $PUSH r31,-$SIZE_T*1($a0) 419e1051a39Sopenharmony_ci 420e1051a39Sopenharmony_ci subi $ap,$ap,$SIZE_T # bias by -1 421e1051a39Sopenharmony_ci subi $np,$np,$SIZE_T # bias by -1 422e1051a39Sopenharmony_ci subi $rp,$rp,$SIZE_T # bias by -1 423e1051a39Sopenharmony_ci $LD $n0,0($n0) # *n0 424e1051a39Sopenharmony_ci 425e1051a39Sopenharmony_ci add $t0,$bp,$num 426e1051a39Sopenharmony_ci add $ap_end,$ap,$num 427e1051a39Sopenharmony_ci subi $t0,$t0,$SIZE_T*4 # &b[num-4] 428e1051a39Sopenharmony_ci 429e1051a39Sopenharmony_ci $LD $bi,$SIZE_T*0($bp) # b[0] 430e1051a39Sopenharmony_ci li $acc0,0 431e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap) # a[0..3] 432e1051a39Sopenharmony_ci li $acc1,0 433e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap) 434e1051a39Sopenharmony_ci li $acc2,0 435e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap) 436e1051a39Sopenharmony_ci li $acc3,0 437e1051a39Sopenharmony_ci $LDU $a3,$SIZE_T*4($ap) 438e1051a39Sopenharmony_ci $LD $m0,$SIZE_T*1($np) # n[0..3] 439e1051a39Sopenharmony_ci $LD $m1,$SIZE_T*2($np) 440e1051a39Sopenharmony_ci $LD $m2,$SIZE_T*3($np) 441e1051a39Sopenharmony_ci $LDU $m3,$SIZE_T*4($np) 442e1051a39Sopenharmony_ci 443e1051a39Sopenharmony_ci $PUSH $rp,$SIZE_T*6($sp) # offload rp and &b[num-4] 444e1051a39Sopenharmony_ci $PUSH $t0,$SIZE_T*7($sp) 445e1051a39Sopenharmony_ci li $carry,0 446e1051a39Sopenharmony_ci addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit 447e1051a39Sopenharmony_ci li $cnt,0 448e1051a39Sopenharmony_ci li $zero,0 449e1051a39Sopenharmony_ci b .Loop_mul4x_1st_reduction 450e1051a39Sopenharmony_ci 451e1051a39Sopenharmony_ci.align 5 452e1051a39Sopenharmony_ci.Loop_mul4x_1st_reduction: 453e1051a39Sopenharmony_ci $UMULL $t0,$a0,$bi # lo(a[0..3]*b[0]) 454e1051a39Sopenharmony_ci addze $carry,$carry # modulo-scheduled 455e1051a39Sopenharmony_ci $UMULL $t1,$a1,$bi 456e1051a39Sopenharmony_ci addi $cnt,$cnt,$SIZE_T 457e1051a39Sopenharmony_ci $UMULL $t2,$a2,$bi 458e1051a39Sopenharmony_ci andi. $cnt,$cnt,$SIZE_T*4-1 459e1051a39Sopenharmony_ci $UMULL $t3,$a3,$bi 460e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 461e1051a39Sopenharmony_ci $UMULH $t0,$a0,$bi # hi(a[0..3]*b[0]) 462e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 463e1051a39Sopenharmony_ci $UMULH $t1,$a1,$bi 464e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 465e1051a39Sopenharmony_ci $UMULL $mi,$acc0,$n0 # t[0]*n0 466e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 467e1051a39Sopenharmony_ci $UMULH $t2,$a2,$bi 468e1051a39Sopenharmony_ci addze $acc4,$zero 469e1051a39Sopenharmony_ci $UMULH $t3,$a3,$bi 470e1051a39Sopenharmony_ci $LDX $bi,$bp,$cnt # next b[i] (or b[0]) 471e1051a39Sopenharmony_ci addc $acc1,$acc1,$t0 472e1051a39Sopenharmony_ci # (*) mul $t0,$m0,$mi # lo(n[0..3]*t[0]*n0) 473e1051a39Sopenharmony_ci $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing 474e1051a39Sopenharmony_ci adde $acc2,$acc2,$t1 475e1051a39Sopenharmony_ci $UMULL $t1,$m1,$mi 476e1051a39Sopenharmony_ci adde $acc3,$acc3,$t2 477e1051a39Sopenharmony_ci $UMULL $t2,$m2,$mi 478e1051a39Sopenharmony_ci adde $acc4,$acc4,$t3 # can't overflow 479e1051a39Sopenharmony_ci $UMULL $t3,$m3,$mi 480e1051a39Sopenharmony_ci # (*) addc $acc0,$acc0,$t0 481e1051a39Sopenharmony_ci # (*) As for removal of first multiplication and addition 482e1051a39Sopenharmony_ci # instructions. The outcome of first addition is 483e1051a39Sopenharmony_ci # guaranteed to be zero, which leaves two computationally 484e1051a39Sopenharmony_ci # significant outcomes: it either carries or not. Then 485e1051a39Sopenharmony_ci # question is when does it carry? Is there alternative 486e1051a39Sopenharmony_ci # way to deduce it? If you follow operations, you can 487e1051a39Sopenharmony_ci # observe that condition for carry is quite simple: 488e1051a39Sopenharmony_ci # $acc0 being non-zero. So that carry can be calculated 489e1051a39Sopenharmony_ci # by adding -1 to $acc0. That's what next instruction does. 490e1051a39Sopenharmony_ci addic $acc0,$acc0,-1 # (*), discarded 491e1051a39Sopenharmony_ci $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0) 492e1051a39Sopenharmony_ci adde $acc0,$acc1,$t1 493e1051a39Sopenharmony_ci $UMULH $t1,$m1,$mi 494e1051a39Sopenharmony_ci adde $acc1,$acc2,$t2 495e1051a39Sopenharmony_ci $UMULH $t2,$m2,$mi 496e1051a39Sopenharmony_ci adde $acc2,$acc3,$t3 497e1051a39Sopenharmony_ci $UMULH $t3,$m3,$mi 498e1051a39Sopenharmony_ci adde $acc3,$acc4,$carry 499e1051a39Sopenharmony_ci addze $carry,$zero 500e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 501e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 502e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 503e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 504e1051a39Sopenharmony_ci #addze $carry,$carry 505e1051a39Sopenharmony_ci bne .Loop_mul4x_1st_reduction 506e1051a39Sopenharmony_ci 507e1051a39Sopenharmony_ci $UCMP $ap_end,$ap 508e1051a39Sopenharmony_ci beq .Lmul4x4_post_condition 509e1051a39Sopenharmony_ci 510e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap) # a[4..7] 511e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap) 512e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap) 513e1051a39Sopenharmony_ci $LDU $a3,$SIZE_T*4($ap) 514e1051a39Sopenharmony_ci $LD $mi,$SIZE_T*8($sp) # a[0]*n0 515e1051a39Sopenharmony_ci $LD $m0,$SIZE_T*1($np) # n[4..7] 516e1051a39Sopenharmony_ci $LD $m1,$SIZE_T*2($np) 517e1051a39Sopenharmony_ci $LD $m2,$SIZE_T*3($np) 518e1051a39Sopenharmony_ci $LDU $m3,$SIZE_T*4($np) 519e1051a39Sopenharmony_ci b .Loop_mul4x_1st_tail 520e1051a39Sopenharmony_ci 521e1051a39Sopenharmony_ci.align 5 522e1051a39Sopenharmony_ci.Loop_mul4x_1st_tail: 523e1051a39Sopenharmony_ci $UMULL $t0,$a0,$bi # lo(a[4..7]*b[i]) 524e1051a39Sopenharmony_ci addze $carry,$carry # modulo-scheduled 525e1051a39Sopenharmony_ci $UMULL $t1,$a1,$bi 526e1051a39Sopenharmony_ci addi $cnt,$cnt,$SIZE_T 527e1051a39Sopenharmony_ci $UMULL $t2,$a2,$bi 528e1051a39Sopenharmony_ci andi. $cnt,$cnt,$SIZE_T*4-1 529e1051a39Sopenharmony_ci $UMULL $t3,$a3,$bi 530e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 531e1051a39Sopenharmony_ci $UMULH $t0,$a0,$bi # hi(a[4..7]*b[i]) 532e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 533e1051a39Sopenharmony_ci $UMULH $t1,$a1,$bi 534e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 535e1051a39Sopenharmony_ci $UMULH $t2,$a2,$bi 536e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 537e1051a39Sopenharmony_ci $UMULH $t3,$a3,$bi 538e1051a39Sopenharmony_ci addze $acc4,$zero 539e1051a39Sopenharmony_ci $LDX $bi,$bp,$cnt # next b[i] (or b[0]) 540e1051a39Sopenharmony_ci addc $acc1,$acc1,$t0 541e1051a39Sopenharmony_ci $UMULL $t0,$m0,$mi # lo(n[4..7]*a[0]*n0) 542e1051a39Sopenharmony_ci adde $acc2,$acc2,$t1 543e1051a39Sopenharmony_ci $UMULL $t1,$m1,$mi 544e1051a39Sopenharmony_ci adde $acc3,$acc3,$t2 545e1051a39Sopenharmony_ci $UMULL $t2,$m2,$mi 546e1051a39Sopenharmony_ci adde $acc4,$acc4,$t3 # can't overflow 547e1051a39Sopenharmony_ci $UMULL $t3,$m3,$mi 548e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 549e1051a39Sopenharmony_ci $UMULH $t0,$m0,$mi # hi(n[4..7]*a[0]*n0) 550e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 551e1051a39Sopenharmony_ci $UMULH $t1,$m1,$mi 552e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 553e1051a39Sopenharmony_ci $UMULH $t2,$m2,$mi 554e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 555e1051a39Sopenharmony_ci adde $acc4,$acc4,$carry 556e1051a39Sopenharmony_ci $UMULH $t3,$m3,$mi 557e1051a39Sopenharmony_ci addze $carry,$zero 558e1051a39Sopenharmony_ci addi $mi,$sp,$SIZE_T*8 559e1051a39Sopenharmony_ci $LDX $mi,$mi,$cnt # next t[0]*n0 560e1051a39Sopenharmony_ci $STU $acc0,$SIZE_T($tp) # word of result 561e1051a39Sopenharmony_ci addc $acc0,$acc1,$t0 562e1051a39Sopenharmony_ci adde $acc1,$acc2,$t1 563e1051a39Sopenharmony_ci adde $acc2,$acc3,$t2 564e1051a39Sopenharmony_ci adde $acc3,$acc4,$t3 565e1051a39Sopenharmony_ci #addze $carry,$carry 566e1051a39Sopenharmony_ci bne .Loop_mul4x_1st_tail 567e1051a39Sopenharmony_ci 568e1051a39Sopenharmony_ci sub $t1,$ap_end,$num # rewinded $ap 569e1051a39Sopenharmony_ci $UCMP $ap_end,$ap # done yet? 570e1051a39Sopenharmony_ci beq .Lmul4x_proceed 571e1051a39Sopenharmony_ci 572e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap) 573e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap) 574e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap) 575e1051a39Sopenharmony_ci $LDU $a3,$SIZE_T*4($ap) 576e1051a39Sopenharmony_ci $LD $m0,$SIZE_T*1($np) 577e1051a39Sopenharmony_ci $LD $m1,$SIZE_T*2($np) 578e1051a39Sopenharmony_ci $LD $m2,$SIZE_T*3($np) 579e1051a39Sopenharmony_ci $LDU $m3,$SIZE_T*4($np) 580e1051a39Sopenharmony_ci b .Loop_mul4x_1st_tail 581e1051a39Sopenharmony_ci 582e1051a39Sopenharmony_ci.align 5 583e1051a39Sopenharmony_ci.Lmul4x_proceed: 584e1051a39Sopenharmony_ci $LDU $bi,$SIZE_T*4($bp) # *++b 585e1051a39Sopenharmony_ci addze $carry,$carry # topmost carry 586e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($t1) 587e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($t1) 588e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($t1) 589e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($t1) 590e1051a39Sopenharmony_ci addi $ap,$t1,$SIZE_T*4 591e1051a39Sopenharmony_ci sub $np,$np,$num # rewind np 592e1051a39Sopenharmony_ci 593e1051a39Sopenharmony_ci $ST $acc0,$SIZE_T*1($tp) # result 594e1051a39Sopenharmony_ci $ST $acc1,$SIZE_T*2($tp) 595e1051a39Sopenharmony_ci $ST $acc2,$SIZE_T*3($tp) 596e1051a39Sopenharmony_ci $ST $acc3,$SIZE_T*4($tp) 597e1051a39Sopenharmony_ci $ST $carry,$SIZE_T*5($tp) # save topmost carry 598e1051a39Sopenharmony_ci $LD $acc0,$SIZE_T*12($sp) # t[0..3] 599e1051a39Sopenharmony_ci $LD $acc1,$SIZE_T*13($sp) 600e1051a39Sopenharmony_ci $LD $acc2,$SIZE_T*14($sp) 601e1051a39Sopenharmony_ci $LD $acc3,$SIZE_T*15($sp) 602e1051a39Sopenharmony_ci 603e1051a39Sopenharmony_ci $LD $m0,$SIZE_T*1($np) # n[0..3] 604e1051a39Sopenharmony_ci $LD $m1,$SIZE_T*2($np) 605e1051a39Sopenharmony_ci $LD $m2,$SIZE_T*3($np) 606e1051a39Sopenharmony_ci $LDU $m3,$SIZE_T*4($np) 607e1051a39Sopenharmony_ci addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit 608e1051a39Sopenharmony_ci li $carry,0 609e1051a39Sopenharmony_ci b .Loop_mul4x_reduction 610e1051a39Sopenharmony_ci 611e1051a39Sopenharmony_ci.align 5 612e1051a39Sopenharmony_ci.Loop_mul4x_reduction: 613e1051a39Sopenharmony_ci $UMULL $t0,$a0,$bi # lo(a[0..3]*b[4]) 614e1051a39Sopenharmony_ci addze $carry,$carry # modulo-scheduled 615e1051a39Sopenharmony_ci $UMULL $t1,$a1,$bi 616e1051a39Sopenharmony_ci addi $cnt,$cnt,$SIZE_T 617e1051a39Sopenharmony_ci $UMULL $t2,$a2,$bi 618e1051a39Sopenharmony_ci andi. $cnt,$cnt,$SIZE_T*4-1 619e1051a39Sopenharmony_ci $UMULL $t3,$a3,$bi 620e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 621e1051a39Sopenharmony_ci $UMULH $t0,$a0,$bi # hi(a[0..3]*b[4]) 622e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 623e1051a39Sopenharmony_ci $UMULH $t1,$a1,$bi 624e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 625e1051a39Sopenharmony_ci $UMULL $mi,$acc0,$n0 # t[0]*n0 626e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 627e1051a39Sopenharmony_ci $UMULH $t2,$a2,$bi 628e1051a39Sopenharmony_ci addze $acc4,$zero 629e1051a39Sopenharmony_ci $UMULH $t3,$a3,$bi 630e1051a39Sopenharmony_ci $LDX $bi,$bp,$cnt # next b[i] 631e1051a39Sopenharmony_ci addc $acc1,$acc1,$t0 632e1051a39Sopenharmony_ci # (*) mul $t0,$m0,$mi 633e1051a39Sopenharmony_ci $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing 634e1051a39Sopenharmony_ci adde $acc2,$acc2,$t1 635e1051a39Sopenharmony_ci $UMULL $t1,$m1,$mi # lo(n[0..3]*t[0]*n0 636e1051a39Sopenharmony_ci adde $acc3,$acc3,$t2 637e1051a39Sopenharmony_ci $UMULL $t2,$m2,$mi 638e1051a39Sopenharmony_ci adde $acc4,$acc4,$t3 # can't overflow 639e1051a39Sopenharmony_ci $UMULL $t3,$m3,$mi 640e1051a39Sopenharmony_ci # (*) addc $acc0,$acc0,$t0 641e1051a39Sopenharmony_ci addic $acc0,$acc0,-1 # (*), discarded 642e1051a39Sopenharmony_ci $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0 643e1051a39Sopenharmony_ci adde $acc0,$acc1,$t1 644e1051a39Sopenharmony_ci $UMULH $t1,$m1,$mi 645e1051a39Sopenharmony_ci adde $acc1,$acc2,$t2 646e1051a39Sopenharmony_ci $UMULH $t2,$m2,$mi 647e1051a39Sopenharmony_ci adde $acc2,$acc3,$t3 648e1051a39Sopenharmony_ci $UMULH $t3,$m3,$mi 649e1051a39Sopenharmony_ci adde $acc3,$acc4,$carry 650e1051a39Sopenharmony_ci addze $carry,$zero 651e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 652e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 653e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 654e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 655e1051a39Sopenharmony_ci #addze $carry,$carry 656e1051a39Sopenharmony_ci bne .Loop_mul4x_reduction 657e1051a39Sopenharmony_ci 658e1051a39Sopenharmony_ci $LD $t0,$SIZE_T*5($tp) # t[4..7] 659e1051a39Sopenharmony_ci addze $carry,$carry 660e1051a39Sopenharmony_ci $LD $t1,$SIZE_T*6($tp) 661e1051a39Sopenharmony_ci $LD $t2,$SIZE_T*7($tp) 662e1051a39Sopenharmony_ci $LD $t3,$SIZE_T*8($tp) 663e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap) # a[4..7] 664e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap) 665e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap) 666e1051a39Sopenharmony_ci $LDU $a3,$SIZE_T*4($ap) 667e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 668e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 669e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 670e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 671e1051a39Sopenharmony_ci #addze $carry,$carry 672e1051a39Sopenharmony_ci 673e1051a39Sopenharmony_ci $LD $mi,$SIZE_T*8($sp) # t[0]*n0 674e1051a39Sopenharmony_ci $LD $m0,$SIZE_T*1($np) # n[4..7] 675e1051a39Sopenharmony_ci $LD $m1,$SIZE_T*2($np) 676e1051a39Sopenharmony_ci $LD $m2,$SIZE_T*3($np) 677e1051a39Sopenharmony_ci $LDU $m3,$SIZE_T*4($np) 678e1051a39Sopenharmony_ci b .Loop_mul4x_tail 679e1051a39Sopenharmony_ci 680e1051a39Sopenharmony_ci.align 5 681e1051a39Sopenharmony_ci.Loop_mul4x_tail: 682e1051a39Sopenharmony_ci $UMULL $t0,$a0,$bi # lo(a[4..7]*b[4]) 683e1051a39Sopenharmony_ci addze $carry,$carry # modulo-scheduled 684e1051a39Sopenharmony_ci $UMULL $t1,$a1,$bi 685e1051a39Sopenharmony_ci addi $cnt,$cnt,$SIZE_T 686e1051a39Sopenharmony_ci $UMULL $t2,$a2,$bi 687e1051a39Sopenharmony_ci andi. $cnt,$cnt,$SIZE_T*4-1 688e1051a39Sopenharmony_ci $UMULL $t3,$a3,$bi 689e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 690e1051a39Sopenharmony_ci $UMULH $t0,$a0,$bi # hi(a[4..7]*b[4]) 691e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 692e1051a39Sopenharmony_ci $UMULH $t1,$a1,$bi 693e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 694e1051a39Sopenharmony_ci $UMULH $t2,$a2,$bi 695e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 696e1051a39Sopenharmony_ci $UMULH $t3,$a3,$bi 697e1051a39Sopenharmony_ci addze $acc4,$zero 698e1051a39Sopenharmony_ci $LDX $bi,$bp,$cnt # next b[i] 699e1051a39Sopenharmony_ci addc $acc1,$acc1,$t0 700e1051a39Sopenharmony_ci $UMULL $t0,$m0,$mi # lo(n[4..7]*t[0]*n0) 701e1051a39Sopenharmony_ci adde $acc2,$acc2,$t1 702e1051a39Sopenharmony_ci $UMULL $t1,$m1,$mi 703e1051a39Sopenharmony_ci adde $acc3,$acc3,$t2 704e1051a39Sopenharmony_ci $UMULL $t2,$m2,$mi 705e1051a39Sopenharmony_ci adde $acc4,$acc4,$t3 # can't overflow 706e1051a39Sopenharmony_ci $UMULL $t3,$m3,$mi 707e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 708e1051a39Sopenharmony_ci $UMULH $t0,$m0,$mi # hi(n[4..7]*t[0]*n0) 709e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 710e1051a39Sopenharmony_ci $UMULH $t1,$m1,$mi 711e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 712e1051a39Sopenharmony_ci $UMULH $t2,$m2,$mi 713e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 714e1051a39Sopenharmony_ci $UMULH $t3,$m3,$mi 715e1051a39Sopenharmony_ci adde $acc4,$acc4,$carry 716e1051a39Sopenharmony_ci addi $mi,$sp,$SIZE_T*8 717e1051a39Sopenharmony_ci $LDX $mi,$mi,$cnt # next a[0]*n0 718e1051a39Sopenharmony_ci addze $carry,$zero 719e1051a39Sopenharmony_ci $STU $acc0,$SIZE_T($tp) # word of result 720e1051a39Sopenharmony_ci addc $acc0,$acc1,$t0 721e1051a39Sopenharmony_ci adde $acc1,$acc2,$t1 722e1051a39Sopenharmony_ci adde $acc2,$acc3,$t2 723e1051a39Sopenharmony_ci adde $acc3,$acc4,$t3 724e1051a39Sopenharmony_ci #addze $carry,$carry 725e1051a39Sopenharmony_ci bne .Loop_mul4x_tail 726e1051a39Sopenharmony_ci 727e1051a39Sopenharmony_ci $LD $t0,$SIZE_T*5($tp) # next t[i] or topmost carry 728e1051a39Sopenharmony_ci sub $t1,$np,$num # rewinded np? 729e1051a39Sopenharmony_ci addze $carry,$carry 730e1051a39Sopenharmony_ci $UCMP $ap_end,$ap # done yet? 731e1051a39Sopenharmony_ci beq .Loop_mul4x_break 732e1051a39Sopenharmony_ci 733e1051a39Sopenharmony_ci $LD $t1,$SIZE_T*6($tp) 734e1051a39Sopenharmony_ci $LD $t2,$SIZE_T*7($tp) 735e1051a39Sopenharmony_ci $LD $t3,$SIZE_T*8($tp) 736e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap) 737e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap) 738e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap) 739e1051a39Sopenharmony_ci $LDU $a3,$SIZE_T*4($ap) 740e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 741e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 742e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 743e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 744e1051a39Sopenharmony_ci #addze $carry,$carry 745e1051a39Sopenharmony_ci 746e1051a39Sopenharmony_ci $LD $m0,$SIZE_T*1($np) # n[4..7] 747e1051a39Sopenharmony_ci $LD $m1,$SIZE_T*2($np) 748e1051a39Sopenharmony_ci $LD $m2,$SIZE_T*3($np) 749e1051a39Sopenharmony_ci $LDU $m3,$SIZE_T*4($np) 750e1051a39Sopenharmony_ci b .Loop_mul4x_tail 751e1051a39Sopenharmony_ci 752e1051a39Sopenharmony_ci.align 5 753e1051a39Sopenharmony_ci.Loop_mul4x_break: 754e1051a39Sopenharmony_ci $POP $t2,$SIZE_T*6($sp) # pull rp and &b[num-4] 755e1051a39Sopenharmony_ci $POP $t3,$SIZE_T*7($sp) 756e1051a39Sopenharmony_ci addc $a0,$acc0,$t0 # accumulate topmost carry 757e1051a39Sopenharmony_ci $LD $acc0,$SIZE_T*12($sp) # t[0..3] 758e1051a39Sopenharmony_ci addze $a1,$acc1 759e1051a39Sopenharmony_ci $LD $acc1,$SIZE_T*13($sp) 760e1051a39Sopenharmony_ci addze $a2,$acc2 761e1051a39Sopenharmony_ci $LD $acc2,$SIZE_T*14($sp) 762e1051a39Sopenharmony_ci addze $a3,$acc3 763e1051a39Sopenharmony_ci $LD $acc3,$SIZE_T*15($sp) 764e1051a39Sopenharmony_ci addze $carry,$carry # topmost carry 765e1051a39Sopenharmony_ci $ST $a0,$SIZE_T*1($tp) # result 766e1051a39Sopenharmony_ci sub $ap,$ap_end,$num # rewind ap 767e1051a39Sopenharmony_ci $ST $a1,$SIZE_T*2($tp) 768e1051a39Sopenharmony_ci $ST $a2,$SIZE_T*3($tp) 769e1051a39Sopenharmony_ci $ST $a3,$SIZE_T*4($tp) 770e1051a39Sopenharmony_ci $ST $carry,$SIZE_T*5($tp) # store topmost carry 771e1051a39Sopenharmony_ci 772e1051a39Sopenharmony_ci $LD $m0,$SIZE_T*1($t1) # n[0..3] 773e1051a39Sopenharmony_ci $LD $m1,$SIZE_T*2($t1) 774e1051a39Sopenharmony_ci $LD $m2,$SIZE_T*3($t1) 775e1051a39Sopenharmony_ci $LD $m3,$SIZE_T*4($t1) 776e1051a39Sopenharmony_ci addi $np,$t1,$SIZE_T*4 777e1051a39Sopenharmony_ci $UCMP $bp,$t3 # done yet? 778e1051a39Sopenharmony_ci beq .Lmul4x_post 779e1051a39Sopenharmony_ci 780e1051a39Sopenharmony_ci $LDU $bi,$SIZE_T*4($bp) 781e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap) # a[0..3] 782e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap) 783e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap) 784e1051a39Sopenharmony_ci $LDU $a3,$SIZE_T*4($ap) 785e1051a39Sopenharmony_ci li $carry,0 786e1051a39Sopenharmony_ci addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit 787e1051a39Sopenharmony_ci b .Loop_mul4x_reduction 788e1051a39Sopenharmony_ci 789e1051a39Sopenharmony_ci.align 5 790e1051a39Sopenharmony_ci.Lmul4x_post: 791e1051a39Sopenharmony_ci # Final step. We see if result is larger than modulus, and 792e1051a39Sopenharmony_ci # if it is, subtract the modulus. But comparison implies 793e1051a39Sopenharmony_ci # subtraction. So we subtract modulus, see if it borrowed, 794e1051a39Sopenharmony_ci # and conditionally copy original value. 795e1051a39Sopenharmony_ci srwi $cnt,$num,`log($SIZE_T)/log(2)+2` 796e1051a39Sopenharmony_ci mr $bp,$t2 # &rp[-1] 797e1051a39Sopenharmony_ci subi $cnt,$cnt,1 798e1051a39Sopenharmony_ci mr $ap_end,$t2 # &rp[-1] copy 799e1051a39Sopenharmony_ci subfc $t0,$m0,$acc0 800e1051a39Sopenharmony_ci addi $tp,$sp,$SIZE_T*15 801e1051a39Sopenharmony_ci subfe $t1,$m1,$acc1 802e1051a39Sopenharmony_ci 803e1051a39Sopenharmony_ci mtctr $cnt 804e1051a39Sopenharmony_ci.Lmul4x_sub: 805e1051a39Sopenharmony_ci $LD $m0,$SIZE_T*1($np) 806e1051a39Sopenharmony_ci $LD $acc0,$SIZE_T*1($tp) 807e1051a39Sopenharmony_ci subfe $t2,$m2,$acc2 808e1051a39Sopenharmony_ci $LD $m1,$SIZE_T*2($np) 809e1051a39Sopenharmony_ci $LD $acc1,$SIZE_T*2($tp) 810e1051a39Sopenharmony_ci subfe $t3,$m3,$acc3 811e1051a39Sopenharmony_ci $LD $m2,$SIZE_T*3($np) 812e1051a39Sopenharmony_ci $LD $acc2,$SIZE_T*3($tp) 813e1051a39Sopenharmony_ci $LDU $m3,$SIZE_T*4($np) 814e1051a39Sopenharmony_ci $LDU $acc3,$SIZE_T*4($tp) 815e1051a39Sopenharmony_ci $ST $t0,$SIZE_T*1($bp) 816e1051a39Sopenharmony_ci $ST $t1,$SIZE_T*2($bp) 817e1051a39Sopenharmony_ci subfe $t0,$m0,$acc0 818e1051a39Sopenharmony_ci $ST $t2,$SIZE_T*3($bp) 819e1051a39Sopenharmony_ci $STU $t3,$SIZE_T*4($bp) 820e1051a39Sopenharmony_ci subfe $t1,$m1,$acc1 821e1051a39Sopenharmony_ci bdnz .Lmul4x_sub 822e1051a39Sopenharmony_ci 823e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap_end) 824e1051a39Sopenharmony_ci $ST $t0,$SIZE_T*1($bp) 825e1051a39Sopenharmony_ci $LD $t0,$SIZE_T*12($sp) 826e1051a39Sopenharmony_ci subfe $t2,$m2,$acc2 827e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap_end) 828e1051a39Sopenharmony_ci $ST $t1,$SIZE_T*2($bp) 829e1051a39Sopenharmony_ci $LD $t1,$SIZE_T*13($sp) 830e1051a39Sopenharmony_ci subfe $t3,$m3,$acc3 831e1051a39Sopenharmony_ci subfe $carry,$zero,$carry # did it borrow? 832e1051a39Sopenharmony_ci addi $tp,$sp,$SIZE_T*12 833e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap_end) 834e1051a39Sopenharmony_ci $ST $t2,$SIZE_T*3($bp) 835e1051a39Sopenharmony_ci $LD $t2,$SIZE_T*14($sp) 836e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($ap_end) 837e1051a39Sopenharmony_ci $ST $t3,$SIZE_T*4($bp) 838e1051a39Sopenharmony_ci $LD $t3,$SIZE_T*15($sp) 839e1051a39Sopenharmony_ci 840e1051a39Sopenharmony_ci mtctr $cnt 841e1051a39Sopenharmony_ci.Lmul4x_cond_copy: 842e1051a39Sopenharmony_ci and $t0,$t0,$carry 843e1051a39Sopenharmony_ci andc $a0,$a0,$carry 844e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*0($tp) # wipe stack clean 845e1051a39Sopenharmony_ci and $t1,$t1,$carry 846e1051a39Sopenharmony_ci andc $a1,$a1,$carry 847e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*1($tp) 848e1051a39Sopenharmony_ci and $t2,$t2,$carry 849e1051a39Sopenharmony_ci andc $a2,$a2,$carry 850e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*2($tp) 851e1051a39Sopenharmony_ci and $t3,$t3,$carry 852e1051a39Sopenharmony_ci andc $a3,$a3,$carry 853e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*3($tp) 854e1051a39Sopenharmony_ci or $acc0,$t0,$a0 855e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*5($ap_end) 856e1051a39Sopenharmony_ci $LD $t0,$SIZE_T*4($tp) 857e1051a39Sopenharmony_ci or $acc1,$t1,$a1 858e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*6($ap_end) 859e1051a39Sopenharmony_ci $LD $t1,$SIZE_T*5($tp) 860e1051a39Sopenharmony_ci or $acc2,$t2,$a2 861e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*7($ap_end) 862e1051a39Sopenharmony_ci $LD $t2,$SIZE_T*6($tp) 863e1051a39Sopenharmony_ci or $acc3,$t3,$a3 864e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*8($ap_end) 865e1051a39Sopenharmony_ci $LD $t3,$SIZE_T*7($tp) 866e1051a39Sopenharmony_ci addi $tp,$tp,$SIZE_T*4 867e1051a39Sopenharmony_ci $ST $acc0,$SIZE_T*1($ap_end) 868e1051a39Sopenharmony_ci $ST $acc1,$SIZE_T*2($ap_end) 869e1051a39Sopenharmony_ci $ST $acc2,$SIZE_T*3($ap_end) 870e1051a39Sopenharmony_ci $STU $acc3,$SIZE_T*4($ap_end) 871e1051a39Sopenharmony_ci bdnz .Lmul4x_cond_copy 872e1051a39Sopenharmony_ci 873e1051a39Sopenharmony_ci $POP $bp,0($sp) # pull saved sp 874e1051a39Sopenharmony_ci and $t0,$t0,$carry 875e1051a39Sopenharmony_ci andc $a0,$a0,$carry 876e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*0($tp) 877e1051a39Sopenharmony_ci and $t1,$t1,$carry 878e1051a39Sopenharmony_ci andc $a1,$a1,$carry 879e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*1($tp) 880e1051a39Sopenharmony_ci and $t2,$t2,$carry 881e1051a39Sopenharmony_ci andc $a2,$a2,$carry 882e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*2($tp) 883e1051a39Sopenharmony_ci and $t3,$t3,$carry 884e1051a39Sopenharmony_ci andc $a3,$a3,$carry 885e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*3($tp) 886e1051a39Sopenharmony_ci or $acc0,$t0,$a0 887e1051a39Sopenharmony_ci or $acc1,$t1,$a1 888e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*4($tp) 889e1051a39Sopenharmony_ci or $acc2,$t2,$a2 890e1051a39Sopenharmony_ci or $acc3,$t3,$a3 891e1051a39Sopenharmony_ci $ST $acc0,$SIZE_T*1($ap_end) 892e1051a39Sopenharmony_ci $ST $acc1,$SIZE_T*2($ap_end) 893e1051a39Sopenharmony_ci $ST $acc2,$SIZE_T*3($ap_end) 894e1051a39Sopenharmony_ci $ST $acc3,$SIZE_T*4($ap_end) 895e1051a39Sopenharmony_ci 896e1051a39Sopenharmony_ci b .Lmul4x_done 897e1051a39Sopenharmony_ci 898e1051a39Sopenharmony_ci.align 4 899e1051a39Sopenharmony_ci.Lmul4x4_post_condition: 900e1051a39Sopenharmony_ci $POP $ap,$SIZE_T*6($sp) # pull &rp[-1] 901e1051a39Sopenharmony_ci $POP $bp,0($sp) # pull saved sp 902e1051a39Sopenharmony_ci addze $carry,$carry # modulo-scheduled 903e1051a39Sopenharmony_ci # $acc0-3,$carry hold result, $m0-3 hold modulus 904e1051a39Sopenharmony_ci subfc $a0,$m0,$acc0 905e1051a39Sopenharmony_ci subfe $a1,$m1,$acc1 906e1051a39Sopenharmony_ci subfe $a2,$m2,$acc2 907e1051a39Sopenharmony_ci subfe $a3,$m3,$acc3 908e1051a39Sopenharmony_ci subfe $carry,$zero,$carry # did it borrow? 909e1051a39Sopenharmony_ci 910e1051a39Sopenharmony_ci and $m0,$m0,$carry 911e1051a39Sopenharmony_ci and $m1,$m1,$carry 912e1051a39Sopenharmony_ci addc $a0,$a0,$m0 913e1051a39Sopenharmony_ci and $m2,$m2,$carry 914e1051a39Sopenharmony_ci adde $a1,$a1,$m1 915e1051a39Sopenharmony_ci and $m3,$m3,$carry 916e1051a39Sopenharmony_ci adde $a2,$a2,$m2 917e1051a39Sopenharmony_ci adde $a3,$a3,$m3 918e1051a39Sopenharmony_ci 919e1051a39Sopenharmony_ci $ST $a0,$SIZE_T*1($ap) # write result 920e1051a39Sopenharmony_ci $ST $a1,$SIZE_T*2($ap) 921e1051a39Sopenharmony_ci $ST $a2,$SIZE_T*3($ap) 922e1051a39Sopenharmony_ci $ST $a3,$SIZE_T*4($ap) 923e1051a39Sopenharmony_ci 924e1051a39Sopenharmony_ci.Lmul4x_done: 925e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*8($sp) # wipe stack clean 926e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*9($sp) 927e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*10($sp) 928e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*11($sp) 929e1051a39Sopenharmony_ci li r3,1 # signal "done" 930e1051a39Sopenharmony_ci $POP r14,-$SIZE_T*18($bp) 931e1051a39Sopenharmony_ci $POP r15,-$SIZE_T*17($bp) 932e1051a39Sopenharmony_ci $POP r16,-$SIZE_T*16($bp) 933e1051a39Sopenharmony_ci $POP r17,-$SIZE_T*15($bp) 934e1051a39Sopenharmony_ci $POP r18,-$SIZE_T*14($bp) 935e1051a39Sopenharmony_ci $POP r19,-$SIZE_T*13($bp) 936e1051a39Sopenharmony_ci $POP r20,-$SIZE_T*12($bp) 937e1051a39Sopenharmony_ci $POP r21,-$SIZE_T*11($bp) 938e1051a39Sopenharmony_ci $POP r22,-$SIZE_T*10($bp) 939e1051a39Sopenharmony_ci $POP r23,-$SIZE_T*9($bp) 940e1051a39Sopenharmony_ci $POP r24,-$SIZE_T*8($bp) 941e1051a39Sopenharmony_ci $POP r25,-$SIZE_T*7($bp) 942e1051a39Sopenharmony_ci $POP r26,-$SIZE_T*6($bp) 943e1051a39Sopenharmony_ci $POP r27,-$SIZE_T*5($bp) 944e1051a39Sopenharmony_ci $POP r28,-$SIZE_T*4($bp) 945e1051a39Sopenharmony_ci $POP r29,-$SIZE_T*3($bp) 946e1051a39Sopenharmony_ci $POP r30,-$SIZE_T*2($bp) 947e1051a39Sopenharmony_ci $POP r31,-$SIZE_T*1($bp) 948e1051a39Sopenharmony_ci mr $sp,$bp 949e1051a39Sopenharmony_ci blr 950e1051a39Sopenharmony_ci .long 0 951e1051a39Sopenharmony_ci .byte 0,12,4,0x20,0x80,18,6,0 952e1051a39Sopenharmony_ci .long 0 953e1051a39Sopenharmony_ci.size .bn_mul4x_mont_int,.-.bn_mul4x_mont_int 954e1051a39Sopenharmony_ci___ 955e1051a39Sopenharmony_ci} 956e1051a39Sopenharmony_ci 957e1051a39Sopenharmony_ciif (1) { 958e1051a39Sopenharmony_ci######################################################################## 959e1051a39Sopenharmony_ci# Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module. 960e1051a39Sopenharmony_ci 961e1051a39Sopenharmony_cimy ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17)); 962e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3)=map("r$_",(18..21)); 963e1051a39Sopenharmony_cimy ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29)); 964e1051a39Sopenharmony_cimy ($cnt,$carry,$zero)=("r30","r31","r0"); 965e1051a39Sopenharmony_cimy ($tp,$ap_end,$na0)=($bp,$np,$carry); 966e1051a39Sopenharmony_ci 967e1051a39Sopenharmony_ci# sp----------->+-------------------------------+ 968e1051a39Sopenharmony_ci# | saved sp | 969e1051a39Sopenharmony_ci# +-------------------------------+ 970e1051a39Sopenharmony_ci# . . 971e1051a39Sopenharmony_ci# +12*size_t +-------------------------------+ 972e1051a39Sopenharmony_ci# | size_t tmp[2*num] | 973e1051a39Sopenharmony_ci# . . 974e1051a39Sopenharmony_ci# . . 975e1051a39Sopenharmony_ci# . . 976e1051a39Sopenharmony_ci# +-------------------------------+ 977e1051a39Sopenharmony_ci# . . 978e1051a39Sopenharmony_ci# -18*size_t +-------------------------------+ 979e1051a39Sopenharmony_ci# | 18 saved gpr, r14-r31 | 980e1051a39Sopenharmony_ci# . . 981e1051a39Sopenharmony_ci# . . 982e1051a39Sopenharmony_ci# +-------------------------------+ 983e1051a39Sopenharmony_ci$code.=<<___; 984e1051a39Sopenharmony_ci.align 5 985e1051a39Sopenharmony_ci__bn_sqr8x_mont: 986e1051a39Sopenharmony_ci.Lsqr8x_do: 987e1051a39Sopenharmony_ci mr $a0,$sp 988e1051a39Sopenharmony_ci slwi $a1,$num,`log($SIZE_T)/log(2)+1` 989e1051a39Sopenharmony_ci li $a2,-32*$SIZE_T 990e1051a39Sopenharmony_ci sub $a1,$a2,$a1 991e1051a39Sopenharmony_ci slwi $num,$num,`log($SIZE_T)/log(2)` 992e1051a39Sopenharmony_ci $STUX $sp,$sp,$a1 # alloca 993e1051a39Sopenharmony_ci 994e1051a39Sopenharmony_ci $PUSH r14,-$SIZE_T*18($a0) 995e1051a39Sopenharmony_ci $PUSH r15,-$SIZE_T*17($a0) 996e1051a39Sopenharmony_ci $PUSH r16,-$SIZE_T*16($a0) 997e1051a39Sopenharmony_ci $PUSH r17,-$SIZE_T*15($a0) 998e1051a39Sopenharmony_ci $PUSH r18,-$SIZE_T*14($a0) 999e1051a39Sopenharmony_ci $PUSH r19,-$SIZE_T*13($a0) 1000e1051a39Sopenharmony_ci $PUSH r20,-$SIZE_T*12($a0) 1001e1051a39Sopenharmony_ci $PUSH r21,-$SIZE_T*11($a0) 1002e1051a39Sopenharmony_ci $PUSH r22,-$SIZE_T*10($a0) 1003e1051a39Sopenharmony_ci $PUSH r23,-$SIZE_T*9($a0) 1004e1051a39Sopenharmony_ci $PUSH r24,-$SIZE_T*8($a0) 1005e1051a39Sopenharmony_ci $PUSH r25,-$SIZE_T*7($a0) 1006e1051a39Sopenharmony_ci $PUSH r26,-$SIZE_T*6($a0) 1007e1051a39Sopenharmony_ci $PUSH r27,-$SIZE_T*5($a0) 1008e1051a39Sopenharmony_ci $PUSH r28,-$SIZE_T*4($a0) 1009e1051a39Sopenharmony_ci $PUSH r29,-$SIZE_T*3($a0) 1010e1051a39Sopenharmony_ci $PUSH r30,-$SIZE_T*2($a0) 1011e1051a39Sopenharmony_ci $PUSH r31,-$SIZE_T*1($a0) 1012e1051a39Sopenharmony_ci 1013e1051a39Sopenharmony_ci subi $ap,$ap,$SIZE_T # bias by -1 1014e1051a39Sopenharmony_ci subi $t0,$np,$SIZE_T # bias by -1 1015e1051a39Sopenharmony_ci subi $rp,$rp,$SIZE_T # bias by -1 1016e1051a39Sopenharmony_ci $LD $n0,0($n0) # *n0 1017e1051a39Sopenharmony_ci li $zero,0 1018e1051a39Sopenharmony_ci 1019e1051a39Sopenharmony_ci add $ap_end,$ap,$num 1020e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap) 1021e1051a39Sopenharmony_ci #li $acc0,0 1022e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap) 1023e1051a39Sopenharmony_ci li $acc1,0 1024e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap) 1025e1051a39Sopenharmony_ci li $acc2,0 1026e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($ap) 1027e1051a39Sopenharmony_ci li $acc3,0 1028e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($ap) 1029e1051a39Sopenharmony_ci li $acc4,0 1030e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($ap) 1031e1051a39Sopenharmony_ci li $acc5,0 1032e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($ap) 1033e1051a39Sopenharmony_ci li $acc6,0 1034e1051a39Sopenharmony_ci $LDU $a7,$SIZE_T*8($ap) 1035e1051a39Sopenharmony_ci li $acc7,0 1036e1051a39Sopenharmony_ci 1037e1051a39Sopenharmony_ci addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1038e1051a39Sopenharmony_ci subic. $cnt,$num,$SIZE_T*8 1039e1051a39Sopenharmony_ci b .Lsqr8x_zero_start 1040e1051a39Sopenharmony_ci 1041e1051a39Sopenharmony_ci.align 5 1042e1051a39Sopenharmony_ci.Lsqr8x_zero: 1043e1051a39Sopenharmony_ci subic. $cnt,$cnt,$SIZE_T*8 1044e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*1($tp) 1045e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*2($tp) 1046e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*3($tp) 1047e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*4($tp) 1048e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*5($tp) 1049e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*6($tp) 1050e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*7($tp) 1051e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*8($tp) 1052e1051a39Sopenharmony_ci.Lsqr8x_zero_start: 1053e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*9($tp) 1054e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*10($tp) 1055e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*11($tp) 1056e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*12($tp) 1057e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*13($tp) 1058e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*14($tp) 1059e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*15($tp) 1060e1051a39Sopenharmony_ci $STU $zero,$SIZE_T*16($tp) 1061e1051a39Sopenharmony_ci bne .Lsqr8x_zero 1062e1051a39Sopenharmony_ci 1063e1051a39Sopenharmony_ci $PUSH $rp,$SIZE_T*6($sp) # offload &rp[-1] 1064e1051a39Sopenharmony_ci $PUSH $t0,$SIZE_T*7($sp) # offload &np[-1] 1065e1051a39Sopenharmony_ci $PUSH $n0,$SIZE_T*8($sp) # offload n0 1066e1051a39Sopenharmony_ci $PUSH $tp,$SIZE_T*9($sp) # &tp[2*num-1] 1067e1051a39Sopenharmony_ci $PUSH $zero,$SIZE_T*10($sp) # initial top-most carry 1068e1051a39Sopenharmony_ci addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1069e1051a39Sopenharmony_ci 1070e1051a39Sopenharmony_ci # Multiply everything but a[i]*a[i] 1071e1051a39Sopenharmony_ci.align 5 1072e1051a39Sopenharmony_ci.Lsqr8x_outer_loop: 1073e1051a39Sopenharmony_ci # a[1]a[0] (i) 1074e1051a39Sopenharmony_ci # a[2]a[0] 1075e1051a39Sopenharmony_ci # a[3]a[0] 1076e1051a39Sopenharmony_ci # a[4]a[0] 1077e1051a39Sopenharmony_ci # a[5]a[0] 1078e1051a39Sopenharmony_ci # a[6]a[0] 1079e1051a39Sopenharmony_ci # a[7]a[0] 1080e1051a39Sopenharmony_ci # a[2]a[1] (ii) 1081e1051a39Sopenharmony_ci # a[3]a[1] 1082e1051a39Sopenharmony_ci # a[4]a[1] 1083e1051a39Sopenharmony_ci # a[5]a[1] 1084e1051a39Sopenharmony_ci # a[6]a[1] 1085e1051a39Sopenharmony_ci # a[7]a[1] 1086e1051a39Sopenharmony_ci # a[3]a[2] (iii) 1087e1051a39Sopenharmony_ci # a[4]a[2] 1088e1051a39Sopenharmony_ci # a[5]a[2] 1089e1051a39Sopenharmony_ci # a[6]a[2] 1090e1051a39Sopenharmony_ci # a[7]a[2] 1091e1051a39Sopenharmony_ci # a[4]a[3] (iv) 1092e1051a39Sopenharmony_ci # a[5]a[3] 1093e1051a39Sopenharmony_ci # a[6]a[3] 1094e1051a39Sopenharmony_ci # a[7]a[3] 1095e1051a39Sopenharmony_ci # a[5]a[4] (v) 1096e1051a39Sopenharmony_ci # a[6]a[4] 1097e1051a39Sopenharmony_ci # a[7]a[4] 1098e1051a39Sopenharmony_ci # a[6]a[5] (vi) 1099e1051a39Sopenharmony_ci # a[7]a[5] 1100e1051a39Sopenharmony_ci # a[7]a[6] (vii) 1101e1051a39Sopenharmony_ci 1102e1051a39Sopenharmony_ci $UMULL $t0,$a1,$a0 # lo(a[1..7]*a[0]) (i) 1103e1051a39Sopenharmony_ci $UMULL $t1,$a2,$a0 1104e1051a39Sopenharmony_ci $UMULL $t2,$a3,$a0 1105e1051a39Sopenharmony_ci $UMULL $t3,$a4,$a0 1106e1051a39Sopenharmony_ci addc $acc1,$acc1,$t0 # t[1]+lo(a[1]*a[0]) 1107e1051a39Sopenharmony_ci $UMULL $t0,$a5,$a0 1108e1051a39Sopenharmony_ci adde $acc2,$acc2,$t1 1109e1051a39Sopenharmony_ci $UMULL $t1,$a6,$a0 1110e1051a39Sopenharmony_ci adde $acc3,$acc3,$t2 1111e1051a39Sopenharmony_ci $UMULL $t2,$a7,$a0 1112e1051a39Sopenharmony_ci adde $acc4,$acc4,$t3 1113e1051a39Sopenharmony_ci $UMULH $t3,$a1,$a0 # hi(a[1..7]*a[0]) 1114e1051a39Sopenharmony_ci adde $acc5,$acc5,$t0 1115e1051a39Sopenharmony_ci $UMULH $t0,$a2,$a0 1116e1051a39Sopenharmony_ci adde $acc6,$acc6,$t1 1117e1051a39Sopenharmony_ci $UMULH $t1,$a3,$a0 1118e1051a39Sopenharmony_ci adde $acc7,$acc7,$t2 1119e1051a39Sopenharmony_ci $UMULH $t2,$a4,$a0 1120e1051a39Sopenharmony_ci $ST $acc0,$SIZE_T*1($tp) # t[0] 1121e1051a39Sopenharmony_ci addze $acc0,$zero # t[8] 1122e1051a39Sopenharmony_ci $ST $acc1,$SIZE_T*2($tp) # t[1] 1123e1051a39Sopenharmony_ci addc $acc2,$acc2,$t3 # t[2]+lo(a[1]*a[0]) 1124e1051a39Sopenharmony_ci $UMULH $t3,$a5,$a0 1125e1051a39Sopenharmony_ci adde $acc3,$acc3,$t0 1126e1051a39Sopenharmony_ci $UMULH $t0,$a6,$a0 1127e1051a39Sopenharmony_ci adde $acc4,$acc4,$t1 1128e1051a39Sopenharmony_ci $UMULH $t1,$a7,$a0 1129e1051a39Sopenharmony_ci adde $acc5,$acc5,$t2 1130e1051a39Sopenharmony_ci $UMULL $t2,$a2,$a1 # lo(a[2..7]*a[1]) (ii) 1131e1051a39Sopenharmony_ci adde $acc6,$acc6,$t3 1132e1051a39Sopenharmony_ci $UMULL $t3,$a3,$a1 1133e1051a39Sopenharmony_ci adde $acc7,$acc7,$t0 1134e1051a39Sopenharmony_ci $UMULL $t0,$a4,$a1 1135e1051a39Sopenharmony_ci adde $acc0,$acc0,$t1 1136e1051a39Sopenharmony_ci 1137e1051a39Sopenharmony_ci $UMULL $t1,$a5,$a1 1138e1051a39Sopenharmony_ci addc $acc3,$acc3,$t2 1139e1051a39Sopenharmony_ci $UMULL $t2,$a6,$a1 1140e1051a39Sopenharmony_ci adde $acc4,$acc4,$t3 1141e1051a39Sopenharmony_ci $UMULL $t3,$a7,$a1 1142e1051a39Sopenharmony_ci adde $acc5,$acc5,$t0 1143e1051a39Sopenharmony_ci $UMULH $t0,$a2,$a1 # hi(a[2..7]*a[1]) 1144e1051a39Sopenharmony_ci adde $acc6,$acc6,$t1 1145e1051a39Sopenharmony_ci $UMULH $t1,$a3,$a1 1146e1051a39Sopenharmony_ci adde $acc7,$acc7,$t2 1147e1051a39Sopenharmony_ci $UMULH $t2,$a4,$a1 1148e1051a39Sopenharmony_ci adde $acc0,$acc0,$t3 1149e1051a39Sopenharmony_ci $UMULH $t3,$a5,$a1 1150e1051a39Sopenharmony_ci $ST $acc2,$SIZE_T*3($tp) # t[2] 1151e1051a39Sopenharmony_ci addze $acc1,$zero # t[9] 1152e1051a39Sopenharmony_ci $ST $acc3,$SIZE_T*4($tp) # t[3] 1153e1051a39Sopenharmony_ci addc $acc4,$acc4,$t0 1154e1051a39Sopenharmony_ci $UMULH $t0,$a6,$a1 1155e1051a39Sopenharmony_ci adde $acc5,$acc5,$t1 1156e1051a39Sopenharmony_ci $UMULH $t1,$a7,$a1 1157e1051a39Sopenharmony_ci adde $acc6,$acc6,$t2 1158e1051a39Sopenharmony_ci $UMULL $t2,$a3,$a2 # lo(a[3..7]*a[2]) (iii) 1159e1051a39Sopenharmony_ci adde $acc7,$acc7,$t3 1160e1051a39Sopenharmony_ci $UMULL $t3,$a4,$a2 1161e1051a39Sopenharmony_ci adde $acc0,$acc0,$t0 1162e1051a39Sopenharmony_ci $UMULL $t0,$a5,$a2 1163e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 1164e1051a39Sopenharmony_ci 1165e1051a39Sopenharmony_ci $UMULL $t1,$a6,$a2 1166e1051a39Sopenharmony_ci addc $acc5,$acc5,$t2 1167e1051a39Sopenharmony_ci $UMULL $t2,$a7,$a2 1168e1051a39Sopenharmony_ci adde $acc6,$acc6,$t3 1169e1051a39Sopenharmony_ci $UMULH $t3,$a3,$a2 # hi(a[3..7]*a[2]) 1170e1051a39Sopenharmony_ci adde $acc7,$acc7,$t0 1171e1051a39Sopenharmony_ci $UMULH $t0,$a4,$a2 1172e1051a39Sopenharmony_ci adde $acc0,$acc0,$t1 1173e1051a39Sopenharmony_ci $UMULH $t1,$a5,$a2 1174e1051a39Sopenharmony_ci adde $acc1,$acc1,$t2 1175e1051a39Sopenharmony_ci $UMULH $t2,$a6,$a2 1176e1051a39Sopenharmony_ci $ST $acc4,$SIZE_T*5($tp) # t[4] 1177e1051a39Sopenharmony_ci addze $acc2,$zero # t[10] 1178e1051a39Sopenharmony_ci $ST $acc5,$SIZE_T*6($tp) # t[5] 1179e1051a39Sopenharmony_ci addc $acc6,$acc6,$t3 1180e1051a39Sopenharmony_ci $UMULH $t3,$a7,$a2 1181e1051a39Sopenharmony_ci adde $acc7,$acc7,$t0 1182e1051a39Sopenharmony_ci $UMULL $t0,$a4,$a3 # lo(a[4..7]*a[3]) (iv) 1183e1051a39Sopenharmony_ci adde $acc0,$acc0,$t1 1184e1051a39Sopenharmony_ci $UMULL $t1,$a5,$a3 1185e1051a39Sopenharmony_ci adde $acc1,$acc1,$t2 1186e1051a39Sopenharmony_ci $UMULL $t2,$a6,$a3 1187e1051a39Sopenharmony_ci adde $acc2,$acc2,$t3 1188e1051a39Sopenharmony_ci 1189e1051a39Sopenharmony_ci $UMULL $t3,$a7,$a3 1190e1051a39Sopenharmony_ci addc $acc7,$acc7,$t0 1191e1051a39Sopenharmony_ci $UMULH $t0,$a4,$a3 # hi(a[4..7]*a[3]) 1192e1051a39Sopenharmony_ci adde $acc0,$acc0,$t1 1193e1051a39Sopenharmony_ci $UMULH $t1,$a5,$a3 1194e1051a39Sopenharmony_ci adde $acc1,$acc1,$t2 1195e1051a39Sopenharmony_ci $UMULH $t2,$a6,$a3 1196e1051a39Sopenharmony_ci adde $acc2,$acc2,$t3 1197e1051a39Sopenharmony_ci $UMULH $t3,$a7,$a3 1198e1051a39Sopenharmony_ci $ST $acc6,$SIZE_T*7($tp) # t[6] 1199e1051a39Sopenharmony_ci addze $acc3,$zero # t[11] 1200e1051a39Sopenharmony_ci $STU $acc7,$SIZE_T*8($tp) # t[7] 1201e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 1202e1051a39Sopenharmony_ci $UMULL $t0,$a5,$a4 # lo(a[5..7]*a[4]) (v) 1203e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 1204e1051a39Sopenharmony_ci $UMULL $t1,$a6,$a4 1205e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 1206e1051a39Sopenharmony_ci $UMULL $t2,$a7,$a4 1207e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 1208e1051a39Sopenharmony_ci 1209e1051a39Sopenharmony_ci $UMULH $t3,$a5,$a4 # hi(a[5..7]*a[4]) 1210e1051a39Sopenharmony_ci addc $acc1,$acc1,$t0 1211e1051a39Sopenharmony_ci $UMULH $t0,$a6,$a4 1212e1051a39Sopenharmony_ci adde $acc2,$acc2,$t1 1213e1051a39Sopenharmony_ci $UMULH $t1,$a7,$a4 1214e1051a39Sopenharmony_ci adde $acc3,$acc3,$t2 1215e1051a39Sopenharmony_ci $UMULL $t2,$a6,$a5 # lo(a[6..7]*a[5]) (vi) 1216e1051a39Sopenharmony_ci addze $acc4,$zero # t[12] 1217e1051a39Sopenharmony_ci addc $acc2,$acc2,$t3 1218e1051a39Sopenharmony_ci $UMULL $t3,$a7,$a5 1219e1051a39Sopenharmony_ci adde $acc3,$acc3,$t0 1220e1051a39Sopenharmony_ci $UMULH $t0,$a6,$a5 # hi(a[6..7]*a[5]) 1221e1051a39Sopenharmony_ci adde $acc4,$acc4,$t1 1222e1051a39Sopenharmony_ci 1223e1051a39Sopenharmony_ci $UMULH $t1,$a7,$a5 1224e1051a39Sopenharmony_ci addc $acc3,$acc3,$t2 1225e1051a39Sopenharmony_ci $UMULL $t2,$a7,$a6 # lo(a[7]*a[6]) (vii) 1226e1051a39Sopenharmony_ci adde $acc4,$acc4,$t3 1227e1051a39Sopenharmony_ci $UMULH $t3,$a7,$a6 # hi(a[7]*a[6]) 1228e1051a39Sopenharmony_ci addze $acc5,$zero # t[13] 1229e1051a39Sopenharmony_ci addc $acc4,$acc4,$t0 1230e1051a39Sopenharmony_ci $UCMP $ap_end,$ap # done yet? 1231e1051a39Sopenharmony_ci adde $acc5,$acc5,$t1 1232e1051a39Sopenharmony_ci 1233e1051a39Sopenharmony_ci addc $acc5,$acc5,$t2 1234e1051a39Sopenharmony_ci sub $t0,$ap_end,$num # rewinded ap 1235e1051a39Sopenharmony_ci addze $acc6,$zero # t[14] 1236e1051a39Sopenharmony_ci add $acc6,$acc6,$t3 1237e1051a39Sopenharmony_ci 1238e1051a39Sopenharmony_ci beq .Lsqr8x_outer_break 1239e1051a39Sopenharmony_ci 1240e1051a39Sopenharmony_ci mr $n0,$a0 1241e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($tp) 1242e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($tp) 1243e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($tp) 1244e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($tp) 1245e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($tp) 1246e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($tp) 1247e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($tp) 1248e1051a39Sopenharmony_ci $LD $a7,$SIZE_T*8($tp) 1249e1051a39Sopenharmony_ci addc $acc0,$acc0,$a0 1250e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap) 1251e1051a39Sopenharmony_ci adde $acc1,$acc1,$a1 1252e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap) 1253e1051a39Sopenharmony_ci adde $acc2,$acc2,$a2 1254e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap) 1255e1051a39Sopenharmony_ci adde $acc3,$acc3,$a3 1256e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($ap) 1257e1051a39Sopenharmony_ci adde $acc4,$acc4,$a4 1258e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($ap) 1259e1051a39Sopenharmony_ci adde $acc5,$acc5,$a5 1260e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($ap) 1261e1051a39Sopenharmony_ci adde $acc6,$acc6,$a6 1262e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($ap) 1263e1051a39Sopenharmony_ci subi $rp,$ap,$SIZE_T*7 1264e1051a39Sopenharmony_ci addze $acc7,$a7 1265e1051a39Sopenharmony_ci $LDU $a7,$SIZE_T*8($ap) 1266e1051a39Sopenharmony_ci #addze $carry,$zero # moved below 1267e1051a39Sopenharmony_ci li $cnt,0 1268e1051a39Sopenharmony_ci b .Lsqr8x_mul 1269e1051a39Sopenharmony_ci 1270e1051a39Sopenharmony_ci # a[8]a[0] 1271e1051a39Sopenharmony_ci # a[9]a[0] 1272e1051a39Sopenharmony_ci # a[a]a[0] 1273e1051a39Sopenharmony_ci # a[b]a[0] 1274e1051a39Sopenharmony_ci # a[c]a[0] 1275e1051a39Sopenharmony_ci # a[d]a[0] 1276e1051a39Sopenharmony_ci # a[e]a[0] 1277e1051a39Sopenharmony_ci # a[f]a[0] 1278e1051a39Sopenharmony_ci # a[8]a[1] 1279e1051a39Sopenharmony_ci # a[f]a[1]........................ 1280e1051a39Sopenharmony_ci # a[8]a[2] 1281e1051a39Sopenharmony_ci # a[f]a[2]........................ 1282e1051a39Sopenharmony_ci # a[8]a[3] 1283e1051a39Sopenharmony_ci # a[f]a[3]........................ 1284e1051a39Sopenharmony_ci # a[8]a[4] 1285e1051a39Sopenharmony_ci # a[f]a[4]........................ 1286e1051a39Sopenharmony_ci # a[8]a[5] 1287e1051a39Sopenharmony_ci # a[f]a[5]........................ 1288e1051a39Sopenharmony_ci # a[8]a[6] 1289e1051a39Sopenharmony_ci # a[f]a[6]........................ 1290e1051a39Sopenharmony_ci # a[8]a[7] 1291e1051a39Sopenharmony_ci # a[f]a[7]........................ 1292e1051a39Sopenharmony_ci.align 5 1293e1051a39Sopenharmony_ci.Lsqr8x_mul: 1294e1051a39Sopenharmony_ci $UMULL $t0,$a0,$n0 1295e1051a39Sopenharmony_ci addze $carry,$zero # carry bit, modulo-scheduled 1296e1051a39Sopenharmony_ci $UMULL $t1,$a1,$n0 1297e1051a39Sopenharmony_ci addi $cnt,$cnt,$SIZE_T 1298e1051a39Sopenharmony_ci $UMULL $t2,$a2,$n0 1299e1051a39Sopenharmony_ci andi. $cnt,$cnt,$SIZE_T*8-1 1300e1051a39Sopenharmony_ci $UMULL $t3,$a3,$n0 1301e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 1302e1051a39Sopenharmony_ci $UMULL $t0,$a4,$n0 1303e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 1304e1051a39Sopenharmony_ci $UMULL $t1,$a5,$n0 1305e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 1306e1051a39Sopenharmony_ci $UMULL $t2,$a6,$n0 1307e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 1308e1051a39Sopenharmony_ci $UMULL $t3,$a7,$n0 1309e1051a39Sopenharmony_ci adde $acc4,$acc4,$t0 1310e1051a39Sopenharmony_ci $UMULH $t0,$a0,$n0 1311e1051a39Sopenharmony_ci adde $acc5,$acc5,$t1 1312e1051a39Sopenharmony_ci $UMULH $t1,$a1,$n0 1313e1051a39Sopenharmony_ci adde $acc6,$acc6,$t2 1314e1051a39Sopenharmony_ci $UMULH $t2,$a2,$n0 1315e1051a39Sopenharmony_ci adde $acc7,$acc7,$t3 1316e1051a39Sopenharmony_ci $UMULH $t3,$a3,$n0 1317e1051a39Sopenharmony_ci addze $carry,$carry 1318e1051a39Sopenharmony_ci $STU $acc0,$SIZE_T($tp) 1319e1051a39Sopenharmony_ci addc $acc0,$acc1,$t0 1320e1051a39Sopenharmony_ci $UMULH $t0,$a4,$n0 1321e1051a39Sopenharmony_ci adde $acc1,$acc2,$t1 1322e1051a39Sopenharmony_ci $UMULH $t1,$a5,$n0 1323e1051a39Sopenharmony_ci adde $acc2,$acc3,$t2 1324e1051a39Sopenharmony_ci $UMULH $t2,$a6,$n0 1325e1051a39Sopenharmony_ci adde $acc3,$acc4,$t3 1326e1051a39Sopenharmony_ci $UMULH $t3,$a7,$n0 1327e1051a39Sopenharmony_ci $LDX $n0,$rp,$cnt 1328e1051a39Sopenharmony_ci adde $acc4,$acc5,$t0 1329e1051a39Sopenharmony_ci adde $acc5,$acc6,$t1 1330e1051a39Sopenharmony_ci adde $acc6,$acc7,$t2 1331e1051a39Sopenharmony_ci adde $acc7,$carry,$t3 1332e1051a39Sopenharmony_ci #addze $carry,$zero # moved above 1333e1051a39Sopenharmony_ci bne .Lsqr8x_mul 1334e1051a39Sopenharmony_ci # note that carry flag is guaranteed 1335e1051a39Sopenharmony_ci # to be zero at this point 1336e1051a39Sopenharmony_ci $UCMP $ap,$ap_end # done yet? 1337e1051a39Sopenharmony_ci beq .Lsqr8x_break 1338e1051a39Sopenharmony_ci 1339e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($tp) 1340e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($tp) 1341e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($tp) 1342e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($tp) 1343e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($tp) 1344e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($tp) 1345e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($tp) 1346e1051a39Sopenharmony_ci $LD $a7,$SIZE_T*8($tp) 1347e1051a39Sopenharmony_ci addc $acc0,$acc0,$a0 1348e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap) 1349e1051a39Sopenharmony_ci adde $acc1,$acc1,$a1 1350e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap) 1351e1051a39Sopenharmony_ci adde $acc2,$acc2,$a2 1352e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap) 1353e1051a39Sopenharmony_ci adde $acc3,$acc3,$a3 1354e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($ap) 1355e1051a39Sopenharmony_ci adde $acc4,$acc4,$a4 1356e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($ap) 1357e1051a39Sopenharmony_ci adde $acc5,$acc5,$a5 1358e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($ap) 1359e1051a39Sopenharmony_ci adde $acc6,$acc6,$a6 1360e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($ap) 1361e1051a39Sopenharmony_ci adde $acc7,$acc7,$a7 1362e1051a39Sopenharmony_ci $LDU $a7,$SIZE_T*8($ap) 1363e1051a39Sopenharmony_ci #addze $carry,$zero # moved above 1364e1051a39Sopenharmony_ci b .Lsqr8x_mul 1365e1051a39Sopenharmony_ci 1366e1051a39Sopenharmony_ci.align 5 1367e1051a39Sopenharmony_ci.Lsqr8x_break: 1368e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*8($rp) 1369e1051a39Sopenharmony_ci addi $ap,$rp,$SIZE_T*15 1370e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*9($rp) 1371e1051a39Sopenharmony_ci sub. $t0,$ap_end,$ap # is it last iteration? 1372e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*10($rp) 1373e1051a39Sopenharmony_ci sub $t1,$tp,$t0 1374e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*11($rp) 1375e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*12($rp) 1376e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*13($rp) 1377e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*14($rp) 1378e1051a39Sopenharmony_ci $LD $a7,$SIZE_T*15($rp) 1379e1051a39Sopenharmony_ci beq .Lsqr8x_outer_loop 1380e1051a39Sopenharmony_ci 1381e1051a39Sopenharmony_ci $ST $acc0,$SIZE_T*1($tp) 1382e1051a39Sopenharmony_ci $LD $acc0,$SIZE_T*1($t1) 1383e1051a39Sopenharmony_ci $ST $acc1,$SIZE_T*2($tp) 1384e1051a39Sopenharmony_ci $LD $acc1,$SIZE_T*2($t1) 1385e1051a39Sopenharmony_ci $ST $acc2,$SIZE_T*3($tp) 1386e1051a39Sopenharmony_ci $LD $acc2,$SIZE_T*3($t1) 1387e1051a39Sopenharmony_ci $ST $acc3,$SIZE_T*4($tp) 1388e1051a39Sopenharmony_ci $LD $acc3,$SIZE_T*4($t1) 1389e1051a39Sopenharmony_ci $ST $acc4,$SIZE_T*5($tp) 1390e1051a39Sopenharmony_ci $LD $acc4,$SIZE_T*5($t1) 1391e1051a39Sopenharmony_ci $ST $acc5,$SIZE_T*6($tp) 1392e1051a39Sopenharmony_ci $LD $acc5,$SIZE_T*6($t1) 1393e1051a39Sopenharmony_ci $ST $acc6,$SIZE_T*7($tp) 1394e1051a39Sopenharmony_ci $LD $acc6,$SIZE_T*7($t1) 1395e1051a39Sopenharmony_ci $ST $acc7,$SIZE_T*8($tp) 1396e1051a39Sopenharmony_ci $LD $acc7,$SIZE_T*8($t1) 1397e1051a39Sopenharmony_ci mr $tp,$t1 1398e1051a39Sopenharmony_ci b .Lsqr8x_outer_loop 1399e1051a39Sopenharmony_ci 1400e1051a39Sopenharmony_ci.align 5 1401e1051a39Sopenharmony_ci.Lsqr8x_outer_break: 1402e1051a39Sopenharmony_ci #################################################################### 1403e1051a39Sopenharmony_ci # Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1404e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*1($t0) # recall that $t0 is &a[-1] 1405e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*2($t0) 1406e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*3($t0) 1407e1051a39Sopenharmony_ci $LD $a7,$SIZE_T*4($t0) 1408e1051a39Sopenharmony_ci addi $ap,$t0,$SIZE_T*4 1409e1051a39Sopenharmony_ci # "tp[x]" comments are for num==8 case 1410e1051a39Sopenharmony_ci $LD $t1,$SIZE_T*13($sp) # =tp[1], t[0] is not interesting 1411e1051a39Sopenharmony_ci $LD $t2,$SIZE_T*14($sp) 1412e1051a39Sopenharmony_ci $LD $t3,$SIZE_T*15($sp) 1413e1051a39Sopenharmony_ci $LD $t0,$SIZE_T*16($sp) 1414e1051a39Sopenharmony_ci 1415e1051a39Sopenharmony_ci $ST $acc0,$SIZE_T*1($tp) # tp[8]= 1416e1051a39Sopenharmony_ci srwi $cnt,$num,`log($SIZE_T)/log(2)+2` 1417e1051a39Sopenharmony_ci $ST $acc1,$SIZE_T*2($tp) 1418e1051a39Sopenharmony_ci subi $cnt,$cnt,1 1419e1051a39Sopenharmony_ci $ST $acc2,$SIZE_T*3($tp) 1420e1051a39Sopenharmony_ci $ST $acc3,$SIZE_T*4($tp) 1421e1051a39Sopenharmony_ci $ST $acc4,$SIZE_T*5($tp) 1422e1051a39Sopenharmony_ci $ST $acc5,$SIZE_T*6($tp) 1423e1051a39Sopenharmony_ci $ST $acc6,$SIZE_T*7($tp) 1424e1051a39Sopenharmony_ci #$ST $acc7,$SIZE_T*8($tp) # tp[15] is not interesting 1425e1051a39Sopenharmony_ci addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1426e1051a39Sopenharmony_ci $UMULL $acc0,$a1,$a1 1427e1051a39Sopenharmony_ci $UMULH $a1,$a1,$a1 1428e1051a39Sopenharmony_ci add $acc1,$t1,$t1 # <<1 1429e1051a39Sopenharmony_ci $SHRI $t1,$t1,$BITS-1 1430e1051a39Sopenharmony_ci $UMULL $a2,$a3,$a3 1431e1051a39Sopenharmony_ci $UMULH $a3,$a3,$a3 1432e1051a39Sopenharmony_ci addc $acc1,$acc1,$a1 1433e1051a39Sopenharmony_ci add $acc2,$t2,$t2 1434e1051a39Sopenharmony_ci $SHRI $t2,$t2,$BITS-1 1435e1051a39Sopenharmony_ci add $acc3,$t3,$t3 1436e1051a39Sopenharmony_ci $SHRI $t3,$t3,$BITS-1 1437e1051a39Sopenharmony_ci or $acc2,$acc2,$t1 1438e1051a39Sopenharmony_ci 1439e1051a39Sopenharmony_ci mtctr $cnt 1440e1051a39Sopenharmony_ci.Lsqr4x_shift_n_add: 1441e1051a39Sopenharmony_ci $UMULL $a4,$a5,$a5 1442e1051a39Sopenharmony_ci $UMULH $a5,$a5,$a5 1443e1051a39Sopenharmony_ci $LD $t1,$SIZE_T*6($tp) # =tp[5] 1444e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*1($ap) 1445e1051a39Sopenharmony_ci adde $acc2,$acc2,$a2 1446e1051a39Sopenharmony_ci add $acc4,$t0,$t0 1447e1051a39Sopenharmony_ci $SHRI $t0,$t0,$BITS-1 1448e1051a39Sopenharmony_ci or $acc3,$acc3,$t2 1449e1051a39Sopenharmony_ci $LD $t2,$SIZE_T*7($tp) # =tp[6] 1450e1051a39Sopenharmony_ci adde $acc3,$acc3,$a3 1451e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*2($ap) 1452e1051a39Sopenharmony_ci add $acc5,$t1,$t1 1453e1051a39Sopenharmony_ci $SHRI $t1,$t1,$BITS-1 1454e1051a39Sopenharmony_ci or $acc4,$acc4,$t3 1455e1051a39Sopenharmony_ci $LD $t3,$SIZE_T*8($tp) # =tp[7] 1456e1051a39Sopenharmony_ci $UMULL $a6,$a7,$a7 1457e1051a39Sopenharmony_ci $UMULH $a7,$a7,$a7 1458e1051a39Sopenharmony_ci adde $acc4,$acc4,$a4 1459e1051a39Sopenharmony_ci add $acc6,$t2,$t2 1460e1051a39Sopenharmony_ci $SHRI $t2,$t2,$BITS-1 1461e1051a39Sopenharmony_ci or $acc5,$acc5,$t0 1462e1051a39Sopenharmony_ci $LD $t0,$SIZE_T*9($tp) # =tp[8] 1463e1051a39Sopenharmony_ci adde $acc5,$acc5,$a5 1464e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*3($ap) 1465e1051a39Sopenharmony_ci add $acc7,$t3,$t3 1466e1051a39Sopenharmony_ci $SHRI $t3,$t3,$BITS-1 1467e1051a39Sopenharmony_ci or $acc6,$acc6,$t1 1468e1051a39Sopenharmony_ci $LD $t1,$SIZE_T*10($tp) # =tp[9] 1469e1051a39Sopenharmony_ci $UMULL $a0,$a1,$a1 1470e1051a39Sopenharmony_ci $UMULH $a1,$a1,$a1 1471e1051a39Sopenharmony_ci adde $acc6,$acc6,$a6 1472e1051a39Sopenharmony_ci $ST $acc0,$SIZE_T*1($tp) # tp[0]= 1473e1051a39Sopenharmony_ci add $acc0,$t0,$t0 1474e1051a39Sopenharmony_ci $SHRI $t0,$t0,$BITS-1 1475e1051a39Sopenharmony_ci or $acc7,$acc7,$t2 1476e1051a39Sopenharmony_ci $LD $t2,$SIZE_T*11($tp) # =tp[10] 1477e1051a39Sopenharmony_ci adde $acc7,$acc7,$a7 1478e1051a39Sopenharmony_ci $LDU $a7,$SIZE_T*4($ap) 1479e1051a39Sopenharmony_ci $ST $acc1,$SIZE_T*2($tp) # tp[1]= 1480e1051a39Sopenharmony_ci add $acc1,$t1,$t1 1481e1051a39Sopenharmony_ci $SHRI $t1,$t1,$BITS-1 1482e1051a39Sopenharmony_ci or $acc0,$acc0,$t3 1483e1051a39Sopenharmony_ci $LD $t3,$SIZE_T*12($tp) # =tp[11] 1484e1051a39Sopenharmony_ci $UMULL $a2,$a3,$a3 1485e1051a39Sopenharmony_ci $UMULH $a3,$a3,$a3 1486e1051a39Sopenharmony_ci adde $acc0,$acc0,$a0 1487e1051a39Sopenharmony_ci $ST $acc2,$SIZE_T*3($tp) # tp[2]= 1488e1051a39Sopenharmony_ci add $acc2,$t2,$t2 1489e1051a39Sopenharmony_ci $SHRI $t2,$t2,$BITS-1 1490e1051a39Sopenharmony_ci or $acc1,$acc1,$t0 1491e1051a39Sopenharmony_ci $LD $t0,$SIZE_T*13($tp) # =tp[12] 1492e1051a39Sopenharmony_ci adde $acc1,$acc1,$a1 1493e1051a39Sopenharmony_ci $ST $acc3,$SIZE_T*4($tp) # tp[3]= 1494e1051a39Sopenharmony_ci $ST $acc4,$SIZE_T*5($tp) # tp[4]= 1495e1051a39Sopenharmony_ci $ST $acc5,$SIZE_T*6($tp) # tp[5]= 1496e1051a39Sopenharmony_ci $ST $acc6,$SIZE_T*7($tp) # tp[6]= 1497e1051a39Sopenharmony_ci $STU $acc7,$SIZE_T*8($tp) # tp[7]= 1498e1051a39Sopenharmony_ci add $acc3,$t3,$t3 1499e1051a39Sopenharmony_ci $SHRI $t3,$t3,$BITS-1 1500e1051a39Sopenharmony_ci or $acc2,$acc2,$t1 1501e1051a39Sopenharmony_ci bdnz .Lsqr4x_shift_n_add 1502e1051a39Sopenharmony_ci___ 1503e1051a39Sopenharmony_cimy ($np,$np_end)=($ap,$ap_end); 1504e1051a39Sopenharmony_ci$code.=<<___; 1505e1051a39Sopenharmony_ci $POP $np,$SIZE_T*7($sp) # pull &np[-1] and n0 1506e1051a39Sopenharmony_ci $POP $n0,$SIZE_T*8($sp) 1507e1051a39Sopenharmony_ci 1508e1051a39Sopenharmony_ci $UMULL $a4,$a5,$a5 1509e1051a39Sopenharmony_ci $UMULH $a5,$a5,$a5 1510e1051a39Sopenharmony_ci $ST $acc0,$SIZE_T*1($tp) # tp[8]= 1511e1051a39Sopenharmony_ci $LD $acc0,$SIZE_T*12($sp) # =tp[0] 1512e1051a39Sopenharmony_ci $LD $t1,$SIZE_T*6($tp) # =tp[13] 1513e1051a39Sopenharmony_ci adde $acc2,$acc2,$a2 1514e1051a39Sopenharmony_ci add $acc4,$t0,$t0 1515e1051a39Sopenharmony_ci $SHRI $t0,$t0,$BITS-1 1516e1051a39Sopenharmony_ci or $acc3,$acc3,$t2 1517e1051a39Sopenharmony_ci $LD $t2,$SIZE_T*7($tp) # =tp[14] 1518e1051a39Sopenharmony_ci adde $acc3,$acc3,$a3 1519e1051a39Sopenharmony_ci add $acc5,$t1,$t1 1520e1051a39Sopenharmony_ci $SHRI $t1,$t1,$BITS-1 1521e1051a39Sopenharmony_ci or $acc4,$acc4,$t3 1522e1051a39Sopenharmony_ci $UMULL $a6,$a7,$a7 1523e1051a39Sopenharmony_ci $UMULH $a7,$a7,$a7 1524e1051a39Sopenharmony_ci adde $acc4,$acc4,$a4 1525e1051a39Sopenharmony_ci add $acc6,$t2,$t2 1526e1051a39Sopenharmony_ci $SHRI $t2,$t2,$BITS-1 1527e1051a39Sopenharmony_ci or $acc5,$acc5,$t0 1528e1051a39Sopenharmony_ci $ST $acc1,$SIZE_T*2($tp) # tp[9]= 1529e1051a39Sopenharmony_ci $LD $acc1,$SIZE_T*13($sp) # =tp[1] 1530e1051a39Sopenharmony_ci adde $acc5,$acc5,$a5 1531e1051a39Sopenharmony_ci or $acc6,$acc6,$t1 1532e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($np) 1533e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($np) 1534e1051a39Sopenharmony_ci adde $acc6,$acc6,$a6 1535e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($np) 1536e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($np) 1537e1051a39Sopenharmony_ci adde $acc7,$a7,$t2 1538e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($np) 1539e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($np) 1540e1051a39Sopenharmony_ci 1541e1051a39Sopenharmony_ci ################################################################ 1542e1051a39Sopenharmony_ci # Reduce by 8 limbs per iteration 1543e1051a39Sopenharmony_ci $UMULL $na0,$n0,$acc0 # t[0]*n0 1544e1051a39Sopenharmony_ci li $cnt,8 1545e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($np) 1546e1051a39Sopenharmony_ci add $np_end,$np,$num 1547e1051a39Sopenharmony_ci $LDU $a7,$SIZE_T*8($np) 1548e1051a39Sopenharmony_ci $ST $acc2,$SIZE_T*3($tp) # tp[10]= 1549e1051a39Sopenharmony_ci $LD $acc2,$SIZE_T*14($sp) 1550e1051a39Sopenharmony_ci $ST $acc3,$SIZE_T*4($tp) # tp[11]= 1551e1051a39Sopenharmony_ci $LD $acc3,$SIZE_T*15($sp) 1552e1051a39Sopenharmony_ci $ST $acc4,$SIZE_T*5($tp) # tp[12]= 1553e1051a39Sopenharmony_ci $LD $acc4,$SIZE_T*16($sp) 1554e1051a39Sopenharmony_ci $ST $acc5,$SIZE_T*6($tp) # tp[13]= 1555e1051a39Sopenharmony_ci $LD $acc5,$SIZE_T*17($sp) 1556e1051a39Sopenharmony_ci $ST $acc6,$SIZE_T*7($tp) # tp[14]= 1557e1051a39Sopenharmony_ci $LD $acc6,$SIZE_T*18($sp) 1558e1051a39Sopenharmony_ci $ST $acc7,$SIZE_T*8($tp) # tp[15]= 1559e1051a39Sopenharmony_ci $LD $acc7,$SIZE_T*19($sp) 1560e1051a39Sopenharmony_ci addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1561e1051a39Sopenharmony_ci mtctr $cnt 1562e1051a39Sopenharmony_ci b .Lsqr8x_reduction 1563e1051a39Sopenharmony_ci 1564e1051a39Sopenharmony_ci.align 5 1565e1051a39Sopenharmony_ci.Lsqr8x_reduction: 1566e1051a39Sopenharmony_ci # (*) $UMULL $t0,$a0,$na0 # lo(n[0-7])*lo(t[0]*n0) 1567e1051a39Sopenharmony_ci $UMULL $t1,$a1,$na0 1568e1051a39Sopenharmony_ci $UMULL $t2,$a2,$na0 1569e1051a39Sopenharmony_ci $STU $na0,$SIZE_T($tp) # put aside t[0]*n0 for tail processing 1570e1051a39Sopenharmony_ci $UMULL $t3,$a3,$na0 1571e1051a39Sopenharmony_ci # (*) addc $acc0,$acc0,$t0 1572e1051a39Sopenharmony_ci addic $acc0,$acc0,-1 # (*) 1573e1051a39Sopenharmony_ci $UMULL $t0,$a4,$na0 1574e1051a39Sopenharmony_ci adde $acc0,$acc1,$t1 1575e1051a39Sopenharmony_ci $UMULL $t1,$a5,$na0 1576e1051a39Sopenharmony_ci adde $acc1,$acc2,$t2 1577e1051a39Sopenharmony_ci $UMULL $t2,$a6,$na0 1578e1051a39Sopenharmony_ci adde $acc2,$acc3,$t3 1579e1051a39Sopenharmony_ci $UMULL $t3,$a7,$na0 1580e1051a39Sopenharmony_ci adde $acc3,$acc4,$t0 1581e1051a39Sopenharmony_ci $UMULH $t0,$a0,$na0 # hi(n[0-7])*lo(t[0]*n0) 1582e1051a39Sopenharmony_ci adde $acc4,$acc5,$t1 1583e1051a39Sopenharmony_ci $UMULH $t1,$a1,$na0 1584e1051a39Sopenharmony_ci adde $acc5,$acc6,$t2 1585e1051a39Sopenharmony_ci $UMULH $t2,$a2,$na0 1586e1051a39Sopenharmony_ci adde $acc6,$acc7,$t3 1587e1051a39Sopenharmony_ci $UMULH $t3,$a3,$na0 1588e1051a39Sopenharmony_ci addze $acc7,$zero 1589e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 1590e1051a39Sopenharmony_ci $UMULH $t0,$a4,$na0 1591e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 1592e1051a39Sopenharmony_ci $UMULH $t1,$a5,$na0 1593e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 1594e1051a39Sopenharmony_ci $UMULH $t2,$a6,$na0 1595e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 1596e1051a39Sopenharmony_ci $UMULH $t3,$a7,$na0 1597e1051a39Sopenharmony_ci $UMULL $na0,$n0,$acc0 # next t[0]*n0 1598e1051a39Sopenharmony_ci adde $acc4,$acc4,$t0 1599e1051a39Sopenharmony_ci adde $acc5,$acc5,$t1 1600e1051a39Sopenharmony_ci adde $acc6,$acc6,$t2 1601e1051a39Sopenharmony_ci adde $acc7,$acc7,$t3 1602e1051a39Sopenharmony_ci bdnz .Lsqr8x_reduction 1603e1051a39Sopenharmony_ci 1604e1051a39Sopenharmony_ci $LD $t0,$SIZE_T*1($tp) 1605e1051a39Sopenharmony_ci $LD $t1,$SIZE_T*2($tp) 1606e1051a39Sopenharmony_ci $LD $t2,$SIZE_T*3($tp) 1607e1051a39Sopenharmony_ci $LD $t3,$SIZE_T*4($tp) 1608e1051a39Sopenharmony_ci subi $rp,$tp,$SIZE_T*7 1609e1051a39Sopenharmony_ci $UCMP $np_end,$np # done yet? 1610e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 1611e1051a39Sopenharmony_ci $LD $t0,$SIZE_T*5($tp) 1612e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 1613e1051a39Sopenharmony_ci $LD $t1,$SIZE_T*6($tp) 1614e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 1615e1051a39Sopenharmony_ci $LD $t2,$SIZE_T*7($tp) 1616e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 1617e1051a39Sopenharmony_ci $LD $t3,$SIZE_T*8($tp) 1618e1051a39Sopenharmony_ci adde $acc4,$acc4,$t0 1619e1051a39Sopenharmony_ci adde $acc5,$acc5,$t1 1620e1051a39Sopenharmony_ci adde $acc6,$acc6,$t2 1621e1051a39Sopenharmony_ci adde $acc7,$acc7,$t3 1622e1051a39Sopenharmony_ci #addze $carry,$zero # moved below 1623e1051a39Sopenharmony_ci beq .Lsqr8x8_post_condition 1624e1051a39Sopenharmony_ci 1625e1051a39Sopenharmony_ci $LD $n0,$SIZE_T*0($rp) 1626e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($np) 1627e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($np) 1628e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($np) 1629e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($np) 1630e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($np) 1631e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($np) 1632e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($np) 1633e1051a39Sopenharmony_ci $LDU $a7,$SIZE_T*8($np) 1634e1051a39Sopenharmony_ci li $cnt,0 1635e1051a39Sopenharmony_ci 1636e1051a39Sopenharmony_ci.align 5 1637e1051a39Sopenharmony_ci.Lsqr8x_tail: 1638e1051a39Sopenharmony_ci $UMULL $t0,$a0,$n0 1639e1051a39Sopenharmony_ci addze $carry,$zero # carry bit, modulo-scheduled 1640e1051a39Sopenharmony_ci $UMULL $t1,$a1,$n0 1641e1051a39Sopenharmony_ci addi $cnt,$cnt,$SIZE_T 1642e1051a39Sopenharmony_ci $UMULL $t2,$a2,$n0 1643e1051a39Sopenharmony_ci andi. $cnt,$cnt,$SIZE_T*8-1 1644e1051a39Sopenharmony_ci $UMULL $t3,$a3,$n0 1645e1051a39Sopenharmony_ci addc $acc0,$acc0,$t0 1646e1051a39Sopenharmony_ci $UMULL $t0,$a4,$n0 1647e1051a39Sopenharmony_ci adde $acc1,$acc1,$t1 1648e1051a39Sopenharmony_ci $UMULL $t1,$a5,$n0 1649e1051a39Sopenharmony_ci adde $acc2,$acc2,$t2 1650e1051a39Sopenharmony_ci $UMULL $t2,$a6,$n0 1651e1051a39Sopenharmony_ci adde $acc3,$acc3,$t3 1652e1051a39Sopenharmony_ci $UMULL $t3,$a7,$n0 1653e1051a39Sopenharmony_ci adde $acc4,$acc4,$t0 1654e1051a39Sopenharmony_ci $UMULH $t0,$a0,$n0 1655e1051a39Sopenharmony_ci adde $acc5,$acc5,$t1 1656e1051a39Sopenharmony_ci $UMULH $t1,$a1,$n0 1657e1051a39Sopenharmony_ci adde $acc6,$acc6,$t2 1658e1051a39Sopenharmony_ci $UMULH $t2,$a2,$n0 1659e1051a39Sopenharmony_ci adde $acc7,$acc7,$t3 1660e1051a39Sopenharmony_ci $UMULH $t3,$a3,$n0 1661e1051a39Sopenharmony_ci addze $carry,$carry 1662e1051a39Sopenharmony_ci $STU $acc0,$SIZE_T($tp) 1663e1051a39Sopenharmony_ci addc $acc0,$acc1,$t0 1664e1051a39Sopenharmony_ci $UMULH $t0,$a4,$n0 1665e1051a39Sopenharmony_ci adde $acc1,$acc2,$t1 1666e1051a39Sopenharmony_ci $UMULH $t1,$a5,$n0 1667e1051a39Sopenharmony_ci adde $acc2,$acc3,$t2 1668e1051a39Sopenharmony_ci $UMULH $t2,$a6,$n0 1669e1051a39Sopenharmony_ci adde $acc3,$acc4,$t3 1670e1051a39Sopenharmony_ci $UMULH $t3,$a7,$n0 1671e1051a39Sopenharmony_ci $LDX $n0,$rp,$cnt 1672e1051a39Sopenharmony_ci adde $acc4,$acc5,$t0 1673e1051a39Sopenharmony_ci adde $acc5,$acc6,$t1 1674e1051a39Sopenharmony_ci adde $acc6,$acc7,$t2 1675e1051a39Sopenharmony_ci adde $acc7,$carry,$t3 1676e1051a39Sopenharmony_ci #addze $carry,$zero # moved above 1677e1051a39Sopenharmony_ci bne .Lsqr8x_tail 1678e1051a39Sopenharmony_ci # note that carry flag is guaranteed 1679e1051a39Sopenharmony_ci # to be zero at this point 1680e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($tp) 1681e1051a39Sopenharmony_ci $POP $carry,$SIZE_T*10($sp) # pull top-most carry in case we break 1682e1051a39Sopenharmony_ci $UCMP $np_end,$np # done yet? 1683e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($tp) 1684e1051a39Sopenharmony_ci sub $t2,$np_end,$num # rewinded np 1685e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($tp) 1686e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($tp) 1687e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($tp) 1688e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($tp) 1689e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($tp) 1690e1051a39Sopenharmony_ci $LD $a7,$SIZE_T*8($tp) 1691e1051a39Sopenharmony_ci beq .Lsqr8x_tail_break 1692e1051a39Sopenharmony_ci 1693e1051a39Sopenharmony_ci addc $acc0,$acc0,$a0 1694e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($np) 1695e1051a39Sopenharmony_ci adde $acc1,$acc1,$a1 1696e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($np) 1697e1051a39Sopenharmony_ci adde $acc2,$acc2,$a2 1698e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($np) 1699e1051a39Sopenharmony_ci adde $acc3,$acc3,$a3 1700e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($np) 1701e1051a39Sopenharmony_ci adde $acc4,$acc4,$a4 1702e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($np) 1703e1051a39Sopenharmony_ci adde $acc5,$acc5,$a5 1704e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($np) 1705e1051a39Sopenharmony_ci adde $acc6,$acc6,$a6 1706e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($np) 1707e1051a39Sopenharmony_ci adde $acc7,$acc7,$a7 1708e1051a39Sopenharmony_ci $LDU $a7,$SIZE_T*8($np) 1709e1051a39Sopenharmony_ci #addze $carry,$zero # moved above 1710e1051a39Sopenharmony_ci b .Lsqr8x_tail 1711e1051a39Sopenharmony_ci 1712e1051a39Sopenharmony_ci.align 5 1713e1051a39Sopenharmony_ci.Lsqr8x_tail_break: 1714e1051a39Sopenharmony_ci $POP $n0,$SIZE_T*8($sp) # pull n0 1715e1051a39Sopenharmony_ci $POP $t3,$SIZE_T*9($sp) # &tp[2*num-1] 1716e1051a39Sopenharmony_ci addi $cnt,$tp,$SIZE_T*8 # end of current t[num] window 1717e1051a39Sopenharmony_ci 1718e1051a39Sopenharmony_ci addic $carry,$carry,-1 # "move" top-most carry to carry bit 1719e1051a39Sopenharmony_ci adde $t0,$acc0,$a0 1720e1051a39Sopenharmony_ci $LD $acc0,$SIZE_T*8($rp) 1721e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($t2) # recall that $t2 is &n[-1] 1722e1051a39Sopenharmony_ci adde $t1,$acc1,$a1 1723e1051a39Sopenharmony_ci $LD $acc1,$SIZE_T*9($rp) 1724e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($t2) 1725e1051a39Sopenharmony_ci adde $acc2,$acc2,$a2 1726e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($t2) 1727e1051a39Sopenharmony_ci adde $acc3,$acc3,$a3 1728e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($t2) 1729e1051a39Sopenharmony_ci adde $acc4,$acc4,$a4 1730e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($t2) 1731e1051a39Sopenharmony_ci adde $acc5,$acc5,$a5 1732e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($t2) 1733e1051a39Sopenharmony_ci adde $acc6,$acc6,$a6 1734e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($t2) 1735e1051a39Sopenharmony_ci adde $acc7,$acc7,$a7 1736e1051a39Sopenharmony_ci $LD $a7,$SIZE_T*8($t2) 1737e1051a39Sopenharmony_ci addi $np,$t2,$SIZE_T*8 1738e1051a39Sopenharmony_ci addze $t2,$zero # top-most carry 1739e1051a39Sopenharmony_ci $UMULL $na0,$n0,$acc0 1740e1051a39Sopenharmony_ci $ST $t0,$SIZE_T*1($tp) 1741e1051a39Sopenharmony_ci $UCMP $cnt,$t3 # did we hit the bottom? 1742e1051a39Sopenharmony_ci $ST $t1,$SIZE_T*2($tp) 1743e1051a39Sopenharmony_ci li $cnt,8 1744e1051a39Sopenharmony_ci $ST $acc2,$SIZE_T*3($tp) 1745e1051a39Sopenharmony_ci $LD $acc2,$SIZE_T*10($rp) 1746e1051a39Sopenharmony_ci $ST $acc3,$SIZE_T*4($tp) 1747e1051a39Sopenharmony_ci $LD $acc3,$SIZE_T*11($rp) 1748e1051a39Sopenharmony_ci $ST $acc4,$SIZE_T*5($tp) 1749e1051a39Sopenharmony_ci $LD $acc4,$SIZE_T*12($rp) 1750e1051a39Sopenharmony_ci $ST $acc5,$SIZE_T*6($tp) 1751e1051a39Sopenharmony_ci $LD $acc5,$SIZE_T*13($rp) 1752e1051a39Sopenharmony_ci $ST $acc6,$SIZE_T*7($tp) 1753e1051a39Sopenharmony_ci $LD $acc6,$SIZE_T*14($rp) 1754e1051a39Sopenharmony_ci $ST $acc7,$SIZE_T*8($tp) 1755e1051a39Sopenharmony_ci $LD $acc7,$SIZE_T*15($rp) 1756e1051a39Sopenharmony_ci $PUSH $t2,$SIZE_T*10($sp) # off-load top-most carry 1757e1051a39Sopenharmony_ci addi $tp,$rp,$SIZE_T*7 # slide the window 1758e1051a39Sopenharmony_ci mtctr $cnt 1759e1051a39Sopenharmony_ci bne .Lsqr8x_reduction 1760e1051a39Sopenharmony_ci 1761e1051a39Sopenharmony_ci ################################################################ 1762e1051a39Sopenharmony_ci # Final step. We see if result is larger than modulus, and 1763e1051a39Sopenharmony_ci # if it is, subtract the modulus. But comparison implies 1764e1051a39Sopenharmony_ci # subtraction. So we subtract modulus, see if it borrowed, 1765e1051a39Sopenharmony_ci # and conditionally copy original value. 1766e1051a39Sopenharmony_ci $POP $rp,$SIZE_T*6($sp) # pull &rp[-1] 1767e1051a39Sopenharmony_ci srwi $cnt,$num,`log($SIZE_T)/log(2)+3` 1768e1051a39Sopenharmony_ci mr $n0,$tp # put tp aside 1769e1051a39Sopenharmony_ci addi $tp,$tp,$SIZE_T*8 1770e1051a39Sopenharmony_ci subi $cnt,$cnt,1 1771e1051a39Sopenharmony_ci subfc $t0,$a0,$acc0 1772e1051a39Sopenharmony_ci subfe $t1,$a1,$acc1 1773e1051a39Sopenharmony_ci mr $carry,$t2 1774e1051a39Sopenharmony_ci mr $ap_end,$rp # $rp copy 1775e1051a39Sopenharmony_ci 1776e1051a39Sopenharmony_ci mtctr $cnt 1777e1051a39Sopenharmony_ci b .Lsqr8x_sub 1778e1051a39Sopenharmony_ci 1779e1051a39Sopenharmony_ci.align 5 1780e1051a39Sopenharmony_ci.Lsqr8x_sub: 1781e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($np) 1782e1051a39Sopenharmony_ci $LD $acc0,$SIZE_T*1($tp) 1783e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($np) 1784e1051a39Sopenharmony_ci $LD $acc1,$SIZE_T*2($tp) 1785e1051a39Sopenharmony_ci subfe $t2,$a2,$acc2 1786e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($np) 1787e1051a39Sopenharmony_ci $LD $acc2,$SIZE_T*3($tp) 1788e1051a39Sopenharmony_ci subfe $t3,$a3,$acc3 1789e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($np) 1790e1051a39Sopenharmony_ci $LD $acc3,$SIZE_T*4($tp) 1791e1051a39Sopenharmony_ci $ST $t0,$SIZE_T*1($rp) 1792e1051a39Sopenharmony_ci subfe $t0,$a4,$acc4 1793e1051a39Sopenharmony_ci $LD $a4,$SIZE_T*5($np) 1794e1051a39Sopenharmony_ci $LD $acc4,$SIZE_T*5($tp) 1795e1051a39Sopenharmony_ci $ST $t1,$SIZE_T*2($rp) 1796e1051a39Sopenharmony_ci subfe $t1,$a5,$acc5 1797e1051a39Sopenharmony_ci $LD $a5,$SIZE_T*6($np) 1798e1051a39Sopenharmony_ci $LD $acc5,$SIZE_T*6($tp) 1799e1051a39Sopenharmony_ci $ST $t2,$SIZE_T*3($rp) 1800e1051a39Sopenharmony_ci subfe $t2,$a6,$acc6 1801e1051a39Sopenharmony_ci $LD $a6,$SIZE_T*7($np) 1802e1051a39Sopenharmony_ci $LD $acc6,$SIZE_T*7($tp) 1803e1051a39Sopenharmony_ci $ST $t3,$SIZE_T*4($rp) 1804e1051a39Sopenharmony_ci subfe $t3,$a7,$acc7 1805e1051a39Sopenharmony_ci $LDU $a7,$SIZE_T*8($np) 1806e1051a39Sopenharmony_ci $LDU $acc7,$SIZE_T*8($tp) 1807e1051a39Sopenharmony_ci $ST $t0,$SIZE_T*5($rp) 1808e1051a39Sopenharmony_ci subfe $t0,$a0,$acc0 1809e1051a39Sopenharmony_ci $ST $t1,$SIZE_T*6($rp) 1810e1051a39Sopenharmony_ci subfe $t1,$a1,$acc1 1811e1051a39Sopenharmony_ci $ST $t2,$SIZE_T*7($rp) 1812e1051a39Sopenharmony_ci $STU $t3,$SIZE_T*8($rp) 1813e1051a39Sopenharmony_ci bdnz .Lsqr8x_sub 1814e1051a39Sopenharmony_ci 1815e1051a39Sopenharmony_ci srwi $cnt,$num,`log($SIZE_T)/log(2)+2` 1816e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*1($ap_end) # original $rp 1817e1051a39Sopenharmony_ci $LD $acc0,$SIZE_T*1($n0) # original $tp 1818e1051a39Sopenharmony_ci subi $cnt,$cnt,1 1819e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*2($ap_end) 1820e1051a39Sopenharmony_ci $LD $acc1,$SIZE_T*2($n0) 1821e1051a39Sopenharmony_ci subfe $t2,$a2,$acc2 1822e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*3($ap_end) 1823e1051a39Sopenharmony_ci $LD $acc2,$SIZE_T*3($n0) 1824e1051a39Sopenharmony_ci subfe $t3,$a3,$acc3 1825e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*4($ap_end) 1826e1051a39Sopenharmony_ci $LDU $acc3,$SIZE_T*4($n0) 1827e1051a39Sopenharmony_ci $ST $t0,$SIZE_T*1($rp) 1828e1051a39Sopenharmony_ci subfe $t0,$a4,$acc4 1829e1051a39Sopenharmony_ci $ST $t1,$SIZE_T*2($rp) 1830e1051a39Sopenharmony_ci subfe $t1,$a5,$acc5 1831e1051a39Sopenharmony_ci $ST $t2,$SIZE_T*3($rp) 1832e1051a39Sopenharmony_ci subfe $t2,$a6,$acc6 1833e1051a39Sopenharmony_ci $ST $t3,$SIZE_T*4($rp) 1834e1051a39Sopenharmony_ci subfe $t3,$a7,$acc7 1835e1051a39Sopenharmony_ci $ST $t0,$SIZE_T*5($rp) 1836e1051a39Sopenharmony_ci subfe $carry,$zero,$carry # did it borrow? 1837e1051a39Sopenharmony_ci $ST $t1,$SIZE_T*6($rp) 1838e1051a39Sopenharmony_ci $ST $t2,$SIZE_T*7($rp) 1839e1051a39Sopenharmony_ci $ST $t3,$SIZE_T*8($rp) 1840e1051a39Sopenharmony_ci 1841e1051a39Sopenharmony_ci addi $tp,$sp,$SIZE_T*11 1842e1051a39Sopenharmony_ci mtctr $cnt 1843e1051a39Sopenharmony_ci 1844e1051a39Sopenharmony_ci.Lsqr4x_cond_copy: 1845e1051a39Sopenharmony_ci andc $a0,$a0,$carry 1846e1051a39Sopenharmony_ci $ST $zero,-$SIZE_T*3($n0) # wipe stack clean 1847e1051a39Sopenharmony_ci and $acc0,$acc0,$carry 1848e1051a39Sopenharmony_ci $ST $zero,-$SIZE_T*2($n0) 1849e1051a39Sopenharmony_ci andc $a1,$a1,$carry 1850e1051a39Sopenharmony_ci $ST $zero,-$SIZE_T*1($n0) 1851e1051a39Sopenharmony_ci and $acc1,$acc1,$carry 1852e1051a39Sopenharmony_ci $ST $zero,-$SIZE_T*0($n0) 1853e1051a39Sopenharmony_ci andc $a2,$a2,$carry 1854e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*1($tp) 1855e1051a39Sopenharmony_ci and $acc2,$acc2,$carry 1856e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*2($tp) 1857e1051a39Sopenharmony_ci andc $a3,$a3,$carry 1858e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*3($tp) 1859e1051a39Sopenharmony_ci and $acc3,$acc3,$carry 1860e1051a39Sopenharmony_ci $STU $zero,$SIZE_T*4($tp) 1861e1051a39Sopenharmony_ci or $t0,$a0,$acc0 1862e1051a39Sopenharmony_ci $LD $a0,$SIZE_T*5($ap_end) 1863e1051a39Sopenharmony_ci $LD $acc0,$SIZE_T*1($n0) 1864e1051a39Sopenharmony_ci or $t1,$a1,$acc1 1865e1051a39Sopenharmony_ci $LD $a1,$SIZE_T*6($ap_end) 1866e1051a39Sopenharmony_ci $LD $acc1,$SIZE_T*2($n0) 1867e1051a39Sopenharmony_ci or $t2,$a2,$acc2 1868e1051a39Sopenharmony_ci $LD $a2,$SIZE_T*7($ap_end) 1869e1051a39Sopenharmony_ci $LD $acc2,$SIZE_T*3($n0) 1870e1051a39Sopenharmony_ci or $t3,$a3,$acc3 1871e1051a39Sopenharmony_ci $LD $a3,$SIZE_T*8($ap_end) 1872e1051a39Sopenharmony_ci $LDU $acc3,$SIZE_T*4($n0) 1873e1051a39Sopenharmony_ci $ST $t0,$SIZE_T*1($ap_end) 1874e1051a39Sopenharmony_ci $ST $t1,$SIZE_T*2($ap_end) 1875e1051a39Sopenharmony_ci $ST $t2,$SIZE_T*3($ap_end) 1876e1051a39Sopenharmony_ci $STU $t3,$SIZE_T*4($ap_end) 1877e1051a39Sopenharmony_ci bdnz .Lsqr4x_cond_copy 1878e1051a39Sopenharmony_ci 1879e1051a39Sopenharmony_ci $POP $ap,0($sp) # pull saved sp 1880e1051a39Sopenharmony_ci andc $a0,$a0,$carry 1881e1051a39Sopenharmony_ci and $acc0,$acc0,$carry 1882e1051a39Sopenharmony_ci andc $a1,$a1,$carry 1883e1051a39Sopenharmony_ci and $acc1,$acc1,$carry 1884e1051a39Sopenharmony_ci andc $a2,$a2,$carry 1885e1051a39Sopenharmony_ci and $acc2,$acc2,$carry 1886e1051a39Sopenharmony_ci andc $a3,$a3,$carry 1887e1051a39Sopenharmony_ci and $acc3,$acc3,$carry 1888e1051a39Sopenharmony_ci or $t0,$a0,$acc0 1889e1051a39Sopenharmony_ci or $t1,$a1,$acc1 1890e1051a39Sopenharmony_ci or $t2,$a2,$acc2 1891e1051a39Sopenharmony_ci or $t3,$a3,$acc3 1892e1051a39Sopenharmony_ci $ST $t0,$SIZE_T*1($ap_end) 1893e1051a39Sopenharmony_ci $ST $t1,$SIZE_T*2($ap_end) 1894e1051a39Sopenharmony_ci $ST $t2,$SIZE_T*3($ap_end) 1895e1051a39Sopenharmony_ci $ST $t3,$SIZE_T*4($ap_end) 1896e1051a39Sopenharmony_ci 1897e1051a39Sopenharmony_ci b .Lsqr8x_done 1898e1051a39Sopenharmony_ci 1899e1051a39Sopenharmony_ci.align 5 1900e1051a39Sopenharmony_ci.Lsqr8x8_post_condition: 1901e1051a39Sopenharmony_ci $POP $rp,$SIZE_T*6($sp) # pull rp 1902e1051a39Sopenharmony_ci $POP $ap,0($sp) # pull saved sp 1903e1051a39Sopenharmony_ci addze $carry,$zero 1904e1051a39Sopenharmony_ci 1905e1051a39Sopenharmony_ci # $acc0-7,$carry hold result, $a0-7 hold modulus 1906e1051a39Sopenharmony_ci subfc $acc0,$a0,$acc0 1907e1051a39Sopenharmony_ci subfe $acc1,$a1,$acc1 1908e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*12($sp) # wipe stack clean 1909e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*13($sp) 1910e1051a39Sopenharmony_ci subfe $acc2,$a2,$acc2 1911e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*14($sp) 1912e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*15($sp) 1913e1051a39Sopenharmony_ci subfe $acc3,$a3,$acc3 1914e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*16($sp) 1915e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*17($sp) 1916e1051a39Sopenharmony_ci subfe $acc4,$a4,$acc4 1917e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*18($sp) 1918e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*19($sp) 1919e1051a39Sopenharmony_ci subfe $acc5,$a5,$acc5 1920e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*20($sp) 1921e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*21($sp) 1922e1051a39Sopenharmony_ci subfe $acc6,$a6,$acc6 1923e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*22($sp) 1924e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*23($sp) 1925e1051a39Sopenharmony_ci subfe $acc7,$a7,$acc7 1926e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*24($sp) 1927e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*25($sp) 1928e1051a39Sopenharmony_ci subfe $carry,$zero,$carry # did it borrow? 1929e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*26($sp) 1930e1051a39Sopenharmony_ci $ST $zero,$SIZE_T*27($sp) 1931e1051a39Sopenharmony_ci 1932e1051a39Sopenharmony_ci and $a0,$a0,$carry 1933e1051a39Sopenharmony_ci and $a1,$a1,$carry 1934e1051a39Sopenharmony_ci addc $acc0,$acc0,$a0 # add modulus back if borrowed 1935e1051a39Sopenharmony_ci and $a2,$a2,$carry 1936e1051a39Sopenharmony_ci adde $acc1,$acc1,$a1 1937e1051a39Sopenharmony_ci and $a3,$a3,$carry 1938e1051a39Sopenharmony_ci adde $acc2,$acc2,$a2 1939e1051a39Sopenharmony_ci and $a4,$a4,$carry 1940e1051a39Sopenharmony_ci adde $acc3,$acc3,$a3 1941e1051a39Sopenharmony_ci and $a5,$a5,$carry 1942e1051a39Sopenharmony_ci adde $acc4,$acc4,$a4 1943e1051a39Sopenharmony_ci and $a6,$a6,$carry 1944e1051a39Sopenharmony_ci adde $acc5,$acc5,$a5 1945e1051a39Sopenharmony_ci and $a7,$a7,$carry 1946e1051a39Sopenharmony_ci adde $acc6,$acc6,$a6 1947e1051a39Sopenharmony_ci adde $acc7,$acc7,$a7 1948e1051a39Sopenharmony_ci $ST $acc0,$SIZE_T*1($rp) 1949e1051a39Sopenharmony_ci $ST $acc1,$SIZE_T*2($rp) 1950e1051a39Sopenharmony_ci $ST $acc2,$SIZE_T*3($rp) 1951e1051a39Sopenharmony_ci $ST $acc3,$SIZE_T*4($rp) 1952e1051a39Sopenharmony_ci $ST $acc4,$SIZE_T*5($rp) 1953e1051a39Sopenharmony_ci $ST $acc5,$SIZE_T*6($rp) 1954e1051a39Sopenharmony_ci $ST $acc6,$SIZE_T*7($rp) 1955e1051a39Sopenharmony_ci $ST $acc7,$SIZE_T*8($rp) 1956e1051a39Sopenharmony_ci 1957e1051a39Sopenharmony_ci.Lsqr8x_done: 1958e1051a39Sopenharmony_ci $PUSH $zero,$SIZE_T*8($sp) 1959e1051a39Sopenharmony_ci $PUSH $zero,$SIZE_T*10($sp) 1960e1051a39Sopenharmony_ci 1961e1051a39Sopenharmony_ci $POP r14,-$SIZE_T*18($ap) 1962e1051a39Sopenharmony_ci li r3,1 # signal "done" 1963e1051a39Sopenharmony_ci $POP r15,-$SIZE_T*17($ap) 1964e1051a39Sopenharmony_ci $POP r16,-$SIZE_T*16($ap) 1965e1051a39Sopenharmony_ci $POP r17,-$SIZE_T*15($ap) 1966e1051a39Sopenharmony_ci $POP r18,-$SIZE_T*14($ap) 1967e1051a39Sopenharmony_ci $POP r19,-$SIZE_T*13($ap) 1968e1051a39Sopenharmony_ci $POP r20,-$SIZE_T*12($ap) 1969e1051a39Sopenharmony_ci $POP r21,-$SIZE_T*11($ap) 1970e1051a39Sopenharmony_ci $POP r22,-$SIZE_T*10($ap) 1971e1051a39Sopenharmony_ci $POP r23,-$SIZE_T*9($ap) 1972e1051a39Sopenharmony_ci $POP r24,-$SIZE_T*8($ap) 1973e1051a39Sopenharmony_ci $POP r25,-$SIZE_T*7($ap) 1974e1051a39Sopenharmony_ci $POP r26,-$SIZE_T*6($ap) 1975e1051a39Sopenharmony_ci $POP r27,-$SIZE_T*5($ap) 1976e1051a39Sopenharmony_ci $POP r28,-$SIZE_T*4($ap) 1977e1051a39Sopenharmony_ci $POP r29,-$SIZE_T*3($ap) 1978e1051a39Sopenharmony_ci $POP r30,-$SIZE_T*2($ap) 1979e1051a39Sopenharmony_ci $POP r31,-$SIZE_T*1($ap) 1980e1051a39Sopenharmony_ci mr $sp,$ap 1981e1051a39Sopenharmony_ci blr 1982e1051a39Sopenharmony_ci .long 0 1983e1051a39Sopenharmony_ci .byte 0,12,4,0x20,0x80,18,6,0 1984e1051a39Sopenharmony_ci .long 0 1985e1051a39Sopenharmony_ci.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1986e1051a39Sopenharmony_ci___ 1987e1051a39Sopenharmony_ci} 1988e1051a39Sopenharmony_ci$code.=<<___; 1989e1051a39Sopenharmony_ci.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" 1990e1051a39Sopenharmony_ci___ 1991e1051a39Sopenharmony_ci 1992e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval $1/gem; 1993e1051a39Sopenharmony_ciprint $code; 1994e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 1995