1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci 17e1051a39Sopenharmony_ci# On PA-7100LC this module performs ~90-50% better, less for longer 18e1051a39Sopenharmony_ci# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means 19e1051a39Sopenharmony_ci# that compiler utilized xmpyu instruction to perform 32x32=64-bit 20e1051a39Sopenharmony_ci# multiplication, which in turn means that "baseline" performance was 21e1051a39Sopenharmony_ci# optimal in respect to instruction set capabilities. Fair comparison 22e1051a39Sopenharmony_ci# with vendor compiler is problematic, because OpenSSL doesn't define 23e1051a39Sopenharmony_ci# BN_LLONG [presumably] for historical reasons, which drives compiler 24e1051a39Sopenharmony_ci# toward 4 times 16x16=32-bit multiplications [plus complementary 25e1051a39Sopenharmony_ci# shifts and additions] instead. This means that you should observe 26e1051a39Sopenharmony_ci# several times improvement over code generated by vendor compiler 27e1051a39Sopenharmony_ci# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual 28e1051a39Sopenharmony_ci# improvement coefficient was never collected on PA-7100LC, or any 29e1051a39Sopenharmony_ci# other 1.1 CPU, because I don't have access to such machine with 30e1051a39Sopenharmony_ci# vendor compiler. But to give you a taste, PA-RISC 1.1 code path 31e1051a39Sopenharmony_ci# reportedly outperformed code generated by cc +DA1.1 +O3 by factor 32e1051a39Sopenharmony_ci# of ~5x on PA-8600. 33e1051a39Sopenharmony_ci# 34e1051a39Sopenharmony_ci# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is 35e1051a39Sopenharmony_ci# reportedly ~2x faster than vendor compiler generated code [according 36e1051a39Sopenharmony_ci# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of 37e1051a39Sopenharmony_ci# this implementation is actually 32-bit one, in the sense that it 38e1051a39Sopenharmony_ci# operates on 32-bit values. But pa-risc2[W].s operates on arrays of 39e1051a39Sopenharmony_ci# 64-bit BN_LONGs... How do they interoperate then? No problem. This 40e1051a39Sopenharmony_ci# module picks halves of 64-bit values in reverse order and pretends 41e1051a39Sopenharmony_ci# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" 42e1051a39Sopenharmony_ci# 64-bit code such as pa-risc2[W].s then? Well, the thing is that 43e1051a39Sopenharmony_ci# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, 44e1051a39Sopenharmony_ci# i.e. there is no "wider" multiplication like on most other 64-bit 45e1051a39Sopenharmony_ci# platforms. This means that even being effectively 32-bit, this 46e1051a39Sopenharmony_ci# implementation performs "64-bit" computational task in same amount 47e1051a39Sopenharmony_ci# of arithmetic operations, most notably multiplications. It requires 48e1051a39Sopenharmony_ci# more memory references, most notably to tp[num], but this doesn't 49e1051a39Sopenharmony_ci# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC 50e1051a39Sopenharmony_ci# 2.0 code path provides virtually same performance as pa-risc2[W].s: 51e1051a39Sopenharmony_ci# it's ~10% better for shortest key length and ~10% worse for longest 52e1051a39Sopenharmony_ci# one. 53e1051a39Sopenharmony_ci# 54e1051a39Sopenharmony_ci# In case it wasn't clear. The module has two distinct code paths: 55e1051a39Sopenharmony_ci# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit 56e1051a39Sopenharmony_ci# additions and 64-bit integer loads, not to mention specific 57e1051a39Sopenharmony_ci# instruction scheduling. In 64-bit build naturally only 2.0 code path 58e1051a39Sopenharmony_ci# is assembled. In 32-bit application context both code paths are 59e1051a39Sopenharmony_ci# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path 60e1051a39Sopenharmony_ci# is taken automatically. Also, in 32-bit build the module imposes 61e1051a39Sopenharmony_ci# couple of limitations: vector lengths has to be even and vector 62e1051a39Sopenharmony_ci# addresses has to be 64-bit aligned. Normally neither is a problem: 63e1051a39Sopenharmony_ci# most common key lengths are even and vectors are commonly malloc-ed, 64e1051a39Sopenharmony_ci# which ensures alignment. 65e1051a39Sopenharmony_ci# 66e1051a39Sopenharmony_ci# Special thanks to polarhome.com for providing HP-UX account on 67e1051a39Sopenharmony_ci# PA-RISC 1.1 machine, and to correspondent who chose to remain 68e1051a39Sopenharmony_ci# anonymous for testing the code on PA-RISC 2.0 machine. 69e1051a39Sopenharmony_ci 70e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 71e1051a39Sopenharmony_ci 72e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 73e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 74e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 75e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 76e1051a39Sopenharmony_ci 77e1051a39Sopenharmony_ci$output and open STDOUT,">$output"; 78e1051a39Sopenharmony_ci 79e1051a39Sopenharmony_ciif ($flavour =~ /64/) { 80e1051a39Sopenharmony_ci $LEVEL ="2.0W"; 81e1051a39Sopenharmony_ci $SIZE_T =8; 82e1051a39Sopenharmony_ci $FRAME_MARKER =80; 83e1051a39Sopenharmony_ci $SAVED_RP =16; 84e1051a39Sopenharmony_ci $PUSH ="std"; 85e1051a39Sopenharmony_ci $PUSHMA ="std,ma"; 86e1051a39Sopenharmony_ci $POP ="ldd"; 87e1051a39Sopenharmony_ci $POPMB ="ldd,mb"; 88e1051a39Sopenharmony_ci $BN_SZ =$SIZE_T; 89e1051a39Sopenharmony_ci} else { 90e1051a39Sopenharmony_ci $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; 91e1051a39Sopenharmony_ci $SIZE_T =4; 92e1051a39Sopenharmony_ci $FRAME_MARKER =48; 93e1051a39Sopenharmony_ci $SAVED_RP =20; 94e1051a39Sopenharmony_ci $PUSH ="stw"; 95e1051a39Sopenharmony_ci $PUSHMA ="stwm"; 96e1051a39Sopenharmony_ci $POP ="ldw"; 97e1051a39Sopenharmony_ci $POPMB ="ldwm"; 98e1051a39Sopenharmony_ci $BN_SZ =$SIZE_T; 99e1051a39Sopenharmony_ci if (open CONF,"<${dir}../../opensslconf.h") { 100e1051a39Sopenharmony_ci while(<CONF>) { 101e1051a39Sopenharmony_ci if (m/#\s*define\s+SIXTY_FOUR_BIT/) { 102e1051a39Sopenharmony_ci $BN_SZ=8; 103e1051a39Sopenharmony_ci $LEVEL="2.0"; 104e1051a39Sopenharmony_ci last; 105e1051a39Sopenharmony_ci } 106e1051a39Sopenharmony_ci } 107e1051a39Sopenharmony_ci close CONF; 108e1051a39Sopenharmony_ci } 109e1051a39Sopenharmony_ci} 110e1051a39Sopenharmony_ci 111e1051a39Sopenharmony_ci$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker 112e1051a39Sopenharmony_ci # [+ argument transfer] 113e1051a39Sopenharmony_ci$LOCALS=$FRAME-$FRAME_MARKER; 114e1051a39Sopenharmony_ci$FRAME+=32; # local variables 115e1051a39Sopenharmony_ci 116e1051a39Sopenharmony_ci$tp="%r31"; 117e1051a39Sopenharmony_ci$ti1="%r29"; 118e1051a39Sopenharmony_ci$ti0="%r28"; 119e1051a39Sopenharmony_ci 120e1051a39Sopenharmony_ci$rp="%r26"; 121e1051a39Sopenharmony_ci$ap="%r25"; 122e1051a39Sopenharmony_ci$bp="%r24"; 123e1051a39Sopenharmony_ci$np="%r23"; 124e1051a39Sopenharmony_ci$n0="%r22"; # passed through stack in 32-bit 125e1051a39Sopenharmony_ci$num="%r21"; # passed through stack in 32-bit 126e1051a39Sopenharmony_ci$idx="%r20"; 127e1051a39Sopenharmony_ci$arrsz="%r19"; 128e1051a39Sopenharmony_ci 129e1051a39Sopenharmony_ci$nm1="%r7"; 130e1051a39Sopenharmony_ci$nm0="%r6"; 131e1051a39Sopenharmony_ci$ab1="%r5"; 132e1051a39Sopenharmony_ci$ab0="%r4"; 133e1051a39Sopenharmony_ci 134e1051a39Sopenharmony_ci$fp="%r3"; 135e1051a39Sopenharmony_ci$hi1="%r2"; 136e1051a39Sopenharmony_ci$hi0="%r1"; 137e1051a39Sopenharmony_ci 138e1051a39Sopenharmony_ci$xfer=$n0; # accommodates [-16..15] offset in fld[dw]s 139e1051a39Sopenharmony_ci 140e1051a39Sopenharmony_ci$fm0="%fr4"; $fti=$fm0; 141e1051a39Sopenharmony_ci$fbi="%fr5L"; 142e1051a39Sopenharmony_ci$fn0="%fr5R"; 143e1051a39Sopenharmony_ci$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; 144e1051a39Sopenharmony_ci$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; 145e1051a39Sopenharmony_ci 146e1051a39Sopenharmony_ci$code=<<___; 147e1051a39Sopenharmony_ci .LEVEL $LEVEL 148e1051a39Sopenharmony_ci .SPACE \$TEXT\$ 149e1051a39Sopenharmony_ci .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 150e1051a39Sopenharmony_ci 151e1051a39Sopenharmony_ci .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 152e1051a39Sopenharmony_ci .ALIGN 64 153e1051a39Sopenharmony_cibn_mul_mont 154e1051a39Sopenharmony_ci .PROC 155e1051a39Sopenharmony_ci .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 156e1051a39Sopenharmony_ci .ENTRY 157e1051a39Sopenharmony_ci $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 158e1051a39Sopenharmony_ci $PUSHMA %r3,$FRAME(%sp) 159e1051a39Sopenharmony_ci $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 160e1051a39Sopenharmony_ci $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 161e1051a39Sopenharmony_ci $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 162e1051a39Sopenharmony_ci $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 163e1051a39Sopenharmony_ci $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 164e1051a39Sopenharmony_ci $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 165e1051a39Sopenharmony_ci $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 166e1051a39Sopenharmony_ci ldo -$FRAME(%sp),$fp 167e1051a39Sopenharmony_ci___ 168e1051a39Sopenharmony_ci$code.=<<___ if ($SIZE_T==4); 169e1051a39Sopenharmony_ci ldw `-$FRAME_MARKER-4`($fp),$n0 170e1051a39Sopenharmony_ci ldw `-$FRAME_MARKER-8`($fp),$num 171e1051a39Sopenharmony_ci nop 172e1051a39Sopenharmony_ci nop ; alignment 173e1051a39Sopenharmony_ci___ 174e1051a39Sopenharmony_ci$code.=<<___ if ($BN_SZ==4); 175e1051a39Sopenharmony_ci comiclr,<= 6,$num,%r0 ; are vectors long enough? 176e1051a39Sopenharmony_ci b L\$abort 177e1051a39Sopenharmony_ci ldi 0,%r28 ; signal "unhandled" 178e1051a39Sopenharmony_ci add,ev %r0,$num,$num ; is $num even? 179e1051a39Sopenharmony_ci b L\$abort 180e1051a39Sopenharmony_ci nop 181e1051a39Sopenharmony_ci or $ap,$np,$ti1 182e1051a39Sopenharmony_ci extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? 183e1051a39Sopenharmony_ci b L\$abort 184e1051a39Sopenharmony_ci nop 185e1051a39Sopenharmony_ci nop ; alignment 186e1051a39Sopenharmony_ci nop 187e1051a39Sopenharmony_ci 188e1051a39Sopenharmony_ci fldws 0($n0),${fn0} 189e1051a39Sopenharmony_ci fldws,ma 4($bp),${fbi} ; bp[0] 190e1051a39Sopenharmony_ci___ 191e1051a39Sopenharmony_ci$code.=<<___ if ($BN_SZ==8); 192e1051a39Sopenharmony_ci comib,> 3,$num,L\$abort ; are vectors long enough? 193e1051a39Sopenharmony_ci ldi 0,%r28 ; signal "unhandled" 194e1051a39Sopenharmony_ci addl $num,$num,$num ; I operate on 32-bit values 195e1051a39Sopenharmony_ci 196e1051a39Sopenharmony_ci fldws 4($n0),${fn0} ; only low part of n0 197e1051a39Sopenharmony_ci fldws 4($bp),${fbi} ; bp[0] in flipped word order 198e1051a39Sopenharmony_ci___ 199e1051a39Sopenharmony_ci$code.=<<___; 200e1051a39Sopenharmony_ci fldds 0($ap),${fai} ; ap[0,1] 201e1051a39Sopenharmony_ci fldds 0($np),${fni} ; np[0,1] 202e1051a39Sopenharmony_ci 203e1051a39Sopenharmony_ci sh2addl $num,%r0,$arrsz 204e1051a39Sopenharmony_ci ldi 31,$hi0 205e1051a39Sopenharmony_ci ldo 36($arrsz),$hi1 ; space for tp[num+1] 206e1051a39Sopenharmony_ci andcm $hi1,$hi0,$hi1 ; align 207e1051a39Sopenharmony_ci addl $hi1,%sp,%sp 208e1051a39Sopenharmony_ci $PUSH $fp,-$SIZE_T(%sp) 209e1051a39Sopenharmony_ci 210e1051a39Sopenharmony_ci ldo `$LOCALS+16`($fp),$xfer 211e1051a39Sopenharmony_ci ldo `$LOCALS+32+4`($fp),$tp 212e1051a39Sopenharmony_ci 213e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] 214e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] 215e1051a39Sopenharmony_ci xmpyu ${fn0},${fab0}R,${fm0} 216e1051a39Sopenharmony_ci 217e1051a39Sopenharmony_ci addl $arrsz,$ap,$ap ; point at the end 218e1051a39Sopenharmony_ci addl $arrsz,$np,$np 219e1051a39Sopenharmony_ci subi 0,$arrsz,$idx ; j=0 220e1051a39Sopenharmony_ci ldo 8($idx),$idx ; j++++ 221e1051a39Sopenharmony_ci 222e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 223e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 224e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) 225e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 226e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 227e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 228e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[2,3] 229e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[2,3] 230e1051a39Sopenharmony_ci___ 231e1051a39Sopenharmony_ci$code.=<<___ if ($BN_SZ==4); 232e1051a39Sopenharmony_ci mtctl $hi0,%cr11 ; $hi0 still holds 31 233e1051a39Sopenharmony_ci extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 234e1051a39Sopenharmony_ci b L\$parisc11 235e1051a39Sopenharmony_ci nop 236e1051a39Sopenharmony_ci___ 237e1051a39Sopenharmony_ci$code.=<<___; # PA-RISC 2.0 code-path 238e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 239e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 240e1051a39Sopenharmony_ci ldd -16($xfer),$ab0 241e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) 242e1051a39Sopenharmony_ci 243e1051a39Sopenharmony_ci extrd,u $ab0,31,32,$hi0 244e1051a39Sopenharmony_ci extrd,u $ab0,63,32,$ab0 245e1051a39Sopenharmony_ci ldd -8($xfer),$nm0 246e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 247e1051a39Sopenharmony_ci ldo 8($idx),$idx ; j++++ 248e1051a39Sopenharmony_ci addl $ab0,$nm0,$nm0 ; low part is discarded 249e1051a39Sopenharmony_ci extrd,u $nm0,31,32,$hi1 250e1051a39Sopenharmony_ci 251e1051a39Sopenharmony_ciL\$1st 252e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 253e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 254e1051a39Sopenharmony_ci ldd 0($xfer),$ab1 255e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 256e1051a39Sopenharmony_ci addl $hi0,$ab1,$ab1 257e1051a39Sopenharmony_ci extrd,u $ab1,31,32,$hi0 258e1051a39Sopenharmony_ci ldd 8($xfer),$nm1 259e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 260e1051a39Sopenharmony_ci extrd,u $ab1,63,32,$ab1 261e1051a39Sopenharmony_ci addl $hi1,$nm1,$nm1 262e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[j,j+1] 263e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[j,j+1] 264e1051a39Sopenharmony_ci addl $ab1,$nm1,$nm1 265e1051a39Sopenharmony_ci extrd,u $nm1,31,32,$hi1 266e1051a39Sopenharmony_ci 267e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 268e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 269e1051a39Sopenharmony_ci ldd -16($xfer),$ab0 270e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) 271e1051a39Sopenharmony_ci addl $hi0,$ab0,$ab0 272e1051a39Sopenharmony_ci extrd,u $ab0,31,32,$hi0 273e1051a39Sopenharmony_ci ldd -8($xfer),$nm0 274e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 275e1051a39Sopenharmony_ci extrd,u $ab0,63,32,$ab0 276e1051a39Sopenharmony_ci addl $hi1,$nm0,$nm0 277e1051a39Sopenharmony_ci stw $nm1,-4($tp) ; tp[j-1] 278e1051a39Sopenharmony_ci addl $ab0,$nm0,$nm0 279e1051a39Sopenharmony_ci stw,ma $nm0,8($tp) ; tp[j-1] 280e1051a39Sopenharmony_ci addib,<> 8,$idx,L\$1st ; j++++ 281e1051a39Sopenharmony_ci extrd,u $nm0,31,32,$hi1 282e1051a39Sopenharmony_ci 283e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 284e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 285e1051a39Sopenharmony_ci ldd 0($xfer),$ab1 286e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 287e1051a39Sopenharmony_ci addl $hi0,$ab1,$ab1 288e1051a39Sopenharmony_ci extrd,u $ab1,31,32,$hi0 289e1051a39Sopenharmony_ci ldd 8($xfer),$nm1 290e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 291e1051a39Sopenharmony_ci extrd,u $ab1,63,32,$ab1 292e1051a39Sopenharmony_ci addl $hi1,$nm1,$nm1 293e1051a39Sopenharmony_ci ldd -16($xfer),$ab0 294e1051a39Sopenharmony_ci addl $ab1,$nm1,$nm1 295e1051a39Sopenharmony_ci ldd -8($xfer),$nm0 296e1051a39Sopenharmony_ci extrd,u $nm1,31,32,$hi1 297e1051a39Sopenharmony_ci 298e1051a39Sopenharmony_ci addl $hi0,$ab0,$ab0 299e1051a39Sopenharmony_ci extrd,u $ab0,31,32,$hi0 300e1051a39Sopenharmony_ci stw $nm1,-4($tp) ; tp[j-1] 301e1051a39Sopenharmony_ci extrd,u $ab0,63,32,$ab0 302e1051a39Sopenharmony_ci addl $hi1,$nm0,$nm0 303e1051a39Sopenharmony_ci ldd 0($xfer),$ab1 304e1051a39Sopenharmony_ci addl $ab0,$nm0,$nm0 305e1051a39Sopenharmony_ci ldd,mb 8($xfer),$nm1 306e1051a39Sopenharmony_ci extrd,u $nm0,31,32,$hi1 307e1051a39Sopenharmony_ci stw,ma $nm0,8($tp) ; tp[j-1] 308e1051a39Sopenharmony_ci 309e1051a39Sopenharmony_ci ldo -1($num),$num ; i-- 310e1051a39Sopenharmony_ci subi 0,$arrsz,$idx ; j=0 311e1051a39Sopenharmony_ci___ 312e1051a39Sopenharmony_ci$code.=<<___ if ($BN_SZ==4); 313e1051a39Sopenharmony_ci fldws,ma 4($bp),${fbi} ; bp[1] 314e1051a39Sopenharmony_ci___ 315e1051a39Sopenharmony_ci$code.=<<___ if ($BN_SZ==8); 316e1051a39Sopenharmony_ci fldws 0($bp),${fbi} ; bp[1] in flipped word order 317e1051a39Sopenharmony_ci___ 318e1051a39Sopenharmony_ci$code.=<<___; 319e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[0,1] 320e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[0,1] 321e1051a39Sopenharmony_ci fldws 8($xfer),${fti}R ; tp[0] 322e1051a39Sopenharmony_ci addl $hi0,$ab1,$ab1 323e1051a39Sopenharmony_ci extrd,u $ab1,31,32,$hi0 324e1051a39Sopenharmony_ci extrd,u $ab1,63,32,$ab1 325e1051a39Sopenharmony_ci ldo 8($idx),$idx ; j++++ 326e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 327e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 328e1051a39Sopenharmony_ci addl $hi1,$nm1,$nm1 329e1051a39Sopenharmony_ci addl $ab1,$nm1,$nm1 330e1051a39Sopenharmony_ci extrd,u $nm1,31,32,$hi1 331e1051a39Sopenharmony_ci fstws,mb ${fab0}L,-8($xfer) ; save high part 332e1051a39Sopenharmony_ci stw $nm1,-4($tp) ; tp[j-1] 333e1051a39Sopenharmony_ci 334e1051a39Sopenharmony_ci fcpy,sgl %fr0,${fti}L ; zero high part 335e1051a39Sopenharmony_ci fcpy,sgl %fr0,${fab0}L 336e1051a39Sopenharmony_ci addl $hi1,$hi0,$hi0 337e1051a39Sopenharmony_ci extrd,u $hi0,31,32,$hi1 338e1051a39Sopenharmony_ci fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 339e1051a39Sopenharmony_ci fcnvxf,dbl,dbl ${fab0},${fab0} 340e1051a39Sopenharmony_ci stw $hi0,0($tp) 341e1051a39Sopenharmony_ci stw $hi1,4($tp) 342e1051a39Sopenharmony_ci 343e1051a39Sopenharmony_ci fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 344e1051a39Sopenharmony_ci fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 345e1051a39Sopenharmony_ci xmpyu ${fn0},${fab0}R,${fm0} 346e1051a39Sopenharmony_ci ldo `$LOCALS+32+4`($fp),$tp 347e1051a39Sopenharmony_ciL\$outer 348e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 349e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 350e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) ; 33-bit value 351e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 352e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[2] 353e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[2] 354e1051a39Sopenharmony_ci ldo 8($idx),$idx ; j++++ 355e1051a39Sopenharmony_ci ldd -16($xfer),$ab0 ; 33-bit value 356e1051a39Sopenharmony_ci ldd -8($xfer),$nm0 357e1051a39Sopenharmony_ci ldw 0($xfer),$hi0 ; high part 358e1051a39Sopenharmony_ci 359e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 360e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 361e1051a39Sopenharmony_ci extrd,u $ab0,31,32,$ti0 ; carry bit 362e1051a39Sopenharmony_ci extrd,u $ab0,63,32,$ab0 363e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 364e1051a39Sopenharmony_ci addl $ti0,$hi0,$hi0 ; account carry bit 365e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 366e1051a39Sopenharmony_ci addl $ab0,$nm0,$nm0 ; low part is discarded 367e1051a39Sopenharmony_ci ldw 0($tp),$ti1 ; tp[1] 368e1051a39Sopenharmony_ci extrd,u $nm0,31,32,$hi1 369e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) 370e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 371e1051a39Sopenharmony_ci 372e1051a39Sopenharmony_ciL\$inner 373e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 374e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 375e1051a39Sopenharmony_ci ldd 0($xfer),$ab1 376e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 377e1051a39Sopenharmony_ci addl $hi0,$ti1,$ti1 378e1051a39Sopenharmony_ci addl $ti1,$ab1,$ab1 379e1051a39Sopenharmony_ci ldd 8($xfer),$nm1 380e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 381e1051a39Sopenharmony_ci extrd,u $ab1,31,32,$hi0 382e1051a39Sopenharmony_ci extrd,u $ab1,63,32,$ab1 383e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[j,j+1] 384e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[j,j+1] 385e1051a39Sopenharmony_ci addl $hi1,$nm1,$nm1 386e1051a39Sopenharmony_ci addl $ab1,$nm1,$nm1 387e1051a39Sopenharmony_ci ldw 4($tp),$ti0 ; tp[j] 388e1051a39Sopenharmony_ci stw $nm1,-4($tp) ; tp[j-1] 389e1051a39Sopenharmony_ci 390e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 391e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 392e1051a39Sopenharmony_ci ldd -16($xfer),$ab0 393e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) 394e1051a39Sopenharmony_ci addl $hi0,$ti0,$ti0 395e1051a39Sopenharmony_ci addl $ti0,$ab0,$ab0 396e1051a39Sopenharmony_ci ldd -8($xfer),$nm0 397e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 398e1051a39Sopenharmony_ci extrd,u $ab0,31,32,$hi0 399e1051a39Sopenharmony_ci extrd,u $nm1,31,32,$hi1 400e1051a39Sopenharmony_ci ldw 8($tp),$ti1 ; tp[j] 401e1051a39Sopenharmony_ci extrd,u $ab0,63,32,$ab0 402e1051a39Sopenharmony_ci addl $hi1,$nm0,$nm0 403e1051a39Sopenharmony_ci addl $ab0,$nm0,$nm0 404e1051a39Sopenharmony_ci stw,ma $nm0,8($tp) ; tp[j-1] 405e1051a39Sopenharmony_ci addib,<> 8,$idx,L\$inner ; j++++ 406e1051a39Sopenharmony_ci extrd,u $nm0,31,32,$hi1 407e1051a39Sopenharmony_ci 408e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 409e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 410e1051a39Sopenharmony_ci ldd 0($xfer),$ab1 411e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 412e1051a39Sopenharmony_ci addl $hi0,$ti1,$ti1 413e1051a39Sopenharmony_ci addl $ti1,$ab1,$ab1 414e1051a39Sopenharmony_ci ldd 8($xfer),$nm1 415e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 416e1051a39Sopenharmony_ci extrd,u $ab1,31,32,$hi0 417e1051a39Sopenharmony_ci extrd,u $ab1,63,32,$ab1 418e1051a39Sopenharmony_ci ldw 4($tp),$ti0 ; tp[j] 419e1051a39Sopenharmony_ci addl $hi1,$nm1,$nm1 420e1051a39Sopenharmony_ci addl $ab1,$nm1,$nm1 421e1051a39Sopenharmony_ci ldd -16($xfer),$ab0 422e1051a39Sopenharmony_ci ldd -8($xfer),$nm0 423e1051a39Sopenharmony_ci extrd,u $nm1,31,32,$hi1 424e1051a39Sopenharmony_ci 425e1051a39Sopenharmony_ci addl $hi0,$ab0,$ab0 426e1051a39Sopenharmony_ci addl $ti0,$ab0,$ab0 427e1051a39Sopenharmony_ci stw $nm1,-4($tp) ; tp[j-1] 428e1051a39Sopenharmony_ci extrd,u $ab0,31,32,$hi0 429e1051a39Sopenharmony_ci ldw 8($tp),$ti1 ; tp[j] 430e1051a39Sopenharmony_ci extrd,u $ab0,63,32,$ab0 431e1051a39Sopenharmony_ci addl $hi1,$nm0,$nm0 432e1051a39Sopenharmony_ci ldd 0($xfer),$ab1 433e1051a39Sopenharmony_ci addl $ab0,$nm0,$nm0 434e1051a39Sopenharmony_ci ldd,mb 8($xfer),$nm1 435e1051a39Sopenharmony_ci extrd,u $nm0,31,32,$hi1 436e1051a39Sopenharmony_ci stw,ma $nm0,8($tp) ; tp[j-1] 437e1051a39Sopenharmony_ci 438e1051a39Sopenharmony_ci addib,= -1,$num,L\$outerdone ; i-- 439e1051a39Sopenharmony_ci subi 0,$arrsz,$idx ; j=0 440e1051a39Sopenharmony_ci___ 441e1051a39Sopenharmony_ci$code.=<<___ if ($BN_SZ==4); 442e1051a39Sopenharmony_ci fldws,ma 4($bp),${fbi} ; bp[i] 443e1051a39Sopenharmony_ci___ 444e1051a39Sopenharmony_ci$code.=<<___ if ($BN_SZ==8); 445e1051a39Sopenharmony_ci ldi 12,$ti0 ; bp[i] in flipped word order 446e1051a39Sopenharmony_ci addl,ev %r0,$num,$num 447e1051a39Sopenharmony_ci ldi -4,$ti0 448e1051a39Sopenharmony_ci addl $ti0,$bp,$bp 449e1051a39Sopenharmony_ci fldws 0($bp),${fbi} 450e1051a39Sopenharmony_ci___ 451e1051a39Sopenharmony_ci$code.=<<___; 452e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[0] 453e1051a39Sopenharmony_ci addl $hi0,$ab1,$ab1 454e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[0] 455e1051a39Sopenharmony_ci fldws 8($xfer),${fti}R ; tp[0] 456e1051a39Sopenharmony_ci addl $ti1,$ab1,$ab1 457e1051a39Sopenharmony_ci extrd,u $ab1,31,32,$hi0 458e1051a39Sopenharmony_ci extrd,u $ab1,63,32,$ab1 459e1051a39Sopenharmony_ci 460e1051a39Sopenharmony_ci ldo 8($idx),$idx ; j++++ 461e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 462e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 463e1051a39Sopenharmony_ci ldw 4($tp),$ti0 ; tp[j] 464e1051a39Sopenharmony_ci 465e1051a39Sopenharmony_ci addl $hi1,$nm1,$nm1 466e1051a39Sopenharmony_ci fstws,mb ${fab0}L,-8($xfer) ; save high part 467e1051a39Sopenharmony_ci addl $ab1,$nm1,$nm1 468e1051a39Sopenharmony_ci extrd,u $nm1,31,32,$hi1 469e1051a39Sopenharmony_ci fcpy,sgl %fr0,${fti}L ; zero high part 470e1051a39Sopenharmony_ci fcpy,sgl %fr0,${fab0}L 471e1051a39Sopenharmony_ci stw $nm1,-4($tp) ; tp[j-1] 472e1051a39Sopenharmony_ci 473e1051a39Sopenharmony_ci fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 474e1051a39Sopenharmony_ci fcnvxf,dbl,dbl ${fab0},${fab0} 475e1051a39Sopenharmony_ci addl $hi1,$hi0,$hi0 476e1051a39Sopenharmony_ci fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 477e1051a39Sopenharmony_ci addl $ti0,$hi0,$hi0 478e1051a39Sopenharmony_ci extrd,u $hi0,31,32,$hi1 479e1051a39Sopenharmony_ci fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 480e1051a39Sopenharmony_ci stw $hi0,0($tp) 481e1051a39Sopenharmony_ci stw $hi1,4($tp) 482e1051a39Sopenharmony_ci xmpyu ${fn0},${fab0}R,${fm0} 483e1051a39Sopenharmony_ci 484e1051a39Sopenharmony_ci b L\$outer 485e1051a39Sopenharmony_ci ldo `$LOCALS+32+4`($fp),$tp 486e1051a39Sopenharmony_ci 487e1051a39Sopenharmony_ciL\$outerdone 488e1051a39Sopenharmony_ci addl $hi0,$ab1,$ab1 489e1051a39Sopenharmony_ci addl $ti1,$ab1,$ab1 490e1051a39Sopenharmony_ci extrd,u $ab1,31,32,$hi0 491e1051a39Sopenharmony_ci extrd,u $ab1,63,32,$ab1 492e1051a39Sopenharmony_ci 493e1051a39Sopenharmony_ci ldw 4($tp),$ti0 ; tp[j] 494e1051a39Sopenharmony_ci 495e1051a39Sopenharmony_ci addl $hi1,$nm1,$nm1 496e1051a39Sopenharmony_ci addl $ab1,$nm1,$nm1 497e1051a39Sopenharmony_ci extrd,u $nm1,31,32,$hi1 498e1051a39Sopenharmony_ci stw $nm1,-4($tp) ; tp[j-1] 499e1051a39Sopenharmony_ci 500e1051a39Sopenharmony_ci addl $hi1,$hi0,$hi0 501e1051a39Sopenharmony_ci addl $ti0,$hi0,$hi0 502e1051a39Sopenharmony_ci extrd,u $hi0,31,32,$hi1 503e1051a39Sopenharmony_ci stw $hi0,0($tp) 504e1051a39Sopenharmony_ci stw $hi1,4($tp) 505e1051a39Sopenharmony_ci 506e1051a39Sopenharmony_ci ldo `$LOCALS+32`($fp),$tp 507e1051a39Sopenharmony_ci sub %r0,%r0,%r0 ; clear borrow 508e1051a39Sopenharmony_ci___ 509e1051a39Sopenharmony_ci$code.=<<___ if ($BN_SZ==4); 510e1051a39Sopenharmony_ci ldws,ma 4($tp),$ti0 511e1051a39Sopenharmony_ci extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? 512e1051a39Sopenharmony_ci b L\$sub_pa11 513e1051a39Sopenharmony_ci addl $tp,$arrsz,$tp 514e1051a39Sopenharmony_ciL\$sub 515e1051a39Sopenharmony_ci ldwx $idx($np),$hi0 516e1051a39Sopenharmony_ci subb $ti0,$hi0,$hi1 517e1051a39Sopenharmony_ci ldwx $idx($tp),$ti0 518e1051a39Sopenharmony_ci addib,<> 4,$idx,L\$sub 519e1051a39Sopenharmony_ci stws,ma $hi1,4($rp) 520e1051a39Sopenharmony_ci 521e1051a39Sopenharmony_ci subb $ti0,%r0,$hi1 522e1051a39Sopenharmony_ci___ 523e1051a39Sopenharmony_ci$code.=<<___ if ($BN_SZ==8); 524e1051a39Sopenharmony_ci ldd,ma 8($tp),$ti0 525e1051a39Sopenharmony_ciL\$sub 526e1051a39Sopenharmony_ci ldd $idx($np),$hi0 527e1051a39Sopenharmony_ci shrpd $ti0,$ti0,32,$ti0 ; flip word order 528e1051a39Sopenharmony_ci std $ti0,-8($tp) ; save flipped value 529e1051a39Sopenharmony_ci sub,db $ti0,$hi0,$hi1 530e1051a39Sopenharmony_ci ldd,ma 8($tp),$ti0 531e1051a39Sopenharmony_ci addib,<> 8,$idx,L\$sub 532e1051a39Sopenharmony_ci std,ma $hi1,8($rp) 533e1051a39Sopenharmony_ci 534e1051a39Sopenharmony_ci extrd,u $ti0,31,32,$ti0 ; carry in flipped word order 535e1051a39Sopenharmony_ci sub,db $ti0,%r0,$hi1 536e1051a39Sopenharmony_ci___ 537e1051a39Sopenharmony_ci$code.=<<___; 538e1051a39Sopenharmony_ci ldo `$LOCALS+32`($fp),$tp 539e1051a39Sopenharmony_ci sub $rp,$arrsz,$rp ; rewind rp 540e1051a39Sopenharmony_ci subi 0,$arrsz,$idx 541e1051a39Sopenharmony_ciL\$copy 542e1051a39Sopenharmony_ci ldd 0($tp),$ti0 543e1051a39Sopenharmony_ci ldd 0($rp),$hi0 544e1051a39Sopenharmony_ci std,ma %r0,8($tp) 545e1051a39Sopenharmony_ci comiclr,= 0,$hi1,%r0 546e1051a39Sopenharmony_ci copy $ti0,$hi0 547e1051a39Sopenharmony_ci addib,<> 8,$idx,L\$copy 548e1051a39Sopenharmony_ci std,ma $hi0,8($rp) 549e1051a39Sopenharmony_ci___ 550e1051a39Sopenharmony_ci 551e1051a39Sopenharmony_ciif ($BN_SZ==4) { # PA-RISC 1.1 code-path 552e1051a39Sopenharmony_ci$ablo=$ab0; 553e1051a39Sopenharmony_ci$abhi=$ab1; 554e1051a39Sopenharmony_ci$nmlo0=$nm0; 555e1051a39Sopenharmony_ci$nmhi0=$nm1; 556e1051a39Sopenharmony_ci$nmlo1="%r9"; 557e1051a39Sopenharmony_ci$nmhi1="%r8"; 558e1051a39Sopenharmony_ci 559e1051a39Sopenharmony_ci$code.=<<___; 560e1051a39Sopenharmony_ci b L\$done 561e1051a39Sopenharmony_ci nop 562e1051a39Sopenharmony_ci 563e1051a39Sopenharmony_ci .ALIGN 8 564e1051a39Sopenharmony_ciL\$parisc11 565e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 566e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 567e1051a39Sopenharmony_ci ldw -12($xfer),$ablo 568e1051a39Sopenharmony_ci ldw -16($xfer),$hi0 569e1051a39Sopenharmony_ci ldw -4($xfer),$nmlo0 570e1051a39Sopenharmony_ci ldw -8($xfer),$nmhi0 571e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) 572e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 573e1051a39Sopenharmony_ci 574e1051a39Sopenharmony_ci ldo 8($idx),$idx ; j++++ 575e1051a39Sopenharmony_ci add $ablo,$nmlo0,$nmlo0 ; discarded 576e1051a39Sopenharmony_ci addc %r0,$nmhi0,$hi1 577e1051a39Sopenharmony_ci ldw 4($xfer),$ablo 578e1051a39Sopenharmony_ci ldw 0($xfer),$abhi 579e1051a39Sopenharmony_ci nop 580e1051a39Sopenharmony_ci 581e1051a39Sopenharmony_ciL\$1st_pa11 582e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 583e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[j,j+1] 584e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 585e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[j,j+1] 586e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 587e1051a39Sopenharmony_ci ldw 12($xfer),$nmlo1 588e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 589e1051a39Sopenharmony_ci ldw 8($xfer),$nmhi1 590e1051a39Sopenharmony_ci add $ablo,$nmlo1,$nmlo1 591e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 592e1051a39Sopenharmony_ci addc %r0,$nmhi1,$nmhi1 593e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 594e1051a39Sopenharmony_ci add $hi1,$nmlo1,$nmlo1 595e1051a39Sopenharmony_ci ldw -12($xfer),$ablo 596e1051a39Sopenharmony_ci addc %r0,$nmhi1,$hi1 597e1051a39Sopenharmony_ci ldw -16($xfer),$abhi 598e1051a39Sopenharmony_ci 599e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 600e1051a39Sopenharmony_ci ldw -4($xfer),$nmlo0 601e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 602e1051a39Sopenharmony_ci ldw -8($xfer),$nmhi0 603e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 604e1051a39Sopenharmony_ci stw $nmlo1,-4($tp) ; tp[j-1] 605e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 606e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) 607e1051a39Sopenharmony_ci add $ablo,$nmlo0,$nmlo0 608e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 609e1051a39Sopenharmony_ci addc %r0,$nmhi0,$nmhi0 610e1051a39Sopenharmony_ci ldw 0($xfer),$abhi 611e1051a39Sopenharmony_ci add $hi1,$nmlo0,$nmlo0 612e1051a39Sopenharmony_ci ldw 4($xfer),$ablo 613e1051a39Sopenharmony_ci stws,ma $nmlo0,8($tp) ; tp[j-1] 614e1051a39Sopenharmony_ci addib,<> 8,$idx,L\$1st_pa11 ; j++++ 615e1051a39Sopenharmony_ci addc %r0,$nmhi0,$hi1 616e1051a39Sopenharmony_ci 617e1051a39Sopenharmony_ci ldw 8($xfer),$nmhi1 618e1051a39Sopenharmony_ci ldw 12($xfer),$nmlo1 619e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 620e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 621e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 622e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 623e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 624e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 625e1051a39Sopenharmony_ci add $ablo,$nmlo1,$nmlo1 626e1051a39Sopenharmony_ci ldw -16($xfer),$abhi 627e1051a39Sopenharmony_ci addc %r0,$nmhi1,$nmhi1 628e1051a39Sopenharmony_ci ldw -12($xfer),$ablo 629e1051a39Sopenharmony_ci add $hi1,$nmlo1,$nmlo1 630e1051a39Sopenharmony_ci ldw -8($xfer),$nmhi0 631e1051a39Sopenharmony_ci addc %r0,$nmhi1,$hi1 632e1051a39Sopenharmony_ci ldw -4($xfer),$nmlo0 633e1051a39Sopenharmony_ci 634e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 635e1051a39Sopenharmony_ci stw $nmlo1,-4($tp) ; tp[j-1] 636e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 637e1051a39Sopenharmony_ci ldw 0($xfer),$abhi 638e1051a39Sopenharmony_ci add $ablo,$nmlo0,$nmlo0 639e1051a39Sopenharmony_ci ldw 4($xfer),$ablo 640e1051a39Sopenharmony_ci addc %r0,$nmhi0,$nmhi0 641e1051a39Sopenharmony_ci ldws,mb 8($xfer),$nmhi1 642e1051a39Sopenharmony_ci add $hi1,$nmlo0,$nmlo0 643e1051a39Sopenharmony_ci ldw 4($xfer),$nmlo1 644e1051a39Sopenharmony_ci addc %r0,$nmhi0,$hi1 645e1051a39Sopenharmony_ci stws,ma $nmlo0,8($tp) ; tp[j-1] 646e1051a39Sopenharmony_ci 647e1051a39Sopenharmony_ci ldo -1($num),$num ; i-- 648e1051a39Sopenharmony_ci subi 0,$arrsz,$idx ; j=0 649e1051a39Sopenharmony_ci 650e1051a39Sopenharmony_ci fldws,ma 4($bp),${fbi} ; bp[1] 651e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[0,1] 652e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[0,1] 653e1051a39Sopenharmony_ci fldws 8($xfer),${fti}R ; tp[0] 654e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 655e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 656e1051a39Sopenharmony_ci ldo 8($idx),$idx ; j++++ 657e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 658e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 659e1051a39Sopenharmony_ci add $hi1,$nmlo1,$nmlo1 660e1051a39Sopenharmony_ci addc %r0,$nmhi1,$nmhi1 661e1051a39Sopenharmony_ci add $ablo,$nmlo1,$nmlo1 662e1051a39Sopenharmony_ci addc %r0,$nmhi1,$hi1 663e1051a39Sopenharmony_ci fstws,mb ${fab0}L,-8($xfer) ; save high part 664e1051a39Sopenharmony_ci stw $nmlo1,-4($tp) ; tp[j-1] 665e1051a39Sopenharmony_ci 666e1051a39Sopenharmony_ci fcpy,sgl %fr0,${fti}L ; zero high part 667e1051a39Sopenharmony_ci fcpy,sgl %fr0,${fab0}L 668e1051a39Sopenharmony_ci add $hi1,$hi0,$hi0 669e1051a39Sopenharmony_ci addc %r0,%r0,$hi1 670e1051a39Sopenharmony_ci fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 671e1051a39Sopenharmony_ci fcnvxf,dbl,dbl ${fab0},${fab0} 672e1051a39Sopenharmony_ci stw $hi0,0($tp) 673e1051a39Sopenharmony_ci stw $hi1,4($tp) 674e1051a39Sopenharmony_ci 675e1051a39Sopenharmony_ci fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 676e1051a39Sopenharmony_ci fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 677e1051a39Sopenharmony_ci xmpyu ${fn0},${fab0}R,${fm0} 678e1051a39Sopenharmony_ci ldo `$LOCALS+32+4`($fp),$tp 679e1051a39Sopenharmony_ciL\$outer_pa11 680e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 681e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 682e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) ; 33-bit value 683e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 684e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[2,3] 685e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[2,3] 686e1051a39Sopenharmony_ci ldw -16($xfer),$abhi ; carry bit actually 687e1051a39Sopenharmony_ci ldo 8($idx),$idx ; j++++ 688e1051a39Sopenharmony_ci ldw -12($xfer),$ablo 689e1051a39Sopenharmony_ci ldw -8($xfer),$nmhi0 690e1051a39Sopenharmony_ci ldw -4($xfer),$nmlo0 691e1051a39Sopenharmony_ci ldw 0($xfer),$hi0 ; high part 692e1051a39Sopenharmony_ci 693e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 694e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 695e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 696e1051a39Sopenharmony_ci addl $abhi,$hi0,$hi0 ; account carry bit 697e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 698e1051a39Sopenharmony_ci add $ablo,$nmlo0,$nmlo0 ; discarded 699e1051a39Sopenharmony_ci ldw 0($tp),$ti1 ; tp[1] 700e1051a39Sopenharmony_ci addc %r0,$nmhi0,$hi1 701e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) 702e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 703e1051a39Sopenharmony_ci ldw 4($xfer),$ablo 704e1051a39Sopenharmony_ci ldw 0($xfer),$abhi 705e1051a39Sopenharmony_ci 706e1051a39Sopenharmony_ciL\$inner_pa11 707e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 708e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[j,j+1] 709e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 710e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[j,j+1] 711e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 712e1051a39Sopenharmony_ci ldw 4($tp),$ti0 ; tp[j] 713e1051a39Sopenharmony_ci addc %r0,$abhi,$abhi 714e1051a39Sopenharmony_ci ldw 12($xfer),$nmlo1 715e1051a39Sopenharmony_ci add $ti1,$ablo,$ablo 716e1051a39Sopenharmony_ci ldw 8($xfer),$nmhi1 717e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 718e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 719e1051a39Sopenharmony_ci add $ablo,$nmlo1,$nmlo1 720e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 721e1051a39Sopenharmony_ci addc %r0,$nmhi1,$nmhi1 722e1051a39Sopenharmony_ci ldw -12($xfer),$ablo 723e1051a39Sopenharmony_ci add $hi1,$nmlo1,$nmlo1 724e1051a39Sopenharmony_ci ldw -16($xfer),$abhi 725e1051a39Sopenharmony_ci addc %r0,$nmhi1,$hi1 726e1051a39Sopenharmony_ci 727e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 728e1051a39Sopenharmony_ci ldw 8($tp),$ti1 ; tp[j] 729e1051a39Sopenharmony_ci xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 730e1051a39Sopenharmony_ci ldw -4($xfer),$nmlo0 731e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 732e1051a39Sopenharmony_ci ldw -8($xfer),$nmhi0 733e1051a39Sopenharmony_ci addc %r0,$abhi,$abhi 734e1051a39Sopenharmony_ci stw $nmlo1,-4($tp) ; tp[j-1] 735e1051a39Sopenharmony_ci add $ti0,$ablo,$ablo 736e1051a39Sopenharmony_ci fstds ${fab0},-16($xfer) 737e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 738e1051a39Sopenharmony_ci fstds ${fnm0},-8($xfer) 739e1051a39Sopenharmony_ci add $ablo,$nmlo0,$nmlo0 740e1051a39Sopenharmony_ci ldw 4($xfer),$ablo 741e1051a39Sopenharmony_ci addc %r0,$nmhi0,$nmhi0 742e1051a39Sopenharmony_ci ldw 0($xfer),$abhi 743e1051a39Sopenharmony_ci add $hi1,$nmlo0,$nmlo0 744e1051a39Sopenharmony_ci stws,ma $nmlo0,8($tp) ; tp[j-1] 745e1051a39Sopenharmony_ci addib,<> 8,$idx,L\$inner_pa11 ; j++++ 746e1051a39Sopenharmony_ci addc %r0,$nmhi0,$hi1 747e1051a39Sopenharmony_ci 748e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 749e1051a39Sopenharmony_ci ldw 12($xfer),$nmlo1 750e1051a39Sopenharmony_ci xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 751e1051a39Sopenharmony_ci ldw 8($xfer),$nmhi1 752e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 753e1051a39Sopenharmony_ci ldw 4($tp),$ti0 ; tp[j] 754e1051a39Sopenharmony_ci addc %r0,$abhi,$abhi 755e1051a39Sopenharmony_ci fstds ${fab1},0($xfer) 756e1051a39Sopenharmony_ci add $ti1,$ablo,$ablo 757e1051a39Sopenharmony_ci fstds ${fnm1},8($xfer) 758e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 759e1051a39Sopenharmony_ci ldw -16($xfer),$abhi 760e1051a39Sopenharmony_ci add $ablo,$nmlo1,$nmlo1 761e1051a39Sopenharmony_ci ldw -12($xfer),$ablo 762e1051a39Sopenharmony_ci addc %r0,$nmhi1,$nmhi1 763e1051a39Sopenharmony_ci ldw -8($xfer),$nmhi0 764e1051a39Sopenharmony_ci add $hi1,$nmlo1,$nmlo1 765e1051a39Sopenharmony_ci ldw -4($xfer),$nmlo0 766e1051a39Sopenharmony_ci addc %r0,$nmhi1,$hi1 767e1051a39Sopenharmony_ci 768e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 769e1051a39Sopenharmony_ci stw $nmlo1,-4($tp) ; tp[j-1] 770e1051a39Sopenharmony_ci addc %r0,$abhi,$abhi 771e1051a39Sopenharmony_ci add $ti0,$ablo,$ablo 772e1051a39Sopenharmony_ci ldw 8($tp),$ti1 ; tp[j] 773e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 774e1051a39Sopenharmony_ci ldw 0($xfer),$abhi 775e1051a39Sopenharmony_ci add $ablo,$nmlo0,$nmlo0 776e1051a39Sopenharmony_ci ldw 4($xfer),$ablo 777e1051a39Sopenharmony_ci addc %r0,$nmhi0,$nmhi0 778e1051a39Sopenharmony_ci ldws,mb 8($xfer),$nmhi1 779e1051a39Sopenharmony_ci add $hi1,$nmlo0,$nmlo0 780e1051a39Sopenharmony_ci ldw 4($xfer),$nmlo1 781e1051a39Sopenharmony_ci addc %r0,$nmhi0,$hi1 782e1051a39Sopenharmony_ci stws,ma $nmlo0,8($tp) ; tp[j-1] 783e1051a39Sopenharmony_ci 784e1051a39Sopenharmony_ci addib,= -1,$num,L\$outerdone_pa11; i-- 785e1051a39Sopenharmony_ci subi 0,$arrsz,$idx ; j=0 786e1051a39Sopenharmony_ci 787e1051a39Sopenharmony_ci fldws,ma 4($bp),${fbi} ; bp[i] 788e1051a39Sopenharmony_ci flddx $idx($ap),${fai} ; ap[0] 789e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 790e1051a39Sopenharmony_ci addc %r0,$abhi,$abhi 791e1051a39Sopenharmony_ci flddx $idx($np),${fni} ; np[0] 792e1051a39Sopenharmony_ci fldws 8($xfer),${fti}R ; tp[0] 793e1051a39Sopenharmony_ci add $ti1,$ablo,$ablo 794e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 795e1051a39Sopenharmony_ci 796e1051a39Sopenharmony_ci ldo 8($idx),$idx ; j++++ 797e1051a39Sopenharmony_ci xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 798e1051a39Sopenharmony_ci xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 799e1051a39Sopenharmony_ci ldw 4($tp),$ti0 ; tp[j] 800e1051a39Sopenharmony_ci 801e1051a39Sopenharmony_ci add $hi1,$nmlo1,$nmlo1 802e1051a39Sopenharmony_ci addc %r0,$nmhi1,$nmhi1 803e1051a39Sopenharmony_ci fstws,mb ${fab0}L,-8($xfer) ; save high part 804e1051a39Sopenharmony_ci add $ablo,$nmlo1,$nmlo1 805e1051a39Sopenharmony_ci addc %r0,$nmhi1,$hi1 806e1051a39Sopenharmony_ci fcpy,sgl %fr0,${fti}L ; zero high part 807e1051a39Sopenharmony_ci fcpy,sgl %fr0,${fab0}L 808e1051a39Sopenharmony_ci stw $nmlo1,-4($tp) ; tp[j-1] 809e1051a39Sopenharmony_ci 810e1051a39Sopenharmony_ci fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 811e1051a39Sopenharmony_ci fcnvxf,dbl,dbl ${fab0},${fab0} 812e1051a39Sopenharmony_ci add $hi1,$hi0,$hi0 813e1051a39Sopenharmony_ci addc %r0,%r0,$hi1 814e1051a39Sopenharmony_ci fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 815e1051a39Sopenharmony_ci add $ti0,$hi0,$hi0 816e1051a39Sopenharmony_ci addc %r0,$hi1,$hi1 817e1051a39Sopenharmony_ci fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 818e1051a39Sopenharmony_ci stw $hi0,0($tp) 819e1051a39Sopenharmony_ci stw $hi1,4($tp) 820e1051a39Sopenharmony_ci xmpyu ${fn0},${fab0}R,${fm0} 821e1051a39Sopenharmony_ci 822e1051a39Sopenharmony_ci b L\$outer_pa11 823e1051a39Sopenharmony_ci ldo `$LOCALS+32+4`($fp),$tp 824e1051a39Sopenharmony_ci 825e1051a39Sopenharmony_ciL\$outerdone_pa11 826e1051a39Sopenharmony_ci add $hi0,$ablo,$ablo 827e1051a39Sopenharmony_ci addc %r0,$abhi,$abhi 828e1051a39Sopenharmony_ci add $ti1,$ablo,$ablo 829e1051a39Sopenharmony_ci addc %r0,$abhi,$hi0 830e1051a39Sopenharmony_ci 831e1051a39Sopenharmony_ci ldw 4($tp),$ti0 ; tp[j] 832e1051a39Sopenharmony_ci 833e1051a39Sopenharmony_ci add $hi1,$nmlo1,$nmlo1 834e1051a39Sopenharmony_ci addc %r0,$nmhi1,$nmhi1 835e1051a39Sopenharmony_ci add $ablo,$nmlo1,$nmlo1 836e1051a39Sopenharmony_ci addc %r0,$nmhi1,$hi1 837e1051a39Sopenharmony_ci stw $nmlo1,-4($tp) ; tp[j-1] 838e1051a39Sopenharmony_ci 839e1051a39Sopenharmony_ci add $hi1,$hi0,$hi0 840e1051a39Sopenharmony_ci addc %r0,%r0,$hi1 841e1051a39Sopenharmony_ci add $ti0,$hi0,$hi0 842e1051a39Sopenharmony_ci addc %r0,$hi1,$hi1 843e1051a39Sopenharmony_ci stw $hi0,0($tp) 844e1051a39Sopenharmony_ci stw $hi1,4($tp) 845e1051a39Sopenharmony_ci 846e1051a39Sopenharmony_ci ldo `$LOCALS+32+4`($fp),$tp 847e1051a39Sopenharmony_ci sub %r0,%r0,%r0 ; clear borrow 848e1051a39Sopenharmony_ci ldw -4($tp),$ti0 849e1051a39Sopenharmony_ci addl $tp,$arrsz,$tp 850e1051a39Sopenharmony_ciL\$sub_pa11 851e1051a39Sopenharmony_ci ldwx $idx($np),$hi0 852e1051a39Sopenharmony_ci subb $ti0,$hi0,$hi1 853e1051a39Sopenharmony_ci ldwx $idx($tp),$ti0 854e1051a39Sopenharmony_ci addib,<> 4,$idx,L\$sub_pa11 855e1051a39Sopenharmony_ci stws,ma $hi1,4($rp) 856e1051a39Sopenharmony_ci 857e1051a39Sopenharmony_ci subb $ti0,%r0,$hi1 858e1051a39Sopenharmony_ci 859e1051a39Sopenharmony_ci ldo `$LOCALS+32`($fp),$tp 860e1051a39Sopenharmony_ci sub $rp,$arrsz,$rp ; rewind rp 861e1051a39Sopenharmony_ci subi 0,$arrsz,$idx 862e1051a39Sopenharmony_ciL\$copy_pa11 863e1051a39Sopenharmony_ci ldw 0($tp),$ti0 864e1051a39Sopenharmony_ci ldw 0($rp),$hi0 865e1051a39Sopenharmony_ci stws,ma %r0,4($tp) 866e1051a39Sopenharmony_ci comiclr,= 0,$hi1,%r0 867e1051a39Sopenharmony_ci copy $ti0,$hi0 868e1051a39Sopenharmony_ci addib,<> 4,$idx,L\$copy_pa11 869e1051a39Sopenharmony_ci stws,ma $hi0,4($rp) 870e1051a39Sopenharmony_ci 871e1051a39Sopenharmony_ci nop ; alignment 872e1051a39Sopenharmony_ciL\$done 873e1051a39Sopenharmony_ci___ 874e1051a39Sopenharmony_ci} 875e1051a39Sopenharmony_ci 876e1051a39Sopenharmony_ci$code.=<<___; 877e1051a39Sopenharmony_ci ldi 1,%r28 ; signal "handled" 878e1051a39Sopenharmony_ci ldo $FRAME($fp),%sp ; destroy tp[num+1] 879e1051a39Sopenharmony_ci 880e1051a39Sopenharmony_ci $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 881e1051a39Sopenharmony_ci $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 882e1051a39Sopenharmony_ci $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 883e1051a39Sopenharmony_ci $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 884e1051a39Sopenharmony_ci $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 885e1051a39Sopenharmony_ci $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 886e1051a39Sopenharmony_ci $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 887e1051a39Sopenharmony_ci $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 888e1051a39Sopenharmony_ciL\$abort 889e1051a39Sopenharmony_ci bv (%r2) 890e1051a39Sopenharmony_ci .EXIT 891e1051a39Sopenharmony_ci $POPMB -$FRAME(%sp),%r3 892e1051a39Sopenharmony_ci .PROCEND 893e1051a39Sopenharmony_ci .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 894e1051a39Sopenharmony_ci___ 895e1051a39Sopenharmony_ci 896e1051a39Sopenharmony_ci# Explicitly encode PA-RISC 2.0 instructions used in this module, so 897e1051a39Sopenharmony_ci# that it can be compiled with .LEVEL 1.0. It should be noted that I 898e1051a39Sopenharmony_ci# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 899e1051a39Sopenharmony_ci# directive... 900e1051a39Sopenharmony_ci 901e1051a39Sopenharmony_cimy $ldd = sub { 902e1051a39Sopenharmony_ci my ($mod,$args) = @_; 903e1051a39Sopenharmony_ci my $orig = "ldd$mod\t$args"; 904e1051a39Sopenharmony_ci 905e1051a39Sopenharmony_ci if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 906e1051a39Sopenharmony_ci { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 907e1051a39Sopenharmony_ci sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 908e1051a39Sopenharmony_ci } 909e1051a39Sopenharmony_ci elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 910e1051a39Sopenharmony_ci { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 911e1051a39Sopenharmony_ci $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 912e1051a39Sopenharmony_ci $opcode|=(1<<5) if ($mod =~ /^,m/); 913e1051a39Sopenharmony_ci $opcode|=(1<<13) if ($mod =~ /^,mb/); 914e1051a39Sopenharmony_ci sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 915e1051a39Sopenharmony_ci } 916e1051a39Sopenharmony_ci else { "\t".$orig; } 917e1051a39Sopenharmony_ci}; 918e1051a39Sopenharmony_ci 919e1051a39Sopenharmony_cimy $std = sub { 920e1051a39Sopenharmony_ci my ($mod,$args) = @_; 921e1051a39Sopenharmony_ci my $orig = "std$mod\t$args"; 922e1051a39Sopenharmony_ci 923e1051a39Sopenharmony_ci if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 924e1051a39Sopenharmony_ci { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); 925e1051a39Sopenharmony_ci $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset 926e1051a39Sopenharmony_ci $opcode|=(1<<5) if ($mod =~ /^,m/); 927e1051a39Sopenharmony_ci $opcode|=(1<<13) if ($mod =~ /^,mb/); 928e1051a39Sopenharmony_ci sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 929e1051a39Sopenharmony_ci } 930e1051a39Sopenharmony_ci else { "\t".$orig; } 931e1051a39Sopenharmony_ci}; 932e1051a39Sopenharmony_ci 933e1051a39Sopenharmony_cimy $extrd = sub { 934e1051a39Sopenharmony_ci my ($mod,$args) = @_; 935e1051a39Sopenharmony_ci my $orig = "extrd$mod\t$args"; 936e1051a39Sopenharmony_ci 937e1051a39Sopenharmony_ci # I only have ",u" completer, it's implicitly encoded... 938e1051a39Sopenharmony_ci if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 939e1051a39Sopenharmony_ci { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 940e1051a39Sopenharmony_ci my $len=32-$3; 941e1051a39Sopenharmony_ci $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 942e1051a39Sopenharmony_ci $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 943e1051a39Sopenharmony_ci sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 944e1051a39Sopenharmony_ci } 945e1051a39Sopenharmony_ci elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 946e1051a39Sopenharmony_ci { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 947e1051a39Sopenharmony_ci my $len=32-$2; 948e1051a39Sopenharmony_ci $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 949e1051a39Sopenharmony_ci $opcode |= (1<<13) if ($mod =~ /,\**=/); 950e1051a39Sopenharmony_ci sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 951e1051a39Sopenharmony_ci } 952e1051a39Sopenharmony_ci else { "\t".$orig; } 953e1051a39Sopenharmony_ci}; 954e1051a39Sopenharmony_ci 955e1051a39Sopenharmony_cimy $shrpd = sub { 956e1051a39Sopenharmony_ci my ($mod,$args) = @_; 957e1051a39Sopenharmony_ci my $orig = "shrpd$mod\t$args"; 958e1051a39Sopenharmony_ci 959e1051a39Sopenharmony_ci if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 960e1051a39Sopenharmony_ci { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 961e1051a39Sopenharmony_ci my $cpos=63-$3; 962e1051a39Sopenharmony_ci $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 963e1051a39Sopenharmony_ci sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 964e1051a39Sopenharmony_ci } 965e1051a39Sopenharmony_ci else { "\t".$orig; } 966e1051a39Sopenharmony_ci}; 967e1051a39Sopenharmony_ci 968e1051a39Sopenharmony_cimy $sub = sub { 969e1051a39Sopenharmony_ci my ($mod,$args) = @_; 970e1051a39Sopenharmony_ci my $orig = "sub$mod\t$args"; 971e1051a39Sopenharmony_ci 972e1051a39Sopenharmony_ci if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { 973e1051a39Sopenharmony_ci my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; 974e1051a39Sopenharmony_ci $opcode|=(1<<10); # e1 975e1051a39Sopenharmony_ci $opcode|=(1<<8); # e2 976e1051a39Sopenharmony_ci $opcode|=(1<<5); # d 977e1051a39Sopenharmony_ci sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig 978e1051a39Sopenharmony_ci } 979e1051a39Sopenharmony_ci else { "\t".$orig; } 980e1051a39Sopenharmony_ci}; 981e1051a39Sopenharmony_ci 982e1051a39Sopenharmony_cisub assemble { 983e1051a39Sopenharmony_ci my ($mnemonic,$mod,$args)=@_; 984e1051a39Sopenharmony_ci my $opcode = eval("\$$mnemonic"); 985e1051a39Sopenharmony_ci 986e1051a39Sopenharmony_ci ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 987e1051a39Sopenharmony_ci} 988e1051a39Sopenharmony_ci 989e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 990e1051a39Sopenharmony_ci =~ /GNU assembler/) { 991e1051a39Sopenharmony_ci $gnuas = 1; 992e1051a39Sopenharmony_ci} 993e1051a39Sopenharmony_ci 994e1051a39Sopenharmony_ciforeach (split("\n",$code)) { 995e1051a39Sopenharmony_ci s/\`([^\`]*)\`/eval $1/ge; 996e1051a39Sopenharmony_ci # flip word order in 64-bit mode... 997e1051a39Sopenharmony_ci s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); 998e1051a39Sopenharmony_ci # assemble 2.0 instructions in 32-bit mode... 999e1051a39Sopenharmony_ci s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); 1000e1051a39Sopenharmony_ci 1001e1051a39Sopenharmony_ci s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8); 1002e1051a39Sopenharmony_ci s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8); 1003e1051a39Sopenharmony_ci s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8); 1004e1051a39Sopenharmony_ci s/\bbv\b/bve/ if ($SIZE_T==8); 1005e1051a39Sopenharmony_ci 1006e1051a39Sopenharmony_ci print $_,"\n"; 1007e1051a39Sopenharmony_ci} 1008e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 1009