11cb0ef41Sopenharmony_ci#! /usr/bin/env perl 21cb0ef41Sopenharmony_ci# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved. 31cb0ef41Sopenharmony_ci# 41cb0ef41Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 51cb0ef41Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 61cb0ef41Sopenharmony_ci# in the file LICENSE in the source distribution or at 71cb0ef41Sopenharmony_ci# https://www.openssl.org/source/license.html 81cb0ef41Sopenharmony_ci 91cb0ef41Sopenharmony_ci 101cb0ef41Sopenharmony_ci# ==================================================================== 111cb0ef41Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 121cb0ef41Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 131cb0ef41Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 141cb0ef41Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 151cb0ef41Sopenharmony_ci# ==================================================================== 161cb0ef41Sopenharmony_ci 171cb0ef41Sopenharmony_ci# December 2005 181cb0ef41Sopenharmony_ci# 191cb0ef41Sopenharmony_ci# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons 201cb0ef41Sopenharmony_ci# for undertaken effort are multiple. First of all, UltraSPARC is not 211cb0ef41Sopenharmony_ci# the whole SPARCv9 universe and other VIS-free implementations deserve 221cb0ef41Sopenharmony_ci# optimized code as much. Secondly, newly introduced UltraSPARC T1, 231cb0ef41Sopenharmony_ci# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths, 241cb0ef41Sopenharmony_ci# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with 251cb0ef41Sopenharmony_ci# several integrated RSA/DSA accelerator circuits accessible through 261cb0ef41Sopenharmony_ci# kernel driver [only(*)], but having decent user-land software 271cb0ef41Sopenharmony_ci# implementation is important too. Finally, reasons like desire to 281cb0ef41Sopenharmony_ci# experiment with dedicated squaring procedure. Yes, this module 291cb0ef41Sopenharmony_ci# implements one, because it was easiest to draft it in SPARCv9 301cb0ef41Sopenharmony_ci# instructions... 311cb0ef41Sopenharmony_ci 321cb0ef41Sopenharmony_ci# (*) Engine accessing the driver in question is on my TODO list. 331cb0ef41Sopenharmony_ci# For reference, accelerator is estimated to give 6 to 10 times 341cb0ef41Sopenharmony_ci# improvement on single-threaded RSA sign. It should be noted 351cb0ef41Sopenharmony_ci# that 6-10x improvement coefficient does not actually mean 361cb0ef41Sopenharmony_ci# something extraordinary in terms of absolute [single-threaded] 371cb0ef41Sopenharmony_ci# performance, as SPARCv9 instruction set is by all means least 381cb0ef41Sopenharmony_ci# suitable for high performance crypto among other 64 bit 391cb0ef41Sopenharmony_ci# platforms. 6-10x factor simply places T1 in same performance 401cb0ef41Sopenharmony_ci# domain as say AMD64 and IA-64. Improvement of RSA verify don't 411cb0ef41Sopenharmony_ci# appear impressive at all, but it's the sign operation which is 421cb0ef41Sopenharmony_ci# far more critical/interesting. 431cb0ef41Sopenharmony_ci 441cb0ef41Sopenharmony_ci# You might notice that inner loops are modulo-scheduled:-) This has 451cb0ef41Sopenharmony_ci# essentially negligible impact on UltraSPARC performance, it's 461cb0ef41Sopenharmony_ci# Fujitsu SPARC64 V users who should notice and hopefully appreciate 471cb0ef41Sopenharmony_ci# the advantage... Currently this module surpasses sparcv9a-mont.pl 481cb0ef41Sopenharmony_ci# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a 491cb0ef41Sopenharmony_ci# module still have hidden potential [see TODO list there], which is 501cb0ef41Sopenharmony_ci# estimated to be larger than 20%... 511cb0ef41Sopenharmony_ci 521cb0ef41Sopenharmony_ci$output = pop and open STDOUT,">$output"; 531cb0ef41Sopenharmony_ci 541cb0ef41Sopenharmony_ci# int bn_mul_mont( 551cb0ef41Sopenharmony_ci$rp="%i0"; # BN_ULONG *rp, 561cb0ef41Sopenharmony_ci$ap="%i1"; # const BN_ULONG *ap, 571cb0ef41Sopenharmony_ci$bp="%i2"; # const BN_ULONG *bp, 581cb0ef41Sopenharmony_ci$np="%i3"; # const BN_ULONG *np, 591cb0ef41Sopenharmony_ci$n0="%i4"; # const BN_ULONG *n0, 601cb0ef41Sopenharmony_ci$num="%i5"; # int num); 611cb0ef41Sopenharmony_ci 621cb0ef41Sopenharmony_ci$frame="STACK_FRAME"; 631cb0ef41Sopenharmony_ci$bias="STACK_BIAS"; 641cb0ef41Sopenharmony_ci 651cb0ef41Sopenharmony_ci$car0="%o0"; 661cb0ef41Sopenharmony_ci$car1="%o1"; 671cb0ef41Sopenharmony_ci$car2="%o2"; # 1 bit 681cb0ef41Sopenharmony_ci$acc0="%o3"; 691cb0ef41Sopenharmony_ci$acc1="%o4"; 701cb0ef41Sopenharmony_ci$mask="%g1"; # 32 bits, what a waste... 711cb0ef41Sopenharmony_ci$tmp0="%g4"; 721cb0ef41Sopenharmony_ci$tmp1="%g5"; 731cb0ef41Sopenharmony_ci 741cb0ef41Sopenharmony_ci$i="%l0"; 751cb0ef41Sopenharmony_ci$j="%l1"; 761cb0ef41Sopenharmony_ci$mul0="%l2"; 771cb0ef41Sopenharmony_ci$mul1="%l3"; 781cb0ef41Sopenharmony_ci$tp="%l4"; 791cb0ef41Sopenharmony_ci$apj="%l5"; 801cb0ef41Sopenharmony_ci$npj="%l6"; 811cb0ef41Sopenharmony_ci$tpj="%l7"; 821cb0ef41Sopenharmony_ci 831cb0ef41Sopenharmony_ci$fname="bn_mul_mont_int"; 841cb0ef41Sopenharmony_ci 851cb0ef41Sopenharmony_ci$code=<<___; 861cb0ef41Sopenharmony_ci#ifndef __ASSEMBLER__ 871cb0ef41Sopenharmony_ci# define __ASSEMBLER__ 1 881cb0ef41Sopenharmony_ci#endif 891cb0ef41Sopenharmony_ci#include "crypto/sparc_arch.h" 901cb0ef41Sopenharmony_ci 911cb0ef41Sopenharmony_ci.section ".text",#alloc,#execinstr 921cb0ef41Sopenharmony_ci 931cb0ef41Sopenharmony_ci.global $fname 941cb0ef41Sopenharmony_ci.align 32 951cb0ef41Sopenharmony_ci$fname: 961cb0ef41Sopenharmony_ci cmp %o5,4 ! 128 bits minimum 971cb0ef41Sopenharmony_ci bge,pt %icc,.Lenter 981cb0ef41Sopenharmony_ci sethi %hi(0xffffffff),$mask 991cb0ef41Sopenharmony_ci retl 1001cb0ef41Sopenharmony_ci clr %o0 1011cb0ef41Sopenharmony_ci.align 32 1021cb0ef41Sopenharmony_ci.Lenter: 1031cb0ef41Sopenharmony_ci save %sp,-$frame,%sp 1041cb0ef41Sopenharmony_ci sll $num,2,$num ! num*=4 1051cb0ef41Sopenharmony_ci or $mask,%lo(0xffffffff),$mask 1061cb0ef41Sopenharmony_ci ld [$n0],$n0 1071cb0ef41Sopenharmony_ci cmp $ap,$bp 1081cb0ef41Sopenharmony_ci and $num,$mask,$num 1091cb0ef41Sopenharmony_ci ld [$bp],$mul0 ! bp[0] 1101cb0ef41Sopenharmony_ci nop 1111cb0ef41Sopenharmony_ci 1121cb0ef41Sopenharmony_ci add %sp,$bias,%o7 ! real top of stack 1131cb0ef41Sopenharmony_ci ld [$ap],$car0 ! ap[0] ! redundant in squaring context 1141cb0ef41Sopenharmony_ci sub %o7,$num,%o7 1151cb0ef41Sopenharmony_ci ld [$ap+4],$apj ! ap[1] 1161cb0ef41Sopenharmony_ci and %o7,-1024,%o7 1171cb0ef41Sopenharmony_ci ld [$np],$car1 ! np[0] 1181cb0ef41Sopenharmony_ci sub %o7,$bias,%sp ! alloca 1191cb0ef41Sopenharmony_ci ld [$np+4],$npj ! np[1] 1201cb0ef41Sopenharmony_ci be,pt SIZE_T_CC,.Lbn_sqr_mont 1211cb0ef41Sopenharmony_ci mov 12,$j 1221cb0ef41Sopenharmony_ci 1231cb0ef41Sopenharmony_ci mulx $car0,$mul0,$car0 ! ap[0]*bp[0] 1241cb0ef41Sopenharmony_ci mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] 1251cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 1261cb0ef41Sopenharmony_ci add %sp,$bias+$frame,$tp 1271cb0ef41Sopenharmony_ci ld [$ap+8],$apj !prologue! 1281cb0ef41Sopenharmony_ci 1291cb0ef41Sopenharmony_ci mulx $n0,$acc0,$mul1 ! "t[0]"*n0 1301cb0ef41Sopenharmony_ci and $mul1,$mask,$mul1 1311cb0ef41Sopenharmony_ci 1321cb0ef41Sopenharmony_ci mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 1331cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 1341cb0ef41Sopenharmony_ci srlx $car0,32,$car0 1351cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 1361cb0ef41Sopenharmony_ci ld [$np+8],$npj !prologue! 1371cb0ef41Sopenharmony_ci srlx $car1,32,$car1 1381cb0ef41Sopenharmony_ci mov $tmp0,$acc0 !prologue! 1391cb0ef41Sopenharmony_ci 1401cb0ef41Sopenharmony_ci.L1st: 1411cb0ef41Sopenharmony_ci mulx $apj,$mul0,$tmp0 1421cb0ef41Sopenharmony_ci mulx $npj,$mul1,$tmp1 1431cb0ef41Sopenharmony_ci add $acc0,$car0,$car0 1441cb0ef41Sopenharmony_ci ld [$ap+$j],$apj ! ap[j] 1451cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 1461cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 1471cb0ef41Sopenharmony_ci ld [$np+$j],$npj ! np[j] 1481cb0ef41Sopenharmony_ci srlx $car0,32,$car0 1491cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 1501cb0ef41Sopenharmony_ci add $j,4,$j ! j++ 1511cb0ef41Sopenharmony_ci mov $tmp0,$acc0 1521cb0ef41Sopenharmony_ci st $car1,[$tp] 1531cb0ef41Sopenharmony_ci cmp $j,$num 1541cb0ef41Sopenharmony_ci mov $tmp1,$acc1 1551cb0ef41Sopenharmony_ci srlx $car1,32,$car1 1561cb0ef41Sopenharmony_ci bl %icc,.L1st 1571cb0ef41Sopenharmony_ci add $tp,4,$tp ! tp++ 1581cb0ef41Sopenharmony_ci!.L1st 1591cb0ef41Sopenharmony_ci 1601cb0ef41Sopenharmony_ci mulx $apj,$mul0,$tmp0 !epilogue! 1611cb0ef41Sopenharmony_ci mulx $npj,$mul1,$tmp1 1621cb0ef41Sopenharmony_ci add $acc0,$car0,$car0 1631cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 1641cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 1651cb0ef41Sopenharmony_ci srlx $car0,32,$car0 1661cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 1671cb0ef41Sopenharmony_ci st $car1,[$tp] 1681cb0ef41Sopenharmony_ci srlx $car1,32,$car1 1691cb0ef41Sopenharmony_ci 1701cb0ef41Sopenharmony_ci add $tmp0,$car0,$car0 1711cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 1721cb0ef41Sopenharmony_ci add $tmp1,$car1,$car1 1731cb0ef41Sopenharmony_ci srlx $car0,32,$car0 1741cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 1751cb0ef41Sopenharmony_ci st $car1,[$tp+4] 1761cb0ef41Sopenharmony_ci srlx $car1,32,$car1 1771cb0ef41Sopenharmony_ci 1781cb0ef41Sopenharmony_ci add $car0,$car1,$car1 1791cb0ef41Sopenharmony_ci st $car1,[$tp+8] 1801cb0ef41Sopenharmony_ci srlx $car1,32,$car2 1811cb0ef41Sopenharmony_ci 1821cb0ef41Sopenharmony_ci mov 4,$i ! i++ 1831cb0ef41Sopenharmony_ci ld [$bp+4],$mul0 ! bp[1] 1841cb0ef41Sopenharmony_ci.Louter: 1851cb0ef41Sopenharmony_ci add %sp,$bias+$frame,$tp 1861cb0ef41Sopenharmony_ci ld [$ap],$car0 ! ap[0] 1871cb0ef41Sopenharmony_ci ld [$ap+4],$apj ! ap[1] 1881cb0ef41Sopenharmony_ci ld [$np],$car1 ! np[0] 1891cb0ef41Sopenharmony_ci ld [$np+4],$npj ! np[1] 1901cb0ef41Sopenharmony_ci ld [$tp],$tmp1 ! tp[0] 1911cb0ef41Sopenharmony_ci ld [$tp+4],$tpj ! tp[1] 1921cb0ef41Sopenharmony_ci mov 12,$j 1931cb0ef41Sopenharmony_ci 1941cb0ef41Sopenharmony_ci mulx $car0,$mul0,$car0 1951cb0ef41Sopenharmony_ci mulx $apj,$mul0,$tmp0 !prologue! 1961cb0ef41Sopenharmony_ci add $tmp1,$car0,$car0 1971cb0ef41Sopenharmony_ci ld [$ap+8],$apj !prologue! 1981cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 1991cb0ef41Sopenharmony_ci 2001cb0ef41Sopenharmony_ci mulx $n0,$acc0,$mul1 2011cb0ef41Sopenharmony_ci and $mul1,$mask,$mul1 2021cb0ef41Sopenharmony_ci 2031cb0ef41Sopenharmony_ci mulx $car1,$mul1,$car1 2041cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 !prologue! 2051cb0ef41Sopenharmony_ci srlx $car0,32,$car0 2061cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 2071cb0ef41Sopenharmony_ci ld [$np+8],$npj !prologue! 2081cb0ef41Sopenharmony_ci srlx $car1,32,$car1 2091cb0ef41Sopenharmony_ci mov $tmp0,$acc0 !prologue! 2101cb0ef41Sopenharmony_ci 2111cb0ef41Sopenharmony_ci.Linner: 2121cb0ef41Sopenharmony_ci mulx $apj,$mul0,$tmp0 2131cb0ef41Sopenharmony_ci mulx $npj,$mul1,$tmp1 2141cb0ef41Sopenharmony_ci add $tpj,$car0,$car0 2151cb0ef41Sopenharmony_ci ld [$ap+$j],$apj ! ap[j] 2161cb0ef41Sopenharmony_ci add $acc0,$car0,$car0 2171cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 2181cb0ef41Sopenharmony_ci ld [$np+$j],$npj ! np[j] 2191cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 2201cb0ef41Sopenharmony_ci ld [$tp+8],$tpj ! tp[j] 2211cb0ef41Sopenharmony_ci srlx $car0,32,$car0 2221cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 2231cb0ef41Sopenharmony_ci add $j,4,$j ! j++ 2241cb0ef41Sopenharmony_ci mov $tmp0,$acc0 2251cb0ef41Sopenharmony_ci st $car1,[$tp] ! tp[j-1] 2261cb0ef41Sopenharmony_ci srlx $car1,32,$car1 2271cb0ef41Sopenharmony_ci mov $tmp1,$acc1 2281cb0ef41Sopenharmony_ci cmp $j,$num 2291cb0ef41Sopenharmony_ci bl %icc,.Linner 2301cb0ef41Sopenharmony_ci add $tp,4,$tp ! tp++ 2311cb0ef41Sopenharmony_ci!.Linner 2321cb0ef41Sopenharmony_ci 2331cb0ef41Sopenharmony_ci mulx $apj,$mul0,$tmp0 !epilogue! 2341cb0ef41Sopenharmony_ci mulx $npj,$mul1,$tmp1 2351cb0ef41Sopenharmony_ci add $tpj,$car0,$car0 2361cb0ef41Sopenharmony_ci add $acc0,$car0,$car0 2371cb0ef41Sopenharmony_ci ld [$tp+8],$tpj ! tp[j] 2381cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 2391cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 2401cb0ef41Sopenharmony_ci srlx $car0,32,$car0 2411cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 2421cb0ef41Sopenharmony_ci st $car1,[$tp] ! tp[j-1] 2431cb0ef41Sopenharmony_ci srlx $car1,32,$car1 2441cb0ef41Sopenharmony_ci 2451cb0ef41Sopenharmony_ci add $tpj,$car0,$car0 2461cb0ef41Sopenharmony_ci add $tmp0,$car0,$car0 2471cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 2481cb0ef41Sopenharmony_ci add $tmp1,$car1,$car1 2491cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 2501cb0ef41Sopenharmony_ci st $car1,[$tp+4] ! tp[j-1] 2511cb0ef41Sopenharmony_ci srlx $car0,32,$car0 2521cb0ef41Sopenharmony_ci add $i,4,$i ! i++ 2531cb0ef41Sopenharmony_ci srlx $car1,32,$car1 2541cb0ef41Sopenharmony_ci 2551cb0ef41Sopenharmony_ci add $car0,$car1,$car1 2561cb0ef41Sopenharmony_ci cmp $i,$num 2571cb0ef41Sopenharmony_ci add $car2,$car1,$car1 2581cb0ef41Sopenharmony_ci st $car1,[$tp+8] 2591cb0ef41Sopenharmony_ci 2601cb0ef41Sopenharmony_ci srlx $car1,32,$car2 2611cb0ef41Sopenharmony_ci bl,a %icc,.Louter 2621cb0ef41Sopenharmony_ci ld [$bp+$i],$mul0 ! bp[i] 2631cb0ef41Sopenharmony_ci!.Louter 2641cb0ef41Sopenharmony_ci 2651cb0ef41Sopenharmony_ci add $tp,12,$tp 2661cb0ef41Sopenharmony_ci 2671cb0ef41Sopenharmony_ci.Ltail: 2681cb0ef41Sopenharmony_ci add $np,$num,$np 2691cb0ef41Sopenharmony_ci add $rp,$num,$rp 2701cb0ef41Sopenharmony_ci sub %g0,$num,%o7 ! k=-num 2711cb0ef41Sopenharmony_ci ba .Lsub 2721cb0ef41Sopenharmony_ci subcc %g0,%g0,%g0 ! clear %icc.c 2731cb0ef41Sopenharmony_ci.align 16 2741cb0ef41Sopenharmony_ci.Lsub: 2751cb0ef41Sopenharmony_ci ld [$tp+%o7],%o0 2761cb0ef41Sopenharmony_ci ld [$np+%o7],%o1 2771cb0ef41Sopenharmony_ci subccc %o0,%o1,%o1 ! tp[j]-np[j] 2781cb0ef41Sopenharmony_ci add $rp,%o7,$i 2791cb0ef41Sopenharmony_ci add %o7,4,%o7 2801cb0ef41Sopenharmony_ci brnz %o7,.Lsub 2811cb0ef41Sopenharmony_ci st %o1,[$i] 2821cb0ef41Sopenharmony_ci subccc $car2,0,$car2 ! handle upmost overflow bit 2831cb0ef41Sopenharmony_ci sub %g0,$num,%o7 2841cb0ef41Sopenharmony_ci 2851cb0ef41Sopenharmony_ci.Lcopy: 2861cb0ef41Sopenharmony_ci ld [$tp+%o7],%o1 ! conditional copy 2871cb0ef41Sopenharmony_ci ld [$rp+%o7],%o0 2881cb0ef41Sopenharmony_ci st %g0,[$tp+%o7] ! zap tp 2891cb0ef41Sopenharmony_ci movcs %icc,%o1,%o0 2901cb0ef41Sopenharmony_ci st %o0,[$rp+%o7] 2911cb0ef41Sopenharmony_ci add %o7,4,%o7 2921cb0ef41Sopenharmony_ci brnz %o7,.Lcopy 2931cb0ef41Sopenharmony_ci nop 2941cb0ef41Sopenharmony_ci mov 1,%i0 2951cb0ef41Sopenharmony_ci ret 2961cb0ef41Sopenharmony_ci restore 2971cb0ef41Sopenharmony_ci___ 2981cb0ef41Sopenharmony_ci 2991cb0ef41Sopenharmony_ci######## 3001cb0ef41Sopenharmony_ci######## .Lbn_sqr_mont gives up to 20% *overall* improvement over 3011cb0ef41Sopenharmony_ci######## code without following dedicated squaring procedure. 3021cb0ef41Sopenharmony_ci######## 3031cb0ef41Sopenharmony_ci$sbit="%o5"; 3041cb0ef41Sopenharmony_ci 3051cb0ef41Sopenharmony_ci$code.=<<___; 3061cb0ef41Sopenharmony_ci.align 32 3071cb0ef41Sopenharmony_ci.Lbn_sqr_mont: 3081cb0ef41Sopenharmony_ci mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] 3091cb0ef41Sopenharmony_ci mulx $apj,$mul0,$tmp0 !prologue! 3101cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 3111cb0ef41Sopenharmony_ci add %sp,$bias+$frame,$tp 3121cb0ef41Sopenharmony_ci ld [$ap+8],$apj !prologue! 3131cb0ef41Sopenharmony_ci 3141cb0ef41Sopenharmony_ci mulx $n0,$acc0,$mul1 ! "t[0]"*n0 3151cb0ef41Sopenharmony_ci srlx $car0,32,$car0 3161cb0ef41Sopenharmony_ci and $mul1,$mask,$mul1 3171cb0ef41Sopenharmony_ci 3181cb0ef41Sopenharmony_ci mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 3191cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 !prologue! 3201cb0ef41Sopenharmony_ci and $car0,1,$sbit 3211cb0ef41Sopenharmony_ci ld [$np+8],$npj !prologue! 3221cb0ef41Sopenharmony_ci srlx $car0,1,$car0 3231cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 3241cb0ef41Sopenharmony_ci srlx $car1,32,$car1 3251cb0ef41Sopenharmony_ci mov $tmp0,$acc0 !prologue! 3261cb0ef41Sopenharmony_ci 3271cb0ef41Sopenharmony_ci.Lsqr_1st: 3281cb0ef41Sopenharmony_ci mulx $apj,$mul0,$tmp0 3291cb0ef41Sopenharmony_ci mulx $npj,$mul1,$tmp1 3301cb0ef41Sopenharmony_ci add $acc0,$car0,$car0 ! ap[j]*a0+c0 3311cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 3321cb0ef41Sopenharmony_ci ld [$ap+$j],$apj ! ap[j] 3331cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 3341cb0ef41Sopenharmony_ci ld [$np+$j],$npj ! np[j] 3351cb0ef41Sopenharmony_ci srlx $car0,32,$car0 3361cb0ef41Sopenharmony_ci add $acc0,$acc0,$acc0 3371cb0ef41Sopenharmony_ci or $sbit,$acc0,$acc0 3381cb0ef41Sopenharmony_ci mov $tmp1,$acc1 3391cb0ef41Sopenharmony_ci srlx $acc0,32,$sbit 3401cb0ef41Sopenharmony_ci add $j,4,$j ! j++ 3411cb0ef41Sopenharmony_ci and $acc0,$mask,$acc0 3421cb0ef41Sopenharmony_ci cmp $j,$num 3431cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 3441cb0ef41Sopenharmony_ci st $car1,[$tp] 3451cb0ef41Sopenharmony_ci mov $tmp0,$acc0 3461cb0ef41Sopenharmony_ci srlx $car1,32,$car1 3471cb0ef41Sopenharmony_ci bl %icc,.Lsqr_1st 3481cb0ef41Sopenharmony_ci add $tp,4,$tp ! tp++ 3491cb0ef41Sopenharmony_ci!.Lsqr_1st 3501cb0ef41Sopenharmony_ci 3511cb0ef41Sopenharmony_ci mulx $apj,$mul0,$tmp0 ! epilogue 3521cb0ef41Sopenharmony_ci mulx $npj,$mul1,$tmp1 3531cb0ef41Sopenharmony_ci add $acc0,$car0,$car0 ! ap[j]*a0+c0 3541cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 3551cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 3561cb0ef41Sopenharmony_ci srlx $car0,32,$car0 3571cb0ef41Sopenharmony_ci add $acc0,$acc0,$acc0 3581cb0ef41Sopenharmony_ci or $sbit,$acc0,$acc0 3591cb0ef41Sopenharmony_ci srlx $acc0,32,$sbit 3601cb0ef41Sopenharmony_ci and $acc0,$mask,$acc0 3611cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 3621cb0ef41Sopenharmony_ci st $car1,[$tp] 3631cb0ef41Sopenharmony_ci srlx $car1,32,$car1 3641cb0ef41Sopenharmony_ci 3651cb0ef41Sopenharmony_ci add $tmp0,$car0,$car0 ! ap[j]*a0+c0 3661cb0ef41Sopenharmony_ci add $tmp1,$car1,$car1 3671cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 3681cb0ef41Sopenharmony_ci srlx $car0,32,$car0 3691cb0ef41Sopenharmony_ci add $acc0,$acc0,$acc0 3701cb0ef41Sopenharmony_ci or $sbit,$acc0,$acc0 3711cb0ef41Sopenharmony_ci srlx $acc0,32,$sbit 3721cb0ef41Sopenharmony_ci and $acc0,$mask,$acc0 3731cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 3741cb0ef41Sopenharmony_ci st $car1,[$tp+4] 3751cb0ef41Sopenharmony_ci srlx $car1,32,$car1 3761cb0ef41Sopenharmony_ci 3771cb0ef41Sopenharmony_ci add $car0,$car0,$car0 3781cb0ef41Sopenharmony_ci or $sbit,$car0,$car0 3791cb0ef41Sopenharmony_ci add $car0,$car1,$car1 3801cb0ef41Sopenharmony_ci st $car1,[$tp+8] 3811cb0ef41Sopenharmony_ci srlx $car1,32,$car2 3821cb0ef41Sopenharmony_ci 3831cb0ef41Sopenharmony_ci ld [%sp+$bias+$frame],$tmp0 ! tp[0] 3841cb0ef41Sopenharmony_ci ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] 3851cb0ef41Sopenharmony_ci ld [%sp+$bias+$frame+8],$tpj ! tp[2] 3861cb0ef41Sopenharmony_ci ld [$ap+4],$mul0 ! ap[1] 3871cb0ef41Sopenharmony_ci ld [$ap+8],$apj ! ap[2] 3881cb0ef41Sopenharmony_ci ld [$np],$car1 ! np[0] 3891cb0ef41Sopenharmony_ci ld [$np+4],$npj ! np[1] 3901cb0ef41Sopenharmony_ci mulx $n0,$tmp0,$mul1 3911cb0ef41Sopenharmony_ci 3921cb0ef41Sopenharmony_ci mulx $mul0,$mul0,$car0 3931cb0ef41Sopenharmony_ci and $mul1,$mask,$mul1 3941cb0ef41Sopenharmony_ci 3951cb0ef41Sopenharmony_ci mulx $car1,$mul1,$car1 3961cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 3971cb0ef41Sopenharmony_ci add $tmp0,$car1,$car1 3981cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 3991cb0ef41Sopenharmony_ci ld [$np+8],$npj ! np[2] 4001cb0ef41Sopenharmony_ci srlx $car1,32,$car1 4011cb0ef41Sopenharmony_ci add $tmp1,$car1,$car1 4021cb0ef41Sopenharmony_ci srlx $car0,32,$car0 4031cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 4041cb0ef41Sopenharmony_ci and $car0,1,$sbit 4051cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 4061cb0ef41Sopenharmony_ci srlx $car0,1,$car0 4071cb0ef41Sopenharmony_ci mov 12,$j 4081cb0ef41Sopenharmony_ci st $car1,[%sp+$bias+$frame] ! tp[0]= 4091cb0ef41Sopenharmony_ci srlx $car1,32,$car1 4101cb0ef41Sopenharmony_ci add %sp,$bias+$frame+4,$tp 4111cb0ef41Sopenharmony_ci 4121cb0ef41Sopenharmony_ci.Lsqr_2nd: 4131cb0ef41Sopenharmony_ci mulx $apj,$mul0,$acc0 4141cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 4151cb0ef41Sopenharmony_ci add $acc0,$car0,$car0 4161cb0ef41Sopenharmony_ci add $tpj,$sbit,$sbit 4171cb0ef41Sopenharmony_ci ld [$ap+$j],$apj ! ap[j] 4181cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 4191cb0ef41Sopenharmony_ci ld [$np+$j],$npj ! np[j] 4201cb0ef41Sopenharmony_ci srlx $car0,32,$car0 4211cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 4221cb0ef41Sopenharmony_ci ld [$tp+8],$tpj ! tp[j] 4231cb0ef41Sopenharmony_ci add $acc0,$acc0,$acc0 4241cb0ef41Sopenharmony_ci add $j,4,$j ! j++ 4251cb0ef41Sopenharmony_ci add $sbit,$acc0,$acc0 4261cb0ef41Sopenharmony_ci srlx $acc0,32,$sbit 4271cb0ef41Sopenharmony_ci and $acc0,$mask,$acc0 4281cb0ef41Sopenharmony_ci cmp $j,$num 4291cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 4301cb0ef41Sopenharmony_ci st $car1,[$tp] ! tp[j-1] 4311cb0ef41Sopenharmony_ci srlx $car1,32,$car1 4321cb0ef41Sopenharmony_ci bl %icc,.Lsqr_2nd 4331cb0ef41Sopenharmony_ci add $tp,4,$tp ! tp++ 4341cb0ef41Sopenharmony_ci!.Lsqr_2nd 4351cb0ef41Sopenharmony_ci 4361cb0ef41Sopenharmony_ci mulx $apj,$mul0,$acc0 4371cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 4381cb0ef41Sopenharmony_ci add $acc0,$car0,$car0 4391cb0ef41Sopenharmony_ci add $tpj,$sbit,$sbit 4401cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 4411cb0ef41Sopenharmony_ci srlx $car0,32,$car0 4421cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 4431cb0ef41Sopenharmony_ci add $acc0,$acc0,$acc0 4441cb0ef41Sopenharmony_ci add $sbit,$acc0,$acc0 4451cb0ef41Sopenharmony_ci srlx $acc0,32,$sbit 4461cb0ef41Sopenharmony_ci and $acc0,$mask,$acc0 4471cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 4481cb0ef41Sopenharmony_ci st $car1,[$tp] ! tp[j-1] 4491cb0ef41Sopenharmony_ci srlx $car1,32,$car1 4501cb0ef41Sopenharmony_ci 4511cb0ef41Sopenharmony_ci add $car0,$car0,$car0 4521cb0ef41Sopenharmony_ci add $sbit,$car0,$car0 4531cb0ef41Sopenharmony_ci add $car0,$car1,$car1 4541cb0ef41Sopenharmony_ci add $car2,$car1,$car1 4551cb0ef41Sopenharmony_ci st $car1,[$tp+4] 4561cb0ef41Sopenharmony_ci srlx $car1,32,$car2 4571cb0ef41Sopenharmony_ci 4581cb0ef41Sopenharmony_ci ld [%sp+$bias+$frame],$tmp1 ! tp[0] 4591cb0ef41Sopenharmony_ci ld [%sp+$bias+$frame+4],$tpj ! tp[1] 4601cb0ef41Sopenharmony_ci ld [$ap+8],$mul0 ! ap[2] 4611cb0ef41Sopenharmony_ci ld [$np],$car1 ! np[0] 4621cb0ef41Sopenharmony_ci ld [$np+4],$npj ! np[1] 4631cb0ef41Sopenharmony_ci mulx $n0,$tmp1,$mul1 4641cb0ef41Sopenharmony_ci and $mul1,$mask,$mul1 4651cb0ef41Sopenharmony_ci mov 8,$i 4661cb0ef41Sopenharmony_ci 4671cb0ef41Sopenharmony_ci mulx $mul0,$mul0,$car0 4681cb0ef41Sopenharmony_ci mulx $car1,$mul1,$car1 4691cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 4701cb0ef41Sopenharmony_ci add $tmp1,$car1,$car1 4711cb0ef41Sopenharmony_ci srlx $car0,32,$car0 4721cb0ef41Sopenharmony_ci add %sp,$bias+$frame,$tp 4731cb0ef41Sopenharmony_ci srlx $car1,32,$car1 4741cb0ef41Sopenharmony_ci and $car0,1,$sbit 4751cb0ef41Sopenharmony_ci srlx $car0,1,$car0 4761cb0ef41Sopenharmony_ci mov 4,$j 4771cb0ef41Sopenharmony_ci 4781cb0ef41Sopenharmony_ci.Lsqr_outer: 4791cb0ef41Sopenharmony_ci.Lsqr_inner1: 4801cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 4811cb0ef41Sopenharmony_ci add $tpj,$car1,$car1 4821cb0ef41Sopenharmony_ci add $j,4,$j 4831cb0ef41Sopenharmony_ci ld [$tp+8],$tpj 4841cb0ef41Sopenharmony_ci cmp $j,$i 4851cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 4861cb0ef41Sopenharmony_ci ld [$np+$j],$npj 4871cb0ef41Sopenharmony_ci st $car1,[$tp] 4881cb0ef41Sopenharmony_ci srlx $car1,32,$car1 4891cb0ef41Sopenharmony_ci bl %icc,.Lsqr_inner1 4901cb0ef41Sopenharmony_ci add $tp,4,$tp 4911cb0ef41Sopenharmony_ci!.Lsqr_inner1 4921cb0ef41Sopenharmony_ci 4931cb0ef41Sopenharmony_ci add $j,4,$j 4941cb0ef41Sopenharmony_ci ld [$ap+$j],$apj ! ap[j] 4951cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 4961cb0ef41Sopenharmony_ci add $tpj,$car1,$car1 4971cb0ef41Sopenharmony_ci ld [$np+$j],$npj ! np[j] 4981cb0ef41Sopenharmony_ci srlx $car1,32,$tmp0 4991cb0ef41Sopenharmony_ci and $car1,$mask,$car1 5001cb0ef41Sopenharmony_ci add $tmp0,$sbit,$sbit 5011cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 5021cb0ef41Sopenharmony_ci ld [$tp+8],$tpj ! tp[j] 5031cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 5041cb0ef41Sopenharmony_ci st $car1,[$tp] 5051cb0ef41Sopenharmony_ci srlx $car1,32,$car1 5061cb0ef41Sopenharmony_ci 5071cb0ef41Sopenharmony_ci add $j,4,$j 5081cb0ef41Sopenharmony_ci cmp $j,$num 5091cb0ef41Sopenharmony_ci be,pn %icc,.Lsqr_no_inner2 5101cb0ef41Sopenharmony_ci add $tp,4,$tp 5111cb0ef41Sopenharmony_ci 5121cb0ef41Sopenharmony_ci.Lsqr_inner2: 5131cb0ef41Sopenharmony_ci mulx $apj,$mul0,$acc0 5141cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 5151cb0ef41Sopenharmony_ci add $tpj,$sbit,$sbit 5161cb0ef41Sopenharmony_ci add $acc0,$car0,$car0 5171cb0ef41Sopenharmony_ci ld [$ap+$j],$apj ! ap[j] 5181cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 5191cb0ef41Sopenharmony_ci ld [$np+$j],$npj ! np[j] 5201cb0ef41Sopenharmony_ci srlx $car0,32,$car0 5211cb0ef41Sopenharmony_ci add $acc0,$acc0,$acc0 5221cb0ef41Sopenharmony_ci ld [$tp+8],$tpj ! tp[j] 5231cb0ef41Sopenharmony_ci add $sbit,$acc0,$acc0 5241cb0ef41Sopenharmony_ci add $j,4,$j ! j++ 5251cb0ef41Sopenharmony_ci srlx $acc0,32,$sbit 5261cb0ef41Sopenharmony_ci and $acc0,$mask,$acc0 5271cb0ef41Sopenharmony_ci cmp $j,$num 5281cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 5291cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 5301cb0ef41Sopenharmony_ci st $car1,[$tp] ! tp[j-1] 5311cb0ef41Sopenharmony_ci srlx $car1,32,$car1 5321cb0ef41Sopenharmony_ci bl %icc,.Lsqr_inner2 5331cb0ef41Sopenharmony_ci add $tp,4,$tp ! tp++ 5341cb0ef41Sopenharmony_ci 5351cb0ef41Sopenharmony_ci.Lsqr_no_inner2: 5361cb0ef41Sopenharmony_ci mulx $apj,$mul0,$acc0 5371cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 5381cb0ef41Sopenharmony_ci add $tpj,$sbit,$sbit 5391cb0ef41Sopenharmony_ci add $acc0,$car0,$car0 5401cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 5411cb0ef41Sopenharmony_ci srlx $car0,32,$car0 5421cb0ef41Sopenharmony_ci add $acc0,$acc0,$acc0 5431cb0ef41Sopenharmony_ci add $sbit,$acc0,$acc0 5441cb0ef41Sopenharmony_ci srlx $acc0,32,$sbit 5451cb0ef41Sopenharmony_ci and $acc0,$mask,$acc0 5461cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 5471cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 5481cb0ef41Sopenharmony_ci st $car1,[$tp] ! tp[j-1] 5491cb0ef41Sopenharmony_ci srlx $car1,32,$car1 5501cb0ef41Sopenharmony_ci 5511cb0ef41Sopenharmony_ci add $car0,$car0,$car0 5521cb0ef41Sopenharmony_ci add $sbit,$car0,$car0 5531cb0ef41Sopenharmony_ci add $car0,$car1,$car1 5541cb0ef41Sopenharmony_ci add $car2,$car1,$car1 5551cb0ef41Sopenharmony_ci st $car1,[$tp+4] 5561cb0ef41Sopenharmony_ci srlx $car1,32,$car2 5571cb0ef41Sopenharmony_ci 5581cb0ef41Sopenharmony_ci add $i,4,$i ! i++ 5591cb0ef41Sopenharmony_ci ld [%sp+$bias+$frame],$tmp1 ! tp[0] 5601cb0ef41Sopenharmony_ci ld [%sp+$bias+$frame+4],$tpj ! tp[1] 5611cb0ef41Sopenharmony_ci ld [$ap+$i],$mul0 ! ap[j] 5621cb0ef41Sopenharmony_ci ld [$np],$car1 ! np[0] 5631cb0ef41Sopenharmony_ci ld [$np+4],$npj ! np[1] 5641cb0ef41Sopenharmony_ci mulx $n0,$tmp1,$mul1 5651cb0ef41Sopenharmony_ci and $mul1,$mask,$mul1 5661cb0ef41Sopenharmony_ci add $i,4,$tmp0 5671cb0ef41Sopenharmony_ci 5681cb0ef41Sopenharmony_ci mulx $mul0,$mul0,$car0 5691cb0ef41Sopenharmony_ci mulx $car1,$mul1,$car1 5701cb0ef41Sopenharmony_ci and $car0,$mask,$acc0 5711cb0ef41Sopenharmony_ci add $tmp1,$car1,$car1 5721cb0ef41Sopenharmony_ci srlx $car0,32,$car0 5731cb0ef41Sopenharmony_ci add %sp,$bias+$frame,$tp 5741cb0ef41Sopenharmony_ci srlx $car1,32,$car1 5751cb0ef41Sopenharmony_ci and $car0,1,$sbit 5761cb0ef41Sopenharmony_ci srlx $car0,1,$car0 5771cb0ef41Sopenharmony_ci 5781cb0ef41Sopenharmony_ci cmp $tmp0,$num ! i<num-1 5791cb0ef41Sopenharmony_ci bl %icc,.Lsqr_outer 5801cb0ef41Sopenharmony_ci mov 4,$j 5811cb0ef41Sopenharmony_ci 5821cb0ef41Sopenharmony_ci.Lsqr_last: 5831cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 5841cb0ef41Sopenharmony_ci add $tpj,$car1,$car1 5851cb0ef41Sopenharmony_ci add $j,4,$j 5861cb0ef41Sopenharmony_ci ld [$tp+8],$tpj 5871cb0ef41Sopenharmony_ci cmp $j,$i 5881cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 5891cb0ef41Sopenharmony_ci ld [$np+$j],$npj 5901cb0ef41Sopenharmony_ci st $car1,[$tp] 5911cb0ef41Sopenharmony_ci srlx $car1,32,$car1 5921cb0ef41Sopenharmony_ci bl %icc,.Lsqr_last 5931cb0ef41Sopenharmony_ci add $tp,4,$tp 5941cb0ef41Sopenharmony_ci!.Lsqr_last 5951cb0ef41Sopenharmony_ci 5961cb0ef41Sopenharmony_ci mulx $npj,$mul1,$acc1 5971cb0ef41Sopenharmony_ci add $tpj,$acc0,$acc0 5981cb0ef41Sopenharmony_ci srlx $acc0,32,$tmp0 5991cb0ef41Sopenharmony_ci and $acc0,$mask,$acc0 6001cb0ef41Sopenharmony_ci add $tmp0,$sbit,$sbit 6011cb0ef41Sopenharmony_ci add $acc0,$car1,$car1 6021cb0ef41Sopenharmony_ci add $acc1,$car1,$car1 6031cb0ef41Sopenharmony_ci st $car1,[$tp] 6041cb0ef41Sopenharmony_ci srlx $car1,32,$car1 6051cb0ef41Sopenharmony_ci 6061cb0ef41Sopenharmony_ci add $car0,$car0,$car0 ! recover $car0 6071cb0ef41Sopenharmony_ci add $sbit,$car0,$car0 6081cb0ef41Sopenharmony_ci add $car0,$car1,$car1 6091cb0ef41Sopenharmony_ci add $car2,$car1,$car1 6101cb0ef41Sopenharmony_ci st $car1,[$tp+4] 6111cb0ef41Sopenharmony_ci srlx $car1,32,$car2 6121cb0ef41Sopenharmony_ci 6131cb0ef41Sopenharmony_ci ba .Ltail 6141cb0ef41Sopenharmony_ci add $tp,8,$tp 6151cb0ef41Sopenharmony_ci.type $fname,#function 6161cb0ef41Sopenharmony_ci.size $fname,(.-$fname) 6171cb0ef41Sopenharmony_ci.asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 6181cb0ef41Sopenharmony_ci.align 32 6191cb0ef41Sopenharmony_ci___ 6201cb0ef41Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval($1)/gem; 6211cb0ef41Sopenharmony_ciprint $code; 6221cb0ef41Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 623