11cb0ef41Sopenharmony_ci#! /usr/bin/env perl 21cb0ef41Sopenharmony_ci# Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved. 31cb0ef41Sopenharmony_ci# 41cb0ef41Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 51cb0ef41Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 61cb0ef41Sopenharmony_ci# in the file LICENSE in the source distribution or at 71cb0ef41Sopenharmony_ci# https://www.openssl.org/source/license.html 81cb0ef41Sopenharmony_ci 91cb0ef41Sopenharmony_ci# Implemented as a Perl wrapper as we want to support several different 101cb0ef41Sopenharmony_ci# architectures with single file. We pick up the target based on the 111cb0ef41Sopenharmony_ci# file name we are asked to generate. 121cb0ef41Sopenharmony_ci# 131cb0ef41Sopenharmony_ci# It should be noted though that this perl code is nothing like 141cb0ef41Sopenharmony_ci# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 151cb0ef41Sopenharmony_ci# as pre-processor to cover for platform differences in name decoration, 161cb0ef41Sopenharmony_ci# linker tables, 32-/64-bit instruction sets... 171cb0ef41Sopenharmony_ci# 181cb0ef41Sopenharmony_ci# As you might know there're several PowerPC ABI in use. Most notably 191cb0ef41Sopenharmony_ci# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 201cb0ef41Sopenharmony_ci# are similar enough to implement leaf(!) functions, which would be ABI 211cb0ef41Sopenharmony_ci# neutral. And that's what you find here: ABI neutral leaf functions. 221cb0ef41Sopenharmony_ci# In case you wonder what that is... 231cb0ef41Sopenharmony_ci# 241cb0ef41Sopenharmony_ci# AIX performance 251cb0ef41Sopenharmony_ci# 261cb0ef41Sopenharmony_ci# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 271cb0ef41Sopenharmony_ci# 281cb0ef41Sopenharmony_ci# The following is the performance of 32-bit compiler 291cb0ef41Sopenharmony_ci# generated code: 301cb0ef41Sopenharmony_ci# 311cb0ef41Sopenharmony_ci# OpenSSL 0.9.6c 21 dec 2001 321cb0ef41Sopenharmony_ci# built on: Tue Jun 11 11:06:51 EDT 2002 331cb0ef41Sopenharmony_ci# options:bn(64,32) ... 341cb0ef41Sopenharmony_ci#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 351cb0ef41Sopenharmony_ci# sign verify sign/s verify/s 361cb0ef41Sopenharmony_ci#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 371cb0ef41Sopenharmony_ci#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 381cb0ef41Sopenharmony_ci#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 391cb0ef41Sopenharmony_ci#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 401cb0ef41Sopenharmony_ci#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 411cb0ef41Sopenharmony_ci#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 421cb0ef41Sopenharmony_ci# 431cb0ef41Sopenharmony_ci# Same benchmark with this assembler code: 441cb0ef41Sopenharmony_ci# 451cb0ef41Sopenharmony_ci#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 461cb0ef41Sopenharmony_ci#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 471cb0ef41Sopenharmony_ci#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 481cb0ef41Sopenharmony_ci#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 491cb0ef41Sopenharmony_ci#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 501cb0ef41Sopenharmony_ci#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 511cb0ef41Sopenharmony_ci# 521cb0ef41Sopenharmony_ci# Number of operations increases by at almost 75% 531cb0ef41Sopenharmony_ci# 541cb0ef41Sopenharmony_ci# Here are performance numbers for 64-bit compiler 551cb0ef41Sopenharmony_ci# generated code: 561cb0ef41Sopenharmony_ci# 571cb0ef41Sopenharmony_ci# OpenSSL 0.9.6g [engine] 9 Aug 2002 581cb0ef41Sopenharmony_ci# built on: Fri Apr 18 16:59:20 EDT 2003 591cb0ef41Sopenharmony_ci# options:bn(64,64) ... 601cb0ef41Sopenharmony_ci# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 611cb0ef41Sopenharmony_ci# sign verify sign/s verify/s 621cb0ef41Sopenharmony_ci#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 631cb0ef41Sopenharmony_ci#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 641cb0ef41Sopenharmony_ci#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 651cb0ef41Sopenharmony_ci#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 661cb0ef41Sopenharmony_ci#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 671cb0ef41Sopenharmony_ci#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 681cb0ef41Sopenharmony_ci# 691cb0ef41Sopenharmony_ci# Same benchmark with this assembler code: 701cb0ef41Sopenharmony_ci# 711cb0ef41Sopenharmony_ci#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 721cb0ef41Sopenharmony_ci#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 731cb0ef41Sopenharmony_ci#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 741cb0ef41Sopenharmony_ci#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 751cb0ef41Sopenharmony_ci#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 761cb0ef41Sopenharmony_ci#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 771cb0ef41Sopenharmony_ci# 781cb0ef41Sopenharmony_ci# Again, performance increases by at about 75% 791cb0ef41Sopenharmony_ci# 801cb0ef41Sopenharmony_ci# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 811cb0ef41Sopenharmony_ci# OpenSSL 0.9.7c 30 Sep 2003 821cb0ef41Sopenharmony_ci# 831cb0ef41Sopenharmony_ci# Original code. 841cb0ef41Sopenharmony_ci# 851cb0ef41Sopenharmony_ci#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 861cb0ef41Sopenharmony_ci#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 871cb0ef41Sopenharmony_ci#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 881cb0ef41Sopenharmony_ci#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 891cb0ef41Sopenharmony_ci#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 901cb0ef41Sopenharmony_ci#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 911cb0ef41Sopenharmony_ci#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 921cb0ef41Sopenharmony_ci# 931cb0ef41Sopenharmony_ci# Same benchmark with this assembler code: 941cb0ef41Sopenharmony_ci# 951cb0ef41Sopenharmony_ci#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 961cb0ef41Sopenharmony_ci#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 971cb0ef41Sopenharmony_ci#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 981cb0ef41Sopenharmony_ci#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 991cb0ef41Sopenharmony_ci#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 1001cb0ef41Sopenharmony_ci#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 1011cb0ef41Sopenharmony_ci#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 1021cb0ef41Sopenharmony_ci# 1031cb0ef41Sopenharmony_ci# Performance increase of ~60% 1041cb0ef41Sopenharmony_ci# Based on submission from Suresh N. Chari of IBM 1051cb0ef41Sopenharmony_ci 1061cb0ef41Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 1071cb0ef41Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 1081cb0ef41Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 1091cb0ef41Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 1101cb0ef41Sopenharmony_ci 1111cb0ef41Sopenharmony_ciif ($flavour =~ /32/) { 1121cb0ef41Sopenharmony_ci $BITS= 32; 1131cb0ef41Sopenharmony_ci $BNSZ= $BITS/8; 1141cb0ef41Sopenharmony_ci $ISA= "\"ppc\""; 1151cb0ef41Sopenharmony_ci 1161cb0ef41Sopenharmony_ci $LD= "lwz"; # load 1171cb0ef41Sopenharmony_ci $LDU= "lwzu"; # load and update 1181cb0ef41Sopenharmony_ci $ST= "stw"; # store 1191cb0ef41Sopenharmony_ci $STU= "stwu"; # store and update 1201cb0ef41Sopenharmony_ci $UMULL= "mullw"; # unsigned multiply low 1211cb0ef41Sopenharmony_ci $UMULH= "mulhwu"; # unsigned multiply high 1221cb0ef41Sopenharmony_ci $UDIV= "divwu"; # unsigned divide 1231cb0ef41Sopenharmony_ci $UCMPI= "cmplwi"; # unsigned compare with immediate 1241cb0ef41Sopenharmony_ci $UCMP= "cmplw"; # unsigned compare 1251cb0ef41Sopenharmony_ci $CNTLZ= "cntlzw"; # count leading zeros 1261cb0ef41Sopenharmony_ci $SHL= "slw"; # shift left 1271cb0ef41Sopenharmony_ci $SHR= "srw"; # unsigned shift right 1281cb0ef41Sopenharmony_ci $SHRI= "srwi"; # unsigned shift right by immediate 1291cb0ef41Sopenharmony_ci $SHLI= "slwi"; # shift left by immediate 1301cb0ef41Sopenharmony_ci $CLRU= "clrlwi"; # clear upper bits 1311cb0ef41Sopenharmony_ci $INSR= "insrwi"; # insert right 1321cb0ef41Sopenharmony_ci $ROTL= "rotlwi"; # rotate left by immediate 1331cb0ef41Sopenharmony_ci $TR= "tw"; # conditional trap 1341cb0ef41Sopenharmony_ci} elsif ($flavour =~ /64/) { 1351cb0ef41Sopenharmony_ci $BITS= 64; 1361cb0ef41Sopenharmony_ci $BNSZ= $BITS/8; 1371cb0ef41Sopenharmony_ci $ISA= "\"ppc64\""; 1381cb0ef41Sopenharmony_ci 1391cb0ef41Sopenharmony_ci # same as above, but 64-bit mnemonics... 1401cb0ef41Sopenharmony_ci $LD= "ld"; # load 1411cb0ef41Sopenharmony_ci $LDU= "ldu"; # load and update 1421cb0ef41Sopenharmony_ci $ST= "std"; # store 1431cb0ef41Sopenharmony_ci $STU= "stdu"; # store and update 1441cb0ef41Sopenharmony_ci $UMULL= "mulld"; # unsigned multiply low 1451cb0ef41Sopenharmony_ci $UMULH= "mulhdu"; # unsigned multiply high 1461cb0ef41Sopenharmony_ci $UDIV= "divdu"; # unsigned divide 1471cb0ef41Sopenharmony_ci $UCMPI= "cmpldi"; # unsigned compare with immediate 1481cb0ef41Sopenharmony_ci $UCMP= "cmpld"; # unsigned compare 1491cb0ef41Sopenharmony_ci $CNTLZ= "cntlzd"; # count leading zeros 1501cb0ef41Sopenharmony_ci $SHL= "sld"; # shift left 1511cb0ef41Sopenharmony_ci $SHR= "srd"; # unsigned shift right 1521cb0ef41Sopenharmony_ci $SHRI= "srdi"; # unsigned shift right by immediate 1531cb0ef41Sopenharmony_ci $SHLI= "sldi"; # shift left by immediate 1541cb0ef41Sopenharmony_ci $CLRU= "clrldi"; # clear upper bits 1551cb0ef41Sopenharmony_ci $INSR= "insrdi"; # insert right 1561cb0ef41Sopenharmony_ci $ROTL= "rotldi"; # rotate left by immediate 1571cb0ef41Sopenharmony_ci $TR= "td"; # conditional trap 1581cb0ef41Sopenharmony_ci} else { die "nonsense $flavour"; } 1591cb0ef41Sopenharmony_ci 1601cb0ef41Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 1611cb0ef41Sopenharmony_ci( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 1621cb0ef41Sopenharmony_ci( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 1631cb0ef41Sopenharmony_cidie "can't locate ppc-xlate.pl"; 1641cb0ef41Sopenharmony_ci 1651cb0ef41Sopenharmony_ciopen STDOUT,"| $^X $xlate $flavour \"$output\"" 1661cb0ef41Sopenharmony_ci or die "can't call $xlate: $!"; 1671cb0ef41Sopenharmony_ci 1681cb0ef41Sopenharmony_ci$data=<<EOF; 1691cb0ef41Sopenharmony_ci#-------------------------------------------------------------------- 1701cb0ef41Sopenharmony_ci# 1711cb0ef41Sopenharmony_ci# 1721cb0ef41Sopenharmony_ci# 1731cb0ef41Sopenharmony_ci# 1741cb0ef41Sopenharmony_ci# File: ppc32.s 1751cb0ef41Sopenharmony_ci# 1761cb0ef41Sopenharmony_ci# Created by: Suresh Chari 1771cb0ef41Sopenharmony_ci# IBM Thomas J. Watson Research Library 1781cb0ef41Sopenharmony_ci# Hawthorne, NY 1791cb0ef41Sopenharmony_ci# 1801cb0ef41Sopenharmony_ci# 1811cb0ef41Sopenharmony_ci# Description: Optimized assembly routines for OpenSSL crypto 1821cb0ef41Sopenharmony_ci# on the 32 bitPowerPC platform. 1831cb0ef41Sopenharmony_ci# 1841cb0ef41Sopenharmony_ci# 1851cb0ef41Sopenharmony_ci# Version History 1861cb0ef41Sopenharmony_ci# 1871cb0ef41Sopenharmony_ci# 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 1881cb0ef41Sopenharmony_ci# cleaned up code. Also made a single version which can 1891cb0ef41Sopenharmony_ci# be used for both the AIX and Linux compilers. See NOTE 1901cb0ef41Sopenharmony_ci# below. 1911cb0ef41Sopenharmony_ci# 12/05/03 Suresh Chari 1921cb0ef41Sopenharmony_ci# (with lots of help from) Andy Polyakov 1931cb0ef41Sopenharmony_ci## 1941cb0ef41Sopenharmony_ci# 1. Initial version 10/20/02 Suresh Chari 1951cb0ef41Sopenharmony_ci# 1961cb0ef41Sopenharmony_ci# 1971cb0ef41Sopenharmony_ci# The following file works for the xlc,cc 1981cb0ef41Sopenharmony_ci# and gcc compilers. 1991cb0ef41Sopenharmony_ci# 2001cb0ef41Sopenharmony_ci# NOTE: To get the file to link correctly with the gcc compiler 2011cb0ef41Sopenharmony_ci# you have to change the names of the routines and remove 2021cb0ef41Sopenharmony_ci# the first .(dot) character. This should automatically 2031cb0ef41Sopenharmony_ci# be done in the build process. 2041cb0ef41Sopenharmony_ci# 2051cb0ef41Sopenharmony_ci# Hand optimized assembly code for the following routines 2061cb0ef41Sopenharmony_ci# 2071cb0ef41Sopenharmony_ci# bn_sqr_comba4 2081cb0ef41Sopenharmony_ci# bn_sqr_comba8 2091cb0ef41Sopenharmony_ci# bn_mul_comba4 2101cb0ef41Sopenharmony_ci# bn_mul_comba8 2111cb0ef41Sopenharmony_ci# bn_sub_words 2121cb0ef41Sopenharmony_ci# bn_add_words 2131cb0ef41Sopenharmony_ci# bn_div_words 2141cb0ef41Sopenharmony_ci# bn_sqr_words 2151cb0ef41Sopenharmony_ci# bn_mul_words 2161cb0ef41Sopenharmony_ci# bn_mul_add_words 2171cb0ef41Sopenharmony_ci# 2181cb0ef41Sopenharmony_ci# NOTE: It is possible to optimize this code more for 2191cb0ef41Sopenharmony_ci# specific PowerPC or Power architectures. On the Northstar 2201cb0ef41Sopenharmony_ci# architecture the optimizations in this file do 2211cb0ef41Sopenharmony_ci# NOT provide much improvement. 2221cb0ef41Sopenharmony_ci# 2231cb0ef41Sopenharmony_ci# If you have comments or suggestions to improve code send 2241cb0ef41Sopenharmony_ci# me a note at schari\@us.ibm.com 2251cb0ef41Sopenharmony_ci# 2261cb0ef41Sopenharmony_ci#-------------------------------------------------------------------------- 2271cb0ef41Sopenharmony_ci# 2281cb0ef41Sopenharmony_ci# Defines to be used in the assembly code. 2291cb0ef41Sopenharmony_ci# 2301cb0ef41Sopenharmony_ci#.set r0,0 # we use it as storage for value of 0 2311cb0ef41Sopenharmony_ci#.set SP,1 # preserved 2321cb0ef41Sopenharmony_ci#.set RTOC,2 # preserved 2331cb0ef41Sopenharmony_ci#.set r3,3 # 1st argument/return value 2341cb0ef41Sopenharmony_ci#.set r4,4 # 2nd argument/volatile register 2351cb0ef41Sopenharmony_ci#.set r5,5 # 3rd argument/volatile register 2361cb0ef41Sopenharmony_ci#.set r6,6 # ... 2371cb0ef41Sopenharmony_ci#.set r7,7 2381cb0ef41Sopenharmony_ci#.set r8,8 2391cb0ef41Sopenharmony_ci#.set r9,9 2401cb0ef41Sopenharmony_ci#.set r10,10 2411cb0ef41Sopenharmony_ci#.set r11,11 2421cb0ef41Sopenharmony_ci#.set r12,12 2431cb0ef41Sopenharmony_ci#.set r13,13 # not used, nor any other "below" it... 2441cb0ef41Sopenharmony_ci 2451cb0ef41Sopenharmony_ci# Declare function names to be global 2461cb0ef41Sopenharmony_ci# NOTE: For gcc these names MUST be changed to remove 2471cb0ef41Sopenharmony_ci# the first . i.e. for example change ".bn_sqr_comba4" 2481cb0ef41Sopenharmony_ci# to "bn_sqr_comba4". This should be automatically done 2491cb0ef41Sopenharmony_ci# in the build. 2501cb0ef41Sopenharmony_ci 2511cb0ef41Sopenharmony_ci .globl .bn_sqr_comba4 2521cb0ef41Sopenharmony_ci .globl .bn_sqr_comba8 2531cb0ef41Sopenharmony_ci .globl .bn_mul_comba4 2541cb0ef41Sopenharmony_ci .globl .bn_mul_comba8 2551cb0ef41Sopenharmony_ci .globl .bn_sub_words 2561cb0ef41Sopenharmony_ci .globl .bn_add_words 2571cb0ef41Sopenharmony_ci .globl .bn_div_words 2581cb0ef41Sopenharmony_ci .globl .bn_sqr_words 2591cb0ef41Sopenharmony_ci .globl .bn_mul_words 2601cb0ef41Sopenharmony_ci .globl .bn_mul_add_words 2611cb0ef41Sopenharmony_ci 2621cb0ef41Sopenharmony_ci# .text section 2631cb0ef41Sopenharmony_ci 2641cb0ef41Sopenharmony_ci .machine "any" 2651cb0ef41Sopenharmony_ci .text 2661cb0ef41Sopenharmony_ci 2671cb0ef41Sopenharmony_ci# 2681cb0ef41Sopenharmony_ci# NOTE: The following label name should be changed to 2691cb0ef41Sopenharmony_ci# "bn_sqr_comba4" i.e. remove the first dot 2701cb0ef41Sopenharmony_ci# for the gcc compiler. This should be automatically 2711cb0ef41Sopenharmony_ci# done in the build 2721cb0ef41Sopenharmony_ci# 2731cb0ef41Sopenharmony_ci 2741cb0ef41Sopenharmony_ci.align 4 2751cb0ef41Sopenharmony_ci.bn_sqr_comba4: 2761cb0ef41Sopenharmony_ci# 2771cb0ef41Sopenharmony_ci# Optimized version of bn_sqr_comba4. 2781cb0ef41Sopenharmony_ci# 2791cb0ef41Sopenharmony_ci# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 2801cb0ef41Sopenharmony_ci# r3 contains r 2811cb0ef41Sopenharmony_ci# r4 contains a 2821cb0ef41Sopenharmony_ci# 2831cb0ef41Sopenharmony_ci# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 2841cb0ef41Sopenharmony_ci# 2851cb0ef41Sopenharmony_ci# r5,r6 are the two BN_ULONGs being multiplied. 2861cb0ef41Sopenharmony_ci# r7,r8 are the results of the 32x32 giving 64 bit multiply. 2871cb0ef41Sopenharmony_ci# r9,r10, r11 are the equivalents of c1,c2, c3. 2881cb0ef41Sopenharmony_ci# Here's the assembly 2891cb0ef41Sopenharmony_ci# 2901cb0ef41Sopenharmony_ci# 2911cb0ef41Sopenharmony_ci xor r0,r0,r0 # set r0 = 0. Used in the addze 2921cb0ef41Sopenharmony_ci # instructions below 2931cb0ef41Sopenharmony_ci 2941cb0ef41Sopenharmony_ci #sqr_add_c(a,0,c1,c2,c3) 2951cb0ef41Sopenharmony_ci $LD r5,`0*$BNSZ`(r4) 2961cb0ef41Sopenharmony_ci $UMULL r9,r5,r5 2971cb0ef41Sopenharmony_ci $UMULH r10,r5,r5 #in first iteration. No need 2981cb0ef41Sopenharmony_ci #to add since c1=c2=c3=0. 2991cb0ef41Sopenharmony_ci # Note c3(r11) is NOT set to 0 3001cb0ef41Sopenharmony_ci # but will be. 3011cb0ef41Sopenharmony_ci 3021cb0ef41Sopenharmony_ci $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 3031cb0ef41Sopenharmony_ci # sqr_add_c2(a,1,0,c2,c3,c1); 3041cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 3051cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 3061cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 3071cb0ef41Sopenharmony_ci 3081cb0ef41Sopenharmony_ci addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 3091cb0ef41Sopenharmony_ci adde r8,r8,r8 3101cb0ef41Sopenharmony_ci addze r9,r0 # catch carry if any. 3111cb0ef41Sopenharmony_ci # r9= r0(=0) and carry 3121cb0ef41Sopenharmony_ci 3131cb0ef41Sopenharmony_ci addc r10,r7,r10 # now add to temp result. 3141cb0ef41Sopenharmony_ci addze r11,r8 # r8 added to r11 which is 0 3151cb0ef41Sopenharmony_ci addze r9,r9 3161cb0ef41Sopenharmony_ci 3171cb0ef41Sopenharmony_ci $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 3181cb0ef41Sopenharmony_ci #sqr_add_c(a,1,c3,c1,c2) 3191cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 3201cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 3211cb0ef41Sopenharmony_ci addc r11,r7,r11 3221cb0ef41Sopenharmony_ci adde r9,r8,r9 3231cb0ef41Sopenharmony_ci addze r10,r0 3241cb0ef41Sopenharmony_ci #sqr_add_c2(a,2,0,c3,c1,c2) 3251cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 3261cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 3271cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 3281cb0ef41Sopenharmony_ci 3291cb0ef41Sopenharmony_ci addc r7,r7,r7 3301cb0ef41Sopenharmony_ci adde r8,r8,r8 3311cb0ef41Sopenharmony_ci addze r10,r10 3321cb0ef41Sopenharmony_ci 3331cb0ef41Sopenharmony_ci addc r11,r7,r11 3341cb0ef41Sopenharmony_ci adde r9,r8,r9 3351cb0ef41Sopenharmony_ci addze r10,r10 3361cb0ef41Sopenharmony_ci $ST r11,`2*$BNSZ`(r3) #r[2]=c3 3371cb0ef41Sopenharmony_ci #sqr_add_c2(a,3,0,c1,c2,c3); 3381cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 3391cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 3401cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 3411cb0ef41Sopenharmony_ci addc r7,r7,r7 3421cb0ef41Sopenharmony_ci adde r8,r8,r8 3431cb0ef41Sopenharmony_ci addze r11,r0 3441cb0ef41Sopenharmony_ci 3451cb0ef41Sopenharmony_ci addc r9,r7,r9 3461cb0ef41Sopenharmony_ci adde r10,r8,r10 3471cb0ef41Sopenharmony_ci addze r11,r11 3481cb0ef41Sopenharmony_ci #sqr_add_c2(a,2,1,c1,c2,c3); 3491cb0ef41Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 3501cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 3511cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 3521cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 3531cb0ef41Sopenharmony_ci 3541cb0ef41Sopenharmony_ci addc r7,r7,r7 3551cb0ef41Sopenharmony_ci adde r8,r8,r8 3561cb0ef41Sopenharmony_ci addze r11,r11 3571cb0ef41Sopenharmony_ci addc r9,r7,r9 3581cb0ef41Sopenharmony_ci adde r10,r8,r10 3591cb0ef41Sopenharmony_ci addze r11,r11 3601cb0ef41Sopenharmony_ci $ST r9,`3*$BNSZ`(r3) #r[3]=c1 3611cb0ef41Sopenharmony_ci #sqr_add_c(a,2,c2,c3,c1); 3621cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 3631cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 3641cb0ef41Sopenharmony_ci addc r10,r7,r10 3651cb0ef41Sopenharmony_ci adde r11,r8,r11 3661cb0ef41Sopenharmony_ci addze r9,r0 3671cb0ef41Sopenharmony_ci #sqr_add_c2(a,3,1,c2,c3,c1); 3681cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 3691cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 3701cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 3711cb0ef41Sopenharmony_ci addc r7,r7,r7 3721cb0ef41Sopenharmony_ci adde r8,r8,r8 3731cb0ef41Sopenharmony_ci addze r9,r9 3741cb0ef41Sopenharmony_ci 3751cb0ef41Sopenharmony_ci addc r10,r7,r10 3761cb0ef41Sopenharmony_ci adde r11,r8,r11 3771cb0ef41Sopenharmony_ci addze r9,r9 3781cb0ef41Sopenharmony_ci $ST r10,`4*$BNSZ`(r3) #r[4]=c2 3791cb0ef41Sopenharmony_ci #sqr_add_c2(a,3,2,c3,c1,c2); 3801cb0ef41Sopenharmony_ci $LD r5,`2*$BNSZ`(r4) 3811cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 3821cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 3831cb0ef41Sopenharmony_ci addc r7,r7,r7 3841cb0ef41Sopenharmony_ci adde r8,r8,r8 3851cb0ef41Sopenharmony_ci addze r10,r0 3861cb0ef41Sopenharmony_ci 3871cb0ef41Sopenharmony_ci addc r11,r7,r11 3881cb0ef41Sopenharmony_ci adde r9,r8,r9 3891cb0ef41Sopenharmony_ci addze r10,r10 3901cb0ef41Sopenharmony_ci $ST r11,`5*$BNSZ`(r3) #r[5] = c3 3911cb0ef41Sopenharmony_ci #sqr_add_c(a,3,c1,c2,c3); 3921cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 3931cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 3941cb0ef41Sopenharmony_ci addc r9,r7,r9 3951cb0ef41Sopenharmony_ci adde r10,r8,r10 3961cb0ef41Sopenharmony_ci 3971cb0ef41Sopenharmony_ci $ST r9,`6*$BNSZ`(r3) #r[6]=c1 3981cb0ef41Sopenharmony_ci $ST r10,`7*$BNSZ`(r3) #r[7]=c2 3991cb0ef41Sopenharmony_ci blr 4001cb0ef41Sopenharmony_ci .long 0 4011cb0ef41Sopenharmony_ci .byte 0,12,0x14,0,0,0,2,0 4021cb0ef41Sopenharmony_ci .long 0 4031cb0ef41Sopenharmony_ci.size .bn_sqr_comba4,.-.bn_sqr_comba4 4041cb0ef41Sopenharmony_ci 4051cb0ef41Sopenharmony_ci# 4061cb0ef41Sopenharmony_ci# NOTE: The following label name should be changed to 4071cb0ef41Sopenharmony_ci# "bn_sqr_comba8" i.e. remove the first dot 4081cb0ef41Sopenharmony_ci# for the gcc compiler. This should be automatically 4091cb0ef41Sopenharmony_ci# done in the build 4101cb0ef41Sopenharmony_ci# 4111cb0ef41Sopenharmony_ci 4121cb0ef41Sopenharmony_ci.align 4 4131cb0ef41Sopenharmony_ci.bn_sqr_comba8: 4141cb0ef41Sopenharmony_ci# 4151cb0ef41Sopenharmony_ci# This is an optimized version of the bn_sqr_comba8 routine. 4161cb0ef41Sopenharmony_ci# Tightly uses the adde instruction 4171cb0ef41Sopenharmony_ci# 4181cb0ef41Sopenharmony_ci# 4191cb0ef41Sopenharmony_ci# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 4201cb0ef41Sopenharmony_ci# r3 contains r 4211cb0ef41Sopenharmony_ci# r4 contains a 4221cb0ef41Sopenharmony_ci# 4231cb0ef41Sopenharmony_ci# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 4241cb0ef41Sopenharmony_ci# 4251cb0ef41Sopenharmony_ci# r5,r6 are the two BN_ULONGs being multiplied. 4261cb0ef41Sopenharmony_ci# r7,r8 are the results of the 32x32 giving 64 bit multiply. 4271cb0ef41Sopenharmony_ci# r9,r10, r11 are the equivalents of c1,c2, c3. 4281cb0ef41Sopenharmony_ci# 4291cb0ef41Sopenharmony_ci# Possible optimization of loading all 8 longs of a into registers 4301cb0ef41Sopenharmony_ci# doesn't provide any speedup 4311cb0ef41Sopenharmony_ci# 4321cb0ef41Sopenharmony_ci 4331cb0ef41Sopenharmony_ci xor r0,r0,r0 #set r0 = 0.Used in addze 4341cb0ef41Sopenharmony_ci #instructions below. 4351cb0ef41Sopenharmony_ci 4361cb0ef41Sopenharmony_ci #sqr_add_c(a,0,c1,c2,c3); 4371cb0ef41Sopenharmony_ci $LD r5,`0*$BNSZ`(r4) 4381cb0ef41Sopenharmony_ci $UMULL r9,r5,r5 #1st iteration: no carries. 4391cb0ef41Sopenharmony_ci $UMULH r10,r5,r5 4401cb0ef41Sopenharmony_ci $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 4411cb0ef41Sopenharmony_ci #sqr_add_c2(a,1,0,c2,c3,c1); 4421cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 4431cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 4441cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 4451cb0ef41Sopenharmony_ci 4461cb0ef41Sopenharmony_ci addc r10,r7,r10 #add the two register number 4471cb0ef41Sopenharmony_ci adde r11,r8,r0 # (r8,r7) to the three register 4481cb0ef41Sopenharmony_ci addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 4491cb0ef41Sopenharmony_ci 4501cb0ef41Sopenharmony_ci addc r10,r7,r10 #add the two register number 4511cb0ef41Sopenharmony_ci adde r11,r8,r11 # (r8,r7) to the three register 4521cb0ef41Sopenharmony_ci addze r9,r9 # number (r9,r11,r10). 4531cb0ef41Sopenharmony_ci 4541cb0ef41Sopenharmony_ci $ST r10,`1*$BNSZ`(r3) # r[1]=c2 4551cb0ef41Sopenharmony_ci 4561cb0ef41Sopenharmony_ci #sqr_add_c(a,1,c3,c1,c2); 4571cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 4581cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 4591cb0ef41Sopenharmony_ci addc r11,r7,r11 4601cb0ef41Sopenharmony_ci adde r9,r8,r9 4611cb0ef41Sopenharmony_ci addze r10,r0 4621cb0ef41Sopenharmony_ci #sqr_add_c2(a,2,0,c3,c1,c2); 4631cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 4641cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 4651cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 4661cb0ef41Sopenharmony_ci 4671cb0ef41Sopenharmony_ci addc r11,r7,r11 4681cb0ef41Sopenharmony_ci adde r9,r8,r9 4691cb0ef41Sopenharmony_ci addze r10,r10 4701cb0ef41Sopenharmony_ci 4711cb0ef41Sopenharmony_ci addc r11,r7,r11 4721cb0ef41Sopenharmony_ci adde r9,r8,r9 4731cb0ef41Sopenharmony_ci addze r10,r10 4741cb0ef41Sopenharmony_ci 4751cb0ef41Sopenharmony_ci $ST r11,`2*$BNSZ`(r3) #r[2]=c3 4761cb0ef41Sopenharmony_ci #sqr_add_c2(a,3,0,c1,c2,c3); 4771cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 4781cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 4791cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 4801cb0ef41Sopenharmony_ci 4811cb0ef41Sopenharmony_ci addc r9,r7,r9 4821cb0ef41Sopenharmony_ci adde r10,r8,r10 4831cb0ef41Sopenharmony_ci addze r11,r0 4841cb0ef41Sopenharmony_ci 4851cb0ef41Sopenharmony_ci addc r9,r7,r9 4861cb0ef41Sopenharmony_ci adde r10,r8,r10 4871cb0ef41Sopenharmony_ci addze r11,r11 4881cb0ef41Sopenharmony_ci #sqr_add_c2(a,2,1,c1,c2,c3); 4891cb0ef41Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 4901cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 4911cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 4921cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 4931cb0ef41Sopenharmony_ci 4941cb0ef41Sopenharmony_ci addc r9,r7,r9 4951cb0ef41Sopenharmony_ci adde r10,r8,r10 4961cb0ef41Sopenharmony_ci addze r11,r11 4971cb0ef41Sopenharmony_ci 4981cb0ef41Sopenharmony_ci addc r9,r7,r9 4991cb0ef41Sopenharmony_ci adde r10,r8,r10 5001cb0ef41Sopenharmony_ci addze r11,r11 5011cb0ef41Sopenharmony_ci 5021cb0ef41Sopenharmony_ci $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 5031cb0ef41Sopenharmony_ci #sqr_add_c(a,2,c2,c3,c1); 5041cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 5051cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 5061cb0ef41Sopenharmony_ci 5071cb0ef41Sopenharmony_ci addc r10,r7,r10 5081cb0ef41Sopenharmony_ci adde r11,r8,r11 5091cb0ef41Sopenharmony_ci addze r9,r0 5101cb0ef41Sopenharmony_ci #sqr_add_c2(a,3,1,c2,c3,c1); 5111cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 5121cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 5131cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 5141cb0ef41Sopenharmony_ci 5151cb0ef41Sopenharmony_ci addc r10,r7,r10 5161cb0ef41Sopenharmony_ci adde r11,r8,r11 5171cb0ef41Sopenharmony_ci addze r9,r9 5181cb0ef41Sopenharmony_ci 5191cb0ef41Sopenharmony_ci addc r10,r7,r10 5201cb0ef41Sopenharmony_ci adde r11,r8,r11 5211cb0ef41Sopenharmony_ci addze r9,r9 5221cb0ef41Sopenharmony_ci #sqr_add_c2(a,4,0,c2,c3,c1); 5231cb0ef41Sopenharmony_ci $LD r5,`0*$BNSZ`(r4) 5241cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 5251cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 5261cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 5271cb0ef41Sopenharmony_ci 5281cb0ef41Sopenharmony_ci addc r10,r7,r10 5291cb0ef41Sopenharmony_ci adde r11,r8,r11 5301cb0ef41Sopenharmony_ci addze r9,r9 5311cb0ef41Sopenharmony_ci 5321cb0ef41Sopenharmony_ci addc r10,r7,r10 5331cb0ef41Sopenharmony_ci adde r11,r8,r11 5341cb0ef41Sopenharmony_ci addze r9,r9 5351cb0ef41Sopenharmony_ci $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 5361cb0ef41Sopenharmony_ci #sqr_add_c2(a,5,0,c3,c1,c2); 5371cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 5381cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 5391cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 5401cb0ef41Sopenharmony_ci 5411cb0ef41Sopenharmony_ci addc r11,r7,r11 5421cb0ef41Sopenharmony_ci adde r9,r8,r9 5431cb0ef41Sopenharmony_ci addze r10,r0 5441cb0ef41Sopenharmony_ci 5451cb0ef41Sopenharmony_ci addc r11,r7,r11 5461cb0ef41Sopenharmony_ci adde r9,r8,r9 5471cb0ef41Sopenharmony_ci addze r10,r10 5481cb0ef41Sopenharmony_ci #sqr_add_c2(a,4,1,c3,c1,c2); 5491cb0ef41Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 5501cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 5511cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 5521cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 5531cb0ef41Sopenharmony_ci 5541cb0ef41Sopenharmony_ci addc r11,r7,r11 5551cb0ef41Sopenharmony_ci adde r9,r8,r9 5561cb0ef41Sopenharmony_ci addze r10,r10 5571cb0ef41Sopenharmony_ci 5581cb0ef41Sopenharmony_ci addc r11,r7,r11 5591cb0ef41Sopenharmony_ci adde r9,r8,r9 5601cb0ef41Sopenharmony_ci addze r10,r10 5611cb0ef41Sopenharmony_ci #sqr_add_c2(a,3,2,c3,c1,c2); 5621cb0ef41Sopenharmony_ci $LD r5,`2*$BNSZ`(r4) 5631cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 5641cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 5651cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 5661cb0ef41Sopenharmony_ci 5671cb0ef41Sopenharmony_ci addc r11,r7,r11 5681cb0ef41Sopenharmony_ci adde r9,r8,r9 5691cb0ef41Sopenharmony_ci addze r10,r10 5701cb0ef41Sopenharmony_ci 5711cb0ef41Sopenharmony_ci addc r11,r7,r11 5721cb0ef41Sopenharmony_ci adde r9,r8,r9 5731cb0ef41Sopenharmony_ci addze r10,r10 5741cb0ef41Sopenharmony_ci $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 5751cb0ef41Sopenharmony_ci #sqr_add_c(a,3,c1,c2,c3); 5761cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 5771cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 5781cb0ef41Sopenharmony_ci addc r9,r7,r9 5791cb0ef41Sopenharmony_ci adde r10,r8,r10 5801cb0ef41Sopenharmony_ci addze r11,r0 5811cb0ef41Sopenharmony_ci #sqr_add_c2(a,4,2,c1,c2,c3); 5821cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 5831cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 5841cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 5851cb0ef41Sopenharmony_ci 5861cb0ef41Sopenharmony_ci addc r9,r7,r9 5871cb0ef41Sopenharmony_ci adde r10,r8,r10 5881cb0ef41Sopenharmony_ci addze r11,r11 5891cb0ef41Sopenharmony_ci 5901cb0ef41Sopenharmony_ci addc r9,r7,r9 5911cb0ef41Sopenharmony_ci adde r10,r8,r10 5921cb0ef41Sopenharmony_ci addze r11,r11 5931cb0ef41Sopenharmony_ci #sqr_add_c2(a,5,1,c1,c2,c3); 5941cb0ef41Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 5951cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 5961cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 5971cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 5981cb0ef41Sopenharmony_ci 5991cb0ef41Sopenharmony_ci addc r9,r7,r9 6001cb0ef41Sopenharmony_ci adde r10,r8,r10 6011cb0ef41Sopenharmony_ci addze r11,r11 6021cb0ef41Sopenharmony_ci 6031cb0ef41Sopenharmony_ci addc r9,r7,r9 6041cb0ef41Sopenharmony_ci adde r10,r8,r10 6051cb0ef41Sopenharmony_ci addze r11,r11 6061cb0ef41Sopenharmony_ci #sqr_add_c2(a,6,0,c1,c2,c3); 6071cb0ef41Sopenharmony_ci $LD r5,`0*$BNSZ`(r4) 6081cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 6091cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 6101cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 6111cb0ef41Sopenharmony_ci addc r9,r7,r9 6121cb0ef41Sopenharmony_ci adde r10,r8,r10 6131cb0ef41Sopenharmony_ci addze r11,r11 6141cb0ef41Sopenharmony_ci addc r9,r7,r9 6151cb0ef41Sopenharmony_ci adde r10,r8,r10 6161cb0ef41Sopenharmony_ci addze r11,r11 6171cb0ef41Sopenharmony_ci $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 6181cb0ef41Sopenharmony_ci #sqr_add_c2(a,7,0,c2,c3,c1); 6191cb0ef41Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 6201cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 6211cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 6221cb0ef41Sopenharmony_ci 6231cb0ef41Sopenharmony_ci addc r10,r7,r10 6241cb0ef41Sopenharmony_ci adde r11,r8,r11 6251cb0ef41Sopenharmony_ci addze r9,r0 6261cb0ef41Sopenharmony_ci addc r10,r7,r10 6271cb0ef41Sopenharmony_ci adde r11,r8,r11 6281cb0ef41Sopenharmony_ci addze r9,r9 6291cb0ef41Sopenharmony_ci #sqr_add_c2(a,6,1,c2,c3,c1); 6301cb0ef41Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 6311cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 6321cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 6331cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 6341cb0ef41Sopenharmony_ci 6351cb0ef41Sopenharmony_ci addc r10,r7,r10 6361cb0ef41Sopenharmony_ci adde r11,r8,r11 6371cb0ef41Sopenharmony_ci addze r9,r9 6381cb0ef41Sopenharmony_ci addc r10,r7,r10 6391cb0ef41Sopenharmony_ci adde r11,r8,r11 6401cb0ef41Sopenharmony_ci addze r9,r9 6411cb0ef41Sopenharmony_ci #sqr_add_c2(a,5,2,c2,c3,c1); 6421cb0ef41Sopenharmony_ci $LD r5,`2*$BNSZ`(r4) 6431cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 6441cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 6451cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 6461cb0ef41Sopenharmony_ci addc r10,r7,r10 6471cb0ef41Sopenharmony_ci adde r11,r8,r11 6481cb0ef41Sopenharmony_ci addze r9,r9 6491cb0ef41Sopenharmony_ci addc r10,r7,r10 6501cb0ef41Sopenharmony_ci adde r11,r8,r11 6511cb0ef41Sopenharmony_ci addze r9,r9 6521cb0ef41Sopenharmony_ci #sqr_add_c2(a,4,3,c2,c3,c1); 6531cb0ef41Sopenharmony_ci $LD r5,`3*$BNSZ`(r4) 6541cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 6551cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 6561cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 6571cb0ef41Sopenharmony_ci 6581cb0ef41Sopenharmony_ci addc r10,r7,r10 6591cb0ef41Sopenharmony_ci adde r11,r8,r11 6601cb0ef41Sopenharmony_ci addze r9,r9 6611cb0ef41Sopenharmony_ci addc r10,r7,r10 6621cb0ef41Sopenharmony_ci adde r11,r8,r11 6631cb0ef41Sopenharmony_ci addze r9,r9 6641cb0ef41Sopenharmony_ci $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 6651cb0ef41Sopenharmony_ci #sqr_add_c(a,4,c3,c1,c2); 6661cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 6671cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 6681cb0ef41Sopenharmony_ci addc r11,r7,r11 6691cb0ef41Sopenharmony_ci adde r9,r8,r9 6701cb0ef41Sopenharmony_ci addze r10,r0 6711cb0ef41Sopenharmony_ci #sqr_add_c2(a,5,3,c3,c1,c2); 6721cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 6731cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 6741cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 6751cb0ef41Sopenharmony_ci addc r11,r7,r11 6761cb0ef41Sopenharmony_ci adde r9,r8,r9 6771cb0ef41Sopenharmony_ci addze r10,r10 6781cb0ef41Sopenharmony_ci addc r11,r7,r11 6791cb0ef41Sopenharmony_ci adde r9,r8,r9 6801cb0ef41Sopenharmony_ci addze r10,r10 6811cb0ef41Sopenharmony_ci #sqr_add_c2(a,6,2,c3,c1,c2); 6821cb0ef41Sopenharmony_ci $LD r5,`2*$BNSZ`(r4) 6831cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 6841cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 6851cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 6861cb0ef41Sopenharmony_ci addc r11,r7,r11 6871cb0ef41Sopenharmony_ci adde r9,r8,r9 6881cb0ef41Sopenharmony_ci addze r10,r10 6891cb0ef41Sopenharmony_ci 6901cb0ef41Sopenharmony_ci addc r11,r7,r11 6911cb0ef41Sopenharmony_ci adde r9,r8,r9 6921cb0ef41Sopenharmony_ci addze r10,r10 6931cb0ef41Sopenharmony_ci #sqr_add_c2(a,7,1,c3,c1,c2); 6941cb0ef41Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 6951cb0ef41Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 6961cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 6971cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 6981cb0ef41Sopenharmony_ci addc r11,r7,r11 6991cb0ef41Sopenharmony_ci adde r9,r8,r9 7001cb0ef41Sopenharmony_ci addze r10,r10 7011cb0ef41Sopenharmony_ci addc r11,r7,r11 7021cb0ef41Sopenharmony_ci adde r9,r8,r9 7031cb0ef41Sopenharmony_ci addze r10,r10 7041cb0ef41Sopenharmony_ci $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 7051cb0ef41Sopenharmony_ci #sqr_add_c2(a,7,2,c1,c2,c3); 7061cb0ef41Sopenharmony_ci $LD r5,`2*$BNSZ`(r4) 7071cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 7081cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 7091cb0ef41Sopenharmony_ci 7101cb0ef41Sopenharmony_ci addc r9,r7,r9 7111cb0ef41Sopenharmony_ci adde r10,r8,r10 7121cb0ef41Sopenharmony_ci addze r11,r0 7131cb0ef41Sopenharmony_ci addc r9,r7,r9 7141cb0ef41Sopenharmony_ci adde r10,r8,r10 7151cb0ef41Sopenharmony_ci addze r11,r11 7161cb0ef41Sopenharmony_ci #sqr_add_c2(a,6,3,c1,c2,c3); 7171cb0ef41Sopenharmony_ci $LD r5,`3*$BNSZ`(r4) 7181cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 7191cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 7201cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 7211cb0ef41Sopenharmony_ci addc r9,r7,r9 7221cb0ef41Sopenharmony_ci adde r10,r8,r10 7231cb0ef41Sopenharmony_ci addze r11,r11 7241cb0ef41Sopenharmony_ci addc r9,r7,r9 7251cb0ef41Sopenharmony_ci adde r10,r8,r10 7261cb0ef41Sopenharmony_ci addze r11,r11 7271cb0ef41Sopenharmony_ci #sqr_add_c2(a,5,4,c1,c2,c3); 7281cb0ef41Sopenharmony_ci $LD r5,`4*$BNSZ`(r4) 7291cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 7301cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 7311cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 7321cb0ef41Sopenharmony_ci addc r9,r7,r9 7331cb0ef41Sopenharmony_ci adde r10,r8,r10 7341cb0ef41Sopenharmony_ci addze r11,r11 7351cb0ef41Sopenharmony_ci addc r9,r7,r9 7361cb0ef41Sopenharmony_ci adde r10,r8,r10 7371cb0ef41Sopenharmony_ci addze r11,r11 7381cb0ef41Sopenharmony_ci $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 7391cb0ef41Sopenharmony_ci #sqr_add_c(a,5,c2,c3,c1); 7401cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 7411cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 7421cb0ef41Sopenharmony_ci addc r10,r7,r10 7431cb0ef41Sopenharmony_ci adde r11,r8,r11 7441cb0ef41Sopenharmony_ci addze r9,r0 7451cb0ef41Sopenharmony_ci #sqr_add_c2(a,6,4,c2,c3,c1); 7461cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 7471cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 7481cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 7491cb0ef41Sopenharmony_ci addc r10,r7,r10 7501cb0ef41Sopenharmony_ci adde r11,r8,r11 7511cb0ef41Sopenharmony_ci addze r9,r9 7521cb0ef41Sopenharmony_ci addc r10,r7,r10 7531cb0ef41Sopenharmony_ci adde r11,r8,r11 7541cb0ef41Sopenharmony_ci addze r9,r9 7551cb0ef41Sopenharmony_ci #sqr_add_c2(a,7,3,c2,c3,c1); 7561cb0ef41Sopenharmony_ci $LD r5,`3*$BNSZ`(r4) 7571cb0ef41Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 7581cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 7591cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 7601cb0ef41Sopenharmony_ci addc r10,r7,r10 7611cb0ef41Sopenharmony_ci adde r11,r8,r11 7621cb0ef41Sopenharmony_ci addze r9,r9 7631cb0ef41Sopenharmony_ci addc r10,r7,r10 7641cb0ef41Sopenharmony_ci adde r11,r8,r11 7651cb0ef41Sopenharmony_ci addze r9,r9 7661cb0ef41Sopenharmony_ci $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 7671cb0ef41Sopenharmony_ci #sqr_add_c2(a,7,4,c3,c1,c2); 7681cb0ef41Sopenharmony_ci $LD r5,`4*$BNSZ`(r4) 7691cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 7701cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 7711cb0ef41Sopenharmony_ci addc r11,r7,r11 7721cb0ef41Sopenharmony_ci adde r9,r8,r9 7731cb0ef41Sopenharmony_ci addze r10,r0 7741cb0ef41Sopenharmony_ci addc r11,r7,r11 7751cb0ef41Sopenharmony_ci adde r9,r8,r9 7761cb0ef41Sopenharmony_ci addze r10,r10 7771cb0ef41Sopenharmony_ci #sqr_add_c2(a,6,5,c3,c1,c2); 7781cb0ef41Sopenharmony_ci $LD r5,`5*$BNSZ`(r4) 7791cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 7801cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 7811cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 7821cb0ef41Sopenharmony_ci addc r11,r7,r11 7831cb0ef41Sopenharmony_ci adde r9,r8,r9 7841cb0ef41Sopenharmony_ci addze r10,r10 7851cb0ef41Sopenharmony_ci addc r11,r7,r11 7861cb0ef41Sopenharmony_ci adde r9,r8,r9 7871cb0ef41Sopenharmony_ci addze r10,r10 7881cb0ef41Sopenharmony_ci $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 7891cb0ef41Sopenharmony_ci #sqr_add_c(a,6,c1,c2,c3); 7901cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 7911cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 7921cb0ef41Sopenharmony_ci addc r9,r7,r9 7931cb0ef41Sopenharmony_ci adde r10,r8,r10 7941cb0ef41Sopenharmony_ci addze r11,r0 7951cb0ef41Sopenharmony_ci #sqr_add_c2(a,7,5,c1,c2,c3) 7961cb0ef41Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 7971cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 7981cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 7991cb0ef41Sopenharmony_ci addc r9,r7,r9 8001cb0ef41Sopenharmony_ci adde r10,r8,r10 8011cb0ef41Sopenharmony_ci addze r11,r11 8021cb0ef41Sopenharmony_ci addc r9,r7,r9 8031cb0ef41Sopenharmony_ci adde r10,r8,r10 8041cb0ef41Sopenharmony_ci addze r11,r11 8051cb0ef41Sopenharmony_ci $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 8061cb0ef41Sopenharmony_ci 8071cb0ef41Sopenharmony_ci #sqr_add_c2(a,7,6,c2,c3,c1) 8081cb0ef41Sopenharmony_ci $LD r5,`6*$BNSZ`(r4) 8091cb0ef41Sopenharmony_ci $UMULL r7,r5,r6 8101cb0ef41Sopenharmony_ci $UMULH r8,r5,r6 8111cb0ef41Sopenharmony_ci addc r10,r7,r10 8121cb0ef41Sopenharmony_ci adde r11,r8,r11 8131cb0ef41Sopenharmony_ci addze r9,r0 8141cb0ef41Sopenharmony_ci addc r10,r7,r10 8151cb0ef41Sopenharmony_ci adde r11,r8,r11 8161cb0ef41Sopenharmony_ci addze r9,r9 8171cb0ef41Sopenharmony_ci $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 8181cb0ef41Sopenharmony_ci #sqr_add_c(a,7,c3,c1,c2); 8191cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 8201cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 8211cb0ef41Sopenharmony_ci addc r11,r7,r11 8221cb0ef41Sopenharmony_ci adde r9,r8,r9 8231cb0ef41Sopenharmony_ci $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 8241cb0ef41Sopenharmony_ci $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 8251cb0ef41Sopenharmony_ci 8261cb0ef41Sopenharmony_ci 8271cb0ef41Sopenharmony_ci blr 8281cb0ef41Sopenharmony_ci .long 0 8291cb0ef41Sopenharmony_ci .byte 0,12,0x14,0,0,0,2,0 8301cb0ef41Sopenharmony_ci .long 0 8311cb0ef41Sopenharmony_ci.size .bn_sqr_comba8,.-.bn_sqr_comba8 8321cb0ef41Sopenharmony_ci 8331cb0ef41Sopenharmony_ci# 8341cb0ef41Sopenharmony_ci# NOTE: The following label name should be changed to 8351cb0ef41Sopenharmony_ci# "bn_mul_comba4" i.e. remove the first dot 8361cb0ef41Sopenharmony_ci# for the gcc compiler. This should be automatically 8371cb0ef41Sopenharmony_ci# done in the build 8381cb0ef41Sopenharmony_ci# 8391cb0ef41Sopenharmony_ci 8401cb0ef41Sopenharmony_ci.align 4 8411cb0ef41Sopenharmony_ci.bn_mul_comba4: 8421cb0ef41Sopenharmony_ci# 8431cb0ef41Sopenharmony_ci# This is an optimized version of the bn_mul_comba4 routine. 8441cb0ef41Sopenharmony_ci# 8451cb0ef41Sopenharmony_ci# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 8461cb0ef41Sopenharmony_ci# r3 contains r 8471cb0ef41Sopenharmony_ci# r4 contains a 8481cb0ef41Sopenharmony_ci# r5 contains b 8491cb0ef41Sopenharmony_ci# r6, r7 are the 2 BN_ULONGs being multiplied. 8501cb0ef41Sopenharmony_ci# r8, r9 are the results of the 32x32 giving 64 multiply. 8511cb0ef41Sopenharmony_ci# r10, r11, r12 are the equivalents of c1, c2, and c3. 8521cb0ef41Sopenharmony_ci# 8531cb0ef41Sopenharmony_ci xor r0,r0,r0 #r0=0. Used in addze below. 8541cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[0],c1,c2,c3); 8551cb0ef41Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) 8561cb0ef41Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 8571cb0ef41Sopenharmony_ci $UMULL r10,r6,r7 8581cb0ef41Sopenharmony_ci $UMULH r11,r6,r7 8591cb0ef41Sopenharmony_ci $ST r10,`0*$BNSZ`(r3) #r[0]=c1 8601cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[1],c2,c3,c1); 8611cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 8621cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 8631cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 8641cb0ef41Sopenharmony_ci addc r11,r8,r11 8651cb0ef41Sopenharmony_ci adde r12,r9,r0 8661cb0ef41Sopenharmony_ci addze r10,r0 8671cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[0],c2,c3,c1); 8681cb0ef41Sopenharmony_ci $LD r6, `1*$BNSZ`(r4) 8691cb0ef41Sopenharmony_ci $LD r7, `0*$BNSZ`(r5) 8701cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 8711cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 8721cb0ef41Sopenharmony_ci addc r11,r8,r11 8731cb0ef41Sopenharmony_ci adde r12,r9,r12 8741cb0ef41Sopenharmony_ci addze r10,r10 8751cb0ef41Sopenharmony_ci $ST r11,`1*$BNSZ`(r3) #r[1]=c2 8761cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[0],c3,c1,c2); 8771cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 8781cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 8791cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 8801cb0ef41Sopenharmony_ci addc r12,r8,r12 8811cb0ef41Sopenharmony_ci adde r10,r9,r10 8821cb0ef41Sopenharmony_ci addze r11,r0 8831cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[1],c3,c1,c2); 8841cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 8851cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 8861cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 8871cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 8881cb0ef41Sopenharmony_ci addc r12,r8,r12 8891cb0ef41Sopenharmony_ci adde r10,r9,r10 8901cb0ef41Sopenharmony_ci addze r11,r11 8911cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[2],c3,c1,c2); 8921cb0ef41Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) 8931cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 8941cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 8951cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 8961cb0ef41Sopenharmony_ci addc r12,r8,r12 8971cb0ef41Sopenharmony_ci adde r10,r9,r10 8981cb0ef41Sopenharmony_ci addze r11,r11 8991cb0ef41Sopenharmony_ci $ST r12,`2*$BNSZ`(r3) #r[2]=c3 9001cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[3],c1,c2,c3); 9011cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 9021cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 9031cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 9041cb0ef41Sopenharmony_ci addc r10,r8,r10 9051cb0ef41Sopenharmony_ci adde r11,r9,r11 9061cb0ef41Sopenharmony_ci addze r12,r0 9071cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[2],c1,c2,c3); 9081cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 9091cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 9101cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 9111cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 9121cb0ef41Sopenharmony_ci addc r10,r8,r10 9131cb0ef41Sopenharmony_ci adde r11,r9,r11 9141cb0ef41Sopenharmony_ci addze r12,r12 9151cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[1],c1,c2,c3); 9161cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 9171cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 9181cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 9191cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 9201cb0ef41Sopenharmony_ci addc r10,r8,r10 9211cb0ef41Sopenharmony_ci adde r11,r9,r11 9221cb0ef41Sopenharmony_ci addze r12,r12 9231cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[0],c1,c2,c3); 9241cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 9251cb0ef41Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 9261cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 9271cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 9281cb0ef41Sopenharmony_ci addc r10,r8,r10 9291cb0ef41Sopenharmony_ci adde r11,r9,r11 9301cb0ef41Sopenharmony_ci addze r12,r12 9311cb0ef41Sopenharmony_ci $ST r10,`3*$BNSZ`(r3) #r[3]=c1 9321cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[1],c2,c3,c1); 9331cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 9341cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 9351cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 9361cb0ef41Sopenharmony_ci addc r11,r8,r11 9371cb0ef41Sopenharmony_ci adde r12,r9,r12 9381cb0ef41Sopenharmony_ci addze r10,r0 9391cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[2],c2,c3,c1); 9401cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 9411cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 9421cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 9431cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 9441cb0ef41Sopenharmony_ci addc r11,r8,r11 9451cb0ef41Sopenharmony_ci adde r12,r9,r12 9461cb0ef41Sopenharmony_ci addze r10,r10 9471cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[3],c2,c3,c1); 9481cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 9491cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 9501cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 9511cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 9521cb0ef41Sopenharmony_ci addc r11,r8,r11 9531cb0ef41Sopenharmony_ci adde r12,r9,r12 9541cb0ef41Sopenharmony_ci addze r10,r10 9551cb0ef41Sopenharmony_ci $ST r11,`4*$BNSZ`(r3) #r[4]=c2 9561cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[3],c3,c1,c2); 9571cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 9581cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 9591cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 9601cb0ef41Sopenharmony_ci addc r12,r8,r12 9611cb0ef41Sopenharmony_ci adde r10,r9,r10 9621cb0ef41Sopenharmony_ci addze r11,r0 9631cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[2],c3,c1,c2); 9641cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 9651cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 9661cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 9671cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 9681cb0ef41Sopenharmony_ci addc r12,r8,r12 9691cb0ef41Sopenharmony_ci adde r10,r9,r10 9701cb0ef41Sopenharmony_ci addze r11,r11 9711cb0ef41Sopenharmony_ci $ST r12,`5*$BNSZ`(r3) #r[5]=c3 9721cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[3],c1,c2,c3); 9731cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 9741cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 9751cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 9761cb0ef41Sopenharmony_ci addc r10,r8,r10 9771cb0ef41Sopenharmony_ci adde r11,r9,r11 9781cb0ef41Sopenharmony_ci 9791cb0ef41Sopenharmony_ci $ST r10,`6*$BNSZ`(r3) #r[6]=c1 9801cb0ef41Sopenharmony_ci $ST r11,`7*$BNSZ`(r3) #r[7]=c2 9811cb0ef41Sopenharmony_ci blr 9821cb0ef41Sopenharmony_ci .long 0 9831cb0ef41Sopenharmony_ci .byte 0,12,0x14,0,0,0,3,0 9841cb0ef41Sopenharmony_ci .long 0 9851cb0ef41Sopenharmony_ci.size .bn_mul_comba4,.-.bn_mul_comba4 9861cb0ef41Sopenharmony_ci 9871cb0ef41Sopenharmony_ci# 9881cb0ef41Sopenharmony_ci# NOTE: The following label name should be changed to 9891cb0ef41Sopenharmony_ci# "bn_mul_comba8" i.e. remove the first dot 9901cb0ef41Sopenharmony_ci# for the gcc compiler. This should be automatically 9911cb0ef41Sopenharmony_ci# done in the build 9921cb0ef41Sopenharmony_ci# 9931cb0ef41Sopenharmony_ci 9941cb0ef41Sopenharmony_ci.align 4 9951cb0ef41Sopenharmony_ci.bn_mul_comba8: 9961cb0ef41Sopenharmony_ci# 9971cb0ef41Sopenharmony_ci# Optimized version of the bn_mul_comba8 routine. 9981cb0ef41Sopenharmony_ci# 9991cb0ef41Sopenharmony_ci# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 10001cb0ef41Sopenharmony_ci# r3 contains r 10011cb0ef41Sopenharmony_ci# r4 contains a 10021cb0ef41Sopenharmony_ci# r5 contains b 10031cb0ef41Sopenharmony_ci# r6, r7 are the 2 BN_ULONGs being multiplied. 10041cb0ef41Sopenharmony_ci# r8, r9 are the results of the 32x32 giving 64 multiply. 10051cb0ef41Sopenharmony_ci# r10, r11, r12 are the equivalents of c1, c2, and c3. 10061cb0ef41Sopenharmony_ci# 10071cb0ef41Sopenharmony_ci xor r0,r0,r0 #r0=0. Used in addze below. 10081cb0ef41Sopenharmony_ci 10091cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[0],c1,c2,c3); 10101cb0ef41Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) #a[0] 10111cb0ef41Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) #b[0] 10121cb0ef41Sopenharmony_ci $UMULL r10,r6,r7 10131cb0ef41Sopenharmony_ci $UMULH r11,r6,r7 10141cb0ef41Sopenharmony_ci $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 10151cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[1],c2,c3,c1); 10161cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 10171cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10181cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 10191cb0ef41Sopenharmony_ci addc r11,r11,r8 10201cb0ef41Sopenharmony_ci addze r12,r9 # since we didn't set r12 to zero before. 10211cb0ef41Sopenharmony_ci addze r10,r0 10221cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[0],c2,c3,c1); 10231cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 10241cb0ef41Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 10251cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10261cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 10271cb0ef41Sopenharmony_ci addc r11,r11,r8 10281cb0ef41Sopenharmony_ci adde r12,r12,r9 10291cb0ef41Sopenharmony_ci addze r10,r10 10301cb0ef41Sopenharmony_ci $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 10311cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[0],c3,c1,c2); 10321cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 10331cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10341cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 10351cb0ef41Sopenharmony_ci addc r12,r12,r8 10361cb0ef41Sopenharmony_ci adde r10,r10,r9 10371cb0ef41Sopenharmony_ci addze r11,r0 10381cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[1],c3,c1,c2); 10391cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 10401cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 10411cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10421cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 10431cb0ef41Sopenharmony_ci addc r12,r12,r8 10441cb0ef41Sopenharmony_ci adde r10,r10,r9 10451cb0ef41Sopenharmony_ci addze r11,r11 10461cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[2],c3,c1,c2); 10471cb0ef41Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) 10481cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 10491cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10501cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 10511cb0ef41Sopenharmony_ci addc r12,r12,r8 10521cb0ef41Sopenharmony_ci adde r10,r10,r9 10531cb0ef41Sopenharmony_ci addze r11,r11 10541cb0ef41Sopenharmony_ci $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 10551cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[3],c1,c2,c3); 10561cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 10571cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10581cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 10591cb0ef41Sopenharmony_ci addc r10,r10,r8 10601cb0ef41Sopenharmony_ci adde r11,r11,r9 10611cb0ef41Sopenharmony_ci addze r12,r0 10621cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[2],c1,c2,c3); 10631cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 10641cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 10651cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10661cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 10671cb0ef41Sopenharmony_ci addc r10,r10,r8 10681cb0ef41Sopenharmony_ci adde r11,r11,r9 10691cb0ef41Sopenharmony_ci addze r12,r12 10701cb0ef41Sopenharmony_ci 10711cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[1],c1,c2,c3); 10721cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 10731cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 10741cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10751cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 10761cb0ef41Sopenharmony_ci addc r10,r10,r8 10771cb0ef41Sopenharmony_ci adde r11,r11,r9 10781cb0ef41Sopenharmony_ci addze r12,r12 10791cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[0],c1,c2,c3); 10801cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 10811cb0ef41Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 10821cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10831cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 10841cb0ef41Sopenharmony_ci addc r10,r10,r8 10851cb0ef41Sopenharmony_ci adde r11,r11,r9 10861cb0ef41Sopenharmony_ci addze r12,r12 10871cb0ef41Sopenharmony_ci $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 10881cb0ef41Sopenharmony_ci #mul_add_c(a[4],b[0],c2,c3,c1); 10891cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 10901cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10911cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 10921cb0ef41Sopenharmony_ci addc r11,r11,r8 10931cb0ef41Sopenharmony_ci adde r12,r12,r9 10941cb0ef41Sopenharmony_ci addze r10,r0 10951cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[1],c2,c3,c1); 10961cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 10971cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 10981cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 10991cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11001cb0ef41Sopenharmony_ci addc r11,r11,r8 11011cb0ef41Sopenharmony_ci adde r12,r12,r9 11021cb0ef41Sopenharmony_ci addze r10,r10 11031cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[2],c2,c3,c1); 11041cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 11051cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 11061cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11071cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11081cb0ef41Sopenharmony_ci addc r11,r11,r8 11091cb0ef41Sopenharmony_ci adde r12,r12,r9 11101cb0ef41Sopenharmony_ci addze r10,r10 11111cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[3],c2,c3,c1); 11121cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 11131cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 11141cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11151cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11161cb0ef41Sopenharmony_ci addc r11,r11,r8 11171cb0ef41Sopenharmony_ci adde r12,r12,r9 11181cb0ef41Sopenharmony_ci addze r10,r10 11191cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[4],c2,c3,c1); 11201cb0ef41Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) 11211cb0ef41Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 11221cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11231cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11241cb0ef41Sopenharmony_ci addc r11,r11,r8 11251cb0ef41Sopenharmony_ci adde r12,r12,r9 11261cb0ef41Sopenharmony_ci addze r10,r10 11271cb0ef41Sopenharmony_ci $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 11281cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[5],c3,c1,c2); 11291cb0ef41Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 11301cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11311cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11321cb0ef41Sopenharmony_ci addc r12,r12,r8 11331cb0ef41Sopenharmony_ci adde r10,r10,r9 11341cb0ef41Sopenharmony_ci addze r11,r0 11351cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[4],c3,c1,c2); 11361cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 11371cb0ef41Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 11381cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11391cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11401cb0ef41Sopenharmony_ci addc r12,r12,r8 11411cb0ef41Sopenharmony_ci adde r10,r10,r9 11421cb0ef41Sopenharmony_ci addze r11,r11 11431cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[3],c3,c1,c2); 11441cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 11451cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 11461cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11471cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11481cb0ef41Sopenharmony_ci addc r12,r12,r8 11491cb0ef41Sopenharmony_ci adde r10,r10,r9 11501cb0ef41Sopenharmony_ci addze r11,r11 11511cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[2],c3,c1,c2); 11521cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 11531cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 11541cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11551cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11561cb0ef41Sopenharmony_ci addc r12,r12,r8 11571cb0ef41Sopenharmony_ci adde r10,r10,r9 11581cb0ef41Sopenharmony_ci addze r11,r11 11591cb0ef41Sopenharmony_ci #mul_add_c(a[4],b[1],c3,c1,c2); 11601cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 11611cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 11621cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11631cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11641cb0ef41Sopenharmony_ci addc r12,r12,r8 11651cb0ef41Sopenharmony_ci adde r10,r10,r9 11661cb0ef41Sopenharmony_ci addze r11,r11 11671cb0ef41Sopenharmony_ci #mul_add_c(a[5],b[0],c3,c1,c2); 11681cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 11691cb0ef41Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 11701cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11711cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11721cb0ef41Sopenharmony_ci addc r12,r12,r8 11731cb0ef41Sopenharmony_ci adde r10,r10,r9 11741cb0ef41Sopenharmony_ci addze r11,r11 11751cb0ef41Sopenharmony_ci $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 11761cb0ef41Sopenharmony_ci #mul_add_c(a[6],b[0],c1,c2,c3); 11771cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 11781cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11791cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11801cb0ef41Sopenharmony_ci addc r10,r10,r8 11811cb0ef41Sopenharmony_ci adde r11,r11,r9 11821cb0ef41Sopenharmony_ci addze r12,r0 11831cb0ef41Sopenharmony_ci #mul_add_c(a[5],b[1],c1,c2,c3); 11841cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 11851cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 11861cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11871cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11881cb0ef41Sopenharmony_ci addc r10,r10,r8 11891cb0ef41Sopenharmony_ci adde r11,r11,r9 11901cb0ef41Sopenharmony_ci addze r12,r12 11911cb0ef41Sopenharmony_ci #mul_add_c(a[4],b[2],c1,c2,c3); 11921cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 11931cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 11941cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 11951cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 11961cb0ef41Sopenharmony_ci addc r10,r10,r8 11971cb0ef41Sopenharmony_ci adde r11,r11,r9 11981cb0ef41Sopenharmony_ci addze r12,r12 11991cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[3],c1,c2,c3); 12001cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 12011cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 12021cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12031cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12041cb0ef41Sopenharmony_ci addc r10,r10,r8 12051cb0ef41Sopenharmony_ci adde r11,r11,r9 12061cb0ef41Sopenharmony_ci addze r12,r12 12071cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[4],c1,c2,c3); 12081cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 12091cb0ef41Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 12101cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12111cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12121cb0ef41Sopenharmony_ci addc r10,r10,r8 12131cb0ef41Sopenharmony_ci adde r11,r11,r9 12141cb0ef41Sopenharmony_ci addze r12,r12 12151cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[5],c1,c2,c3); 12161cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 12171cb0ef41Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 12181cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12191cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12201cb0ef41Sopenharmony_ci addc r10,r10,r8 12211cb0ef41Sopenharmony_ci adde r11,r11,r9 12221cb0ef41Sopenharmony_ci addze r12,r12 12231cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[6],c1,c2,c3); 12241cb0ef41Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) 12251cb0ef41Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 12261cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12271cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12281cb0ef41Sopenharmony_ci addc r10,r10,r8 12291cb0ef41Sopenharmony_ci adde r11,r11,r9 12301cb0ef41Sopenharmony_ci addze r12,r12 12311cb0ef41Sopenharmony_ci $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 12321cb0ef41Sopenharmony_ci #mul_add_c(a[0],b[7],c2,c3,c1); 12331cb0ef41Sopenharmony_ci $LD r7,`7*$BNSZ`(r5) 12341cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12351cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12361cb0ef41Sopenharmony_ci addc r11,r11,r8 12371cb0ef41Sopenharmony_ci adde r12,r12,r9 12381cb0ef41Sopenharmony_ci addze r10,r0 12391cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[6],c2,c3,c1); 12401cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 12411cb0ef41Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 12421cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12431cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12441cb0ef41Sopenharmony_ci addc r11,r11,r8 12451cb0ef41Sopenharmony_ci adde r12,r12,r9 12461cb0ef41Sopenharmony_ci addze r10,r10 12471cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[5],c2,c3,c1); 12481cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 12491cb0ef41Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 12501cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12511cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12521cb0ef41Sopenharmony_ci addc r11,r11,r8 12531cb0ef41Sopenharmony_ci adde r12,r12,r9 12541cb0ef41Sopenharmony_ci addze r10,r10 12551cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[4],c2,c3,c1); 12561cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 12571cb0ef41Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 12581cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12591cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12601cb0ef41Sopenharmony_ci addc r11,r11,r8 12611cb0ef41Sopenharmony_ci adde r12,r12,r9 12621cb0ef41Sopenharmony_ci addze r10,r10 12631cb0ef41Sopenharmony_ci #mul_add_c(a[4],b[3],c2,c3,c1); 12641cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 12651cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 12661cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12671cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12681cb0ef41Sopenharmony_ci addc r11,r11,r8 12691cb0ef41Sopenharmony_ci adde r12,r12,r9 12701cb0ef41Sopenharmony_ci addze r10,r10 12711cb0ef41Sopenharmony_ci #mul_add_c(a[5],b[2],c2,c3,c1); 12721cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 12731cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 12741cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12751cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12761cb0ef41Sopenharmony_ci addc r11,r11,r8 12771cb0ef41Sopenharmony_ci adde r12,r12,r9 12781cb0ef41Sopenharmony_ci addze r10,r10 12791cb0ef41Sopenharmony_ci #mul_add_c(a[6],b[1],c2,c3,c1); 12801cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 12811cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 12821cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12831cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12841cb0ef41Sopenharmony_ci addc r11,r11,r8 12851cb0ef41Sopenharmony_ci adde r12,r12,r9 12861cb0ef41Sopenharmony_ci addze r10,r10 12871cb0ef41Sopenharmony_ci #mul_add_c(a[7],b[0],c2,c3,c1); 12881cb0ef41Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 12891cb0ef41Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 12901cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12911cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 12921cb0ef41Sopenharmony_ci addc r11,r11,r8 12931cb0ef41Sopenharmony_ci adde r12,r12,r9 12941cb0ef41Sopenharmony_ci addze r10,r10 12951cb0ef41Sopenharmony_ci $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 12961cb0ef41Sopenharmony_ci #mul_add_c(a[7],b[1],c3,c1,c2); 12971cb0ef41Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 12981cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 12991cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13001cb0ef41Sopenharmony_ci addc r12,r12,r8 13011cb0ef41Sopenharmony_ci adde r10,r10,r9 13021cb0ef41Sopenharmony_ci addze r11,r0 13031cb0ef41Sopenharmony_ci #mul_add_c(a[6],b[2],c3,c1,c2); 13041cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 13051cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 13061cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13071cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13081cb0ef41Sopenharmony_ci addc r12,r12,r8 13091cb0ef41Sopenharmony_ci adde r10,r10,r9 13101cb0ef41Sopenharmony_ci addze r11,r11 13111cb0ef41Sopenharmony_ci #mul_add_c(a[5],b[3],c3,c1,c2); 13121cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 13131cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 13141cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13151cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13161cb0ef41Sopenharmony_ci addc r12,r12,r8 13171cb0ef41Sopenharmony_ci adde r10,r10,r9 13181cb0ef41Sopenharmony_ci addze r11,r11 13191cb0ef41Sopenharmony_ci #mul_add_c(a[4],b[4],c3,c1,c2); 13201cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 13211cb0ef41Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 13221cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13231cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13241cb0ef41Sopenharmony_ci addc r12,r12,r8 13251cb0ef41Sopenharmony_ci adde r10,r10,r9 13261cb0ef41Sopenharmony_ci addze r11,r11 13271cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[5],c3,c1,c2); 13281cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 13291cb0ef41Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 13301cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13311cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13321cb0ef41Sopenharmony_ci addc r12,r12,r8 13331cb0ef41Sopenharmony_ci adde r10,r10,r9 13341cb0ef41Sopenharmony_ci addze r11,r11 13351cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[6],c3,c1,c2); 13361cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 13371cb0ef41Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 13381cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13391cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13401cb0ef41Sopenharmony_ci addc r12,r12,r8 13411cb0ef41Sopenharmony_ci adde r10,r10,r9 13421cb0ef41Sopenharmony_ci addze r11,r11 13431cb0ef41Sopenharmony_ci #mul_add_c(a[1],b[7],c3,c1,c2); 13441cb0ef41Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 13451cb0ef41Sopenharmony_ci $LD r7,`7*$BNSZ`(r5) 13461cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13471cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13481cb0ef41Sopenharmony_ci addc r12,r12,r8 13491cb0ef41Sopenharmony_ci adde r10,r10,r9 13501cb0ef41Sopenharmony_ci addze r11,r11 13511cb0ef41Sopenharmony_ci $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 13521cb0ef41Sopenharmony_ci #mul_add_c(a[2],b[7],c1,c2,c3); 13531cb0ef41Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 13541cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13551cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13561cb0ef41Sopenharmony_ci addc r10,r10,r8 13571cb0ef41Sopenharmony_ci adde r11,r11,r9 13581cb0ef41Sopenharmony_ci addze r12,r0 13591cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[6],c1,c2,c3); 13601cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 13611cb0ef41Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 13621cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13631cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13641cb0ef41Sopenharmony_ci addc r10,r10,r8 13651cb0ef41Sopenharmony_ci adde r11,r11,r9 13661cb0ef41Sopenharmony_ci addze r12,r12 13671cb0ef41Sopenharmony_ci #mul_add_c(a[4],b[5],c1,c2,c3); 13681cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 13691cb0ef41Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 13701cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13711cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13721cb0ef41Sopenharmony_ci addc r10,r10,r8 13731cb0ef41Sopenharmony_ci adde r11,r11,r9 13741cb0ef41Sopenharmony_ci addze r12,r12 13751cb0ef41Sopenharmony_ci #mul_add_c(a[5],b[4],c1,c2,c3); 13761cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 13771cb0ef41Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 13781cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13791cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13801cb0ef41Sopenharmony_ci addc r10,r10,r8 13811cb0ef41Sopenharmony_ci adde r11,r11,r9 13821cb0ef41Sopenharmony_ci addze r12,r12 13831cb0ef41Sopenharmony_ci #mul_add_c(a[6],b[3],c1,c2,c3); 13841cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 13851cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 13861cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13871cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13881cb0ef41Sopenharmony_ci addc r10,r10,r8 13891cb0ef41Sopenharmony_ci adde r11,r11,r9 13901cb0ef41Sopenharmony_ci addze r12,r12 13911cb0ef41Sopenharmony_ci #mul_add_c(a[7],b[2],c1,c2,c3); 13921cb0ef41Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 13931cb0ef41Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 13941cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 13951cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 13961cb0ef41Sopenharmony_ci addc r10,r10,r8 13971cb0ef41Sopenharmony_ci adde r11,r11,r9 13981cb0ef41Sopenharmony_ci addze r12,r12 13991cb0ef41Sopenharmony_ci $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 14001cb0ef41Sopenharmony_ci #mul_add_c(a[7],b[3],c2,c3,c1); 14011cb0ef41Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 14021cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14031cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14041cb0ef41Sopenharmony_ci addc r11,r11,r8 14051cb0ef41Sopenharmony_ci adde r12,r12,r9 14061cb0ef41Sopenharmony_ci addze r10,r0 14071cb0ef41Sopenharmony_ci #mul_add_c(a[6],b[4],c2,c3,c1); 14081cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 14091cb0ef41Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 14101cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14111cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14121cb0ef41Sopenharmony_ci addc r11,r11,r8 14131cb0ef41Sopenharmony_ci adde r12,r12,r9 14141cb0ef41Sopenharmony_ci addze r10,r10 14151cb0ef41Sopenharmony_ci #mul_add_c(a[5],b[5],c2,c3,c1); 14161cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 14171cb0ef41Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 14181cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14191cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14201cb0ef41Sopenharmony_ci addc r11,r11,r8 14211cb0ef41Sopenharmony_ci adde r12,r12,r9 14221cb0ef41Sopenharmony_ci addze r10,r10 14231cb0ef41Sopenharmony_ci #mul_add_c(a[4],b[6],c2,c3,c1); 14241cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 14251cb0ef41Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 14261cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14271cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14281cb0ef41Sopenharmony_ci addc r11,r11,r8 14291cb0ef41Sopenharmony_ci adde r12,r12,r9 14301cb0ef41Sopenharmony_ci addze r10,r10 14311cb0ef41Sopenharmony_ci #mul_add_c(a[3],b[7],c2,c3,c1); 14321cb0ef41Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 14331cb0ef41Sopenharmony_ci $LD r7,`7*$BNSZ`(r5) 14341cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14351cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14361cb0ef41Sopenharmony_ci addc r11,r11,r8 14371cb0ef41Sopenharmony_ci adde r12,r12,r9 14381cb0ef41Sopenharmony_ci addze r10,r10 14391cb0ef41Sopenharmony_ci $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 14401cb0ef41Sopenharmony_ci #mul_add_c(a[4],b[7],c3,c1,c2); 14411cb0ef41Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 14421cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14431cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14441cb0ef41Sopenharmony_ci addc r12,r12,r8 14451cb0ef41Sopenharmony_ci adde r10,r10,r9 14461cb0ef41Sopenharmony_ci addze r11,r0 14471cb0ef41Sopenharmony_ci #mul_add_c(a[5],b[6],c3,c1,c2); 14481cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 14491cb0ef41Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 14501cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14511cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14521cb0ef41Sopenharmony_ci addc r12,r12,r8 14531cb0ef41Sopenharmony_ci adde r10,r10,r9 14541cb0ef41Sopenharmony_ci addze r11,r11 14551cb0ef41Sopenharmony_ci #mul_add_c(a[6],b[5],c3,c1,c2); 14561cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 14571cb0ef41Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 14581cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14591cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14601cb0ef41Sopenharmony_ci addc r12,r12,r8 14611cb0ef41Sopenharmony_ci adde r10,r10,r9 14621cb0ef41Sopenharmony_ci addze r11,r11 14631cb0ef41Sopenharmony_ci #mul_add_c(a[7],b[4],c3,c1,c2); 14641cb0ef41Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 14651cb0ef41Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 14661cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14671cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14681cb0ef41Sopenharmony_ci addc r12,r12,r8 14691cb0ef41Sopenharmony_ci adde r10,r10,r9 14701cb0ef41Sopenharmony_ci addze r11,r11 14711cb0ef41Sopenharmony_ci $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 14721cb0ef41Sopenharmony_ci #mul_add_c(a[7],b[5],c1,c2,c3); 14731cb0ef41Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 14741cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14751cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14761cb0ef41Sopenharmony_ci addc r10,r10,r8 14771cb0ef41Sopenharmony_ci adde r11,r11,r9 14781cb0ef41Sopenharmony_ci addze r12,r0 14791cb0ef41Sopenharmony_ci #mul_add_c(a[6],b[6],c1,c2,c3); 14801cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 14811cb0ef41Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 14821cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14831cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14841cb0ef41Sopenharmony_ci addc r10,r10,r8 14851cb0ef41Sopenharmony_ci adde r11,r11,r9 14861cb0ef41Sopenharmony_ci addze r12,r12 14871cb0ef41Sopenharmony_ci #mul_add_c(a[5],b[7],c1,c2,c3); 14881cb0ef41Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 14891cb0ef41Sopenharmony_ci $LD r7,`7*$BNSZ`(r5) 14901cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14911cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 14921cb0ef41Sopenharmony_ci addc r10,r10,r8 14931cb0ef41Sopenharmony_ci adde r11,r11,r9 14941cb0ef41Sopenharmony_ci addze r12,r12 14951cb0ef41Sopenharmony_ci $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 14961cb0ef41Sopenharmony_ci #mul_add_c(a[6],b[7],c2,c3,c1); 14971cb0ef41Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 14981cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 14991cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 15001cb0ef41Sopenharmony_ci addc r11,r11,r8 15011cb0ef41Sopenharmony_ci adde r12,r12,r9 15021cb0ef41Sopenharmony_ci addze r10,r0 15031cb0ef41Sopenharmony_ci #mul_add_c(a[7],b[6],c2,c3,c1); 15041cb0ef41Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 15051cb0ef41Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 15061cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 15071cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 15081cb0ef41Sopenharmony_ci addc r11,r11,r8 15091cb0ef41Sopenharmony_ci adde r12,r12,r9 15101cb0ef41Sopenharmony_ci addze r10,r10 15111cb0ef41Sopenharmony_ci $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 15121cb0ef41Sopenharmony_ci #mul_add_c(a[7],b[7],c3,c1,c2); 15131cb0ef41Sopenharmony_ci $LD r7,`7*$BNSZ`(r5) 15141cb0ef41Sopenharmony_ci $UMULL r8,r6,r7 15151cb0ef41Sopenharmony_ci $UMULH r9,r6,r7 15161cb0ef41Sopenharmony_ci addc r12,r12,r8 15171cb0ef41Sopenharmony_ci adde r10,r10,r9 15181cb0ef41Sopenharmony_ci $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 15191cb0ef41Sopenharmony_ci $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 15201cb0ef41Sopenharmony_ci blr 15211cb0ef41Sopenharmony_ci .long 0 15221cb0ef41Sopenharmony_ci .byte 0,12,0x14,0,0,0,3,0 15231cb0ef41Sopenharmony_ci .long 0 15241cb0ef41Sopenharmony_ci.size .bn_mul_comba8,.-.bn_mul_comba8 15251cb0ef41Sopenharmony_ci 15261cb0ef41Sopenharmony_ci# 15271cb0ef41Sopenharmony_ci# NOTE: The following label name should be changed to 15281cb0ef41Sopenharmony_ci# "bn_sub_words" i.e. remove the first dot 15291cb0ef41Sopenharmony_ci# for the gcc compiler. This should be automatically 15301cb0ef41Sopenharmony_ci# done in the build 15311cb0ef41Sopenharmony_ci# 15321cb0ef41Sopenharmony_ci# 15331cb0ef41Sopenharmony_ci.align 4 15341cb0ef41Sopenharmony_ci.bn_sub_words: 15351cb0ef41Sopenharmony_ci# 15361cb0ef41Sopenharmony_ci# Handcoded version of bn_sub_words 15371cb0ef41Sopenharmony_ci# 15381cb0ef41Sopenharmony_ci#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 15391cb0ef41Sopenharmony_ci# 15401cb0ef41Sopenharmony_ci# r3 = r 15411cb0ef41Sopenharmony_ci# r4 = a 15421cb0ef41Sopenharmony_ci# r5 = b 15431cb0ef41Sopenharmony_ci# r6 = n 15441cb0ef41Sopenharmony_ci# 15451cb0ef41Sopenharmony_ci# Note: No loop unrolling done since this is not a performance 15461cb0ef41Sopenharmony_ci# critical loop. 15471cb0ef41Sopenharmony_ci 15481cb0ef41Sopenharmony_ci xor r0,r0,r0 #set r0 = 0 15491cb0ef41Sopenharmony_ci# 15501cb0ef41Sopenharmony_ci# check for r6 = 0 AND set carry bit. 15511cb0ef41Sopenharmony_ci# 15521cb0ef41Sopenharmony_ci subfc. r7,r0,r6 # If r6 is 0 then result is 0. 15531cb0ef41Sopenharmony_ci # if r6 > 0 then result !=0 15541cb0ef41Sopenharmony_ci # In either case carry bit is set. 15551cb0ef41Sopenharmony_ci beq Lppcasm_sub_adios 15561cb0ef41Sopenharmony_ci addi r4,r4,-$BNSZ 15571cb0ef41Sopenharmony_ci addi r3,r3,-$BNSZ 15581cb0ef41Sopenharmony_ci addi r5,r5,-$BNSZ 15591cb0ef41Sopenharmony_ci mtctr r6 15601cb0ef41Sopenharmony_ciLppcasm_sub_mainloop: 15611cb0ef41Sopenharmony_ci $LDU r7,$BNSZ(r4) 15621cb0ef41Sopenharmony_ci $LDU r8,$BNSZ(r5) 15631cb0ef41Sopenharmony_ci subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 15641cb0ef41Sopenharmony_ci # if carry = 1 this is r7-r8. Else it 15651cb0ef41Sopenharmony_ci # is r7-r8 -1 as we need. 15661cb0ef41Sopenharmony_ci $STU r6,$BNSZ(r3) 15671cb0ef41Sopenharmony_ci bdnz Lppcasm_sub_mainloop 15681cb0ef41Sopenharmony_ciLppcasm_sub_adios: 15691cb0ef41Sopenharmony_ci subfze r3,r0 # if carry bit is set then r3 = 0 else -1 15701cb0ef41Sopenharmony_ci andi. r3,r3,1 # keep only last bit. 15711cb0ef41Sopenharmony_ci blr 15721cb0ef41Sopenharmony_ci .long 0 15731cb0ef41Sopenharmony_ci .byte 0,12,0x14,0,0,0,4,0 15741cb0ef41Sopenharmony_ci .long 0 15751cb0ef41Sopenharmony_ci.size .bn_sub_words,.-.bn_sub_words 15761cb0ef41Sopenharmony_ci 15771cb0ef41Sopenharmony_ci# 15781cb0ef41Sopenharmony_ci# NOTE: The following label name should be changed to 15791cb0ef41Sopenharmony_ci# "bn_add_words" i.e. remove the first dot 15801cb0ef41Sopenharmony_ci# for the gcc compiler. This should be automatically 15811cb0ef41Sopenharmony_ci# done in the build 15821cb0ef41Sopenharmony_ci# 15831cb0ef41Sopenharmony_ci 15841cb0ef41Sopenharmony_ci.align 4 15851cb0ef41Sopenharmony_ci.bn_add_words: 15861cb0ef41Sopenharmony_ci# 15871cb0ef41Sopenharmony_ci# Handcoded version of bn_add_words 15881cb0ef41Sopenharmony_ci# 15891cb0ef41Sopenharmony_ci#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 15901cb0ef41Sopenharmony_ci# 15911cb0ef41Sopenharmony_ci# r3 = r 15921cb0ef41Sopenharmony_ci# r4 = a 15931cb0ef41Sopenharmony_ci# r5 = b 15941cb0ef41Sopenharmony_ci# r6 = n 15951cb0ef41Sopenharmony_ci# 15961cb0ef41Sopenharmony_ci# Note: No loop unrolling done since this is not a performance 15971cb0ef41Sopenharmony_ci# critical loop. 15981cb0ef41Sopenharmony_ci 15991cb0ef41Sopenharmony_ci xor r0,r0,r0 16001cb0ef41Sopenharmony_ci# 16011cb0ef41Sopenharmony_ci# check for r6 = 0. Is this needed? 16021cb0ef41Sopenharmony_ci# 16031cb0ef41Sopenharmony_ci addic. r6,r6,0 #test r6 and clear carry bit. 16041cb0ef41Sopenharmony_ci beq Lppcasm_add_adios 16051cb0ef41Sopenharmony_ci addi r4,r4,-$BNSZ 16061cb0ef41Sopenharmony_ci addi r3,r3,-$BNSZ 16071cb0ef41Sopenharmony_ci addi r5,r5,-$BNSZ 16081cb0ef41Sopenharmony_ci mtctr r6 16091cb0ef41Sopenharmony_ciLppcasm_add_mainloop: 16101cb0ef41Sopenharmony_ci $LDU r7,$BNSZ(r4) 16111cb0ef41Sopenharmony_ci $LDU r8,$BNSZ(r5) 16121cb0ef41Sopenharmony_ci adde r8,r7,r8 16131cb0ef41Sopenharmony_ci $STU r8,$BNSZ(r3) 16141cb0ef41Sopenharmony_ci bdnz Lppcasm_add_mainloop 16151cb0ef41Sopenharmony_ciLppcasm_add_adios: 16161cb0ef41Sopenharmony_ci addze r3,r0 #return carry bit. 16171cb0ef41Sopenharmony_ci blr 16181cb0ef41Sopenharmony_ci .long 0 16191cb0ef41Sopenharmony_ci .byte 0,12,0x14,0,0,0,4,0 16201cb0ef41Sopenharmony_ci .long 0 16211cb0ef41Sopenharmony_ci.size .bn_add_words,.-.bn_add_words 16221cb0ef41Sopenharmony_ci 16231cb0ef41Sopenharmony_ci# 16241cb0ef41Sopenharmony_ci# NOTE: The following label name should be changed to 16251cb0ef41Sopenharmony_ci# "bn_div_words" i.e. remove the first dot 16261cb0ef41Sopenharmony_ci# for the gcc compiler. This should be automatically 16271cb0ef41Sopenharmony_ci# done in the build 16281cb0ef41Sopenharmony_ci# 16291cb0ef41Sopenharmony_ci 16301cb0ef41Sopenharmony_ci.align 4 16311cb0ef41Sopenharmony_ci.bn_div_words: 16321cb0ef41Sopenharmony_ci# 16331cb0ef41Sopenharmony_ci# This is a cleaned up version of code generated by 16341cb0ef41Sopenharmony_ci# the AIX compiler. The only optimization is to use 16351cb0ef41Sopenharmony_ci# the PPC instruction to count leading zeros instead 16361cb0ef41Sopenharmony_ci# of call to num_bits_word. Since this was compiled 16371cb0ef41Sopenharmony_ci# only at level -O2 we can possibly squeeze it more? 16381cb0ef41Sopenharmony_ci# 16391cb0ef41Sopenharmony_ci# r3 = h 16401cb0ef41Sopenharmony_ci# r4 = l 16411cb0ef41Sopenharmony_ci# r5 = d 16421cb0ef41Sopenharmony_ci 16431cb0ef41Sopenharmony_ci $UCMPI 0,r5,0 # compare r5 and 0 16441cb0ef41Sopenharmony_ci bne Lppcasm_div1 # proceed if d!=0 16451cb0ef41Sopenharmony_ci li r3,-1 # d=0 return -1 16461cb0ef41Sopenharmony_ci blr 16471cb0ef41Sopenharmony_ciLppcasm_div1: 16481cb0ef41Sopenharmony_ci xor r0,r0,r0 #r0=0 16491cb0ef41Sopenharmony_ci li r8,$BITS 16501cb0ef41Sopenharmony_ci $CNTLZ. r7,r5 #r7 = num leading 0s in d. 16511cb0ef41Sopenharmony_ci beq Lppcasm_div2 #proceed if no leading zeros 16521cb0ef41Sopenharmony_ci subf r8,r7,r8 #r8 = BN_num_bits_word(d) 16531cb0ef41Sopenharmony_ci $SHR. r9,r3,r8 #are there any bits above r8'th? 16541cb0ef41Sopenharmony_ci $TR 16,r9,r0 #if there're, signal to dump core... 16551cb0ef41Sopenharmony_ciLppcasm_div2: 16561cb0ef41Sopenharmony_ci $UCMP 0,r3,r5 #h>=d? 16571cb0ef41Sopenharmony_ci blt Lppcasm_div3 #goto Lppcasm_div3 if not 16581cb0ef41Sopenharmony_ci subf r3,r5,r3 #h-=d ; 16591cb0ef41Sopenharmony_ciLppcasm_div3: #r7 = BN_BITS2-i. so r7=i 16601cb0ef41Sopenharmony_ci cmpi 0,0,r7,0 # is (i == 0)? 16611cb0ef41Sopenharmony_ci beq Lppcasm_div4 16621cb0ef41Sopenharmony_ci $SHL r3,r3,r7 # h = (h<< i) 16631cb0ef41Sopenharmony_ci $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 16641cb0ef41Sopenharmony_ci $SHL r5,r5,r7 # d<<=i 16651cb0ef41Sopenharmony_ci or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 16661cb0ef41Sopenharmony_ci $SHL r4,r4,r7 # l <<=i 16671cb0ef41Sopenharmony_ciLppcasm_div4: 16681cb0ef41Sopenharmony_ci $SHRI r9,r5,`$BITS/2` # r9 = dh 16691cb0ef41Sopenharmony_ci # dl will be computed when needed 16701cb0ef41Sopenharmony_ci # as it saves registers. 16711cb0ef41Sopenharmony_ci li r6,2 #r6=2 16721cb0ef41Sopenharmony_ci mtctr r6 #counter will be in count. 16731cb0ef41Sopenharmony_ciLppcasm_divouterloop: 16741cb0ef41Sopenharmony_ci $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 16751cb0ef41Sopenharmony_ci $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 16761cb0ef41Sopenharmony_ci # compute here for innerloop. 16771cb0ef41Sopenharmony_ci $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 16781cb0ef41Sopenharmony_ci bne Lppcasm_div5 # goto Lppcasm_div5 if not 16791cb0ef41Sopenharmony_ci 16801cb0ef41Sopenharmony_ci li r8,-1 16811cb0ef41Sopenharmony_ci $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 16821cb0ef41Sopenharmony_ci b Lppcasm_div6 16831cb0ef41Sopenharmony_ciLppcasm_div5: 16841cb0ef41Sopenharmony_ci $UDIV r8,r3,r9 #q = h/dh 16851cb0ef41Sopenharmony_ciLppcasm_div6: 16861cb0ef41Sopenharmony_ci $UMULL r12,r9,r8 #th = q*dh 16871cb0ef41Sopenharmony_ci $CLRU r10,r5,`$BITS/2` #r10=dl 16881cb0ef41Sopenharmony_ci $UMULL r6,r8,r10 #tl = q*dl 16891cb0ef41Sopenharmony_ci 16901cb0ef41Sopenharmony_ciLppcasm_divinnerloop: 16911cb0ef41Sopenharmony_ci subf r10,r12,r3 #t = h -th 16921cb0ef41Sopenharmony_ci $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 16931cb0ef41Sopenharmony_ci addic. r7,r7,0 #test if r7 == 0. used below. 16941cb0ef41Sopenharmony_ci # now want to compute 16951cb0ef41Sopenharmony_ci # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 16961cb0ef41Sopenharmony_ci # the following 2 instructions do that 16971cb0ef41Sopenharmony_ci $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 16981cb0ef41Sopenharmony_ci or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 16991cb0ef41Sopenharmony_ci $UCMP cr1,r6,r7 # compare (tl <= r7) 17001cb0ef41Sopenharmony_ci bne Lppcasm_divinnerexit 17011cb0ef41Sopenharmony_ci ble cr1,Lppcasm_divinnerexit 17021cb0ef41Sopenharmony_ci addi r8,r8,-1 #q-- 17031cb0ef41Sopenharmony_ci subf r12,r9,r12 #th -=dh 17041cb0ef41Sopenharmony_ci $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 17051cb0ef41Sopenharmony_ci subf r6,r10,r6 #tl -=dl 17061cb0ef41Sopenharmony_ci b Lppcasm_divinnerloop 17071cb0ef41Sopenharmony_ciLppcasm_divinnerexit: 17081cb0ef41Sopenharmony_ci $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 17091cb0ef41Sopenharmony_ci $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 17101cb0ef41Sopenharmony_ci $UCMP cr1,r4,r11 # compare l and tl 17111cb0ef41Sopenharmony_ci add r12,r12,r10 # th+=t 17121cb0ef41Sopenharmony_ci bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 17131cb0ef41Sopenharmony_ci addi r12,r12,1 # th++ 17141cb0ef41Sopenharmony_ciLppcasm_div7: 17151cb0ef41Sopenharmony_ci subf r11,r11,r4 #r11=l-tl 17161cb0ef41Sopenharmony_ci $UCMP cr1,r3,r12 #compare h and th 17171cb0ef41Sopenharmony_ci bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 17181cb0ef41Sopenharmony_ci addi r8,r8,-1 # q-- 17191cb0ef41Sopenharmony_ci add r3,r5,r3 # h+=d 17201cb0ef41Sopenharmony_ciLppcasm_div8: 17211cb0ef41Sopenharmony_ci subf r12,r12,r3 #r12 = h-th 17221cb0ef41Sopenharmony_ci $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 17231cb0ef41Sopenharmony_ci # want to compute 17241cb0ef41Sopenharmony_ci # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 17251cb0ef41Sopenharmony_ci # the following 2 instructions will do this. 17261cb0ef41Sopenharmony_ci $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 17271cb0ef41Sopenharmony_ci $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 17281cb0ef41Sopenharmony_ci bdz Lppcasm_div9 #if (count==0) break ; 17291cb0ef41Sopenharmony_ci $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 17301cb0ef41Sopenharmony_ci b Lppcasm_divouterloop 17311cb0ef41Sopenharmony_ciLppcasm_div9: 17321cb0ef41Sopenharmony_ci or r3,r8,r0 17331cb0ef41Sopenharmony_ci blr 17341cb0ef41Sopenharmony_ci .long 0 17351cb0ef41Sopenharmony_ci .byte 0,12,0x14,0,0,0,3,0 17361cb0ef41Sopenharmony_ci .long 0 17371cb0ef41Sopenharmony_ci.size .bn_div_words,.-.bn_div_words 17381cb0ef41Sopenharmony_ci 17391cb0ef41Sopenharmony_ci# 17401cb0ef41Sopenharmony_ci# NOTE: The following label name should be changed to 17411cb0ef41Sopenharmony_ci# "bn_sqr_words" i.e. remove the first dot 17421cb0ef41Sopenharmony_ci# for the gcc compiler. This should be automatically 17431cb0ef41Sopenharmony_ci# done in the build 17441cb0ef41Sopenharmony_ci# 17451cb0ef41Sopenharmony_ci.align 4 17461cb0ef41Sopenharmony_ci.bn_sqr_words: 17471cb0ef41Sopenharmony_ci# 17481cb0ef41Sopenharmony_ci# Optimized version of bn_sqr_words 17491cb0ef41Sopenharmony_ci# 17501cb0ef41Sopenharmony_ci# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 17511cb0ef41Sopenharmony_ci# 17521cb0ef41Sopenharmony_ci# r3 = r 17531cb0ef41Sopenharmony_ci# r4 = a 17541cb0ef41Sopenharmony_ci# r5 = n 17551cb0ef41Sopenharmony_ci# 17561cb0ef41Sopenharmony_ci# r6 = a[i]. 17571cb0ef41Sopenharmony_ci# r7,r8 = product. 17581cb0ef41Sopenharmony_ci# 17591cb0ef41Sopenharmony_ci# No unrolling done here. Not performance critical. 17601cb0ef41Sopenharmony_ci 17611cb0ef41Sopenharmony_ci addic. r5,r5,0 #test r5. 17621cb0ef41Sopenharmony_ci beq Lppcasm_sqr_adios 17631cb0ef41Sopenharmony_ci addi r4,r4,-$BNSZ 17641cb0ef41Sopenharmony_ci addi r3,r3,-$BNSZ 17651cb0ef41Sopenharmony_ci mtctr r5 17661cb0ef41Sopenharmony_ciLppcasm_sqr_mainloop: 17671cb0ef41Sopenharmony_ci #sqr(r[0],r[1],a[0]); 17681cb0ef41Sopenharmony_ci $LDU r6,$BNSZ(r4) 17691cb0ef41Sopenharmony_ci $UMULL r7,r6,r6 17701cb0ef41Sopenharmony_ci $UMULH r8,r6,r6 17711cb0ef41Sopenharmony_ci $STU r7,$BNSZ(r3) 17721cb0ef41Sopenharmony_ci $STU r8,$BNSZ(r3) 17731cb0ef41Sopenharmony_ci bdnz Lppcasm_sqr_mainloop 17741cb0ef41Sopenharmony_ciLppcasm_sqr_adios: 17751cb0ef41Sopenharmony_ci blr 17761cb0ef41Sopenharmony_ci .long 0 17771cb0ef41Sopenharmony_ci .byte 0,12,0x14,0,0,0,3,0 17781cb0ef41Sopenharmony_ci .long 0 17791cb0ef41Sopenharmony_ci.size .bn_sqr_words,.-.bn_sqr_words 17801cb0ef41Sopenharmony_ci 17811cb0ef41Sopenharmony_ci# 17821cb0ef41Sopenharmony_ci# NOTE: The following label name should be changed to 17831cb0ef41Sopenharmony_ci# "bn_mul_words" i.e. remove the first dot 17841cb0ef41Sopenharmony_ci# for the gcc compiler. This should be automatically 17851cb0ef41Sopenharmony_ci# done in the build 17861cb0ef41Sopenharmony_ci# 17871cb0ef41Sopenharmony_ci 17881cb0ef41Sopenharmony_ci.align 4 17891cb0ef41Sopenharmony_ci.bn_mul_words: 17901cb0ef41Sopenharmony_ci# 17911cb0ef41Sopenharmony_ci# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 17921cb0ef41Sopenharmony_ci# 17931cb0ef41Sopenharmony_ci# r3 = rp 17941cb0ef41Sopenharmony_ci# r4 = ap 17951cb0ef41Sopenharmony_ci# r5 = num 17961cb0ef41Sopenharmony_ci# r6 = w 17971cb0ef41Sopenharmony_ci xor r0,r0,r0 17981cb0ef41Sopenharmony_ci xor r12,r12,r12 # used for carry 17991cb0ef41Sopenharmony_ci rlwinm. r7,r5,30,2,31 # num >> 2 18001cb0ef41Sopenharmony_ci beq Lppcasm_mw_REM 18011cb0ef41Sopenharmony_ci mtctr r7 18021cb0ef41Sopenharmony_ciLppcasm_mw_LOOP: 18031cb0ef41Sopenharmony_ci #mul(rp[0],ap[0],w,c1); 18041cb0ef41Sopenharmony_ci $LD r8,`0*$BNSZ`(r4) 18051cb0ef41Sopenharmony_ci $UMULL r9,r6,r8 18061cb0ef41Sopenharmony_ci $UMULH r10,r6,r8 18071cb0ef41Sopenharmony_ci addc r9,r9,r12 18081cb0ef41Sopenharmony_ci #addze r10,r10 #carry is NOT ignored. 18091cb0ef41Sopenharmony_ci #will be taken care of 18101cb0ef41Sopenharmony_ci #in second spin below 18111cb0ef41Sopenharmony_ci #using adde. 18121cb0ef41Sopenharmony_ci $ST r9,`0*$BNSZ`(r3) 18131cb0ef41Sopenharmony_ci #mul(rp[1],ap[1],w,c1); 18141cb0ef41Sopenharmony_ci $LD r8,`1*$BNSZ`(r4) 18151cb0ef41Sopenharmony_ci $UMULL r11,r6,r8 18161cb0ef41Sopenharmony_ci $UMULH r12,r6,r8 18171cb0ef41Sopenharmony_ci adde r11,r11,r10 18181cb0ef41Sopenharmony_ci #addze r12,r12 18191cb0ef41Sopenharmony_ci $ST r11,`1*$BNSZ`(r3) 18201cb0ef41Sopenharmony_ci #mul(rp[2],ap[2],w,c1); 18211cb0ef41Sopenharmony_ci $LD r8,`2*$BNSZ`(r4) 18221cb0ef41Sopenharmony_ci $UMULL r9,r6,r8 18231cb0ef41Sopenharmony_ci $UMULH r10,r6,r8 18241cb0ef41Sopenharmony_ci adde r9,r9,r12 18251cb0ef41Sopenharmony_ci #addze r10,r10 18261cb0ef41Sopenharmony_ci $ST r9,`2*$BNSZ`(r3) 18271cb0ef41Sopenharmony_ci #mul_add(rp[3],ap[3],w,c1); 18281cb0ef41Sopenharmony_ci $LD r8,`3*$BNSZ`(r4) 18291cb0ef41Sopenharmony_ci $UMULL r11,r6,r8 18301cb0ef41Sopenharmony_ci $UMULH r12,r6,r8 18311cb0ef41Sopenharmony_ci adde r11,r11,r10 18321cb0ef41Sopenharmony_ci addze r12,r12 #this spin we collect carry into 18331cb0ef41Sopenharmony_ci #r12 18341cb0ef41Sopenharmony_ci $ST r11,`3*$BNSZ`(r3) 18351cb0ef41Sopenharmony_ci 18361cb0ef41Sopenharmony_ci addi r3,r3,`4*$BNSZ` 18371cb0ef41Sopenharmony_ci addi r4,r4,`4*$BNSZ` 18381cb0ef41Sopenharmony_ci bdnz Lppcasm_mw_LOOP 18391cb0ef41Sopenharmony_ci 18401cb0ef41Sopenharmony_ciLppcasm_mw_REM: 18411cb0ef41Sopenharmony_ci andi. r5,r5,0x3 18421cb0ef41Sopenharmony_ci beq Lppcasm_mw_OVER 18431cb0ef41Sopenharmony_ci #mul(rp[0],ap[0],w,c1); 18441cb0ef41Sopenharmony_ci $LD r8,`0*$BNSZ`(r4) 18451cb0ef41Sopenharmony_ci $UMULL r9,r6,r8 18461cb0ef41Sopenharmony_ci $UMULH r10,r6,r8 18471cb0ef41Sopenharmony_ci addc r9,r9,r12 18481cb0ef41Sopenharmony_ci addze r10,r10 18491cb0ef41Sopenharmony_ci $ST r9,`0*$BNSZ`(r3) 18501cb0ef41Sopenharmony_ci addi r12,r10,0 18511cb0ef41Sopenharmony_ci 18521cb0ef41Sopenharmony_ci addi r5,r5,-1 18531cb0ef41Sopenharmony_ci cmpli 0,0,r5,0 18541cb0ef41Sopenharmony_ci beq Lppcasm_mw_OVER 18551cb0ef41Sopenharmony_ci 18561cb0ef41Sopenharmony_ci 18571cb0ef41Sopenharmony_ci #mul(rp[1],ap[1],w,c1); 18581cb0ef41Sopenharmony_ci $LD r8,`1*$BNSZ`(r4) 18591cb0ef41Sopenharmony_ci $UMULL r9,r6,r8 18601cb0ef41Sopenharmony_ci $UMULH r10,r6,r8 18611cb0ef41Sopenharmony_ci addc r9,r9,r12 18621cb0ef41Sopenharmony_ci addze r10,r10 18631cb0ef41Sopenharmony_ci $ST r9,`1*$BNSZ`(r3) 18641cb0ef41Sopenharmony_ci addi r12,r10,0 18651cb0ef41Sopenharmony_ci 18661cb0ef41Sopenharmony_ci addi r5,r5,-1 18671cb0ef41Sopenharmony_ci cmpli 0,0,r5,0 18681cb0ef41Sopenharmony_ci beq Lppcasm_mw_OVER 18691cb0ef41Sopenharmony_ci 18701cb0ef41Sopenharmony_ci #mul_add(rp[2],ap[2],w,c1); 18711cb0ef41Sopenharmony_ci $LD r8,`2*$BNSZ`(r4) 18721cb0ef41Sopenharmony_ci $UMULL r9,r6,r8 18731cb0ef41Sopenharmony_ci $UMULH r10,r6,r8 18741cb0ef41Sopenharmony_ci addc r9,r9,r12 18751cb0ef41Sopenharmony_ci addze r10,r10 18761cb0ef41Sopenharmony_ci $ST r9,`2*$BNSZ`(r3) 18771cb0ef41Sopenharmony_ci addi r12,r10,0 18781cb0ef41Sopenharmony_ci 18791cb0ef41Sopenharmony_ciLppcasm_mw_OVER: 18801cb0ef41Sopenharmony_ci addi r3,r12,0 18811cb0ef41Sopenharmony_ci blr 18821cb0ef41Sopenharmony_ci .long 0 18831cb0ef41Sopenharmony_ci .byte 0,12,0x14,0,0,0,4,0 18841cb0ef41Sopenharmony_ci .long 0 18851cb0ef41Sopenharmony_ci.size .bn_mul_words,.-.bn_mul_words 18861cb0ef41Sopenharmony_ci 18871cb0ef41Sopenharmony_ci# 18881cb0ef41Sopenharmony_ci# NOTE: The following label name should be changed to 18891cb0ef41Sopenharmony_ci# "bn_mul_add_words" i.e. remove the first dot 18901cb0ef41Sopenharmony_ci# for the gcc compiler. This should be automatically 18911cb0ef41Sopenharmony_ci# done in the build 18921cb0ef41Sopenharmony_ci# 18931cb0ef41Sopenharmony_ci 18941cb0ef41Sopenharmony_ci.align 4 18951cb0ef41Sopenharmony_ci.bn_mul_add_words: 18961cb0ef41Sopenharmony_ci# 18971cb0ef41Sopenharmony_ci# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 18981cb0ef41Sopenharmony_ci# 18991cb0ef41Sopenharmony_ci# r3 = rp 19001cb0ef41Sopenharmony_ci# r4 = ap 19011cb0ef41Sopenharmony_ci# r5 = num 19021cb0ef41Sopenharmony_ci# r6 = w 19031cb0ef41Sopenharmony_ci# 19041cb0ef41Sopenharmony_ci# empirical evidence suggests that unrolled version performs best!! 19051cb0ef41Sopenharmony_ci# 19061cb0ef41Sopenharmony_ci xor r0,r0,r0 #r0 = 0 19071cb0ef41Sopenharmony_ci xor r12,r12,r12 #r12 = 0 . used for carry 19081cb0ef41Sopenharmony_ci rlwinm. r7,r5,30,2,31 # num >> 2 19091cb0ef41Sopenharmony_ci beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 19101cb0ef41Sopenharmony_ci mtctr r7 19111cb0ef41Sopenharmony_ciLppcasm_maw_mainloop: 19121cb0ef41Sopenharmony_ci #mul_add(rp[0],ap[0],w,c1); 19131cb0ef41Sopenharmony_ci $LD r8,`0*$BNSZ`(r4) 19141cb0ef41Sopenharmony_ci $LD r11,`0*$BNSZ`(r3) 19151cb0ef41Sopenharmony_ci $UMULL r9,r6,r8 19161cb0ef41Sopenharmony_ci $UMULH r10,r6,r8 19171cb0ef41Sopenharmony_ci addc r9,r9,r12 #r12 is carry. 19181cb0ef41Sopenharmony_ci addze r10,r10 19191cb0ef41Sopenharmony_ci addc r9,r9,r11 19201cb0ef41Sopenharmony_ci #addze r10,r10 19211cb0ef41Sopenharmony_ci #the above instruction addze 19221cb0ef41Sopenharmony_ci #is NOT needed. Carry will NOT 19231cb0ef41Sopenharmony_ci #be ignored. It's not affected 19241cb0ef41Sopenharmony_ci #by multiply and will be collected 19251cb0ef41Sopenharmony_ci #in the next spin 19261cb0ef41Sopenharmony_ci $ST r9,`0*$BNSZ`(r3) 19271cb0ef41Sopenharmony_ci 19281cb0ef41Sopenharmony_ci #mul_add(rp[1],ap[1],w,c1); 19291cb0ef41Sopenharmony_ci $LD r8,`1*$BNSZ`(r4) 19301cb0ef41Sopenharmony_ci $LD r9,`1*$BNSZ`(r3) 19311cb0ef41Sopenharmony_ci $UMULL r11,r6,r8 19321cb0ef41Sopenharmony_ci $UMULH r12,r6,r8 19331cb0ef41Sopenharmony_ci adde r11,r11,r10 #r10 is carry. 19341cb0ef41Sopenharmony_ci addze r12,r12 19351cb0ef41Sopenharmony_ci addc r11,r11,r9 19361cb0ef41Sopenharmony_ci #addze r12,r12 19371cb0ef41Sopenharmony_ci $ST r11,`1*$BNSZ`(r3) 19381cb0ef41Sopenharmony_ci 19391cb0ef41Sopenharmony_ci #mul_add(rp[2],ap[2],w,c1); 19401cb0ef41Sopenharmony_ci $LD r8,`2*$BNSZ`(r4) 19411cb0ef41Sopenharmony_ci $UMULL r9,r6,r8 19421cb0ef41Sopenharmony_ci $LD r11,`2*$BNSZ`(r3) 19431cb0ef41Sopenharmony_ci $UMULH r10,r6,r8 19441cb0ef41Sopenharmony_ci adde r9,r9,r12 19451cb0ef41Sopenharmony_ci addze r10,r10 19461cb0ef41Sopenharmony_ci addc r9,r9,r11 19471cb0ef41Sopenharmony_ci #addze r10,r10 19481cb0ef41Sopenharmony_ci $ST r9,`2*$BNSZ`(r3) 19491cb0ef41Sopenharmony_ci 19501cb0ef41Sopenharmony_ci #mul_add(rp[3],ap[3],w,c1); 19511cb0ef41Sopenharmony_ci $LD r8,`3*$BNSZ`(r4) 19521cb0ef41Sopenharmony_ci $UMULL r11,r6,r8 19531cb0ef41Sopenharmony_ci $LD r9,`3*$BNSZ`(r3) 19541cb0ef41Sopenharmony_ci $UMULH r12,r6,r8 19551cb0ef41Sopenharmony_ci adde r11,r11,r10 19561cb0ef41Sopenharmony_ci addze r12,r12 19571cb0ef41Sopenharmony_ci addc r11,r11,r9 19581cb0ef41Sopenharmony_ci addze r12,r12 19591cb0ef41Sopenharmony_ci $ST r11,`3*$BNSZ`(r3) 19601cb0ef41Sopenharmony_ci addi r3,r3,`4*$BNSZ` 19611cb0ef41Sopenharmony_ci addi r4,r4,`4*$BNSZ` 19621cb0ef41Sopenharmony_ci bdnz Lppcasm_maw_mainloop 19631cb0ef41Sopenharmony_ci 19641cb0ef41Sopenharmony_ciLppcasm_maw_leftover: 19651cb0ef41Sopenharmony_ci andi. r5,r5,0x3 19661cb0ef41Sopenharmony_ci beq Lppcasm_maw_adios 19671cb0ef41Sopenharmony_ci addi r3,r3,-$BNSZ 19681cb0ef41Sopenharmony_ci addi r4,r4,-$BNSZ 19691cb0ef41Sopenharmony_ci #mul_add(rp[0],ap[0],w,c1); 19701cb0ef41Sopenharmony_ci mtctr r5 19711cb0ef41Sopenharmony_ci $LDU r8,$BNSZ(r4) 19721cb0ef41Sopenharmony_ci $UMULL r9,r6,r8 19731cb0ef41Sopenharmony_ci $UMULH r10,r6,r8 19741cb0ef41Sopenharmony_ci $LDU r11,$BNSZ(r3) 19751cb0ef41Sopenharmony_ci addc r9,r9,r11 19761cb0ef41Sopenharmony_ci addze r10,r10 19771cb0ef41Sopenharmony_ci addc r9,r9,r12 19781cb0ef41Sopenharmony_ci addze r12,r10 19791cb0ef41Sopenharmony_ci $ST r9,0(r3) 19801cb0ef41Sopenharmony_ci 19811cb0ef41Sopenharmony_ci bdz Lppcasm_maw_adios 19821cb0ef41Sopenharmony_ci #mul_add(rp[1],ap[1],w,c1); 19831cb0ef41Sopenharmony_ci $LDU r8,$BNSZ(r4) 19841cb0ef41Sopenharmony_ci $UMULL r9,r6,r8 19851cb0ef41Sopenharmony_ci $UMULH r10,r6,r8 19861cb0ef41Sopenharmony_ci $LDU r11,$BNSZ(r3) 19871cb0ef41Sopenharmony_ci addc r9,r9,r11 19881cb0ef41Sopenharmony_ci addze r10,r10 19891cb0ef41Sopenharmony_ci addc r9,r9,r12 19901cb0ef41Sopenharmony_ci addze r12,r10 19911cb0ef41Sopenharmony_ci $ST r9,0(r3) 19921cb0ef41Sopenharmony_ci 19931cb0ef41Sopenharmony_ci bdz Lppcasm_maw_adios 19941cb0ef41Sopenharmony_ci #mul_add(rp[2],ap[2],w,c1); 19951cb0ef41Sopenharmony_ci $LDU r8,$BNSZ(r4) 19961cb0ef41Sopenharmony_ci $UMULL r9,r6,r8 19971cb0ef41Sopenharmony_ci $UMULH r10,r6,r8 19981cb0ef41Sopenharmony_ci $LDU r11,$BNSZ(r3) 19991cb0ef41Sopenharmony_ci addc r9,r9,r11 20001cb0ef41Sopenharmony_ci addze r10,r10 20011cb0ef41Sopenharmony_ci addc r9,r9,r12 20021cb0ef41Sopenharmony_ci addze r12,r10 20031cb0ef41Sopenharmony_ci $ST r9,0(r3) 20041cb0ef41Sopenharmony_ci 20051cb0ef41Sopenharmony_ciLppcasm_maw_adios: 20061cb0ef41Sopenharmony_ci addi r3,r12,0 20071cb0ef41Sopenharmony_ci blr 20081cb0ef41Sopenharmony_ci .long 0 20091cb0ef41Sopenharmony_ci .byte 0,12,0x14,0,0,0,4,0 20101cb0ef41Sopenharmony_ci .long 0 20111cb0ef41Sopenharmony_ci.size .bn_mul_add_words,.-.bn_mul_add_words 20121cb0ef41Sopenharmony_ci .align 4 20131cb0ef41Sopenharmony_ciEOF 20141cb0ef41Sopenharmony_ci$data =~ s/\`([^\`]*)\`/eval $1/gem; 20151cb0ef41Sopenharmony_ciprint $data; 20161cb0ef41Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 2017