1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci# Implemented as a Perl wrapper as we want to support several different 10e1051a39Sopenharmony_ci# architectures with single file. We pick up the target based on the 11e1051a39Sopenharmony_ci# file name we are asked to generate. 12e1051a39Sopenharmony_ci# 13e1051a39Sopenharmony_ci# It should be noted though that this perl code is nothing like 14e1051a39Sopenharmony_ci# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much 15e1051a39Sopenharmony_ci# as pre-processor to cover for platform differences in name decoration, 16e1051a39Sopenharmony_ci# linker tables, 32-/64-bit instruction sets... 17e1051a39Sopenharmony_ci# 18e1051a39Sopenharmony_ci# As you might know there're several PowerPC ABI in use. Most notably 19e1051a39Sopenharmony_ci# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs 20e1051a39Sopenharmony_ci# are similar enough to implement leaf(!) functions, which would be ABI 21e1051a39Sopenharmony_ci# neutral. And that's what you find here: ABI neutral leaf functions. 22e1051a39Sopenharmony_ci# In case you wonder what that is... 23e1051a39Sopenharmony_ci# 24e1051a39Sopenharmony_ci# AIX performance 25e1051a39Sopenharmony_ci# 26e1051a39Sopenharmony_ci# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. 27e1051a39Sopenharmony_ci# 28e1051a39Sopenharmony_ci# The following is the performance of 32-bit compiler 29e1051a39Sopenharmony_ci# generated code: 30e1051a39Sopenharmony_ci# 31e1051a39Sopenharmony_ci# OpenSSL 0.9.6c 21 dec 2001 32e1051a39Sopenharmony_ci# built on: Tue Jun 11 11:06:51 EDT 2002 33e1051a39Sopenharmony_ci# options:bn(64,32) ... 34e1051a39Sopenharmony_ci#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 35e1051a39Sopenharmony_ci# sign verify sign/s verify/s 36e1051a39Sopenharmony_ci#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 37e1051a39Sopenharmony_ci#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 38e1051a39Sopenharmony_ci#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 39e1051a39Sopenharmony_ci#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 40e1051a39Sopenharmony_ci#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 41e1051a39Sopenharmony_ci#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 42e1051a39Sopenharmony_ci# 43e1051a39Sopenharmony_ci# Same benchmark with this assembler code: 44e1051a39Sopenharmony_ci# 45e1051a39Sopenharmony_ci#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 46e1051a39Sopenharmony_ci#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 47e1051a39Sopenharmony_ci#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 48e1051a39Sopenharmony_ci#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 49e1051a39Sopenharmony_ci#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 50e1051a39Sopenharmony_ci#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 51e1051a39Sopenharmony_ci# 52e1051a39Sopenharmony_ci# Number of operations increases by at almost 75% 53e1051a39Sopenharmony_ci# 54e1051a39Sopenharmony_ci# Here are performance numbers for 64-bit compiler 55e1051a39Sopenharmony_ci# generated code: 56e1051a39Sopenharmony_ci# 57e1051a39Sopenharmony_ci# OpenSSL 0.9.6g [engine] 9 Aug 2002 58e1051a39Sopenharmony_ci# built on: Fri Apr 18 16:59:20 EDT 2003 59e1051a39Sopenharmony_ci# options:bn(64,64) ... 60e1051a39Sopenharmony_ci# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 61e1051a39Sopenharmony_ci# sign verify sign/s verify/s 62e1051a39Sopenharmony_ci#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 63e1051a39Sopenharmony_ci#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 64e1051a39Sopenharmony_ci#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 65e1051a39Sopenharmony_ci#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 66e1051a39Sopenharmony_ci#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 67e1051a39Sopenharmony_ci#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 68e1051a39Sopenharmony_ci# 69e1051a39Sopenharmony_ci# Same benchmark with this assembler code: 70e1051a39Sopenharmony_ci# 71e1051a39Sopenharmony_ci#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 72e1051a39Sopenharmony_ci#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 73e1051a39Sopenharmony_ci#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 74e1051a39Sopenharmony_ci#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 75e1051a39Sopenharmony_ci#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 76e1051a39Sopenharmony_ci#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 77e1051a39Sopenharmony_ci# 78e1051a39Sopenharmony_ci# Again, performance increases by at about 75% 79e1051a39Sopenharmony_ci# 80e1051a39Sopenharmony_ci# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) 81e1051a39Sopenharmony_ci# OpenSSL 0.9.7c 30 Sep 2003 82e1051a39Sopenharmony_ci# 83e1051a39Sopenharmony_ci# Original code. 84e1051a39Sopenharmony_ci# 85e1051a39Sopenharmony_ci#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 86e1051a39Sopenharmony_ci#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 87e1051a39Sopenharmony_ci#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 88e1051a39Sopenharmony_ci#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 89e1051a39Sopenharmony_ci#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 90e1051a39Sopenharmony_ci#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 91e1051a39Sopenharmony_ci#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 92e1051a39Sopenharmony_ci# 93e1051a39Sopenharmony_ci# Same benchmark with this assembler code: 94e1051a39Sopenharmony_ci# 95e1051a39Sopenharmony_ci#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 96e1051a39Sopenharmony_ci#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 97e1051a39Sopenharmony_ci#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 98e1051a39Sopenharmony_ci#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 99e1051a39Sopenharmony_ci#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 100e1051a39Sopenharmony_ci#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 101e1051a39Sopenharmony_ci#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 102e1051a39Sopenharmony_ci# 103e1051a39Sopenharmony_ci# Performance increase of ~60% 104e1051a39Sopenharmony_ci# Based on submission from Suresh N. Chari of IBM 105e1051a39Sopenharmony_ci 106e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 107e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 108e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 109e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 110e1051a39Sopenharmony_ci 111e1051a39Sopenharmony_ciif ($flavour =~ /32/) { 112e1051a39Sopenharmony_ci $BITS= 32; 113e1051a39Sopenharmony_ci $BNSZ= $BITS/8; 114e1051a39Sopenharmony_ci $ISA= "\"ppc\""; 115e1051a39Sopenharmony_ci 116e1051a39Sopenharmony_ci $LD= "lwz"; # load 117e1051a39Sopenharmony_ci $LDU= "lwzu"; # load and update 118e1051a39Sopenharmony_ci $ST= "stw"; # store 119e1051a39Sopenharmony_ci $STU= "stwu"; # store and update 120e1051a39Sopenharmony_ci $UMULL= "mullw"; # unsigned multiply low 121e1051a39Sopenharmony_ci $UMULH= "mulhwu"; # unsigned multiply high 122e1051a39Sopenharmony_ci $UDIV= "divwu"; # unsigned divide 123e1051a39Sopenharmony_ci $UCMPI= "cmplwi"; # unsigned compare with immediate 124e1051a39Sopenharmony_ci $UCMP= "cmplw"; # unsigned compare 125e1051a39Sopenharmony_ci $CNTLZ= "cntlzw"; # count leading zeros 126e1051a39Sopenharmony_ci $SHL= "slw"; # shift left 127e1051a39Sopenharmony_ci $SHR= "srw"; # unsigned shift right 128e1051a39Sopenharmony_ci $SHRI= "srwi"; # unsigned shift right by immediate 129e1051a39Sopenharmony_ci $SHLI= "slwi"; # shift left by immediate 130e1051a39Sopenharmony_ci $CLRU= "clrlwi"; # clear upper bits 131e1051a39Sopenharmony_ci $INSR= "insrwi"; # insert right 132e1051a39Sopenharmony_ci $ROTL= "rotlwi"; # rotate left by immediate 133e1051a39Sopenharmony_ci $TR= "tw"; # conditional trap 134e1051a39Sopenharmony_ci} elsif ($flavour =~ /64/) { 135e1051a39Sopenharmony_ci $BITS= 64; 136e1051a39Sopenharmony_ci $BNSZ= $BITS/8; 137e1051a39Sopenharmony_ci $ISA= "\"ppc64\""; 138e1051a39Sopenharmony_ci 139e1051a39Sopenharmony_ci # same as above, but 64-bit mnemonics... 140e1051a39Sopenharmony_ci $LD= "ld"; # load 141e1051a39Sopenharmony_ci $LDU= "ldu"; # load and update 142e1051a39Sopenharmony_ci $ST= "std"; # store 143e1051a39Sopenharmony_ci $STU= "stdu"; # store and update 144e1051a39Sopenharmony_ci $UMULL= "mulld"; # unsigned multiply low 145e1051a39Sopenharmony_ci $UMULH= "mulhdu"; # unsigned multiply high 146e1051a39Sopenharmony_ci $UDIV= "divdu"; # unsigned divide 147e1051a39Sopenharmony_ci $UCMPI= "cmpldi"; # unsigned compare with immediate 148e1051a39Sopenharmony_ci $UCMP= "cmpld"; # unsigned compare 149e1051a39Sopenharmony_ci $CNTLZ= "cntlzd"; # count leading zeros 150e1051a39Sopenharmony_ci $SHL= "sld"; # shift left 151e1051a39Sopenharmony_ci $SHR= "srd"; # unsigned shift right 152e1051a39Sopenharmony_ci $SHRI= "srdi"; # unsigned shift right by immediate 153e1051a39Sopenharmony_ci $SHLI= "sldi"; # shift left by immediate 154e1051a39Sopenharmony_ci $CLRU= "clrldi"; # clear upper bits 155e1051a39Sopenharmony_ci $INSR= "insrdi"; # insert right 156e1051a39Sopenharmony_ci $ROTL= "rotldi"; # rotate left by immediate 157e1051a39Sopenharmony_ci $TR= "td"; # conditional trap 158e1051a39Sopenharmony_ci} else { die "nonsense $flavour"; } 159e1051a39Sopenharmony_ci 160e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 161e1051a39Sopenharmony_ci( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 162e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 163e1051a39Sopenharmony_cidie "can't locate ppc-xlate.pl"; 164e1051a39Sopenharmony_ci 165e1051a39Sopenharmony_ciopen STDOUT,"| $^X $xlate $flavour \"$output\"" 166e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 167e1051a39Sopenharmony_ci 168e1051a39Sopenharmony_ci$data=<<EOF; 169e1051a39Sopenharmony_ci#-------------------------------------------------------------------- 170e1051a39Sopenharmony_ci# 171e1051a39Sopenharmony_ci# 172e1051a39Sopenharmony_ci# 173e1051a39Sopenharmony_ci# 174e1051a39Sopenharmony_ci# File: ppc32.s 175e1051a39Sopenharmony_ci# 176e1051a39Sopenharmony_ci# Created by: Suresh Chari 177e1051a39Sopenharmony_ci# IBM Thomas J. Watson Research Library 178e1051a39Sopenharmony_ci# Hawthorne, NY 179e1051a39Sopenharmony_ci# 180e1051a39Sopenharmony_ci# 181e1051a39Sopenharmony_ci# Description: Optimized assembly routines for OpenSSL crypto 182e1051a39Sopenharmony_ci# on the 32 bitPowerPC platform. 183e1051a39Sopenharmony_ci# 184e1051a39Sopenharmony_ci# 185e1051a39Sopenharmony_ci# Version History 186e1051a39Sopenharmony_ci# 187e1051a39Sopenharmony_ci# 2. Fixed bn_add,bn_sub and bn_div_words, added comments, 188e1051a39Sopenharmony_ci# cleaned up code. Also made a single version which can 189e1051a39Sopenharmony_ci# be used for both the AIX and Linux compilers. See NOTE 190e1051a39Sopenharmony_ci# below. 191e1051a39Sopenharmony_ci# 12/05/03 Suresh Chari 192e1051a39Sopenharmony_ci# (with lots of help from) Andy Polyakov 193e1051a39Sopenharmony_ci## 194e1051a39Sopenharmony_ci# 1. Initial version 10/20/02 Suresh Chari 195e1051a39Sopenharmony_ci# 196e1051a39Sopenharmony_ci# 197e1051a39Sopenharmony_ci# The following file works for the xlc,cc 198e1051a39Sopenharmony_ci# and gcc compilers. 199e1051a39Sopenharmony_ci# 200e1051a39Sopenharmony_ci# NOTE: To get the file to link correctly with the gcc compiler 201e1051a39Sopenharmony_ci# you have to change the names of the routines and remove 202e1051a39Sopenharmony_ci# the first .(dot) character. This should automatically 203e1051a39Sopenharmony_ci# be done in the build process. 204e1051a39Sopenharmony_ci# 205e1051a39Sopenharmony_ci# Hand optimized assembly code for the following routines 206e1051a39Sopenharmony_ci# 207e1051a39Sopenharmony_ci# bn_sqr_comba4 208e1051a39Sopenharmony_ci# bn_sqr_comba8 209e1051a39Sopenharmony_ci# bn_mul_comba4 210e1051a39Sopenharmony_ci# bn_mul_comba8 211e1051a39Sopenharmony_ci# bn_sub_words 212e1051a39Sopenharmony_ci# bn_add_words 213e1051a39Sopenharmony_ci# bn_div_words 214e1051a39Sopenharmony_ci# bn_sqr_words 215e1051a39Sopenharmony_ci# bn_mul_words 216e1051a39Sopenharmony_ci# bn_mul_add_words 217e1051a39Sopenharmony_ci# 218e1051a39Sopenharmony_ci# NOTE: It is possible to optimize this code more for 219e1051a39Sopenharmony_ci# specific PowerPC or Power architectures. On the Northstar 220e1051a39Sopenharmony_ci# architecture the optimizations in this file do 221e1051a39Sopenharmony_ci# NOT provide much improvement. 222e1051a39Sopenharmony_ci# 223e1051a39Sopenharmony_ci# If you have comments or suggestions to improve code send 224e1051a39Sopenharmony_ci# me a note at schari\@us.ibm.com 225e1051a39Sopenharmony_ci# 226e1051a39Sopenharmony_ci#-------------------------------------------------------------------------- 227e1051a39Sopenharmony_ci# 228e1051a39Sopenharmony_ci# Defines to be used in the assembly code. 229e1051a39Sopenharmony_ci# 230e1051a39Sopenharmony_ci#.set r0,0 # we use it as storage for value of 0 231e1051a39Sopenharmony_ci#.set SP,1 # preserved 232e1051a39Sopenharmony_ci#.set RTOC,2 # preserved 233e1051a39Sopenharmony_ci#.set r3,3 # 1st argument/return value 234e1051a39Sopenharmony_ci#.set r4,4 # 2nd argument/volatile register 235e1051a39Sopenharmony_ci#.set r5,5 # 3rd argument/volatile register 236e1051a39Sopenharmony_ci#.set r6,6 # ... 237e1051a39Sopenharmony_ci#.set r7,7 238e1051a39Sopenharmony_ci#.set r8,8 239e1051a39Sopenharmony_ci#.set r9,9 240e1051a39Sopenharmony_ci#.set r10,10 241e1051a39Sopenharmony_ci#.set r11,11 242e1051a39Sopenharmony_ci#.set r12,12 243e1051a39Sopenharmony_ci#.set r13,13 # not used, nor any other "below" it... 244e1051a39Sopenharmony_ci 245e1051a39Sopenharmony_ci# Declare function names to be global 246e1051a39Sopenharmony_ci# NOTE: For gcc these names MUST be changed to remove 247e1051a39Sopenharmony_ci# the first . i.e. for example change ".bn_sqr_comba4" 248e1051a39Sopenharmony_ci# to "bn_sqr_comba4". This should be automatically done 249e1051a39Sopenharmony_ci# in the build. 250e1051a39Sopenharmony_ci 251e1051a39Sopenharmony_ci .globl .bn_sqr_comba4 252e1051a39Sopenharmony_ci .globl .bn_sqr_comba8 253e1051a39Sopenharmony_ci .globl .bn_mul_comba4 254e1051a39Sopenharmony_ci .globl .bn_mul_comba8 255e1051a39Sopenharmony_ci .globl .bn_sub_words 256e1051a39Sopenharmony_ci .globl .bn_add_words 257e1051a39Sopenharmony_ci .globl .bn_div_words 258e1051a39Sopenharmony_ci .globl .bn_sqr_words 259e1051a39Sopenharmony_ci .globl .bn_mul_words 260e1051a39Sopenharmony_ci .globl .bn_mul_add_words 261e1051a39Sopenharmony_ci 262e1051a39Sopenharmony_ci# .text section 263e1051a39Sopenharmony_ci 264e1051a39Sopenharmony_ci .machine "any" 265e1051a39Sopenharmony_ci .text 266e1051a39Sopenharmony_ci 267e1051a39Sopenharmony_ci# 268e1051a39Sopenharmony_ci# NOTE: The following label name should be changed to 269e1051a39Sopenharmony_ci# "bn_sqr_comba4" i.e. remove the first dot 270e1051a39Sopenharmony_ci# for the gcc compiler. This should be automatically 271e1051a39Sopenharmony_ci# done in the build 272e1051a39Sopenharmony_ci# 273e1051a39Sopenharmony_ci 274e1051a39Sopenharmony_ci.align 4 275e1051a39Sopenharmony_ci.bn_sqr_comba4: 276e1051a39Sopenharmony_ci# 277e1051a39Sopenharmony_ci# Optimized version of bn_sqr_comba4. 278e1051a39Sopenharmony_ci# 279e1051a39Sopenharmony_ci# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 280e1051a39Sopenharmony_ci# r3 contains r 281e1051a39Sopenharmony_ci# r4 contains a 282e1051a39Sopenharmony_ci# 283e1051a39Sopenharmony_ci# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 284e1051a39Sopenharmony_ci# 285e1051a39Sopenharmony_ci# r5,r6 are the two BN_ULONGs being multiplied. 286e1051a39Sopenharmony_ci# r7,r8 are the results of the 32x32 giving 64 bit multiply. 287e1051a39Sopenharmony_ci# r9,r10, r11 are the equivalents of c1,c2, c3. 288e1051a39Sopenharmony_ci# Here's the assembly 289e1051a39Sopenharmony_ci# 290e1051a39Sopenharmony_ci# 291e1051a39Sopenharmony_ci xor r0,r0,r0 # set r0 = 0. Used in the addze 292e1051a39Sopenharmony_ci # instructions below 293e1051a39Sopenharmony_ci 294e1051a39Sopenharmony_ci #sqr_add_c(a,0,c1,c2,c3) 295e1051a39Sopenharmony_ci $LD r5,`0*$BNSZ`(r4) 296e1051a39Sopenharmony_ci $UMULL r9,r5,r5 297e1051a39Sopenharmony_ci $UMULH r10,r5,r5 #in first iteration. No need 298e1051a39Sopenharmony_ci #to add since c1=c2=c3=0. 299e1051a39Sopenharmony_ci # Note c3(r11) is NOT set to 0 300e1051a39Sopenharmony_ci # but will be. 301e1051a39Sopenharmony_ci 302e1051a39Sopenharmony_ci $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 303e1051a39Sopenharmony_ci # sqr_add_c2(a,1,0,c2,c3,c1); 304e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 305e1051a39Sopenharmony_ci $UMULL r7,r5,r6 306e1051a39Sopenharmony_ci $UMULH r8,r5,r6 307e1051a39Sopenharmony_ci 308e1051a39Sopenharmony_ci addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) 309e1051a39Sopenharmony_ci adde r8,r8,r8 310e1051a39Sopenharmony_ci addze r9,r0 # catch carry if any. 311e1051a39Sopenharmony_ci # r9= r0(=0) and carry 312e1051a39Sopenharmony_ci 313e1051a39Sopenharmony_ci addc r10,r7,r10 # now add to temp result. 314e1051a39Sopenharmony_ci addze r11,r8 # r8 added to r11 which is 0 315e1051a39Sopenharmony_ci addze r9,r9 316e1051a39Sopenharmony_ci 317e1051a39Sopenharmony_ci $ST r10,`1*$BNSZ`(r3) #r[1]=c2; 318e1051a39Sopenharmony_ci #sqr_add_c(a,1,c3,c1,c2) 319e1051a39Sopenharmony_ci $UMULL r7,r6,r6 320e1051a39Sopenharmony_ci $UMULH r8,r6,r6 321e1051a39Sopenharmony_ci addc r11,r7,r11 322e1051a39Sopenharmony_ci adde r9,r8,r9 323e1051a39Sopenharmony_ci addze r10,r0 324e1051a39Sopenharmony_ci #sqr_add_c2(a,2,0,c3,c1,c2) 325e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 326e1051a39Sopenharmony_ci $UMULL r7,r5,r6 327e1051a39Sopenharmony_ci $UMULH r8,r5,r6 328e1051a39Sopenharmony_ci 329e1051a39Sopenharmony_ci addc r7,r7,r7 330e1051a39Sopenharmony_ci adde r8,r8,r8 331e1051a39Sopenharmony_ci addze r10,r10 332e1051a39Sopenharmony_ci 333e1051a39Sopenharmony_ci addc r11,r7,r11 334e1051a39Sopenharmony_ci adde r9,r8,r9 335e1051a39Sopenharmony_ci addze r10,r10 336e1051a39Sopenharmony_ci $ST r11,`2*$BNSZ`(r3) #r[2]=c3 337e1051a39Sopenharmony_ci #sqr_add_c2(a,3,0,c1,c2,c3); 338e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 339e1051a39Sopenharmony_ci $UMULL r7,r5,r6 340e1051a39Sopenharmony_ci $UMULH r8,r5,r6 341e1051a39Sopenharmony_ci addc r7,r7,r7 342e1051a39Sopenharmony_ci adde r8,r8,r8 343e1051a39Sopenharmony_ci addze r11,r0 344e1051a39Sopenharmony_ci 345e1051a39Sopenharmony_ci addc r9,r7,r9 346e1051a39Sopenharmony_ci adde r10,r8,r10 347e1051a39Sopenharmony_ci addze r11,r11 348e1051a39Sopenharmony_ci #sqr_add_c2(a,2,1,c1,c2,c3); 349e1051a39Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 350e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 351e1051a39Sopenharmony_ci $UMULL r7,r5,r6 352e1051a39Sopenharmony_ci $UMULH r8,r5,r6 353e1051a39Sopenharmony_ci 354e1051a39Sopenharmony_ci addc r7,r7,r7 355e1051a39Sopenharmony_ci adde r8,r8,r8 356e1051a39Sopenharmony_ci addze r11,r11 357e1051a39Sopenharmony_ci addc r9,r7,r9 358e1051a39Sopenharmony_ci adde r10,r8,r10 359e1051a39Sopenharmony_ci addze r11,r11 360e1051a39Sopenharmony_ci $ST r9,`3*$BNSZ`(r3) #r[3]=c1 361e1051a39Sopenharmony_ci #sqr_add_c(a,2,c2,c3,c1); 362e1051a39Sopenharmony_ci $UMULL r7,r6,r6 363e1051a39Sopenharmony_ci $UMULH r8,r6,r6 364e1051a39Sopenharmony_ci addc r10,r7,r10 365e1051a39Sopenharmony_ci adde r11,r8,r11 366e1051a39Sopenharmony_ci addze r9,r0 367e1051a39Sopenharmony_ci #sqr_add_c2(a,3,1,c2,c3,c1); 368e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 369e1051a39Sopenharmony_ci $UMULL r7,r5,r6 370e1051a39Sopenharmony_ci $UMULH r8,r5,r6 371e1051a39Sopenharmony_ci addc r7,r7,r7 372e1051a39Sopenharmony_ci adde r8,r8,r8 373e1051a39Sopenharmony_ci addze r9,r9 374e1051a39Sopenharmony_ci 375e1051a39Sopenharmony_ci addc r10,r7,r10 376e1051a39Sopenharmony_ci adde r11,r8,r11 377e1051a39Sopenharmony_ci addze r9,r9 378e1051a39Sopenharmony_ci $ST r10,`4*$BNSZ`(r3) #r[4]=c2 379e1051a39Sopenharmony_ci #sqr_add_c2(a,3,2,c3,c1,c2); 380e1051a39Sopenharmony_ci $LD r5,`2*$BNSZ`(r4) 381e1051a39Sopenharmony_ci $UMULL r7,r5,r6 382e1051a39Sopenharmony_ci $UMULH r8,r5,r6 383e1051a39Sopenharmony_ci addc r7,r7,r7 384e1051a39Sopenharmony_ci adde r8,r8,r8 385e1051a39Sopenharmony_ci addze r10,r0 386e1051a39Sopenharmony_ci 387e1051a39Sopenharmony_ci addc r11,r7,r11 388e1051a39Sopenharmony_ci adde r9,r8,r9 389e1051a39Sopenharmony_ci addze r10,r10 390e1051a39Sopenharmony_ci $ST r11,`5*$BNSZ`(r3) #r[5] = c3 391e1051a39Sopenharmony_ci #sqr_add_c(a,3,c1,c2,c3); 392e1051a39Sopenharmony_ci $UMULL r7,r6,r6 393e1051a39Sopenharmony_ci $UMULH r8,r6,r6 394e1051a39Sopenharmony_ci addc r9,r7,r9 395e1051a39Sopenharmony_ci adde r10,r8,r10 396e1051a39Sopenharmony_ci 397e1051a39Sopenharmony_ci $ST r9,`6*$BNSZ`(r3) #r[6]=c1 398e1051a39Sopenharmony_ci $ST r10,`7*$BNSZ`(r3) #r[7]=c2 399e1051a39Sopenharmony_ci blr 400e1051a39Sopenharmony_ci .long 0 401e1051a39Sopenharmony_ci .byte 0,12,0x14,0,0,0,2,0 402e1051a39Sopenharmony_ci .long 0 403e1051a39Sopenharmony_ci.size .bn_sqr_comba4,.-.bn_sqr_comba4 404e1051a39Sopenharmony_ci 405e1051a39Sopenharmony_ci# 406e1051a39Sopenharmony_ci# NOTE: The following label name should be changed to 407e1051a39Sopenharmony_ci# "bn_sqr_comba8" i.e. remove the first dot 408e1051a39Sopenharmony_ci# for the gcc compiler. This should be automatically 409e1051a39Sopenharmony_ci# done in the build 410e1051a39Sopenharmony_ci# 411e1051a39Sopenharmony_ci 412e1051a39Sopenharmony_ci.align 4 413e1051a39Sopenharmony_ci.bn_sqr_comba8: 414e1051a39Sopenharmony_ci# 415e1051a39Sopenharmony_ci# This is an optimized version of the bn_sqr_comba8 routine. 416e1051a39Sopenharmony_ci# Tightly uses the adde instruction 417e1051a39Sopenharmony_ci# 418e1051a39Sopenharmony_ci# 419e1051a39Sopenharmony_ci# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 420e1051a39Sopenharmony_ci# r3 contains r 421e1051a39Sopenharmony_ci# r4 contains a 422e1051a39Sopenharmony_ci# 423e1051a39Sopenharmony_ci# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: 424e1051a39Sopenharmony_ci# 425e1051a39Sopenharmony_ci# r5,r6 are the two BN_ULONGs being multiplied. 426e1051a39Sopenharmony_ci# r7,r8 are the results of the 32x32 giving 64 bit multiply. 427e1051a39Sopenharmony_ci# r9,r10, r11 are the equivalents of c1,c2, c3. 428e1051a39Sopenharmony_ci# 429e1051a39Sopenharmony_ci# Possible optimization of loading all 8 longs of a into registers 430e1051a39Sopenharmony_ci# doesn't provide any speedup 431e1051a39Sopenharmony_ci# 432e1051a39Sopenharmony_ci 433e1051a39Sopenharmony_ci xor r0,r0,r0 #set r0 = 0.Used in addze 434e1051a39Sopenharmony_ci #instructions below. 435e1051a39Sopenharmony_ci 436e1051a39Sopenharmony_ci #sqr_add_c(a,0,c1,c2,c3); 437e1051a39Sopenharmony_ci $LD r5,`0*$BNSZ`(r4) 438e1051a39Sopenharmony_ci $UMULL r9,r5,r5 #1st iteration: no carries. 439e1051a39Sopenharmony_ci $UMULH r10,r5,r5 440e1051a39Sopenharmony_ci $ST r9,`0*$BNSZ`(r3) # r[0]=c1; 441e1051a39Sopenharmony_ci #sqr_add_c2(a,1,0,c2,c3,c1); 442e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 443e1051a39Sopenharmony_ci $UMULL r7,r5,r6 444e1051a39Sopenharmony_ci $UMULH r8,r5,r6 445e1051a39Sopenharmony_ci 446e1051a39Sopenharmony_ci addc r10,r7,r10 #add the two register number 447e1051a39Sopenharmony_ci adde r11,r8,r0 # (r8,r7) to the three register 448e1051a39Sopenharmony_ci addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 449e1051a39Sopenharmony_ci 450e1051a39Sopenharmony_ci addc r10,r7,r10 #add the two register number 451e1051a39Sopenharmony_ci adde r11,r8,r11 # (r8,r7) to the three register 452e1051a39Sopenharmony_ci addze r9,r9 # number (r9,r11,r10). 453e1051a39Sopenharmony_ci 454e1051a39Sopenharmony_ci $ST r10,`1*$BNSZ`(r3) # r[1]=c2 455e1051a39Sopenharmony_ci 456e1051a39Sopenharmony_ci #sqr_add_c(a,1,c3,c1,c2); 457e1051a39Sopenharmony_ci $UMULL r7,r6,r6 458e1051a39Sopenharmony_ci $UMULH r8,r6,r6 459e1051a39Sopenharmony_ci addc r11,r7,r11 460e1051a39Sopenharmony_ci adde r9,r8,r9 461e1051a39Sopenharmony_ci addze r10,r0 462e1051a39Sopenharmony_ci #sqr_add_c2(a,2,0,c3,c1,c2); 463e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 464e1051a39Sopenharmony_ci $UMULL r7,r5,r6 465e1051a39Sopenharmony_ci $UMULH r8,r5,r6 466e1051a39Sopenharmony_ci 467e1051a39Sopenharmony_ci addc r11,r7,r11 468e1051a39Sopenharmony_ci adde r9,r8,r9 469e1051a39Sopenharmony_ci addze r10,r10 470e1051a39Sopenharmony_ci 471e1051a39Sopenharmony_ci addc r11,r7,r11 472e1051a39Sopenharmony_ci adde r9,r8,r9 473e1051a39Sopenharmony_ci addze r10,r10 474e1051a39Sopenharmony_ci 475e1051a39Sopenharmony_ci $ST r11,`2*$BNSZ`(r3) #r[2]=c3 476e1051a39Sopenharmony_ci #sqr_add_c2(a,3,0,c1,c2,c3); 477e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. 478e1051a39Sopenharmony_ci $UMULL r7,r5,r6 479e1051a39Sopenharmony_ci $UMULH r8,r5,r6 480e1051a39Sopenharmony_ci 481e1051a39Sopenharmony_ci addc r9,r7,r9 482e1051a39Sopenharmony_ci adde r10,r8,r10 483e1051a39Sopenharmony_ci addze r11,r0 484e1051a39Sopenharmony_ci 485e1051a39Sopenharmony_ci addc r9,r7,r9 486e1051a39Sopenharmony_ci adde r10,r8,r10 487e1051a39Sopenharmony_ci addze r11,r11 488e1051a39Sopenharmony_ci #sqr_add_c2(a,2,1,c1,c2,c3); 489e1051a39Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 490e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 491e1051a39Sopenharmony_ci $UMULL r7,r5,r6 492e1051a39Sopenharmony_ci $UMULH r8,r5,r6 493e1051a39Sopenharmony_ci 494e1051a39Sopenharmony_ci addc r9,r7,r9 495e1051a39Sopenharmony_ci adde r10,r8,r10 496e1051a39Sopenharmony_ci addze r11,r11 497e1051a39Sopenharmony_ci 498e1051a39Sopenharmony_ci addc r9,r7,r9 499e1051a39Sopenharmony_ci adde r10,r8,r10 500e1051a39Sopenharmony_ci addze r11,r11 501e1051a39Sopenharmony_ci 502e1051a39Sopenharmony_ci $ST r9,`3*$BNSZ`(r3) #r[3]=c1; 503e1051a39Sopenharmony_ci #sqr_add_c(a,2,c2,c3,c1); 504e1051a39Sopenharmony_ci $UMULL r7,r6,r6 505e1051a39Sopenharmony_ci $UMULH r8,r6,r6 506e1051a39Sopenharmony_ci 507e1051a39Sopenharmony_ci addc r10,r7,r10 508e1051a39Sopenharmony_ci adde r11,r8,r11 509e1051a39Sopenharmony_ci addze r9,r0 510e1051a39Sopenharmony_ci #sqr_add_c2(a,3,1,c2,c3,c1); 511e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 512e1051a39Sopenharmony_ci $UMULL r7,r5,r6 513e1051a39Sopenharmony_ci $UMULH r8,r5,r6 514e1051a39Sopenharmony_ci 515e1051a39Sopenharmony_ci addc r10,r7,r10 516e1051a39Sopenharmony_ci adde r11,r8,r11 517e1051a39Sopenharmony_ci addze r9,r9 518e1051a39Sopenharmony_ci 519e1051a39Sopenharmony_ci addc r10,r7,r10 520e1051a39Sopenharmony_ci adde r11,r8,r11 521e1051a39Sopenharmony_ci addze r9,r9 522e1051a39Sopenharmony_ci #sqr_add_c2(a,4,0,c2,c3,c1); 523e1051a39Sopenharmony_ci $LD r5,`0*$BNSZ`(r4) 524e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 525e1051a39Sopenharmony_ci $UMULL r7,r5,r6 526e1051a39Sopenharmony_ci $UMULH r8,r5,r6 527e1051a39Sopenharmony_ci 528e1051a39Sopenharmony_ci addc r10,r7,r10 529e1051a39Sopenharmony_ci adde r11,r8,r11 530e1051a39Sopenharmony_ci addze r9,r9 531e1051a39Sopenharmony_ci 532e1051a39Sopenharmony_ci addc r10,r7,r10 533e1051a39Sopenharmony_ci adde r11,r8,r11 534e1051a39Sopenharmony_ci addze r9,r9 535e1051a39Sopenharmony_ci $ST r10,`4*$BNSZ`(r3) #r[4]=c2; 536e1051a39Sopenharmony_ci #sqr_add_c2(a,5,0,c3,c1,c2); 537e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 538e1051a39Sopenharmony_ci $UMULL r7,r5,r6 539e1051a39Sopenharmony_ci $UMULH r8,r5,r6 540e1051a39Sopenharmony_ci 541e1051a39Sopenharmony_ci addc r11,r7,r11 542e1051a39Sopenharmony_ci adde r9,r8,r9 543e1051a39Sopenharmony_ci addze r10,r0 544e1051a39Sopenharmony_ci 545e1051a39Sopenharmony_ci addc r11,r7,r11 546e1051a39Sopenharmony_ci adde r9,r8,r9 547e1051a39Sopenharmony_ci addze r10,r10 548e1051a39Sopenharmony_ci #sqr_add_c2(a,4,1,c3,c1,c2); 549e1051a39Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 550e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 551e1051a39Sopenharmony_ci $UMULL r7,r5,r6 552e1051a39Sopenharmony_ci $UMULH r8,r5,r6 553e1051a39Sopenharmony_ci 554e1051a39Sopenharmony_ci addc r11,r7,r11 555e1051a39Sopenharmony_ci adde r9,r8,r9 556e1051a39Sopenharmony_ci addze r10,r10 557e1051a39Sopenharmony_ci 558e1051a39Sopenharmony_ci addc r11,r7,r11 559e1051a39Sopenharmony_ci adde r9,r8,r9 560e1051a39Sopenharmony_ci addze r10,r10 561e1051a39Sopenharmony_ci #sqr_add_c2(a,3,2,c3,c1,c2); 562e1051a39Sopenharmony_ci $LD r5,`2*$BNSZ`(r4) 563e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 564e1051a39Sopenharmony_ci $UMULL r7,r5,r6 565e1051a39Sopenharmony_ci $UMULH r8,r5,r6 566e1051a39Sopenharmony_ci 567e1051a39Sopenharmony_ci addc r11,r7,r11 568e1051a39Sopenharmony_ci adde r9,r8,r9 569e1051a39Sopenharmony_ci addze r10,r10 570e1051a39Sopenharmony_ci 571e1051a39Sopenharmony_ci addc r11,r7,r11 572e1051a39Sopenharmony_ci adde r9,r8,r9 573e1051a39Sopenharmony_ci addze r10,r10 574e1051a39Sopenharmony_ci $ST r11,`5*$BNSZ`(r3) #r[5]=c3; 575e1051a39Sopenharmony_ci #sqr_add_c(a,3,c1,c2,c3); 576e1051a39Sopenharmony_ci $UMULL r7,r6,r6 577e1051a39Sopenharmony_ci $UMULH r8,r6,r6 578e1051a39Sopenharmony_ci addc r9,r7,r9 579e1051a39Sopenharmony_ci adde r10,r8,r10 580e1051a39Sopenharmony_ci addze r11,r0 581e1051a39Sopenharmony_ci #sqr_add_c2(a,4,2,c1,c2,c3); 582e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 583e1051a39Sopenharmony_ci $UMULL r7,r5,r6 584e1051a39Sopenharmony_ci $UMULH r8,r5,r6 585e1051a39Sopenharmony_ci 586e1051a39Sopenharmony_ci addc r9,r7,r9 587e1051a39Sopenharmony_ci adde r10,r8,r10 588e1051a39Sopenharmony_ci addze r11,r11 589e1051a39Sopenharmony_ci 590e1051a39Sopenharmony_ci addc r9,r7,r9 591e1051a39Sopenharmony_ci adde r10,r8,r10 592e1051a39Sopenharmony_ci addze r11,r11 593e1051a39Sopenharmony_ci #sqr_add_c2(a,5,1,c1,c2,c3); 594e1051a39Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 595e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 596e1051a39Sopenharmony_ci $UMULL r7,r5,r6 597e1051a39Sopenharmony_ci $UMULH r8,r5,r6 598e1051a39Sopenharmony_ci 599e1051a39Sopenharmony_ci addc r9,r7,r9 600e1051a39Sopenharmony_ci adde r10,r8,r10 601e1051a39Sopenharmony_ci addze r11,r11 602e1051a39Sopenharmony_ci 603e1051a39Sopenharmony_ci addc r9,r7,r9 604e1051a39Sopenharmony_ci adde r10,r8,r10 605e1051a39Sopenharmony_ci addze r11,r11 606e1051a39Sopenharmony_ci #sqr_add_c2(a,6,0,c1,c2,c3); 607e1051a39Sopenharmony_ci $LD r5,`0*$BNSZ`(r4) 608e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 609e1051a39Sopenharmony_ci $UMULL r7,r5,r6 610e1051a39Sopenharmony_ci $UMULH r8,r5,r6 611e1051a39Sopenharmony_ci addc r9,r7,r9 612e1051a39Sopenharmony_ci adde r10,r8,r10 613e1051a39Sopenharmony_ci addze r11,r11 614e1051a39Sopenharmony_ci addc r9,r7,r9 615e1051a39Sopenharmony_ci adde r10,r8,r10 616e1051a39Sopenharmony_ci addze r11,r11 617e1051a39Sopenharmony_ci $ST r9,`6*$BNSZ`(r3) #r[6]=c1; 618e1051a39Sopenharmony_ci #sqr_add_c2(a,7,0,c2,c3,c1); 619e1051a39Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 620e1051a39Sopenharmony_ci $UMULL r7,r5,r6 621e1051a39Sopenharmony_ci $UMULH r8,r5,r6 622e1051a39Sopenharmony_ci 623e1051a39Sopenharmony_ci addc r10,r7,r10 624e1051a39Sopenharmony_ci adde r11,r8,r11 625e1051a39Sopenharmony_ci addze r9,r0 626e1051a39Sopenharmony_ci addc r10,r7,r10 627e1051a39Sopenharmony_ci adde r11,r8,r11 628e1051a39Sopenharmony_ci addze r9,r9 629e1051a39Sopenharmony_ci #sqr_add_c2(a,6,1,c2,c3,c1); 630e1051a39Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 631e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 632e1051a39Sopenharmony_ci $UMULL r7,r5,r6 633e1051a39Sopenharmony_ci $UMULH r8,r5,r6 634e1051a39Sopenharmony_ci 635e1051a39Sopenharmony_ci addc r10,r7,r10 636e1051a39Sopenharmony_ci adde r11,r8,r11 637e1051a39Sopenharmony_ci addze r9,r9 638e1051a39Sopenharmony_ci addc r10,r7,r10 639e1051a39Sopenharmony_ci adde r11,r8,r11 640e1051a39Sopenharmony_ci addze r9,r9 641e1051a39Sopenharmony_ci #sqr_add_c2(a,5,2,c2,c3,c1); 642e1051a39Sopenharmony_ci $LD r5,`2*$BNSZ`(r4) 643e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 644e1051a39Sopenharmony_ci $UMULL r7,r5,r6 645e1051a39Sopenharmony_ci $UMULH r8,r5,r6 646e1051a39Sopenharmony_ci addc r10,r7,r10 647e1051a39Sopenharmony_ci adde r11,r8,r11 648e1051a39Sopenharmony_ci addze r9,r9 649e1051a39Sopenharmony_ci addc r10,r7,r10 650e1051a39Sopenharmony_ci adde r11,r8,r11 651e1051a39Sopenharmony_ci addze r9,r9 652e1051a39Sopenharmony_ci #sqr_add_c2(a,4,3,c2,c3,c1); 653e1051a39Sopenharmony_ci $LD r5,`3*$BNSZ`(r4) 654e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 655e1051a39Sopenharmony_ci $UMULL r7,r5,r6 656e1051a39Sopenharmony_ci $UMULH r8,r5,r6 657e1051a39Sopenharmony_ci 658e1051a39Sopenharmony_ci addc r10,r7,r10 659e1051a39Sopenharmony_ci adde r11,r8,r11 660e1051a39Sopenharmony_ci addze r9,r9 661e1051a39Sopenharmony_ci addc r10,r7,r10 662e1051a39Sopenharmony_ci adde r11,r8,r11 663e1051a39Sopenharmony_ci addze r9,r9 664e1051a39Sopenharmony_ci $ST r10,`7*$BNSZ`(r3) #r[7]=c2; 665e1051a39Sopenharmony_ci #sqr_add_c(a,4,c3,c1,c2); 666e1051a39Sopenharmony_ci $UMULL r7,r6,r6 667e1051a39Sopenharmony_ci $UMULH r8,r6,r6 668e1051a39Sopenharmony_ci addc r11,r7,r11 669e1051a39Sopenharmony_ci adde r9,r8,r9 670e1051a39Sopenharmony_ci addze r10,r0 671e1051a39Sopenharmony_ci #sqr_add_c2(a,5,3,c3,c1,c2); 672e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 673e1051a39Sopenharmony_ci $UMULL r7,r5,r6 674e1051a39Sopenharmony_ci $UMULH r8,r5,r6 675e1051a39Sopenharmony_ci addc r11,r7,r11 676e1051a39Sopenharmony_ci adde r9,r8,r9 677e1051a39Sopenharmony_ci addze r10,r10 678e1051a39Sopenharmony_ci addc r11,r7,r11 679e1051a39Sopenharmony_ci adde r9,r8,r9 680e1051a39Sopenharmony_ci addze r10,r10 681e1051a39Sopenharmony_ci #sqr_add_c2(a,6,2,c3,c1,c2); 682e1051a39Sopenharmony_ci $LD r5,`2*$BNSZ`(r4) 683e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 684e1051a39Sopenharmony_ci $UMULL r7,r5,r6 685e1051a39Sopenharmony_ci $UMULH r8,r5,r6 686e1051a39Sopenharmony_ci addc r11,r7,r11 687e1051a39Sopenharmony_ci adde r9,r8,r9 688e1051a39Sopenharmony_ci addze r10,r10 689e1051a39Sopenharmony_ci 690e1051a39Sopenharmony_ci addc r11,r7,r11 691e1051a39Sopenharmony_ci adde r9,r8,r9 692e1051a39Sopenharmony_ci addze r10,r10 693e1051a39Sopenharmony_ci #sqr_add_c2(a,7,1,c3,c1,c2); 694e1051a39Sopenharmony_ci $LD r5,`1*$BNSZ`(r4) 695e1051a39Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 696e1051a39Sopenharmony_ci $UMULL r7,r5,r6 697e1051a39Sopenharmony_ci $UMULH r8,r5,r6 698e1051a39Sopenharmony_ci addc r11,r7,r11 699e1051a39Sopenharmony_ci adde r9,r8,r9 700e1051a39Sopenharmony_ci addze r10,r10 701e1051a39Sopenharmony_ci addc r11,r7,r11 702e1051a39Sopenharmony_ci adde r9,r8,r9 703e1051a39Sopenharmony_ci addze r10,r10 704e1051a39Sopenharmony_ci $ST r11,`8*$BNSZ`(r3) #r[8]=c3; 705e1051a39Sopenharmony_ci #sqr_add_c2(a,7,2,c1,c2,c3); 706e1051a39Sopenharmony_ci $LD r5,`2*$BNSZ`(r4) 707e1051a39Sopenharmony_ci $UMULL r7,r5,r6 708e1051a39Sopenharmony_ci $UMULH r8,r5,r6 709e1051a39Sopenharmony_ci 710e1051a39Sopenharmony_ci addc r9,r7,r9 711e1051a39Sopenharmony_ci adde r10,r8,r10 712e1051a39Sopenharmony_ci addze r11,r0 713e1051a39Sopenharmony_ci addc r9,r7,r9 714e1051a39Sopenharmony_ci adde r10,r8,r10 715e1051a39Sopenharmony_ci addze r11,r11 716e1051a39Sopenharmony_ci #sqr_add_c2(a,6,3,c1,c2,c3); 717e1051a39Sopenharmony_ci $LD r5,`3*$BNSZ`(r4) 718e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 719e1051a39Sopenharmony_ci $UMULL r7,r5,r6 720e1051a39Sopenharmony_ci $UMULH r8,r5,r6 721e1051a39Sopenharmony_ci addc r9,r7,r9 722e1051a39Sopenharmony_ci adde r10,r8,r10 723e1051a39Sopenharmony_ci addze r11,r11 724e1051a39Sopenharmony_ci addc r9,r7,r9 725e1051a39Sopenharmony_ci adde r10,r8,r10 726e1051a39Sopenharmony_ci addze r11,r11 727e1051a39Sopenharmony_ci #sqr_add_c2(a,5,4,c1,c2,c3); 728e1051a39Sopenharmony_ci $LD r5,`4*$BNSZ`(r4) 729e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 730e1051a39Sopenharmony_ci $UMULL r7,r5,r6 731e1051a39Sopenharmony_ci $UMULH r8,r5,r6 732e1051a39Sopenharmony_ci addc r9,r7,r9 733e1051a39Sopenharmony_ci adde r10,r8,r10 734e1051a39Sopenharmony_ci addze r11,r11 735e1051a39Sopenharmony_ci addc r9,r7,r9 736e1051a39Sopenharmony_ci adde r10,r8,r10 737e1051a39Sopenharmony_ci addze r11,r11 738e1051a39Sopenharmony_ci $ST r9,`9*$BNSZ`(r3) #r[9]=c1; 739e1051a39Sopenharmony_ci #sqr_add_c(a,5,c2,c3,c1); 740e1051a39Sopenharmony_ci $UMULL r7,r6,r6 741e1051a39Sopenharmony_ci $UMULH r8,r6,r6 742e1051a39Sopenharmony_ci addc r10,r7,r10 743e1051a39Sopenharmony_ci adde r11,r8,r11 744e1051a39Sopenharmony_ci addze r9,r0 745e1051a39Sopenharmony_ci #sqr_add_c2(a,6,4,c2,c3,c1); 746e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 747e1051a39Sopenharmony_ci $UMULL r7,r5,r6 748e1051a39Sopenharmony_ci $UMULH r8,r5,r6 749e1051a39Sopenharmony_ci addc r10,r7,r10 750e1051a39Sopenharmony_ci adde r11,r8,r11 751e1051a39Sopenharmony_ci addze r9,r9 752e1051a39Sopenharmony_ci addc r10,r7,r10 753e1051a39Sopenharmony_ci adde r11,r8,r11 754e1051a39Sopenharmony_ci addze r9,r9 755e1051a39Sopenharmony_ci #sqr_add_c2(a,7,3,c2,c3,c1); 756e1051a39Sopenharmony_ci $LD r5,`3*$BNSZ`(r4) 757e1051a39Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 758e1051a39Sopenharmony_ci $UMULL r7,r5,r6 759e1051a39Sopenharmony_ci $UMULH r8,r5,r6 760e1051a39Sopenharmony_ci addc r10,r7,r10 761e1051a39Sopenharmony_ci adde r11,r8,r11 762e1051a39Sopenharmony_ci addze r9,r9 763e1051a39Sopenharmony_ci addc r10,r7,r10 764e1051a39Sopenharmony_ci adde r11,r8,r11 765e1051a39Sopenharmony_ci addze r9,r9 766e1051a39Sopenharmony_ci $ST r10,`10*$BNSZ`(r3) #r[10]=c2; 767e1051a39Sopenharmony_ci #sqr_add_c2(a,7,4,c3,c1,c2); 768e1051a39Sopenharmony_ci $LD r5,`4*$BNSZ`(r4) 769e1051a39Sopenharmony_ci $UMULL r7,r5,r6 770e1051a39Sopenharmony_ci $UMULH r8,r5,r6 771e1051a39Sopenharmony_ci addc r11,r7,r11 772e1051a39Sopenharmony_ci adde r9,r8,r9 773e1051a39Sopenharmony_ci addze r10,r0 774e1051a39Sopenharmony_ci addc r11,r7,r11 775e1051a39Sopenharmony_ci adde r9,r8,r9 776e1051a39Sopenharmony_ci addze r10,r10 777e1051a39Sopenharmony_ci #sqr_add_c2(a,6,5,c3,c1,c2); 778e1051a39Sopenharmony_ci $LD r5,`5*$BNSZ`(r4) 779e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 780e1051a39Sopenharmony_ci $UMULL r7,r5,r6 781e1051a39Sopenharmony_ci $UMULH r8,r5,r6 782e1051a39Sopenharmony_ci addc r11,r7,r11 783e1051a39Sopenharmony_ci adde r9,r8,r9 784e1051a39Sopenharmony_ci addze r10,r10 785e1051a39Sopenharmony_ci addc r11,r7,r11 786e1051a39Sopenharmony_ci adde r9,r8,r9 787e1051a39Sopenharmony_ci addze r10,r10 788e1051a39Sopenharmony_ci $ST r11,`11*$BNSZ`(r3) #r[11]=c3; 789e1051a39Sopenharmony_ci #sqr_add_c(a,6,c1,c2,c3); 790e1051a39Sopenharmony_ci $UMULL r7,r6,r6 791e1051a39Sopenharmony_ci $UMULH r8,r6,r6 792e1051a39Sopenharmony_ci addc r9,r7,r9 793e1051a39Sopenharmony_ci adde r10,r8,r10 794e1051a39Sopenharmony_ci addze r11,r0 795e1051a39Sopenharmony_ci #sqr_add_c2(a,7,5,c1,c2,c3) 796e1051a39Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 797e1051a39Sopenharmony_ci $UMULL r7,r5,r6 798e1051a39Sopenharmony_ci $UMULH r8,r5,r6 799e1051a39Sopenharmony_ci addc r9,r7,r9 800e1051a39Sopenharmony_ci adde r10,r8,r10 801e1051a39Sopenharmony_ci addze r11,r11 802e1051a39Sopenharmony_ci addc r9,r7,r9 803e1051a39Sopenharmony_ci adde r10,r8,r10 804e1051a39Sopenharmony_ci addze r11,r11 805e1051a39Sopenharmony_ci $ST r9,`12*$BNSZ`(r3) #r[12]=c1; 806e1051a39Sopenharmony_ci 807e1051a39Sopenharmony_ci #sqr_add_c2(a,7,6,c2,c3,c1) 808e1051a39Sopenharmony_ci $LD r5,`6*$BNSZ`(r4) 809e1051a39Sopenharmony_ci $UMULL r7,r5,r6 810e1051a39Sopenharmony_ci $UMULH r8,r5,r6 811e1051a39Sopenharmony_ci addc r10,r7,r10 812e1051a39Sopenharmony_ci adde r11,r8,r11 813e1051a39Sopenharmony_ci addze r9,r0 814e1051a39Sopenharmony_ci addc r10,r7,r10 815e1051a39Sopenharmony_ci adde r11,r8,r11 816e1051a39Sopenharmony_ci addze r9,r9 817e1051a39Sopenharmony_ci $ST r10,`13*$BNSZ`(r3) #r[13]=c2; 818e1051a39Sopenharmony_ci #sqr_add_c(a,7,c3,c1,c2); 819e1051a39Sopenharmony_ci $UMULL r7,r6,r6 820e1051a39Sopenharmony_ci $UMULH r8,r6,r6 821e1051a39Sopenharmony_ci addc r11,r7,r11 822e1051a39Sopenharmony_ci adde r9,r8,r9 823e1051a39Sopenharmony_ci $ST r11,`14*$BNSZ`(r3) #r[14]=c3; 824e1051a39Sopenharmony_ci $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 825e1051a39Sopenharmony_ci 826e1051a39Sopenharmony_ci 827e1051a39Sopenharmony_ci blr 828e1051a39Sopenharmony_ci .long 0 829e1051a39Sopenharmony_ci .byte 0,12,0x14,0,0,0,2,0 830e1051a39Sopenharmony_ci .long 0 831e1051a39Sopenharmony_ci.size .bn_sqr_comba8,.-.bn_sqr_comba8 832e1051a39Sopenharmony_ci 833e1051a39Sopenharmony_ci# 834e1051a39Sopenharmony_ci# NOTE: The following label name should be changed to 835e1051a39Sopenharmony_ci# "bn_mul_comba4" i.e. remove the first dot 836e1051a39Sopenharmony_ci# for the gcc compiler. This should be automatically 837e1051a39Sopenharmony_ci# done in the build 838e1051a39Sopenharmony_ci# 839e1051a39Sopenharmony_ci 840e1051a39Sopenharmony_ci.align 4 841e1051a39Sopenharmony_ci.bn_mul_comba4: 842e1051a39Sopenharmony_ci# 843e1051a39Sopenharmony_ci# This is an optimized version of the bn_mul_comba4 routine. 844e1051a39Sopenharmony_ci# 845e1051a39Sopenharmony_ci# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 846e1051a39Sopenharmony_ci# r3 contains r 847e1051a39Sopenharmony_ci# r4 contains a 848e1051a39Sopenharmony_ci# r5 contains b 849e1051a39Sopenharmony_ci# r6, r7 are the 2 BN_ULONGs being multiplied. 850e1051a39Sopenharmony_ci# r8, r9 are the results of the 32x32 giving 64 multiply. 851e1051a39Sopenharmony_ci# r10, r11, r12 are the equivalents of c1, c2, and c3. 852e1051a39Sopenharmony_ci# 853e1051a39Sopenharmony_ci xor r0,r0,r0 #r0=0. Used in addze below. 854e1051a39Sopenharmony_ci #mul_add_c(a[0],b[0],c1,c2,c3); 855e1051a39Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) 856e1051a39Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 857e1051a39Sopenharmony_ci $UMULL r10,r6,r7 858e1051a39Sopenharmony_ci $UMULH r11,r6,r7 859e1051a39Sopenharmony_ci $ST r10,`0*$BNSZ`(r3) #r[0]=c1 860e1051a39Sopenharmony_ci #mul_add_c(a[0],b[1],c2,c3,c1); 861e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 862e1051a39Sopenharmony_ci $UMULL r8,r6,r7 863e1051a39Sopenharmony_ci $UMULH r9,r6,r7 864e1051a39Sopenharmony_ci addc r11,r8,r11 865e1051a39Sopenharmony_ci adde r12,r9,r0 866e1051a39Sopenharmony_ci addze r10,r0 867e1051a39Sopenharmony_ci #mul_add_c(a[1],b[0],c2,c3,c1); 868e1051a39Sopenharmony_ci $LD r6, `1*$BNSZ`(r4) 869e1051a39Sopenharmony_ci $LD r7, `0*$BNSZ`(r5) 870e1051a39Sopenharmony_ci $UMULL r8,r6,r7 871e1051a39Sopenharmony_ci $UMULH r9,r6,r7 872e1051a39Sopenharmony_ci addc r11,r8,r11 873e1051a39Sopenharmony_ci adde r12,r9,r12 874e1051a39Sopenharmony_ci addze r10,r10 875e1051a39Sopenharmony_ci $ST r11,`1*$BNSZ`(r3) #r[1]=c2 876e1051a39Sopenharmony_ci #mul_add_c(a[2],b[0],c3,c1,c2); 877e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 878e1051a39Sopenharmony_ci $UMULL r8,r6,r7 879e1051a39Sopenharmony_ci $UMULH r9,r6,r7 880e1051a39Sopenharmony_ci addc r12,r8,r12 881e1051a39Sopenharmony_ci adde r10,r9,r10 882e1051a39Sopenharmony_ci addze r11,r0 883e1051a39Sopenharmony_ci #mul_add_c(a[1],b[1],c3,c1,c2); 884e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 885e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 886e1051a39Sopenharmony_ci $UMULL r8,r6,r7 887e1051a39Sopenharmony_ci $UMULH r9,r6,r7 888e1051a39Sopenharmony_ci addc r12,r8,r12 889e1051a39Sopenharmony_ci adde r10,r9,r10 890e1051a39Sopenharmony_ci addze r11,r11 891e1051a39Sopenharmony_ci #mul_add_c(a[0],b[2],c3,c1,c2); 892e1051a39Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) 893e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 894e1051a39Sopenharmony_ci $UMULL r8,r6,r7 895e1051a39Sopenharmony_ci $UMULH r9,r6,r7 896e1051a39Sopenharmony_ci addc r12,r8,r12 897e1051a39Sopenharmony_ci adde r10,r9,r10 898e1051a39Sopenharmony_ci addze r11,r11 899e1051a39Sopenharmony_ci $ST r12,`2*$BNSZ`(r3) #r[2]=c3 900e1051a39Sopenharmony_ci #mul_add_c(a[0],b[3],c1,c2,c3); 901e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 902e1051a39Sopenharmony_ci $UMULL r8,r6,r7 903e1051a39Sopenharmony_ci $UMULH r9,r6,r7 904e1051a39Sopenharmony_ci addc r10,r8,r10 905e1051a39Sopenharmony_ci adde r11,r9,r11 906e1051a39Sopenharmony_ci addze r12,r0 907e1051a39Sopenharmony_ci #mul_add_c(a[1],b[2],c1,c2,c3); 908e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 909e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 910e1051a39Sopenharmony_ci $UMULL r8,r6,r7 911e1051a39Sopenharmony_ci $UMULH r9,r6,r7 912e1051a39Sopenharmony_ci addc r10,r8,r10 913e1051a39Sopenharmony_ci adde r11,r9,r11 914e1051a39Sopenharmony_ci addze r12,r12 915e1051a39Sopenharmony_ci #mul_add_c(a[2],b[1],c1,c2,c3); 916e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 917e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 918e1051a39Sopenharmony_ci $UMULL r8,r6,r7 919e1051a39Sopenharmony_ci $UMULH r9,r6,r7 920e1051a39Sopenharmony_ci addc r10,r8,r10 921e1051a39Sopenharmony_ci adde r11,r9,r11 922e1051a39Sopenharmony_ci addze r12,r12 923e1051a39Sopenharmony_ci #mul_add_c(a[3],b[0],c1,c2,c3); 924e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 925e1051a39Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 926e1051a39Sopenharmony_ci $UMULL r8,r6,r7 927e1051a39Sopenharmony_ci $UMULH r9,r6,r7 928e1051a39Sopenharmony_ci addc r10,r8,r10 929e1051a39Sopenharmony_ci adde r11,r9,r11 930e1051a39Sopenharmony_ci addze r12,r12 931e1051a39Sopenharmony_ci $ST r10,`3*$BNSZ`(r3) #r[3]=c1 932e1051a39Sopenharmony_ci #mul_add_c(a[3],b[1],c2,c3,c1); 933e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 934e1051a39Sopenharmony_ci $UMULL r8,r6,r7 935e1051a39Sopenharmony_ci $UMULH r9,r6,r7 936e1051a39Sopenharmony_ci addc r11,r8,r11 937e1051a39Sopenharmony_ci adde r12,r9,r12 938e1051a39Sopenharmony_ci addze r10,r0 939e1051a39Sopenharmony_ci #mul_add_c(a[2],b[2],c2,c3,c1); 940e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 941e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 942e1051a39Sopenharmony_ci $UMULL r8,r6,r7 943e1051a39Sopenharmony_ci $UMULH r9,r6,r7 944e1051a39Sopenharmony_ci addc r11,r8,r11 945e1051a39Sopenharmony_ci adde r12,r9,r12 946e1051a39Sopenharmony_ci addze r10,r10 947e1051a39Sopenharmony_ci #mul_add_c(a[1],b[3],c2,c3,c1); 948e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 949e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 950e1051a39Sopenharmony_ci $UMULL r8,r6,r7 951e1051a39Sopenharmony_ci $UMULH r9,r6,r7 952e1051a39Sopenharmony_ci addc r11,r8,r11 953e1051a39Sopenharmony_ci adde r12,r9,r12 954e1051a39Sopenharmony_ci addze r10,r10 955e1051a39Sopenharmony_ci $ST r11,`4*$BNSZ`(r3) #r[4]=c2 956e1051a39Sopenharmony_ci #mul_add_c(a[2],b[3],c3,c1,c2); 957e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 958e1051a39Sopenharmony_ci $UMULL r8,r6,r7 959e1051a39Sopenharmony_ci $UMULH r9,r6,r7 960e1051a39Sopenharmony_ci addc r12,r8,r12 961e1051a39Sopenharmony_ci adde r10,r9,r10 962e1051a39Sopenharmony_ci addze r11,r0 963e1051a39Sopenharmony_ci #mul_add_c(a[3],b[2],c3,c1,c2); 964e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 965e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 966e1051a39Sopenharmony_ci $UMULL r8,r6,r7 967e1051a39Sopenharmony_ci $UMULH r9,r6,r7 968e1051a39Sopenharmony_ci addc r12,r8,r12 969e1051a39Sopenharmony_ci adde r10,r9,r10 970e1051a39Sopenharmony_ci addze r11,r11 971e1051a39Sopenharmony_ci $ST r12,`5*$BNSZ`(r3) #r[5]=c3 972e1051a39Sopenharmony_ci #mul_add_c(a[3],b[3],c1,c2,c3); 973e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 974e1051a39Sopenharmony_ci $UMULL r8,r6,r7 975e1051a39Sopenharmony_ci $UMULH r9,r6,r7 976e1051a39Sopenharmony_ci addc r10,r8,r10 977e1051a39Sopenharmony_ci adde r11,r9,r11 978e1051a39Sopenharmony_ci 979e1051a39Sopenharmony_ci $ST r10,`6*$BNSZ`(r3) #r[6]=c1 980e1051a39Sopenharmony_ci $ST r11,`7*$BNSZ`(r3) #r[7]=c2 981e1051a39Sopenharmony_ci blr 982e1051a39Sopenharmony_ci .long 0 983e1051a39Sopenharmony_ci .byte 0,12,0x14,0,0,0,3,0 984e1051a39Sopenharmony_ci .long 0 985e1051a39Sopenharmony_ci.size .bn_mul_comba4,.-.bn_mul_comba4 986e1051a39Sopenharmony_ci 987e1051a39Sopenharmony_ci# 988e1051a39Sopenharmony_ci# NOTE: The following label name should be changed to 989e1051a39Sopenharmony_ci# "bn_mul_comba8" i.e. remove the first dot 990e1051a39Sopenharmony_ci# for the gcc compiler. This should be automatically 991e1051a39Sopenharmony_ci# done in the build 992e1051a39Sopenharmony_ci# 993e1051a39Sopenharmony_ci 994e1051a39Sopenharmony_ci.align 4 995e1051a39Sopenharmony_ci.bn_mul_comba8: 996e1051a39Sopenharmony_ci# 997e1051a39Sopenharmony_ci# Optimized version of the bn_mul_comba8 routine. 998e1051a39Sopenharmony_ci# 999e1051a39Sopenharmony_ci# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1000e1051a39Sopenharmony_ci# r3 contains r 1001e1051a39Sopenharmony_ci# r4 contains a 1002e1051a39Sopenharmony_ci# r5 contains b 1003e1051a39Sopenharmony_ci# r6, r7 are the 2 BN_ULONGs being multiplied. 1004e1051a39Sopenharmony_ci# r8, r9 are the results of the 32x32 giving 64 multiply. 1005e1051a39Sopenharmony_ci# r10, r11, r12 are the equivalents of c1, c2, and c3. 1006e1051a39Sopenharmony_ci# 1007e1051a39Sopenharmony_ci xor r0,r0,r0 #r0=0. Used in addze below. 1008e1051a39Sopenharmony_ci 1009e1051a39Sopenharmony_ci #mul_add_c(a[0],b[0],c1,c2,c3); 1010e1051a39Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) #a[0] 1011e1051a39Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) #b[0] 1012e1051a39Sopenharmony_ci $UMULL r10,r6,r7 1013e1051a39Sopenharmony_ci $UMULH r11,r6,r7 1014e1051a39Sopenharmony_ci $ST r10,`0*$BNSZ`(r3) #r[0]=c1; 1015e1051a39Sopenharmony_ci #mul_add_c(a[0],b[1],c2,c3,c1); 1016e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 1017e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1018e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1019e1051a39Sopenharmony_ci addc r11,r11,r8 1020e1051a39Sopenharmony_ci addze r12,r9 # since we didn't set r12 to zero before. 1021e1051a39Sopenharmony_ci addze r10,r0 1022e1051a39Sopenharmony_ci #mul_add_c(a[1],b[0],c2,c3,c1); 1023e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 1024e1051a39Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 1025e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1026e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1027e1051a39Sopenharmony_ci addc r11,r11,r8 1028e1051a39Sopenharmony_ci adde r12,r12,r9 1029e1051a39Sopenharmony_ci addze r10,r10 1030e1051a39Sopenharmony_ci $ST r11,`1*$BNSZ`(r3) #r[1]=c2; 1031e1051a39Sopenharmony_ci #mul_add_c(a[2],b[0],c3,c1,c2); 1032e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 1033e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1034e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1035e1051a39Sopenharmony_ci addc r12,r12,r8 1036e1051a39Sopenharmony_ci adde r10,r10,r9 1037e1051a39Sopenharmony_ci addze r11,r0 1038e1051a39Sopenharmony_ci #mul_add_c(a[1],b[1],c3,c1,c2); 1039e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 1040e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 1041e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1042e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1043e1051a39Sopenharmony_ci addc r12,r12,r8 1044e1051a39Sopenharmony_ci adde r10,r10,r9 1045e1051a39Sopenharmony_ci addze r11,r11 1046e1051a39Sopenharmony_ci #mul_add_c(a[0],b[2],c3,c1,c2); 1047e1051a39Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) 1048e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 1049e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1050e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1051e1051a39Sopenharmony_ci addc r12,r12,r8 1052e1051a39Sopenharmony_ci adde r10,r10,r9 1053e1051a39Sopenharmony_ci addze r11,r11 1054e1051a39Sopenharmony_ci $ST r12,`2*$BNSZ`(r3) #r[2]=c3; 1055e1051a39Sopenharmony_ci #mul_add_c(a[0],b[3],c1,c2,c3); 1056e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 1057e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1058e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1059e1051a39Sopenharmony_ci addc r10,r10,r8 1060e1051a39Sopenharmony_ci adde r11,r11,r9 1061e1051a39Sopenharmony_ci addze r12,r0 1062e1051a39Sopenharmony_ci #mul_add_c(a[1],b[2],c1,c2,c3); 1063e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 1064e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 1065e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1066e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1067e1051a39Sopenharmony_ci addc r10,r10,r8 1068e1051a39Sopenharmony_ci adde r11,r11,r9 1069e1051a39Sopenharmony_ci addze r12,r12 1070e1051a39Sopenharmony_ci 1071e1051a39Sopenharmony_ci #mul_add_c(a[2],b[1],c1,c2,c3); 1072e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 1073e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 1074e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1075e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1076e1051a39Sopenharmony_ci addc r10,r10,r8 1077e1051a39Sopenharmony_ci adde r11,r11,r9 1078e1051a39Sopenharmony_ci addze r12,r12 1079e1051a39Sopenharmony_ci #mul_add_c(a[3],b[0],c1,c2,c3); 1080e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 1081e1051a39Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 1082e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1083e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1084e1051a39Sopenharmony_ci addc r10,r10,r8 1085e1051a39Sopenharmony_ci adde r11,r11,r9 1086e1051a39Sopenharmony_ci addze r12,r12 1087e1051a39Sopenharmony_ci $ST r10,`3*$BNSZ`(r3) #r[3]=c1; 1088e1051a39Sopenharmony_ci #mul_add_c(a[4],b[0],c2,c3,c1); 1089e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 1090e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1091e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1092e1051a39Sopenharmony_ci addc r11,r11,r8 1093e1051a39Sopenharmony_ci adde r12,r12,r9 1094e1051a39Sopenharmony_ci addze r10,r0 1095e1051a39Sopenharmony_ci #mul_add_c(a[3],b[1],c2,c3,c1); 1096e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 1097e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 1098e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1099e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1100e1051a39Sopenharmony_ci addc r11,r11,r8 1101e1051a39Sopenharmony_ci adde r12,r12,r9 1102e1051a39Sopenharmony_ci addze r10,r10 1103e1051a39Sopenharmony_ci #mul_add_c(a[2],b[2],c2,c3,c1); 1104e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 1105e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 1106e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1107e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1108e1051a39Sopenharmony_ci addc r11,r11,r8 1109e1051a39Sopenharmony_ci adde r12,r12,r9 1110e1051a39Sopenharmony_ci addze r10,r10 1111e1051a39Sopenharmony_ci #mul_add_c(a[1],b[3],c2,c3,c1); 1112e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 1113e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 1114e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1115e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1116e1051a39Sopenharmony_ci addc r11,r11,r8 1117e1051a39Sopenharmony_ci adde r12,r12,r9 1118e1051a39Sopenharmony_ci addze r10,r10 1119e1051a39Sopenharmony_ci #mul_add_c(a[0],b[4],c2,c3,c1); 1120e1051a39Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) 1121e1051a39Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 1122e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1123e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1124e1051a39Sopenharmony_ci addc r11,r11,r8 1125e1051a39Sopenharmony_ci adde r12,r12,r9 1126e1051a39Sopenharmony_ci addze r10,r10 1127e1051a39Sopenharmony_ci $ST r11,`4*$BNSZ`(r3) #r[4]=c2; 1128e1051a39Sopenharmony_ci #mul_add_c(a[0],b[5],c3,c1,c2); 1129e1051a39Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 1130e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1131e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1132e1051a39Sopenharmony_ci addc r12,r12,r8 1133e1051a39Sopenharmony_ci adde r10,r10,r9 1134e1051a39Sopenharmony_ci addze r11,r0 1135e1051a39Sopenharmony_ci #mul_add_c(a[1],b[4],c3,c1,c2); 1136e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 1137e1051a39Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 1138e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1139e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1140e1051a39Sopenharmony_ci addc r12,r12,r8 1141e1051a39Sopenharmony_ci adde r10,r10,r9 1142e1051a39Sopenharmony_ci addze r11,r11 1143e1051a39Sopenharmony_ci #mul_add_c(a[2],b[3],c3,c1,c2); 1144e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 1145e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 1146e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1147e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1148e1051a39Sopenharmony_ci addc r12,r12,r8 1149e1051a39Sopenharmony_ci adde r10,r10,r9 1150e1051a39Sopenharmony_ci addze r11,r11 1151e1051a39Sopenharmony_ci #mul_add_c(a[3],b[2],c3,c1,c2); 1152e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 1153e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 1154e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1155e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1156e1051a39Sopenharmony_ci addc r12,r12,r8 1157e1051a39Sopenharmony_ci adde r10,r10,r9 1158e1051a39Sopenharmony_ci addze r11,r11 1159e1051a39Sopenharmony_ci #mul_add_c(a[4],b[1],c3,c1,c2); 1160e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 1161e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 1162e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1163e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1164e1051a39Sopenharmony_ci addc r12,r12,r8 1165e1051a39Sopenharmony_ci adde r10,r10,r9 1166e1051a39Sopenharmony_ci addze r11,r11 1167e1051a39Sopenharmony_ci #mul_add_c(a[5],b[0],c3,c1,c2); 1168e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 1169e1051a39Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 1170e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1171e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1172e1051a39Sopenharmony_ci addc r12,r12,r8 1173e1051a39Sopenharmony_ci adde r10,r10,r9 1174e1051a39Sopenharmony_ci addze r11,r11 1175e1051a39Sopenharmony_ci $ST r12,`5*$BNSZ`(r3) #r[5]=c3; 1176e1051a39Sopenharmony_ci #mul_add_c(a[6],b[0],c1,c2,c3); 1177e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 1178e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1179e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1180e1051a39Sopenharmony_ci addc r10,r10,r8 1181e1051a39Sopenharmony_ci adde r11,r11,r9 1182e1051a39Sopenharmony_ci addze r12,r0 1183e1051a39Sopenharmony_ci #mul_add_c(a[5],b[1],c1,c2,c3); 1184e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 1185e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 1186e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1187e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1188e1051a39Sopenharmony_ci addc r10,r10,r8 1189e1051a39Sopenharmony_ci adde r11,r11,r9 1190e1051a39Sopenharmony_ci addze r12,r12 1191e1051a39Sopenharmony_ci #mul_add_c(a[4],b[2],c1,c2,c3); 1192e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 1193e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 1194e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1195e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1196e1051a39Sopenharmony_ci addc r10,r10,r8 1197e1051a39Sopenharmony_ci adde r11,r11,r9 1198e1051a39Sopenharmony_ci addze r12,r12 1199e1051a39Sopenharmony_ci #mul_add_c(a[3],b[3],c1,c2,c3); 1200e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 1201e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 1202e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1203e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1204e1051a39Sopenharmony_ci addc r10,r10,r8 1205e1051a39Sopenharmony_ci adde r11,r11,r9 1206e1051a39Sopenharmony_ci addze r12,r12 1207e1051a39Sopenharmony_ci #mul_add_c(a[2],b[4],c1,c2,c3); 1208e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 1209e1051a39Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 1210e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1211e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1212e1051a39Sopenharmony_ci addc r10,r10,r8 1213e1051a39Sopenharmony_ci adde r11,r11,r9 1214e1051a39Sopenharmony_ci addze r12,r12 1215e1051a39Sopenharmony_ci #mul_add_c(a[1],b[5],c1,c2,c3); 1216e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 1217e1051a39Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 1218e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1219e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1220e1051a39Sopenharmony_ci addc r10,r10,r8 1221e1051a39Sopenharmony_ci adde r11,r11,r9 1222e1051a39Sopenharmony_ci addze r12,r12 1223e1051a39Sopenharmony_ci #mul_add_c(a[0],b[6],c1,c2,c3); 1224e1051a39Sopenharmony_ci $LD r6,`0*$BNSZ`(r4) 1225e1051a39Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 1226e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1227e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1228e1051a39Sopenharmony_ci addc r10,r10,r8 1229e1051a39Sopenharmony_ci adde r11,r11,r9 1230e1051a39Sopenharmony_ci addze r12,r12 1231e1051a39Sopenharmony_ci $ST r10,`6*$BNSZ`(r3) #r[6]=c1; 1232e1051a39Sopenharmony_ci #mul_add_c(a[0],b[7],c2,c3,c1); 1233e1051a39Sopenharmony_ci $LD r7,`7*$BNSZ`(r5) 1234e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1235e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1236e1051a39Sopenharmony_ci addc r11,r11,r8 1237e1051a39Sopenharmony_ci adde r12,r12,r9 1238e1051a39Sopenharmony_ci addze r10,r0 1239e1051a39Sopenharmony_ci #mul_add_c(a[1],b[6],c2,c3,c1); 1240e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 1241e1051a39Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 1242e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1243e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1244e1051a39Sopenharmony_ci addc r11,r11,r8 1245e1051a39Sopenharmony_ci adde r12,r12,r9 1246e1051a39Sopenharmony_ci addze r10,r10 1247e1051a39Sopenharmony_ci #mul_add_c(a[2],b[5],c2,c3,c1); 1248e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 1249e1051a39Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 1250e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1251e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1252e1051a39Sopenharmony_ci addc r11,r11,r8 1253e1051a39Sopenharmony_ci adde r12,r12,r9 1254e1051a39Sopenharmony_ci addze r10,r10 1255e1051a39Sopenharmony_ci #mul_add_c(a[3],b[4],c2,c3,c1); 1256e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 1257e1051a39Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 1258e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1259e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1260e1051a39Sopenharmony_ci addc r11,r11,r8 1261e1051a39Sopenharmony_ci adde r12,r12,r9 1262e1051a39Sopenharmony_ci addze r10,r10 1263e1051a39Sopenharmony_ci #mul_add_c(a[4],b[3],c2,c3,c1); 1264e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 1265e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 1266e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1267e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1268e1051a39Sopenharmony_ci addc r11,r11,r8 1269e1051a39Sopenharmony_ci adde r12,r12,r9 1270e1051a39Sopenharmony_ci addze r10,r10 1271e1051a39Sopenharmony_ci #mul_add_c(a[5],b[2],c2,c3,c1); 1272e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 1273e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 1274e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1275e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1276e1051a39Sopenharmony_ci addc r11,r11,r8 1277e1051a39Sopenharmony_ci adde r12,r12,r9 1278e1051a39Sopenharmony_ci addze r10,r10 1279e1051a39Sopenharmony_ci #mul_add_c(a[6],b[1],c2,c3,c1); 1280e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 1281e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 1282e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1283e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1284e1051a39Sopenharmony_ci addc r11,r11,r8 1285e1051a39Sopenharmony_ci adde r12,r12,r9 1286e1051a39Sopenharmony_ci addze r10,r10 1287e1051a39Sopenharmony_ci #mul_add_c(a[7],b[0],c2,c3,c1); 1288e1051a39Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 1289e1051a39Sopenharmony_ci $LD r7,`0*$BNSZ`(r5) 1290e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1291e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1292e1051a39Sopenharmony_ci addc r11,r11,r8 1293e1051a39Sopenharmony_ci adde r12,r12,r9 1294e1051a39Sopenharmony_ci addze r10,r10 1295e1051a39Sopenharmony_ci $ST r11,`7*$BNSZ`(r3) #r[7]=c2; 1296e1051a39Sopenharmony_ci #mul_add_c(a[7],b[1],c3,c1,c2); 1297e1051a39Sopenharmony_ci $LD r7,`1*$BNSZ`(r5) 1298e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1299e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1300e1051a39Sopenharmony_ci addc r12,r12,r8 1301e1051a39Sopenharmony_ci adde r10,r10,r9 1302e1051a39Sopenharmony_ci addze r11,r0 1303e1051a39Sopenharmony_ci #mul_add_c(a[6],b[2],c3,c1,c2); 1304e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 1305e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 1306e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1307e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1308e1051a39Sopenharmony_ci addc r12,r12,r8 1309e1051a39Sopenharmony_ci adde r10,r10,r9 1310e1051a39Sopenharmony_ci addze r11,r11 1311e1051a39Sopenharmony_ci #mul_add_c(a[5],b[3],c3,c1,c2); 1312e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 1313e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 1314e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1315e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1316e1051a39Sopenharmony_ci addc r12,r12,r8 1317e1051a39Sopenharmony_ci adde r10,r10,r9 1318e1051a39Sopenharmony_ci addze r11,r11 1319e1051a39Sopenharmony_ci #mul_add_c(a[4],b[4],c3,c1,c2); 1320e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 1321e1051a39Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 1322e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1323e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1324e1051a39Sopenharmony_ci addc r12,r12,r8 1325e1051a39Sopenharmony_ci adde r10,r10,r9 1326e1051a39Sopenharmony_ci addze r11,r11 1327e1051a39Sopenharmony_ci #mul_add_c(a[3],b[5],c3,c1,c2); 1328e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 1329e1051a39Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 1330e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1331e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1332e1051a39Sopenharmony_ci addc r12,r12,r8 1333e1051a39Sopenharmony_ci adde r10,r10,r9 1334e1051a39Sopenharmony_ci addze r11,r11 1335e1051a39Sopenharmony_ci #mul_add_c(a[2],b[6],c3,c1,c2); 1336e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 1337e1051a39Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 1338e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1339e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1340e1051a39Sopenharmony_ci addc r12,r12,r8 1341e1051a39Sopenharmony_ci adde r10,r10,r9 1342e1051a39Sopenharmony_ci addze r11,r11 1343e1051a39Sopenharmony_ci #mul_add_c(a[1],b[7],c3,c1,c2); 1344e1051a39Sopenharmony_ci $LD r6,`1*$BNSZ`(r4) 1345e1051a39Sopenharmony_ci $LD r7,`7*$BNSZ`(r5) 1346e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1347e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1348e1051a39Sopenharmony_ci addc r12,r12,r8 1349e1051a39Sopenharmony_ci adde r10,r10,r9 1350e1051a39Sopenharmony_ci addze r11,r11 1351e1051a39Sopenharmony_ci $ST r12,`8*$BNSZ`(r3) #r[8]=c3; 1352e1051a39Sopenharmony_ci #mul_add_c(a[2],b[7],c1,c2,c3); 1353e1051a39Sopenharmony_ci $LD r6,`2*$BNSZ`(r4) 1354e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1355e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1356e1051a39Sopenharmony_ci addc r10,r10,r8 1357e1051a39Sopenharmony_ci adde r11,r11,r9 1358e1051a39Sopenharmony_ci addze r12,r0 1359e1051a39Sopenharmony_ci #mul_add_c(a[3],b[6],c1,c2,c3); 1360e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 1361e1051a39Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 1362e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1363e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1364e1051a39Sopenharmony_ci addc r10,r10,r8 1365e1051a39Sopenharmony_ci adde r11,r11,r9 1366e1051a39Sopenharmony_ci addze r12,r12 1367e1051a39Sopenharmony_ci #mul_add_c(a[4],b[5],c1,c2,c3); 1368e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 1369e1051a39Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 1370e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1371e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1372e1051a39Sopenharmony_ci addc r10,r10,r8 1373e1051a39Sopenharmony_ci adde r11,r11,r9 1374e1051a39Sopenharmony_ci addze r12,r12 1375e1051a39Sopenharmony_ci #mul_add_c(a[5],b[4],c1,c2,c3); 1376e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 1377e1051a39Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 1378e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1379e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1380e1051a39Sopenharmony_ci addc r10,r10,r8 1381e1051a39Sopenharmony_ci adde r11,r11,r9 1382e1051a39Sopenharmony_ci addze r12,r12 1383e1051a39Sopenharmony_ci #mul_add_c(a[6],b[3],c1,c2,c3); 1384e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 1385e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 1386e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1387e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1388e1051a39Sopenharmony_ci addc r10,r10,r8 1389e1051a39Sopenharmony_ci adde r11,r11,r9 1390e1051a39Sopenharmony_ci addze r12,r12 1391e1051a39Sopenharmony_ci #mul_add_c(a[7],b[2],c1,c2,c3); 1392e1051a39Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 1393e1051a39Sopenharmony_ci $LD r7,`2*$BNSZ`(r5) 1394e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1395e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1396e1051a39Sopenharmony_ci addc r10,r10,r8 1397e1051a39Sopenharmony_ci adde r11,r11,r9 1398e1051a39Sopenharmony_ci addze r12,r12 1399e1051a39Sopenharmony_ci $ST r10,`9*$BNSZ`(r3) #r[9]=c1; 1400e1051a39Sopenharmony_ci #mul_add_c(a[7],b[3],c2,c3,c1); 1401e1051a39Sopenharmony_ci $LD r7,`3*$BNSZ`(r5) 1402e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1403e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1404e1051a39Sopenharmony_ci addc r11,r11,r8 1405e1051a39Sopenharmony_ci adde r12,r12,r9 1406e1051a39Sopenharmony_ci addze r10,r0 1407e1051a39Sopenharmony_ci #mul_add_c(a[6],b[4],c2,c3,c1); 1408e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 1409e1051a39Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 1410e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1411e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1412e1051a39Sopenharmony_ci addc r11,r11,r8 1413e1051a39Sopenharmony_ci adde r12,r12,r9 1414e1051a39Sopenharmony_ci addze r10,r10 1415e1051a39Sopenharmony_ci #mul_add_c(a[5],b[5],c2,c3,c1); 1416e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 1417e1051a39Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 1418e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1419e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1420e1051a39Sopenharmony_ci addc r11,r11,r8 1421e1051a39Sopenharmony_ci adde r12,r12,r9 1422e1051a39Sopenharmony_ci addze r10,r10 1423e1051a39Sopenharmony_ci #mul_add_c(a[4],b[6],c2,c3,c1); 1424e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 1425e1051a39Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 1426e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1427e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1428e1051a39Sopenharmony_ci addc r11,r11,r8 1429e1051a39Sopenharmony_ci adde r12,r12,r9 1430e1051a39Sopenharmony_ci addze r10,r10 1431e1051a39Sopenharmony_ci #mul_add_c(a[3],b[7],c2,c3,c1); 1432e1051a39Sopenharmony_ci $LD r6,`3*$BNSZ`(r4) 1433e1051a39Sopenharmony_ci $LD r7,`7*$BNSZ`(r5) 1434e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1435e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1436e1051a39Sopenharmony_ci addc r11,r11,r8 1437e1051a39Sopenharmony_ci adde r12,r12,r9 1438e1051a39Sopenharmony_ci addze r10,r10 1439e1051a39Sopenharmony_ci $ST r11,`10*$BNSZ`(r3) #r[10]=c2; 1440e1051a39Sopenharmony_ci #mul_add_c(a[4],b[7],c3,c1,c2); 1441e1051a39Sopenharmony_ci $LD r6,`4*$BNSZ`(r4) 1442e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1443e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1444e1051a39Sopenharmony_ci addc r12,r12,r8 1445e1051a39Sopenharmony_ci adde r10,r10,r9 1446e1051a39Sopenharmony_ci addze r11,r0 1447e1051a39Sopenharmony_ci #mul_add_c(a[5],b[6],c3,c1,c2); 1448e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 1449e1051a39Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 1450e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1451e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1452e1051a39Sopenharmony_ci addc r12,r12,r8 1453e1051a39Sopenharmony_ci adde r10,r10,r9 1454e1051a39Sopenharmony_ci addze r11,r11 1455e1051a39Sopenharmony_ci #mul_add_c(a[6],b[5],c3,c1,c2); 1456e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 1457e1051a39Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 1458e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1459e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1460e1051a39Sopenharmony_ci addc r12,r12,r8 1461e1051a39Sopenharmony_ci adde r10,r10,r9 1462e1051a39Sopenharmony_ci addze r11,r11 1463e1051a39Sopenharmony_ci #mul_add_c(a[7],b[4],c3,c1,c2); 1464e1051a39Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 1465e1051a39Sopenharmony_ci $LD r7,`4*$BNSZ`(r5) 1466e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1467e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1468e1051a39Sopenharmony_ci addc r12,r12,r8 1469e1051a39Sopenharmony_ci adde r10,r10,r9 1470e1051a39Sopenharmony_ci addze r11,r11 1471e1051a39Sopenharmony_ci $ST r12,`11*$BNSZ`(r3) #r[11]=c3; 1472e1051a39Sopenharmony_ci #mul_add_c(a[7],b[5],c1,c2,c3); 1473e1051a39Sopenharmony_ci $LD r7,`5*$BNSZ`(r5) 1474e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1475e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1476e1051a39Sopenharmony_ci addc r10,r10,r8 1477e1051a39Sopenharmony_ci adde r11,r11,r9 1478e1051a39Sopenharmony_ci addze r12,r0 1479e1051a39Sopenharmony_ci #mul_add_c(a[6],b[6],c1,c2,c3); 1480e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 1481e1051a39Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 1482e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1483e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1484e1051a39Sopenharmony_ci addc r10,r10,r8 1485e1051a39Sopenharmony_ci adde r11,r11,r9 1486e1051a39Sopenharmony_ci addze r12,r12 1487e1051a39Sopenharmony_ci #mul_add_c(a[5],b[7],c1,c2,c3); 1488e1051a39Sopenharmony_ci $LD r6,`5*$BNSZ`(r4) 1489e1051a39Sopenharmony_ci $LD r7,`7*$BNSZ`(r5) 1490e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1491e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1492e1051a39Sopenharmony_ci addc r10,r10,r8 1493e1051a39Sopenharmony_ci adde r11,r11,r9 1494e1051a39Sopenharmony_ci addze r12,r12 1495e1051a39Sopenharmony_ci $ST r10,`12*$BNSZ`(r3) #r[12]=c1; 1496e1051a39Sopenharmony_ci #mul_add_c(a[6],b[7],c2,c3,c1); 1497e1051a39Sopenharmony_ci $LD r6,`6*$BNSZ`(r4) 1498e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1499e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1500e1051a39Sopenharmony_ci addc r11,r11,r8 1501e1051a39Sopenharmony_ci adde r12,r12,r9 1502e1051a39Sopenharmony_ci addze r10,r0 1503e1051a39Sopenharmony_ci #mul_add_c(a[7],b[6],c2,c3,c1); 1504e1051a39Sopenharmony_ci $LD r6,`7*$BNSZ`(r4) 1505e1051a39Sopenharmony_ci $LD r7,`6*$BNSZ`(r5) 1506e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1507e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1508e1051a39Sopenharmony_ci addc r11,r11,r8 1509e1051a39Sopenharmony_ci adde r12,r12,r9 1510e1051a39Sopenharmony_ci addze r10,r10 1511e1051a39Sopenharmony_ci $ST r11,`13*$BNSZ`(r3) #r[13]=c2; 1512e1051a39Sopenharmony_ci #mul_add_c(a[7],b[7],c3,c1,c2); 1513e1051a39Sopenharmony_ci $LD r7,`7*$BNSZ`(r5) 1514e1051a39Sopenharmony_ci $UMULL r8,r6,r7 1515e1051a39Sopenharmony_ci $UMULH r9,r6,r7 1516e1051a39Sopenharmony_ci addc r12,r12,r8 1517e1051a39Sopenharmony_ci adde r10,r10,r9 1518e1051a39Sopenharmony_ci $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1519e1051a39Sopenharmony_ci $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1520e1051a39Sopenharmony_ci blr 1521e1051a39Sopenharmony_ci .long 0 1522e1051a39Sopenharmony_ci .byte 0,12,0x14,0,0,0,3,0 1523e1051a39Sopenharmony_ci .long 0 1524e1051a39Sopenharmony_ci.size .bn_mul_comba8,.-.bn_mul_comba8 1525e1051a39Sopenharmony_ci 1526e1051a39Sopenharmony_ci# 1527e1051a39Sopenharmony_ci# NOTE: The following label name should be changed to 1528e1051a39Sopenharmony_ci# "bn_sub_words" i.e. remove the first dot 1529e1051a39Sopenharmony_ci# for the gcc compiler. This should be automatically 1530e1051a39Sopenharmony_ci# done in the build 1531e1051a39Sopenharmony_ci# 1532e1051a39Sopenharmony_ci# 1533e1051a39Sopenharmony_ci.align 4 1534e1051a39Sopenharmony_ci.bn_sub_words: 1535e1051a39Sopenharmony_ci# 1536e1051a39Sopenharmony_ci# Handcoded version of bn_sub_words 1537e1051a39Sopenharmony_ci# 1538e1051a39Sopenharmony_ci#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1539e1051a39Sopenharmony_ci# 1540e1051a39Sopenharmony_ci# r3 = r 1541e1051a39Sopenharmony_ci# r4 = a 1542e1051a39Sopenharmony_ci# r5 = b 1543e1051a39Sopenharmony_ci# r6 = n 1544e1051a39Sopenharmony_ci# 1545e1051a39Sopenharmony_ci# Note: No loop unrolling done since this is not a performance 1546e1051a39Sopenharmony_ci# critical loop. 1547e1051a39Sopenharmony_ci 1548e1051a39Sopenharmony_ci xor r0,r0,r0 #set r0 = 0 1549e1051a39Sopenharmony_ci# 1550e1051a39Sopenharmony_ci# check for r6 = 0 AND set carry bit. 1551e1051a39Sopenharmony_ci# 1552e1051a39Sopenharmony_ci subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1553e1051a39Sopenharmony_ci # if r6 > 0 then result !=0 1554e1051a39Sopenharmony_ci # In either case carry bit is set. 1555e1051a39Sopenharmony_ci beq Lppcasm_sub_adios 1556e1051a39Sopenharmony_ci addi r4,r4,-$BNSZ 1557e1051a39Sopenharmony_ci addi r3,r3,-$BNSZ 1558e1051a39Sopenharmony_ci addi r5,r5,-$BNSZ 1559e1051a39Sopenharmony_ci mtctr r6 1560e1051a39Sopenharmony_ciLppcasm_sub_mainloop: 1561e1051a39Sopenharmony_ci $LDU r7,$BNSZ(r4) 1562e1051a39Sopenharmony_ci $LDU r8,$BNSZ(r5) 1563e1051a39Sopenharmony_ci subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) 1564e1051a39Sopenharmony_ci # if carry = 1 this is r7-r8. Else it 1565e1051a39Sopenharmony_ci # is r7-r8 -1 as we need. 1566e1051a39Sopenharmony_ci $STU r6,$BNSZ(r3) 1567e1051a39Sopenharmony_ci bdnz Lppcasm_sub_mainloop 1568e1051a39Sopenharmony_ciLppcasm_sub_adios: 1569e1051a39Sopenharmony_ci subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1570e1051a39Sopenharmony_ci andi. r3,r3,1 # keep only last bit. 1571e1051a39Sopenharmony_ci blr 1572e1051a39Sopenharmony_ci .long 0 1573e1051a39Sopenharmony_ci .byte 0,12,0x14,0,0,0,4,0 1574e1051a39Sopenharmony_ci .long 0 1575e1051a39Sopenharmony_ci.size .bn_sub_words,.-.bn_sub_words 1576e1051a39Sopenharmony_ci 1577e1051a39Sopenharmony_ci# 1578e1051a39Sopenharmony_ci# NOTE: The following label name should be changed to 1579e1051a39Sopenharmony_ci# "bn_add_words" i.e. remove the first dot 1580e1051a39Sopenharmony_ci# for the gcc compiler. This should be automatically 1581e1051a39Sopenharmony_ci# done in the build 1582e1051a39Sopenharmony_ci# 1583e1051a39Sopenharmony_ci 1584e1051a39Sopenharmony_ci.align 4 1585e1051a39Sopenharmony_ci.bn_add_words: 1586e1051a39Sopenharmony_ci# 1587e1051a39Sopenharmony_ci# Handcoded version of bn_add_words 1588e1051a39Sopenharmony_ci# 1589e1051a39Sopenharmony_ci#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 1590e1051a39Sopenharmony_ci# 1591e1051a39Sopenharmony_ci# r3 = r 1592e1051a39Sopenharmony_ci# r4 = a 1593e1051a39Sopenharmony_ci# r5 = b 1594e1051a39Sopenharmony_ci# r6 = n 1595e1051a39Sopenharmony_ci# 1596e1051a39Sopenharmony_ci# Note: No loop unrolling done since this is not a performance 1597e1051a39Sopenharmony_ci# critical loop. 1598e1051a39Sopenharmony_ci 1599e1051a39Sopenharmony_ci xor r0,r0,r0 1600e1051a39Sopenharmony_ci# 1601e1051a39Sopenharmony_ci# check for r6 = 0. Is this needed? 1602e1051a39Sopenharmony_ci# 1603e1051a39Sopenharmony_ci addic. r6,r6,0 #test r6 and clear carry bit. 1604e1051a39Sopenharmony_ci beq Lppcasm_add_adios 1605e1051a39Sopenharmony_ci addi r4,r4,-$BNSZ 1606e1051a39Sopenharmony_ci addi r3,r3,-$BNSZ 1607e1051a39Sopenharmony_ci addi r5,r5,-$BNSZ 1608e1051a39Sopenharmony_ci mtctr r6 1609e1051a39Sopenharmony_ciLppcasm_add_mainloop: 1610e1051a39Sopenharmony_ci $LDU r7,$BNSZ(r4) 1611e1051a39Sopenharmony_ci $LDU r8,$BNSZ(r5) 1612e1051a39Sopenharmony_ci adde r8,r7,r8 1613e1051a39Sopenharmony_ci $STU r8,$BNSZ(r3) 1614e1051a39Sopenharmony_ci bdnz Lppcasm_add_mainloop 1615e1051a39Sopenharmony_ciLppcasm_add_adios: 1616e1051a39Sopenharmony_ci addze r3,r0 #return carry bit. 1617e1051a39Sopenharmony_ci blr 1618e1051a39Sopenharmony_ci .long 0 1619e1051a39Sopenharmony_ci .byte 0,12,0x14,0,0,0,4,0 1620e1051a39Sopenharmony_ci .long 0 1621e1051a39Sopenharmony_ci.size .bn_add_words,.-.bn_add_words 1622e1051a39Sopenharmony_ci 1623e1051a39Sopenharmony_ci# 1624e1051a39Sopenharmony_ci# NOTE: The following label name should be changed to 1625e1051a39Sopenharmony_ci# "bn_div_words" i.e. remove the first dot 1626e1051a39Sopenharmony_ci# for the gcc compiler. This should be automatically 1627e1051a39Sopenharmony_ci# done in the build 1628e1051a39Sopenharmony_ci# 1629e1051a39Sopenharmony_ci 1630e1051a39Sopenharmony_ci.align 4 1631e1051a39Sopenharmony_ci.bn_div_words: 1632e1051a39Sopenharmony_ci# 1633e1051a39Sopenharmony_ci# This is a cleaned up version of code generated by 1634e1051a39Sopenharmony_ci# the AIX compiler. The only optimization is to use 1635e1051a39Sopenharmony_ci# the PPC instruction to count leading zeros instead 1636e1051a39Sopenharmony_ci# of call to num_bits_word. Since this was compiled 1637e1051a39Sopenharmony_ci# only at level -O2 we can possibly squeeze it more? 1638e1051a39Sopenharmony_ci# 1639e1051a39Sopenharmony_ci# r3 = h 1640e1051a39Sopenharmony_ci# r4 = l 1641e1051a39Sopenharmony_ci# r5 = d 1642e1051a39Sopenharmony_ci 1643e1051a39Sopenharmony_ci $UCMPI 0,r5,0 # compare r5 and 0 1644e1051a39Sopenharmony_ci bne Lppcasm_div1 # proceed if d!=0 1645e1051a39Sopenharmony_ci li r3,-1 # d=0 return -1 1646e1051a39Sopenharmony_ci blr 1647e1051a39Sopenharmony_ciLppcasm_div1: 1648e1051a39Sopenharmony_ci xor r0,r0,r0 #r0=0 1649e1051a39Sopenharmony_ci li r8,$BITS 1650e1051a39Sopenharmony_ci $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1651e1051a39Sopenharmony_ci beq Lppcasm_div2 #proceed if no leading zeros 1652e1051a39Sopenharmony_ci subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1653e1051a39Sopenharmony_ci $SHR. r9,r3,r8 #are there any bits above r8'th? 1654e1051a39Sopenharmony_ci $TR 16,r9,r0 #if there're, signal to dump core... 1655e1051a39Sopenharmony_ciLppcasm_div2: 1656e1051a39Sopenharmony_ci $UCMP 0,r3,r5 #h>=d? 1657e1051a39Sopenharmony_ci blt Lppcasm_div3 #goto Lppcasm_div3 if not 1658e1051a39Sopenharmony_ci subf r3,r5,r3 #h-=d ; 1659e1051a39Sopenharmony_ciLppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1660e1051a39Sopenharmony_ci cmpi 0,0,r7,0 # is (i == 0)? 1661e1051a39Sopenharmony_ci beq Lppcasm_div4 1662e1051a39Sopenharmony_ci $SHL r3,r3,r7 # h = (h<< i) 1663e1051a39Sopenharmony_ci $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1664e1051a39Sopenharmony_ci $SHL r5,r5,r7 # d<<=i 1665e1051a39Sopenharmony_ci or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) 1666e1051a39Sopenharmony_ci $SHL r4,r4,r7 # l <<=i 1667e1051a39Sopenharmony_ciLppcasm_div4: 1668e1051a39Sopenharmony_ci $SHRI r9,r5,`$BITS/2` # r9 = dh 1669e1051a39Sopenharmony_ci # dl will be computed when needed 1670e1051a39Sopenharmony_ci # as it saves registers. 1671e1051a39Sopenharmony_ci li r6,2 #r6=2 1672e1051a39Sopenharmony_ci mtctr r6 #counter will be in count. 1673e1051a39Sopenharmony_ciLppcasm_divouterloop: 1674e1051a39Sopenharmony_ci $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) 1675e1051a39Sopenharmony_ci $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1676e1051a39Sopenharmony_ci # compute here for innerloop. 1677e1051a39Sopenharmony_ci $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1678e1051a39Sopenharmony_ci bne Lppcasm_div5 # goto Lppcasm_div5 if not 1679e1051a39Sopenharmony_ci 1680e1051a39Sopenharmony_ci li r8,-1 1681e1051a39Sopenharmony_ci $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1682e1051a39Sopenharmony_ci b Lppcasm_div6 1683e1051a39Sopenharmony_ciLppcasm_div5: 1684e1051a39Sopenharmony_ci $UDIV r8,r3,r9 #q = h/dh 1685e1051a39Sopenharmony_ciLppcasm_div6: 1686e1051a39Sopenharmony_ci $UMULL r12,r9,r8 #th = q*dh 1687e1051a39Sopenharmony_ci $CLRU r10,r5,`$BITS/2` #r10=dl 1688e1051a39Sopenharmony_ci $UMULL r6,r8,r10 #tl = q*dl 1689e1051a39Sopenharmony_ci 1690e1051a39Sopenharmony_ciLppcasm_divinnerloop: 1691e1051a39Sopenharmony_ci subf r10,r12,r3 #t = h -th 1692e1051a39Sopenharmony_ci $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... 1693e1051a39Sopenharmony_ci addic. r7,r7,0 #test if r7 == 0. used below. 1694e1051a39Sopenharmony_ci # now want to compute 1695e1051a39Sopenharmony_ci # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) 1696e1051a39Sopenharmony_ci # the following 2 instructions do that 1697e1051a39Sopenharmony_ci $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1698e1051a39Sopenharmony_ci or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1699e1051a39Sopenharmony_ci $UCMP cr1,r6,r7 # compare (tl <= r7) 1700e1051a39Sopenharmony_ci bne Lppcasm_divinnerexit 1701e1051a39Sopenharmony_ci ble cr1,Lppcasm_divinnerexit 1702e1051a39Sopenharmony_ci addi r8,r8,-1 #q-- 1703e1051a39Sopenharmony_ci subf r12,r9,r12 #th -=dh 1704e1051a39Sopenharmony_ci $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1705e1051a39Sopenharmony_ci subf r6,r10,r6 #tl -=dl 1706e1051a39Sopenharmony_ci b Lppcasm_divinnerloop 1707e1051a39Sopenharmony_ciLppcasm_divinnerexit: 1708e1051a39Sopenharmony_ci $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1709e1051a39Sopenharmony_ci $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1710e1051a39Sopenharmony_ci $UCMP cr1,r4,r11 # compare l and tl 1711e1051a39Sopenharmony_ci add r12,r12,r10 # th+=t 1712e1051a39Sopenharmony_ci bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1713e1051a39Sopenharmony_ci addi r12,r12,1 # th++ 1714e1051a39Sopenharmony_ciLppcasm_div7: 1715e1051a39Sopenharmony_ci subf r11,r11,r4 #r11=l-tl 1716e1051a39Sopenharmony_ci $UCMP cr1,r3,r12 #compare h and th 1717e1051a39Sopenharmony_ci bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1718e1051a39Sopenharmony_ci addi r8,r8,-1 # q-- 1719e1051a39Sopenharmony_ci add r3,r5,r3 # h+=d 1720e1051a39Sopenharmony_ciLppcasm_div8: 1721e1051a39Sopenharmony_ci subf r12,r12,r3 #r12 = h-th 1722e1051a39Sopenharmony_ci $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 1723e1051a39Sopenharmony_ci # want to compute 1724e1051a39Sopenharmony_ci # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 1725e1051a39Sopenharmony_ci # the following 2 instructions will do this. 1726e1051a39Sopenharmony_ci $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1727e1051a39Sopenharmony_ci $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1728e1051a39Sopenharmony_ci bdz Lppcasm_div9 #if (count==0) break ; 1729e1051a39Sopenharmony_ci $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1730e1051a39Sopenharmony_ci b Lppcasm_divouterloop 1731e1051a39Sopenharmony_ciLppcasm_div9: 1732e1051a39Sopenharmony_ci or r3,r8,r0 1733e1051a39Sopenharmony_ci blr 1734e1051a39Sopenharmony_ci .long 0 1735e1051a39Sopenharmony_ci .byte 0,12,0x14,0,0,0,3,0 1736e1051a39Sopenharmony_ci .long 0 1737e1051a39Sopenharmony_ci.size .bn_div_words,.-.bn_div_words 1738e1051a39Sopenharmony_ci 1739e1051a39Sopenharmony_ci# 1740e1051a39Sopenharmony_ci# NOTE: The following label name should be changed to 1741e1051a39Sopenharmony_ci# "bn_sqr_words" i.e. remove the first dot 1742e1051a39Sopenharmony_ci# for the gcc compiler. This should be automatically 1743e1051a39Sopenharmony_ci# done in the build 1744e1051a39Sopenharmony_ci# 1745e1051a39Sopenharmony_ci.align 4 1746e1051a39Sopenharmony_ci.bn_sqr_words: 1747e1051a39Sopenharmony_ci# 1748e1051a39Sopenharmony_ci# Optimized version of bn_sqr_words 1749e1051a39Sopenharmony_ci# 1750e1051a39Sopenharmony_ci# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 1751e1051a39Sopenharmony_ci# 1752e1051a39Sopenharmony_ci# r3 = r 1753e1051a39Sopenharmony_ci# r4 = a 1754e1051a39Sopenharmony_ci# r5 = n 1755e1051a39Sopenharmony_ci# 1756e1051a39Sopenharmony_ci# r6 = a[i]. 1757e1051a39Sopenharmony_ci# r7,r8 = product. 1758e1051a39Sopenharmony_ci# 1759e1051a39Sopenharmony_ci# No unrolling done here. Not performance critical. 1760e1051a39Sopenharmony_ci 1761e1051a39Sopenharmony_ci addic. r5,r5,0 #test r5. 1762e1051a39Sopenharmony_ci beq Lppcasm_sqr_adios 1763e1051a39Sopenharmony_ci addi r4,r4,-$BNSZ 1764e1051a39Sopenharmony_ci addi r3,r3,-$BNSZ 1765e1051a39Sopenharmony_ci mtctr r5 1766e1051a39Sopenharmony_ciLppcasm_sqr_mainloop: 1767e1051a39Sopenharmony_ci #sqr(r[0],r[1],a[0]); 1768e1051a39Sopenharmony_ci $LDU r6,$BNSZ(r4) 1769e1051a39Sopenharmony_ci $UMULL r7,r6,r6 1770e1051a39Sopenharmony_ci $UMULH r8,r6,r6 1771e1051a39Sopenharmony_ci $STU r7,$BNSZ(r3) 1772e1051a39Sopenharmony_ci $STU r8,$BNSZ(r3) 1773e1051a39Sopenharmony_ci bdnz Lppcasm_sqr_mainloop 1774e1051a39Sopenharmony_ciLppcasm_sqr_adios: 1775e1051a39Sopenharmony_ci blr 1776e1051a39Sopenharmony_ci .long 0 1777e1051a39Sopenharmony_ci .byte 0,12,0x14,0,0,0,3,0 1778e1051a39Sopenharmony_ci .long 0 1779e1051a39Sopenharmony_ci.size .bn_sqr_words,.-.bn_sqr_words 1780e1051a39Sopenharmony_ci 1781e1051a39Sopenharmony_ci# 1782e1051a39Sopenharmony_ci# NOTE: The following label name should be changed to 1783e1051a39Sopenharmony_ci# "bn_mul_words" i.e. remove the first dot 1784e1051a39Sopenharmony_ci# for the gcc compiler. This should be automatically 1785e1051a39Sopenharmony_ci# done in the build 1786e1051a39Sopenharmony_ci# 1787e1051a39Sopenharmony_ci 1788e1051a39Sopenharmony_ci.align 4 1789e1051a39Sopenharmony_ci.bn_mul_words: 1790e1051a39Sopenharmony_ci# 1791e1051a39Sopenharmony_ci# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1792e1051a39Sopenharmony_ci# 1793e1051a39Sopenharmony_ci# r3 = rp 1794e1051a39Sopenharmony_ci# r4 = ap 1795e1051a39Sopenharmony_ci# r5 = num 1796e1051a39Sopenharmony_ci# r6 = w 1797e1051a39Sopenharmony_ci xor r0,r0,r0 1798e1051a39Sopenharmony_ci xor r12,r12,r12 # used for carry 1799e1051a39Sopenharmony_ci rlwinm. r7,r5,30,2,31 # num >> 2 1800e1051a39Sopenharmony_ci beq Lppcasm_mw_REM 1801e1051a39Sopenharmony_ci mtctr r7 1802e1051a39Sopenharmony_ciLppcasm_mw_LOOP: 1803e1051a39Sopenharmony_ci #mul(rp[0],ap[0],w,c1); 1804e1051a39Sopenharmony_ci $LD r8,`0*$BNSZ`(r4) 1805e1051a39Sopenharmony_ci $UMULL r9,r6,r8 1806e1051a39Sopenharmony_ci $UMULH r10,r6,r8 1807e1051a39Sopenharmony_ci addc r9,r9,r12 1808e1051a39Sopenharmony_ci #addze r10,r10 #carry is NOT ignored. 1809e1051a39Sopenharmony_ci #will be taken care of 1810e1051a39Sopenharmony_ci #in second spin below 1811e1051a39Sopenharmony_ci #using adde. 1812e1051a39Sopenharmony_ci $ST r9,`0*$BNSZ`(r3) 1813e1051a39Sopenharmony_ci #mul(rp[1],ap[1],w,c1); 1814e1051a39Sopenharmony_ci $LD r8,`1*$BNSZ`(r4) 1815e1051a39Sopenharmony_ci $UMULL r11,r6,r8 1816e1051a39Sopenharmony_ci $UMULH r12,r6,r8 1817e1051a39Sopenharmony_ci adde r11,r11,r10 1818e1051a39Sopenharmony_ci #addze r12,r12 1819e1051a39Sopenharmony_ci $ST r11,`1*$BNSZ`(r3) 1820e1051a39Sopenharmony_ci #mul(rp[2],ap[2],w,c1); 1821e1051a39Sopenharmony_ci $LD r8,`2*$BNSZ`(r4) 1822e1051a39Sopenharmony_ci $UMULL r9,r6,r8 1823e1051a39Sopenharmony_ci $UMULH r10,r6,r8 1824e1051a39Sopenharmony_ci adde r9,r9,r12 1825e1051a39Sopenharmony_ci #addze r10,r10 1826e1051a39Sopenharmony_ci $ST r9,`2*$BNSZ`(r3) 1827e1051a39Sopenharmony_ci #mul_add(rp[3],ap[3],w,c1); 1828e1051a39Sopenharmony_ci $LD r8,`3*$BNSZ`(r4) 1829e1051a39Sopenharmony_ci $UMULL r11,r6,r8 1830e1051a39Sopenharmony_ci $UMULH r12,r6,r8 1831e1051a39Sopenharmony_ci adde r11,r11,r10 1832e1051a39Sopenharmony_ci addze r12,r12 #this spin we collect carry into 1833e1051a39Sopenharmony_ci #r12 1834e1051a39Sopenharmony_ci $ST r11,`3*$BNSZ`(r3) 1835e1051a39Sopenharmony_ci 1836e1051a39Sopenharmony_ci addi r3,r3,`4*$BNSZ` 1837e1051a39Sopenharmony_ci addi r4,r4,`4*$BNSZ` 1838e1051a39Sopenharmony_ci bdnz Lppcasm_mw_LOOP 1839e1051a39Sopenharmony_ci 1840e1051a39Sopenharmony_ciLppcasm_mw_REM: 1841e1051a39Sopenharmony_ci andi. r5,r5,0x3 1842e1051a39Sopenharmony_ci beq Lppcasm_mw_OVER 1843e1051a39Sopenharmony_ci #mul(rp[0],ap[0],w,c1); 1844e1051a39Sopenharmony_ci $LD r8,`0*$BNSZ`(r4) 1845e1051a39Sopenharmony_ci $UMULL r9,r6,r8 1846e1051a39Sopenharmony_ci $UMULH r10,r6,r8 1847e1051a39Sopenharmony_ci addc r9,r9,r12 1848e1051a39Sopenharmony_ci addze r10,r10 1849e1051a39Sopenharmony_ci $ST r9,`0*$BNSZ`(r3) 1850e1051a39Sopenharmony_ci addi r12,r10,0 1851e1051a39Sopenharmony_ci 1852e1051a39Sopenharmony_ci addi r5,r5,-1 1853e1051a39Sopenharmony_ci cmpli 0,0,r5,0 1854e1051a39Sopenharmony_ci beq Lppcasm_mw_OVER 1855e1051a39Sopenharmony_ci 1856e1051a39Sopenharmony_ci 1857e1051a39Sopenharmony_ci #mul(rp[1],ap[1],w,c1); 1858e1051a39Sopenharmony_ci $LD r8,`1*$BNSZ`(r4) 1859e1051a39Sopenharmony_ci $UMULL r9,r6,r8 1860e1051a39Sopenharmony_ci $UMULH r10,r6,r8 1861e1051a39Sopenharmony_ci addc r9,r9,r12 1862e1051a39Sopenharmony_ci addze r10,r10 1863e1051a39Sopenharmony_ci $ST r9,`1*$BNSZ`(r3) 1864e1051a39Sopenharmony_ci addi r12,r10,0 1865e1051a39Sopenharmony_ci 1866e1051a39Sopenharmony_ci addi r5,r5,-1 1867e1051a39Sopenharmony_ci cmpli 0,0,r5,0 1868e1051a39Sopenharmony_ci beq Lppcasm_mw_OVER 1869e1051a39Sopenharmony_ci 1870e1051a39Sopenharmony_ci #mul_add(rp[2],ap[2],w,c1); 1871e1051a39Sopenharmony_ci $LD r8,`2*$BNSZ`(r4) 1872e1051a39Sopenharmony_ci $UMULL r9,r6,r8 1873e1051a39Sopenharmony_ci $UMULH r10,r6,r8 1874e1051a39Sopenharmony_ci addc r9,r9,r12 1875e1051a39Sopenharmony_ci addze r10,r10 1876e1051a39Sopenharmony_ci $ST r9,`2*$BNSZ`(r3) 1877e1051a39Sopenharmony_ci addi r12,r10,0 1878e1051a39Sopenharmony_ci 1879e1051a39Sopenharmony_ciLppcasm_mw_OVER: 1880e1051a39Sopenharmony_ci addi r3,r12,0 1881e1051a39Sopenharmony_ci blr 1882e1051a39Sopenharmony_ci .long 0 1883e1051a39Sopenharmony_ci .byte 0,12,0x14,0,0,0,4,0 1884e1051a39Sopenharmony_ci .long 0 1885e1051a39Sopenharmony_ci.size .bn_mul_words,.-.bn_mul_words 1886e1051a39Sopenharmony_ci 1887e1051a39Sopenharmony_ci# 1888e1051a39Sopenharmony_ci# NOTE: The following label name should be changed to 1889e1051a39Sopenharmony_ci# "bn_mul_add_words" i.e. remove the first dot 1890e1051a39Sopenharmony_ci# for the gcc compiler. This should be automatically 1891e1051a39Sopenharmony_ci# done in the build 1892e1051a39Sopenharmony_ci# 1893e1051a39Sopenharmony_ci 1894e1051a39Sopenharmony_ci.align 4 1895e1051a39Sopenharmony_ci.bn_mul_add_words: 1896e1051a39Sopenharmony_ci# 1897e1051a39Sopenharmony_ci# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 1898e1051a39Sopenharmony_ci# 1899e1051a39Sopenharmony_ci# r3 = rp 1900e1051a39Sopenharmony_ci# r4 = ap 1901e1051a39Sopenharmony_ci# r5 = num 1902e1051a39Sopenharmony_ci# r6 = w 1903e1051a39Sopenharmony_ci# 1904e1051a39Sopenharmony_ci# empirical evidence suggests that unrolled version performs best!! 1905e1051a39Sopenharmony_ci# 1906e1051a39Sopenharmony_ci xor r0,r0,r0 #r0 = 0 1907e1051a39Sopenharmony_ci xor r12,r12,r12 #r12 = 0 . used for carry 1908e1051a39Sopenharmony_ci rlwinm. r7,r5,30,2,31 # num >> 2 1909e1051a39Sopenharmony_ci beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1910e1051a39Sopenharmony_ci mtctr r7 1911e1051a39Sopenharmony_ciLppcasm_maw_mainloop: 1912e1051a39Sopenharmony_ci #mul_add(rp[0],ap[0],w,c1); 1913e1051a39Sopenharmony_ci $LD r8,`0*$BNSZ`(r4) 1914e1051a39Sopenharmony_ci $LD r11,`0*$BNSZ`(r3) 1915e1051a39Sopenharmony_ci $UMULL r9,r6,r8 1916e1051a39Sopenharmony_ci $UMULH r10,r6,r8 1917e1051a39Sopenharmony_ci addc r9,r9,r12 #r12 is carry. 1918e1051a39Sopenharmony_ci addze r10,r10 1919e1051a39Sopenharmony_ci addc r9,r9,r11 1920e1051a39Sopenharmony_ci #addze r10,r10 1921e1051a39Sopenharmony_ci #the above instruction addze 1922e1051a39Sopenharmony_ci #is NOT needed. Carry will NOT 1923e1051a39Sopenharmony_ci #be ignored. It's not affected 1924e1051a39Sopenharmony_ci #by multiply and will be collected 1925e1051a39Sopenharmony_ci #in the next spin 1926e1051a39Sopenharmony_ci $ST r9,`0*$BNSZ`(r3) 1927e1051a39Sopenharmony_ci 1928e1051a39Sopenharmony_ci #mul_add(rp[1],ap[1],w,c1); 1929e1051a39Sopenharmony_ci $LD r8,`1*$BNSZ`(r4) 1930e1051a39Sopenharmony_ci $LD r9,`1*$BNSZ`(r3) 1931e1051a39Sopenharmony_ci $UMULL r11,r6,r8 1932e1051a39Sopenharmony_ci $UMULH r12,r6,r8 1933e1051a39Sopenharmony_ci adde r11,r11,r10 #r10 is carry. 1934e1051a39Sopenharmony_ci addze r12,r12 1935e1051a39Sopenharmony_ci addc r11,r11,r9 1936e1051a39Sopenharmony_ci #addze r12,r12 1937e1051a39Sopenharmony_ci $ST r11,`1*$BNSZ`(r3) 1938e1051a39Sopenharmony_ci 1939e1051a39Sopenharmony_ci #mul_add(rp[2],ap[2],w,c1); 1940e1051a39Sopenharmony_ci $LD r8,`2*$BNSZ`(r4) 1941e1051a39Sopenharmony_ci $UMULL r9,r6,r8 1942e1051a39Sopenharmony_ci $LD r11,`2*$BNSZ`(r3) 1943e1051a39Sopenharmony_ci $UMULH r10,r6,r8 1944e1051a39Sopenharmony_ci adde r9,r9,r12 1945e1051a39Sopenharmony_ci addze r10,r10 1946e1051a39Sopenharmony_ci addc r9,r9,r11 1947e1051a39Sopenharmony_ci #addze r10,r10 1948e1051a39Sopenharmony_ci $ST r9,`2*$BNSZ`(r3) 1949e1051a39Sopenharmony_ci 1950e1051a39Sopenharmony_ci #mul_add(rp[3],ap[3],w,c1); 1951e1051a39Sopenharmony_ci $LD r8,`3*$BNSZ`(r4) 1952e1051a39Sopenharmony_ci $UMULL r11,r6,r8 1953e1051a39Sopenharmony_ci $LD r9,`3*$BNSZ`(r3) 1954e1051a39Sopenharmony_ci $UMULH r12,r6,r8 1955e1051a39Sopenharmony_ci adde r11,r11,r10 1956e1051a39Sopenharmony_ci addze r12,r12 1957e1051a39Sopenharmony_ci addc r11,r11,r9 1958e1051a39Sopenharmony_ci addze r12,r12 1959e1051a39Sopenharmony_ci $ST r11,`3*$BNSZ`(r3) 1960e1051a39Sopenharmony_ci addi r3,r3,`4*$BNSZ` 1961e1051a39Sopenharmony_ci addi r4,r4,`4*$BNSZ` 1962e1051a39Sopenharmony_ci bdnz Lppcasm_maw_mainloop 1963e1051a39Sopenharmony_ci 1964e1051a39Sopenharmony_ciLppcasm_maw_leftover: 1965e1051a39Sopenharmony_ci andi. r5,r5,0x3 1966e1051a39Sopenharmony_ci beq Lppcasm_maw_adios 1967e1051a39Sopenharmony_ci addi r3,r3,-$BNSZ 1968e1051a39Sopenharmony_ci addi r4,r4,-$BNSZ 1969e1051a39Sopenharmony_ci #mul_add(rp[0],ap[0],w,c1); 1970e1051a39Sopenharmony_ci mtctr r5 1971e1051a39Sopenharmony_ci $LDU r8,$BNSZ(r4) 1972e1051a39Sopenharmony_ci $UMULL r9,r6,r8 1973e1051a39Sopenharmony_ci $UMULH r10,r6,r8 1974e1051a39Sopenharmony_ci $LDU r11,$BNSZ(r3) 1975e1051a39Sopenharmony_ci addc r9,r9,r11 1976e1051a39Sopenharmony_ci addze r10,r10 1977e1051a39Sopenharmony_ci addc r9,r9,r12 1978e1051a39Sopenharmony_ci addze r12,r10 1979e1051a39Sopenharmony_ci $ST r9,0(r3) 1980e1051a39Sopenharmony_ci 1981e1051a39Sopenharmony_ci bdz Lppcasm_maw_adios 1982e1051a39Sopenharmony_ci #mul_add(rp[1],ap[1],w,c1); 1983e1051a39Sopenharmony_ci $LDU r8,$BNSZ(r4) 1984e1051a39Sopenharmony_ci $UMULL r9,r6,r8 1985e1051a39Sopenharmony_ci $UMULH r10,r6,r8 1986e1051a39Sopenharmony_ci $LDU r11,$BNSZ(r3) 1987e1051a39Sopenharmony_ci addc r9,r9,r11 1988e1051a39Sopenharmony_ci addze r10,r10 1989e1051a39Sopenharmony_ci addc r9,r9,r12 1990e1051a39Sopenharmony_ci addze r12,r10 1991e1051a39Sopenharmony_ci $ST r9,0(r3) 1992e1051a39Sopenharmony_ci 1993e1051a39Sopenharmony_ci bdz Lppcasm_maw_adios 1994e1051a39Sopenharmony_ci #mul_add(rp[2],ap[2],w,c1); 1995e1051a39Sopenharmony_ci $LDU r8,$BNSZ(r4) 1996e1051a39Sopenharmony_ci $UMULL r9,r6,r8 1997e1051a39Sopenharmony_ci $UMULH r10,r6,r8 1998e1051a39Sopenharmony_ci $LDU r11,$BNSZ(r3) 1999e1051a39Sopenharmony_ci addc r9,r9,r11 2000e1051a39Sopenharmony_ci addze r10,r10 2001e1051a39Sopenharmony_ci addc r9,r9,r12 2002e1051a39Sopenharmony_ci addze r12,r10 2003e1051a39Sopenharmony_ci $ST r9,0(r3) 2004e1051a39Sopenharmony_ci 2005e1051a39Sopenharmony_ciLppcasm_maw_adios: 2006e1051a39Sopenharmony_ci addi r3,r12,0 2007e1051a39Sopenharmony_ci blr 2008e1051a39Sopenharmony_ci .long 0 2009e1051a39Sopenharmony_ci .byte 0,12,0x14,0,0,0,4,0 2010e1051a39Sopenharmony_ci .long 0 2011e1051a39Sopenharmony_ci.size .bn_mul_add_words,.-.bn_mul_add_words 2012e1051a39Sopenharmony_ci .align 4 2013e1051a39Sopenharmony_ciEOF 2014e1051a39Sopenharmony_ci$data =~ s/\`([^\`]*)\`/eval $1/gem; 2015e1051a39Sopenharmony_ciprint $data; 2016e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 2017