11cb0ef41Sopenharmony_ci#! /usr/bin/env perl
21cb0ef41Sopenharmony_ci# Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved.
31cb0ef41Sopenharmony_ci#
41cb0ef41Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
51cb0ef41Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
61cb0ef41Sopenharmony_ci# in the file LICENSE in the source distribution or at
71cb0ef41Sopenharmony_ci# https://www.openssl.org/source/license.html
81cb0ef41Sopenharmony_ci
91cb0ef41Sopenharmony_ci# Implemented as a Perl wrapper as we want to support several different
101cb0ef41Sopenharmony_ci# architectures with single file. We pick up the target based on the
111cb0ef41Sopenharmony_ci# file name we are asked to generate.
121cb0ef41Sopenharmony_ci#
131cb0ef41Sopenharmony_ci# It should be noted though that this perl code is nothing like
141cb0ef41Sopenharmony_ci# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
151cb0ef41Sopenharmony_ci# as pre-processor to cover for platform differences in name decoration,
161cb0ef41Sopenharmony_ci# linker tables, 32-/64-bit instruction sets...
171cb0ef41Sopenharmony_ci#
181cb0ef41Sopenharmony_ci# As you might know there're several PowerPC ABI in use. Most notably
191cb0ef41Sopenharmony_ci# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
201cb0ef41Sopenharmony_ci# are similar enough to implement leaf(!) functions, which would be ABI
211cb0ef41Sopenharmony_ci# neutral. And that's what you find here: ABI neutral leaf functions.
221cb0ef41Sopenharmony_ci# In case you wonder what that is...
231cb0ef41Sopenharmony_ci#
241cb0ef41Sopenharmony_ci#       AIX performance
251cb0ef41Sopenharmony_ci#
261cb0ef41Sopenharmony_ci#	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
271cb0ef41Sopenharmony_ci#
281cb0ef41Sopenharmony_ci#	The following is the performance of 32-bit compiler
291cb0ef41Sopenharmony_ci#	generated code:
301cb0ef41Sopenharmony_ci#
311cb0ef41Sopenharmony_ci#	OpenSSL 0.9.6c 21 dec 2001
321cb0ef41Sopenharmony_ci#	built on: Tue Jun 11 11:06:51 EDT 2002
331cb0ef41Sopenharmony_ci#	options:bn(64,32) ...
341cb0ef41Sopenharmony_ci#compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
351cb0ef41Sopenharmony_ci#                  sign    verify    sign/s verify/s
361cb0ef41Sopenharmony_ci#rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
371cb0ef41Sopenharmony_ci#rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
381cb0ef41Sopenharmony_ci#rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
391cb0ef41Sopenharmony_ci#rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
401cb0ef41Sopenharmony_ci#dsa  512 bits   0.0087s   0.0106s    114.3     94.5
411cb0ef41Sopenharmony_ci#dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
421cb0ef41Sopenharmony_ci#
431cb0ef41Sopenharmony_ci#	Same benchmark with this assembler code:
441cb0ef41Sopenharmony_ci#
451cb0ef41Sopenharmony_ci#rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
461cb0ef41Sopenharmony_ci#rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
471cb0ef41Sopenharmony_ci#rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
481cb0ef41Sopenharmony_ci#rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
491cb0ef41Sopenharmony_ci#dsa  512 bits   0.0052s   0.0062s    191.6    162.0
501cb0ef41Sopenharmony_ci#dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
511cb0ef41Sopenharmony_ci#
521cb0ef41Sopenharmony_ci#	Number of operations increases by at almost 75%
531cb0ef41Sopenharmony_ci#
541cb0ef41Sopenharmony_ci#	Here are performance numbers for 64-bit compiler
551cb0ef41Sopenharmony_ci#	generated code:
561cb0ef41Sopenharmony_ci#
571cb0ef41Sopenharmony_ci#	OpenSSL 0.9.6g [engine] 9 Aug 2002
581cb0ef41Sopenharmony_ci#	built on: Fri Apr 18 16:59:20 EDT 2003
591cb0ef41Sopenharmony_ci#	options:bn(64,64) ...
601cb0ef41Sopenharmony_ci#	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
611cb0ef41Sopenharmony_ci#                  sign    verify    sign/s verify/s
621cb0ef41Sopenharmony_ci#rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
631cb0ef41Sopenharmony_ci#rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
641cb0ef41Sopenharmony_ci#rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
651cb0ef41Sopenharmony_ci#rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
661cb0ef41Sopenharmony_ci#dsa  512 bits   0.0026s   0.0032s    382.5    313.7
671cb0ef41Sopenharmony_ci#dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
681cb0ef41Sopenharmony_ci#
691cb0ef41Sopenharmony_ci#	Same benchmark with this assembler code:
701cb0ef41Sopenharmony_ci#
711cb0ef41Sopenharmony_ci#rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
721cb0ef41Sopenharmony_ci#rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
731cb0ef41Sopenharmony_ci#rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
741cb0ef41Sopenharmony_ci#rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
751cb0ef41Sopenharmony_ci#dsa  512 bits   0.0016s   0.0020s    610.7    507.1
761cb0ef41Sopenharmony_ci#dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
771cb0ef41Sopenharmony_ci#
781cb0ef41Sopenharmony_ci#	Again, performance increases by at about 75%
791cb0ef41Sopenharmony_ci#
801cb0ef41Sopenharmony_ci#       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
811cb0ef41Sopenharmony_ci#       OpenSSL 0.9.7c 30 Sep 2003
821cb0ef41Sopenharmony_ci#
831cb0ef41Sopenharmony_ci#       Original code.
841cb0ef41Sopenharmony_ci#
851cb0ef41Sopenharmony_ci#rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
861cb0ef41Sopenharmony_ci#rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
871cb0ef41Sopenharmony_ci#rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
881cb0ef41Sopenharmony_ci#rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
891cb0ef41Sopenharmony_ci#dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
901cb0ef41Sopenharmony_ci#dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
911cb0ef41Sopenharmony_ci#dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
921cb0ef41Sopenharmony_ci#
931cb0ef41Sopenharmony_ci#       Same benchmark with this assembler code:
941cb0ef41Sopenharmony_ci#
951cb0ef41Sopenharmony_ci#rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
961cb0ef41Sopenharmony_ci#rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
971cb0ef41Sopenharmony_ci#rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
981cb0ef41Sopenharmony_ci#rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
991cb0ef41Sopenharmony_ci#dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
1001cb0ef41Sopenharmony_ci#dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
1011cb0ef41Sopenharmony_ci#dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
1021cb0ef41Sopenharmony_ci#
1031cb0ef41Sopenharmony_ci#        Performance increase of ~60%
1041cb0ef41Sopenharmony_ci#        Based on submission from Suresh N. Chari of IBM
1051cb0ef41Sopenharmony_ci
1061cb0ef41Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
1071cb0ef41Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
1081cb0ef41Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
1091cb0ef41Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
1101cb0ef41Sopenharmony_ci
1111cb0ef41Sopenharmony_ciif ($flavour =~ /32/) {
1121cb0ef41Sopenharmony_ci	$BITS=	32;
1131cb0ef41Sopenharmony_ci	$BNSZ=	$BITS/8;
1141cb0ef41Sopenharmony_ci	$ISA=	"\"ppc\"";
1151cb0ef41Sopenharmony_ci
1161cb0ef41Sopenharmony_ci	$LD=	"lwz";		# load
1171cb0ef41Sopenharmony_ci	$LDU=	"lwzu";		# load and update
1181cb0ef41Sopenharmony_ci	$ST=	"stw";		# store
1191cb0ef41Sopenharmony_ci	$STU=	"stwu";		# store and update
1201cb0ef41Sopenharmony_ci	$UMULL=	"mullw";	# unsigned multiply low
1211cb0ef41Sopenharmony_ci	$UMULH=	"mulhwu";	# unsigned multiply high
1221cb0ef41Sopenharmony_ci	$UDIV=	"divwu";	# unsigned divide
1231cb0ef41Sopenharmony_ci	$UCMPI=	"cmplwi";	# unsigned compare with immediate
1241cb0ef41Sopenharmony_ci	$UCMP=	"cmplw";	# unsigned compare
1251cb0ef41Sopenharmony_ci	$CNTLZ=	"cntlzw";	# count leading zeros
1261cb0ef41Sopenharmony_ci	$SHL=	"slw";		# shift left
1271cb0ef41Sopenharmony_ci	$SHR=	"srw";		# unsigned shift right
1281cb0ef41Sopenharmony_ci	$SHRI=	"srwi";		# unsigned shift right by immediate
1291cb0ef41Sopenharmony_ci	$SHLI=	"slwi";		# shift left by immediate
1301cb0ef41Sopenharmony_ci	$CLRU=	"clrlwi";	# clear upper bits
1311cb0ef41Sopenharmony_ci	$INSR=	"insrwi";	# insert right
1321cb0ef41Sopenharmony_ci	$ROTL=	"rotlwi";	# rotate left by immediate
1331cb0ef41Sopenharmony_ci	$TR=	"tw";		# conditional trap
1341cb0ef41Sopenharmony_ci} elsif ($flavour =~ /64/) {
1351cb0ef41Sopenharmony_ci	$BITS=	64;
1361cb0ef41Sopenharmony_ci	$BNSZ=	$BITS/8;
1371cb0ef41Sopenharmony_ci	$ISA=	"\"ppc64\"";
1381cb0ef41Sopenharmony_ci
1391cb0ef41Sopenharmony_ci	# same as above, but 64-bit mnemonics...
1401cb0ef41Sopenharmony_ci	$LD=	"ld";		# load
1411cb0ef41Sopenharmony_ci	$LDU=	"ldu";		# load and update
1421cb0ef41Sopenharmony_ci	$ST=	"std";		# store
1431cb0ef41Sopenharmony_ci	$STU=	"stdu";		# store and update
1441cb0ef41Sopenharmony_ci	$UMULL=	"mulld";	# unsigned multiply low
1451cb0ef41Sopenharmony_ci	$UMULH=	"mulhdu";	# unsigned multiply high
1461cb0ef41Sopenharmony_ci	$UDIV=	"divdu";	# unsigned divide
1471cb0ef41Sopenharmony_ci	$UCMPI=	"cmpldi";	# unsigned compare with immediate
1481cb0ef41Sopenharmony_ci	$UCMP=	"cmpld";	# unsigned compare
1491cb0ef41Sopenharmony_ci	$CNTLZ=	"cntlzd";	# count leading zeros
1501cb0ef41Sopenharmony_ci	$SHL=	"sld";		# shift left
1511cb0ef41Sopenharmony_ci	$SHR=	"srd";		# unsigned shift right
1521cb0ef41Sopenharmony_ci	$SHRI=	"srdi";		# unsigned shift right by immediate
1531cb0ef41Sopenharmony_ci	$SHLI=	"sldi";		# shift left by immediate
1541cb0ef41Sopenharmony_ci	$CLRU=	"clrldi";	# clear upper bits
1551cb0ef41Sopenharmony_ci	$INSR=	"insrdi";	# insert right
1561cb0ef41Sopenharmony_ci	$ROTL=	"rotldi";	# rotate left by immediate
1571cb0ef41Sopenharmony_ci	$TR=	"td";		# conditional trap
1581cb0ef41Sopenharmony_ci} else { die "nonsense $flavour"; }
1591cb0ef41Sopenharmony_ci
1601cb0ef41Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
1611cb0ef41Sopenharmony_ci( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
1621cb0ef41Sopenharmony_ci( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
1631cb0ef41Sopenharmony_cidie "can't locate ppc-xlate.pl";
1641cb0ef41Sopenharmony_ci
1651cb0ef41Sopenharmony_ciopen STDOUT,"| $^X $xlate $flavour \"$output\""
1661cb0ef41Sopenharmony_ci    or die "can't call $xlate: $!";
1671cb0ef41Sopenharmony_ci
1681cb0ef41Sopenharmony_ci$data=<<EOF;
1691cb0ef41Sopenharmony_ci#--------------------------------------------------------------------
1701cb0ef41Sopenharmony_ci#
1711cb0ef41Sopenharmony_ci#
1721cb0ef41Sopenharmony_ci#
1731cb0ef41Sopenharmony_ci#
1741cb0ef41Sopenharmony_ci#	File:		ppc32.s
1751cb0ef41Sopenharmony_ci#
1761cb0ef41Sopenharmony_ci#	Created by:	Suresh Chari
1771cb0ef41Sopenharmony_ci#			IBM Thomas J. Watson Research Library
1781cb0ef41Sopenharmony_ci#			Hawthorne, NY
1791cb0ef41Sopenharmony_ci#
1801cb0ef41Sopenharmony_ci#
1811cb0ef41Sopenharmony_ci#	Description:	Optimized assembly routines for OpenSSL crypto
1821cb0ef41Sopenharmony_ci#			on the 32 bitPowerPC platform.
1831cb0ef41Sopenharmony_ci#
1841cb0ef41Sopenharmony_ci#
1851cb0ef41Sopenharmony_ci#	Version History
1861cb0ef41Sopenharmony_ci#
1871cb0ef41Sopenharmony_ci#	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
1881cb0ef41Sopenharmony_ci#	   cleaned up code. Also made a single version which can
1891cb0ef41Sopenharmony_ci#	   be used for both the AIX and Linux compilers. See NOTE
1901cb0ef41Sopenharmony_ci#	   below.
1911cb0ef41Sopenharmony_ci#				12/05/03		Suresh Chari
1921cb0ef41Sopenharmony_ci#			(with lots of help from)        Andy Polyakov
1931cb0ef41Sopenharmony_ci##
1941cb0ef41Sopenharmony_ci#	1. Initial version	10/20/02		Suresh Chari
1951cb0ef41Sopenharmony_ci#
1961cb0ef41Sopenharmony_ci#
1971cb0ef41Sopenharmony_ci#	The following file works for the xlc,cc
1981cb0ef41Sopenharmony_ci#	and gcc compilers.
1991cb0ef41Sopenharmony_ci#
2001cb0ef41Sopenharmony_ci#	NOTE:	To get the file to link correctly with the gcc compiler
2011cb0ef41Sopenharmony_ci#	        you have to change the names of the routines and remove
2021cb0ef41Sopenharmony_ci#		the first .(dot) character. This should automatically
2031cb0ef41Sopenharmony_ci#		be done in the build process.
2041cb0ef41Sopenharmony_ci#
2051cb0ef41Sopenharmony_ci#	Hand optimized assembly code for the following routines
2061cb0ef41Sopenharmony_ci#
2071cb0ef41Sopenharmony_ci#	bn_sqr_comba4
2081cb0ef41Sopenharmony_ci#	bn_sqr_comba8
2091cb0ef41Sopenharmony_ci#	bn_mul_comba4
2101cb0ef41Sopenharmony_ci#	bn_mul_comba8
2111cb0ef41Sopenharmony_ci#	bn_sub_words
2121cb0ef41Sopenharmony_ci#	bn_add_words
2131cb0ef41Sopenharmony_ci#	bn_div_words
2141cb0ef41Sopenharmony_ci#	bn_sqr_words
2151cb0ef41Sopenharmony_ci#	bn_mul_words
2161cb0ef41Sopenharmony_ci#	bn_mul_add_words
2171cb0ef41Sopenharmony_ci#
2181cb0ef41Sopenharmony_ci#	NOTE:	It is possible to optimize this code more for
2191cb0ef41Sopenharmony_ci#	specific PowerPC or Power architectures. On the Northstar
2201cb0ef41Sopenharmony_ci#	architecture the optimizations in this file do
2211cb0ef41Sopenharmony_ci#	 NOT provide much improvement.
2221cb0ef41Sopenharmony_ci#
2231cb0ef41Sopenharmony_ci#	If you have comments or suggestions to improve code send
2241cb0ef41Sopenharmony_ci#	me a note at schari\@us.ibm.com
2251cb0ef41Sopenharmony_ci#
2261cb0ef41Sopenharmony_ci#--------------------------------------------------------------------------
2271cb0ef41Sopenharmony_ci#
2281cb0ef41Sopenharmony_ci#	Defines to be used in the assembly code.
2291cb0ef41Sopenharmony_ci#
2301cb0ef41Sopenharmony_ci#.set r0,0	# we use it as storage for value of 0
2311cb0ef41Sopenharmony_ci#.set SP,1	# preserved
2321cb0ef41Sopenharmony_ci#.set RTOC,2	# preserved
2331cb0ef41Sopenharmony_ci#.set r3,3	# 1st argument/return value
2341cb0ef41Sopenharmony_ci#.set r4,4	# 2nd argument/volatile register
2351cb0ef41Sopenharmony_ci#.set r5,5	# 3rd argument/volatile register
2361cb0ef41Sopenharmony_ci#.set r6,6	# ...
2371cb0ef41Sopenharmony_ci#.set r7,7
2381cb0ef41Sopenharmony_ci#.set r8,8
2391cb0ef41Sopenharmony_ci#.set r9,9
2401cb0ef41Sopenharmony_ci#.set r10,10
2411cb0ef41Sopenharmony_ci#.set r11,11
2421cb0ef41Sopenharmony_ci#.set r12,12
2431cb0ef41Sopenharmony_ci#.set r13,13	# not used, nor any other "below" it...
2441cb0ef41Sopenharmony_ci
2451cb0ef41Sopenharmony_ci#	Declare function names to be global
2461cb0ef41Sopenharmony_ci#	NOTE:	For gcc these names MUST be changed to remove
2471cb0ef41Sopenharmony_ci#	        the first . i.e. for example change ".bn_sqr_comba4"
2481cb0ef41Sopenharmony_ci#		to "bn_sqr_comba4". This should be automatically done
2491cb0ef41Sopenharmony_ci#		in the build.
2501cb0ef41Sopenharmony_ci
2511cb0ef41Sopenharmony_ci	.globl	.bn_sqr_comba4
2521cb0ef41Sopenharmony_ci	.globl	.bn_sqr_comba8
2531cb0ef41Sopenharmony_ci	.globl	.bn_mul_comba4
2541cb0ef41Sopenharmony_ci	.globl	.bn_mul_comba8
2551cb0ef41Sopenharmony_ci	.globl	.bn_sub_words
2561cb0ef41Sopenharmony_ci	.globl	.bn_add_words
2571cb0ef41Sopenharmony_ci	.globl	.bn_div_words
2581cb0ef41Sopenharmony_ci	.globl	.bn_sqr_words
2591cb0ef41Sopenharmony_ci	.globl	.bn_mul_words
2601cb0ef41Sopenharmony_ci	.globl	.bn_mul_add_words
2611cb0ef41Sopenharmony_ci
2621cb0ef41Sopenharmony_ci# .text section
2631cb0ef41Sopenharmony_ci
2641cb0ef41Sopenharmony_ci	.machine	"any"
2651cb0ef41Sopenharmony_ci	.text
2661cb0ef41Sopenharmony_ci
2671cb0ef41Sopenharmony_ci#
2681cb0ef41Sopenharmony_ci#	NOTE:	The following label name should be changed to
2691cb0ef41Sopenharmony_ci#		"bn_sqr_comba4" i.e. remove the first dot
2701cb0ef41Sopenharmony_ci#		for the gcc compiler. This should be automatically
2711cb0ef41Sopenharmony_ci#		done in the build
2721cb0ef41Sopenharmony_ci#
2731cb0ef41Sopenharmony_ci
2741cb0ef41Sopenharmony_ci.align	4
2751cb0ef41Sopenharmony_ci.bn_sqr_comba4:
2761cb0ef41Sopenharmony_ci#
2771cb0ef41Sopenharmony_ci# Optimized version of bn_sqr_comba4.
2781cb0ef41Sopenharmony_ci#
2791cb0ef41Sopenharmony_ci# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
2801cb0ef41Sopenharmony_ci# r3 contains r
2811cb0ef41Sopenharmony_ci# r4 contains a
2821cb0ef41Sopenharmony_ci#
2831cb0ef41Sopenharmony_ci# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
2841cb0ef41Sopenharmony_ci#
2851cb0ef41Sopenharmony_ci# r5,r6 are the two BN_ULONGs being multiplied.
2861cb0ef41Sopenharmony_ci# r7,r8 are the results of the 32x32 giving 64 bit multiply.
2871cb0ef41Sopenharmony_ci# r9,r10, r11 are the equivalents of c1,c2, c3.
2881cb0ef41Sopenharmony_ci# Here's the assembly
2891cb0ef41Sopenharmony_ci#
2901cb0ef41Sopenharmony_ci#
2911cb0ef41Sopenharmony_ci	xor		r0,r0,r0		# set r0 = 0. Used in the addze
2921cb0ef41Sopenharmony_ci						# instructions below
2931cb0ef41Sopenharmony_ci
2941cb0ef41Sopenharmony_ci						#sqr_add_c(a,0,c1,c2,c3)
2951cb0ef41Sopenharmony_ci	$LD		r5,`0*$BNSZ`(r4)
2961cb0ef41Sopenharmony_ci	$UMULL		r9,r5,r5
2971cb0ef41Sopenharmony_ci	$UMULH		r10,r5,r5		#in first iteration. No need
2981cb0ef41Sopenharmony_ci						#to add since c1=c2=c3=0.
2991cb0ef41Sopenharmony_ci						# Note c3(r11) is NOT set to 0
3001cb0ef41Sopenharmony_ci						# but will be.
3011cb0ef41Sopenharmony_ci
3021cb0ef41Sopenharmony_ci	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
3031cb0ef41Sopenharmony_ci						# sqr_add_c2(a,1,0,c2,c3,c1);
3041cb0ef41Sopenharmony_ci	$LD		r6,`1*$BNSZ`(r4)
3051cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
3061cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
3071cb0ef41Sopenharmony_ci
3081cb0ef41Sopenharmony_ci	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
3091cb0ef41Sopenharmony_ci	adde		r8,r8,r8
3101cb0ef41Sopenharmony_ci	addze		r9,r0			# catch carry if any.
3111cb0ef41Sopenharmony_ci						# r9= r0(=0) and carry
3121cb0ef41Sopenharmony_ci
3131cb0ef41Sopenharmony_ci	addc		r10,r7,r10		# now add to temp result.
3141cb0ef41Sopenharmony_ci	addze		r11,r8                  # r8 added to r11 which is 0
3151cb0ef41Sopenharmony_ci	addze		r9,r9
3161cb0ef41Sopenharmony_ci
3171cb0ef41Sopenharmony_ci	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2;
3181cb0ef41Sopenharmony_ci						#sqr_add_c(a,1,c3,c1,c2)
3191cb0ef41Sopenharmony_ci	$UMULL		r7,r6,r6
3201cb0ef41Sopenharmony_ci	$UMULH		r8,r6,r6
3211cb0ef41Sopenharmony_ci	addc		r11,r7,r11
3221cb0ef41Sopenharmony_ci	adde		r9,r8,r9
3231cb0ef41Sopenharmony_ci	addze		r10,r0
3241cb0ef41Sopenharmony_ci						#sqr_add_c2(a,2,0,c3,c1,c2)
3251cb0ef41Sopenharmony_ci	$LD		r6,`2*$BNSZ`(r4)
3261cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
3271cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
3281cb0ef41Sopenharmony_ci
3291cb0ef41Sopenharmony_ci	addc		r7,r7,r7
3301cb0ef41Sopenharmony_ci	adde		r8,r8,r8
3311cb0ef41Sopenharmony_ci	addze		r10,r10
3321cb0ef41Sopenharmony_ci
3331cb0ef41Sopenharmony_ci	addc		r11,r7,r11
3341cb0ef41Sopenharmony_ci	adde		r9,r8,r9
3351cb0ef41Sopenharmony_ci	addze		r10,r10
3361cb0ef41Sopenharmony_ci	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
3371cb0ef41Sopenharmony_ci						#sqr_add_c2(a,3,0,c1,c2,c3);
3381cb0ef41Sopenharmony_ci	$LD		r6,`3*$BNSZ`(r4)
3391cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
3401cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
3411cb0ef41Sopenharmony_ci	addc		r7,r7,r7
3421cb0ef41Sopenharmony_ci	adde		r8,r8,r8
3431cb0ef41Sopenharmony_ci	addze		r11,r0
3441cb0ef41Sopenharmony_ci
3451cb0ef41Sopenharmony_ci	addc		r9,r7,r9
3461cb0ef41Sopenharmony_ci	adde		r10,r8,r10
3471cb0ef41Sopenharmony_ci	addze		r11,r11
3481cb0ef41Sopenharmony_ci						#sqr_add_c2(a,2,1,c1,c2,c3);
3491cb0ef41Sopenharmony_ci	$LD		r5,`1*$BNSZ`(r4)
3501cb0ef41Sopenharmony_ci	$LD		r6,`2*$BNSZ`(r4)
3511cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
3521cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
3531cb0ef41Sopenharmony_ci
3541cb0ef41Sopenharmony_ci	addc		r7,r7,r7
3551cb0ef41Sopenharmony_ci	adde		r8,r8,r8
3561cb0ef41Sopenharmony_ci	addze		r11,r11
3571cb0ef41Sopenharmony_ci	addc		r9,r7,r9
3581cb0ef41Sopenharmony_ci	adde		r10,r8,r10
3591cb0ef41Sopenharmony_ci	addze		r11,r11
3601cb0ef41Sopenharmony_ci	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
3611cb0ef41Sopenharmony_ci						#sqr_add_c(a,2,c2,c3,c1);
3621cb0ef41Sopenharmony_ci	$UMULL		r7,r6,r6
3631cb0ef41Sopenharmony_ci	$UMULH		r8,r6,r6
3641cb0ef41Sopenharmony_ci	addc		r10,r7,r10
3651cb0ef41Sopenharmony_ci	adde		r11,r8,r11
3661cb0ef41Sopenharmony_ci	addze		r9,r0
3671cb0ef41Sopenharmony_ci						#sqr_add_c2(a,3,1,c2,c3,c1);
3681cb0ef41Sopenharmony_ci	$LD		r6,`3*$BNSZ`(r4)
3691cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
3701cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
3711cb0ef41Sopenharmony_ci	addc		r7,r7,r7
3721cb0ef41Sopenharmony_ci	adde		r8,r8,r8
3731cb0ef41Sopenharmony_ci	addze		r9,r9
3741cb0ef41Sopenharmony_ci
3751cb0ef41Sopenharmony_ci	addc		r10,r7,r10
3761cb0ef41Sopenharmony_ci	adde		r11,r8,r11
3771cb0ef41Sopenharmony_ci	addze		r9,r9
3781cb0ef41Sopenharmony_ci	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
3791cb0ef41Sopenharmony_ci						#sqr_add_c2(a,3,2,c3,c1,c2);
3801cb0ef41Sopenharmony_ci	$LD		r5,`2*$BNSZ`(r4)
3811cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
3821cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
3831cb0ef41Sopenharmony_ci	addc		r7,r7,r7
3841cb0ef41Sopenharmony_ci	adde		r8,r8,r8
3851cb0ef41Sopenharmony_ci	addze		r10,r0
3861cb0ef41Sopenharmony_ci
3871cb0ef41Sopenharmony_ci	addc		r11,r7,r11
3881cb0ef41Sopenharmony_ci	adde		r9,r8,r9
3891cb0ef41Sopenharmony_ci	addze		r10,r10
3901cb0ef41Sopenharmony_ci	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
3911cb0ef41Sopenharmony_ci						#sqr_add_c(a,3,c1,c2,c3);
3921cb0ef41Sopenharmony_ci	$UMULL		r7,r6,r6
3931cb0ef41Sopenharmony_ci	$UMULH		r8,r6,r6
3941cb0ef41Sopenharmony_ci	addc		r9,r7,r9
3951cb0ef41Sopenharmony_ci	adde		r10,r8,r10
3961cb0ef41Sopenharmony_ci
3971cb0ef41Sopenharmony_ci	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
3981cb0ef41Sopenharmony_ci	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
3991cb0ef41Sopenharmony_ci	blr
4001cb0ef41Sopenharmony_ci	.long	0
4011cb0ef41Sopenharmony_ci	.byte	0,12,0x14,0,0,0,2,0
4021cb0ef41Sopenharmony_ci	.long	0
4031cb0ef41Sopenharmony_ci.size	.bn_sqr_comba4,.-.bn_sqr_comba4
4041cb0ef41Sopenharmony_ci
4051cb0ef41Sopenharmony_ci#
4061cb0ef41Sopenharmony_ci#	NOTE:	The following label name should be changed to
4071cb0ef41Sopenharmony_ci#		"bn_sqr_comba8" i.e. remove the first dot
4081cb0ef41Sopenharmony_ci#		for the gcc compiler. This should be automatically
4091cb0ef41Sopenharmony_ci#		done in the build
4101cb0ef41Sopenharmony_ci#
4111cb0ef41Sopenharmony_ci
4121cb0ef41Sopenharmony_ci.align	4
4131cb0ef41Sopenharmony_ci.bn_sqr_comba8:
4141cb0ef41Sopenharmony_ci#
4151cb0ef41Sopenharmony_ci# This is an optimized version of the bn_sqr_comba8 routine.
4161cb0ef41Sopenharmony_ci# Tightly uses the adde instruction
4171cb0ef41Sopenharmony_ci#
4181cb0ef41Sopenharmony_ci#
4191cb0ef41Sopenharmony_ci# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
4201cb0ef41Sopenharmony_ci# r3 contains r
4211cb0ef41Sopenharmony_ci# r4 contains a
4221cb0ef41Sopenharmony_ci#
4231cb0ef41Sopenharmony_ci# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
4241cb0ef41Sopenharmony_ci#
4251cb0ef41Sopenharmony_ci# r5,r6 are the two BN_ULONGs being multiplied.
4261cb0ef41Sopenharmony_ci# r7,r8 are the results of the 32x32 giving 64 bit multiply.
4271cb0ef41Sopenharmony_ci# r9,r10, r11 are the equivalents of c1,c2, c3.
4281cb0ef41Sopenharmony_ci#
4291cb0ef41Sopenharmony_ci# Possible optimization of loading all 8 longs of a into registers
4301cb0ef41Sopenharmony_ci# doesn't provide any speedup
4311cb0ef41Sopenharmony_ci#
4321cb0ef41Sopenharmony_ci
4331cb0ef41Sopenharmony_ci	xor		r0,r0,r0		#set r0 = 0.Used in addze
4341cb0ef41Sopenharmony_ci						#instructions below.
4351cb0ef41Sopenharmony_ci
4361cb0ef41Sopenharmony_ci						#sqr_add_c(a,0,c1,c2,c3);
4371cb0ef41Sopenharmony_ci	$LD		r5,`0*$BNSZ`(r4)
4381cb0ef41Sopenharmony_ci	$UMULL		r9,r5,r5		#1st iteration:	no carries.
4391cb0ef41Sopenharmony_ci	$UMULH		r10,r5,r5
4401cb0ef41Sopenharmony_ci	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
4411cb0ef41Sopenharmony_ci						#sqr_add_c2(a,1,0,c2,c3,c1);
4421cb0ef41Sopenharmony_ci	$LD		r6,`1*$BNSZ`(r4)
4431cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
4441cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
4451cb0ef41Sopenharmony_ci
4461cb0ef41Sopenharmony_ci	addc		r10,r7,r10		#add the two register number
4471cb0ef41Sopenharmony_ci	adde		r11,r8,r0 		# (r8,r7) to the three register
4481cb0ef41Sopenharmony_ci	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
4491cb0ef41Sopenharmony_ci
4501cb0ef41Sopenharmony_ci	addc		r10,r7,r10		#add the two register number
4511cb0ef41Sopenharmony_ci	adde		r11,r8,r11 		# (r8,r7) to the three register
4521cb0ef41Sopenharmony_ci	addze		r9,r9			# number (r9,r11,r10).
4531cb0ef41Sopenharmony_ci
4541cb0ef41Sopenharmony_ci	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
4551cb0ef41Sopenharmony_ci
4561cb0ef41Sopenharmony_ci						#sqr_add_c(a,1,c3,c1,c2);
4571cb0ef41Sopenharmony_ci	$UMULL		r7,r6,r6
4581cb0ef41Sopenharmony_ci	$UMULH		r8,r6,r6
4591cb0ef41Sopenharmony_ci	addc		r11,r7,r11
4601cb0ef41Sopenharmony_ci	adde		r9,r8,r9
4611cb0ef41Sopenharmony_ci	addze		r10,r0
4621cb0ef41Sopenharmony_ci						#sqr_add_c2(a,2,0,c3,c1,c2);
4631cb0ef41Sopenharmony_ci	$LD		r6,`2*$BNSZ`(r4)
4641cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
4651cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
4661cb0ef41Sopenharmony_ci
4671cb0ef41Sopenharmony_ci	addc		r11,r7,r11
4681cb0ef41Sopenharmony_ci	adde		r9,r8,r9
4691cb0ef41Sopenharmony_ci	addze		r10,r10
4701cb0ef41Sopenharmony_ci
4711cb0ef41Sopenharmony_ci	addc		r11,r7,r11
4721cb0ef41Sopenharmony_ci	adde		r9,r8,r9
4731cb0ef41Sopenharmony_ci	addze		r10,r10
4741cb0ef41Sopenharmony_ci
4751cb0ef41Sopenharmony_ci	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
4761cb0ef41Sopenharmony_ci						#sqr_add_c2(a,3,0,c1,c2,c3);
4771cb0ef41Sopenharmony_ci	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
4781cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
4791cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
4801cb0ef41Sopenharmony_ci
4811cb0ef41Sopenharmony_ci	addc		r9,r7,r9
4821cb0ef41Sopenharmony_ci	adde		r10,r8,r10
4831cb0ef41Sopenharmony_ci	addze		r11,r0
4841cb0ef41Sopenharmony_ci
4851cb0ef41Sopenharmony_ci	addc		r9,r7,r9
4861cb0ef41Sopenharmony_ci	adde		r10,r8,r10
4871cb0ef41Sopenharmony_ci	addze		r11,r11
4881cb0ef41Sopenharmony_ci						#sqr_add_c2(a,2,1,c1,c2,c3);
4891cb0ef41Sopenharmony_ci	$LD		r5,`1*$BNSZ`(r4)
4901cb0ef41Sopenharmony_ci	$LD		r6,`2*$BNSZ`(r4)
4911cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
4921cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
4931cb0ef41Sopenharmony_ci
4941cb0ef41Sopenharmony_ci	addc		r9,r7,r9
4951cb0ef41Sopenharmony_ci	adde		r10,r8,r10
4961cb0ef41Sopenharmony_ci	addze		r11,r11
4971cb0ef41Sopenharmony_ci
4981cb0ef41Sopenharmony_ci	addc		r9,r7,r9
4991cb0ef41Sopenharmony_ci	adde		r10,r8,r10
5001cb0ef41Sopenharmony_ci	addze		r11,r11
5011cb0ef41Sopenharmony_ci
5021cb0ef41Sopenharmony_ci	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
5031cb0ef41Sopenharmony_ci						#sqr_add_c(a,2,c2,c3,c1);
5041cb0ef41Sopenharmony_ci	$UMULL		r7,r6,r6
5051cb0ef41Sopenharmony_ci	$UMULH		r8,r6,r6
5061cb0ef41Sopenharmony_ci
5071cb0ef41Sopenharmony_ci	addc		r10,r7,r10
5081cb0ef41Sopenharmony_ci	adde		r11,r8,r11
5091cb0ef41Sopenharmony_ci	addze		r9,r0
5101cb0ef41Sopenharmony_ci						#sqr_add_c2(a,3,1,c2,c3,c1);
5111cb0ef41Sopenharmony_ci	$LD		r6,`3*$BNSZ`(r4)
5121cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
5131cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
5141cb0ef41Sopenharmony_ci
5151cb0ef41Sopenharmony_ci	addc		r10,r7,r10
5161cb0ef41Sopenharmony_ci	adde		r11,r8,r11
5171cb0ef41Sopenharmony_ci	addze		r9,r9
5181cb0ef41Sopenharmony_ci
5191cb0ef41Sopenharmony_ci	addc		r10,r7,r10
5201cb0ef41Sopenharmony_ci	adde		r11,r8,r11
5211cb0ef41Sopenharmony_ci	addze		r9,r9
5221cb0ef41Sopenharmony_ci						#sqr_add_c2(a,4,0,c2,c3,c1);
5231cb0ef41Sopenharmony_ci	$LD		r5,`0*$BNSZ`(r4)
5241cb0ef41Sopenharmony_ci	$LD		r6,`4*$BNSZ`(r4)
5251cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
5261cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
5271cb0ef41Sopenharmony_ci
5281cb0ef41Sopenharmony_ci	addc		r10,r7,r10
5291cb0ef41Sopenharmony_ci	adde		r11,r8,r11
5301cb0ef41Sopenharmony_ci	addze		r9,r9
5311cb0ef41Sopenharmony_ci
5321cb0ef41Sopenharmony_ci	addc		r10,r7,r10
5331cb0ef41Sopenharmony_ci	adde		r11,r8,r11
5341cb0ef41Sopenharmony_ci	addze		r9,r9
5351cb0ef41Sopenharmony_ci	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
5361cb0ef41Sopenharmony_ci						#sqr_add_c2(a,5,0,c3,c1,c2);
5371cb0ef41Sopenharmony_ci	$LD		r6,`5*$BNSZ`(r4)
5381cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
5391cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
5401cb0ef41Sopenharmony_ci
5411cb0ef41Sopenharmony_ci	addc		r11,r7,r11
5421cb0ef41Sopenharmony_ci	adde		r9,r8,r9
5431cb0ef41Sopenharmony_ci	addze		r10,r0
5441cb0ef41Sopenharmony_ci
5451cb0ef41Sopenharmony_ci	addc		r11,r7,r11
5461cb0ef41Sopenharmony_ci	adde		r9,r8,r9
5471cb0ef41Sopenharmony_ci	addze		r10,r10
5481cb0ef41Sopenharmony_ci						#sqr_add_c2(a,4,1,c3,c1,c2);
5491cb0ef41Sopenharmony_ci	$LD		r5,`1*$BNSZ`(r4)
5501cb0ef41Sopenharmony_ci	$LD		r6,`4*$BNSZ`(r4)
5511cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
5521cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
5531cb0ef41Sopenharmony_ci
5541cb0ef41Sopenharmony_ci	addc		r11,r7,r11
5551cb0ef41Sopenharmony_ci	adde		r9,r8,r9
5561cb0ef41Sopenharmony_ci	addze		r10,r10
5571cb0ef41Sopenharmony_ci
5581cb0ef41Sopenharmony_ci	addc		r11,r7,r11
5591cb0ef41Sopenharmony_ci	adde		r9,r8,r9
5601cb0ef41Sopenharmony_ci	addze		r10,r10
5611cb0ef41Sopenharmony_ci						#sqr_add_c2(a,3,2,c3,c1,c2);
5621cb0ef41Sopenharmony_ci	$LD		r5,`2*$BNSZ`(r4)
5631cb0ef41Sopenharmony_ci	$LD		r6,`3*$BNSZ`(r4)
5641cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
5651cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
5661cb0ef41Sopenharmony_ci
5671cb0ef41Sopenharmony_ci	addc		r11,r7,r11
5681cb0ef41Sopenharmony_ci	adde		r9,r8,r9
5691cb0ef41Sopenharmony_ci	addze		r10,r10
5701cb0ef41Sopenharmony_ci
5711cb0ef41Sopenharmony_ci	addc		r11,r7,r11
5721cb0ef41Sopenharmony_ci	adde		r9,r8,r9
5731cb0ef41Sopenharmony_ci	addze		r10,r10
5741cb0ef41Sopenharmony_ci	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
5751cb0ef41Sopenharmony_ci						#sqr_add_c(a,3,c1,c2,c3);
5761cb0ef41Sopenharmony_ci	$UMULL		r7,r6,r6
5771cb0ef41Sopenharmony_ci	$UMULH		r8,r6,r6
5781cb0ef41Sopenharmony_ci	addc		r9,r7,r9
5791cb0ef41Sopenharmony_ci	adde		r10,r8,r10
5801cb0ef41Sopenharmony_ci	addze		r11,r0
5811cb0ef41Sopenharmony_ci						#sqr_add_c2(a,4,2,c1,c2,c3);
5821cb0ef41Sopenharmony_ci	$LD		r6,`4*$BNSZ`(r4)
5831cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
5841cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
5851cb0ef41Sopenharmony_ci
5861cb0ef41Sopenharmony_ci	addc		r9,r7,r9
5871cb0ef41Sopenharmony_ci	adde		r10,r8,r10
5881cb0ef41Sopenharmony_ci	addze		r11,r11
5891cb0ef41Sopenharmony_ci
5901cb0ef41Sopenharmony_ci	addc		r9,r7,r9
5911cb0ef41Sopenharmony_ci	adde		r10,r8,r10
5921cb0ef41Sopenharmony_ci	addze		r11,r11
5931cb0ef41Sopenharmony_ci						#sqr_add_c2(a,5,1,c1,c2,c3);
5941cb0ef41Sopenharmony_ci	$LD		r5,`1*$BNSZ`(r4)
5951cb0ef41Sopenharmony_ci	$LD		r6,`5*$BNSZ`(r4)
5961cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
5971cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
5981cb0ef41Sopenharmony_ci
5991cb0ef41Sopenharmony_ci	addc		r9,r7,r9
6001cb0ef41Sopenharmony_ci	adde		r10,r8,r10
6011cb0ef41Sopenharmony_ci	addze		r11,r11
6021cb0ef41Sopenharmony_ci
6031cb0ef41Sopenharmony_ci	addc		r9,r7,r9
6041cb0ef41Sopenharmony_ci	adde		r10,r8,r10
6051cb0ef41Sopenharmony_ci	addze		r11,r11
6061cb0ef41Sopenharmony_ci						#sqr_add_c2(a,6,0,c1,c2,c3);
6071cb0ef41Sopenharmony_ci	$LD		r5,`0*$BNSZ`(r4)
6081cb0ef41Sopenharmony_ci	$LD		r6,`6*$BNSZ`(r4)
6091cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
6101cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
6111cb0ef41Sopenharmony_ci	addc		r9,r7,r9
6121cb0ef41Sopenharmony_ci	adde		r10,r8,r10
6131cb0ef41Sopenharmony_ci	addze		r11,r11
6141cb0ef41Sopenharmony_ci	addc		r9,r7,r9
6151cb0ef41Sopenharmony_ci	adde		r10,r8,r10
6161cb0ef41Sopenharmony_ci	addze		r11,r11
6171cb0ef41Sopenharmony_ci	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
6181cb0ef41Sopenharmony_ci						#sqr_add_c2(a,7,0,c2,c3,c1);
6191cb0ef41Sopenharmony_ci	$LD		r6,`7*$BNSZ`(r4)
6201cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
6211cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
6221cb0ef41Sopenharmony_ci
6231cb0ef41Sopenharmony_ci	addc		r10,r7,r10
6241cb0ef41Sopenharmony_ci	adde		r11,r8,r11
6251cb0ef41Sopenharmony_ci	addze		r9,r0
6261cb0ef41Sopenharmony_ci	addc		r10,r7,r10
6271cb0ef41Sopenharmony_ci	adde		r11,r8,r11
6281cb0ef41Sopenharmony_ci	addze		r9,r9
6291cb0ef41Sopenharmony_ci						#sqr_add_c2(a,6,1,c2,c3,c1);
6301cb0ef41Sopenharmony_ci	$LD		r5,`1*$BNSZ`(r4)
6311cb0ef41Sopenharmony_ci	$LD		r6,`6*$BNSZ`(r4)
6321cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
6331cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
6341cb0ef41Sopenharmony_ci
6351cb0ef41Sopenharmony_ci	addc		r10,r7,r10
6361cb0ef41Sopenharmony_ci	adde		r11,r8,r11
6371cb0ef41Sopenharmony_ci	addze		r9,r9
6381cb0ef41Sopenharmony_ci	addc		r10,r7,r10
6391cb0ef41Sopenharmony_ci	adde		r11,r8,r11
6401cb0ef41Sopenharmony_ci	addze		r9,r9
6411cb0ef41Sopenharmony_ci						#sqr_add_c2(a,5,2,c2,c3,c1);
6421cb0ef41Sopenharmony_ci	$LD		r5,`2*$BNSZ`(r4)
6431cb0ef41Sopenharmony_ci	$LD		r6,`5*$BNSZ`(r4)
6441cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
6451cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
6461cb0ef41Sopenharmony_ci	addc		r10,r7,r10
6471cb0ef41Sopenharmony_ci	adde		r11,r8,r11
6481cb0ef41Sopenharmony_ci	addze		r9,r9
6491cb0ef41Sopenharmony_ci	addc		r10,r7,r10
6501cb0ef41Sopenharmony_ci	adde		r11,r8,r11
6511cb0ef41Sopenharmony_ci	addze		r9,r9
6521cb0ef41Sopenharmony_ci						#sqr_add_c2(a,4,3,c2,c3,c1);
6531cb0ef41Sopenharmony_ci	$LD		r5,`3*$BNSZ`(r4)
6541cb0ef41Sopenharmony_ci	$LD		r6,`4*$BNSZ`(r4)
6551cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
6561cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
6571cb0ef41Sopenharmony_ci
6581cb0ef41Sopenharmony_ci	addc		r10,r7,r10
6591cb0ef41Sopenharmony_ci	adde		r11,r8,r11
6601cb0ef41Sopenharmony_ci	addze		r9,r9
6611cb0ef41Sopenharmony_ci	addc		r10,r7,r10
6621cb0ef41Sopenharmony_ci	adde		r11,r8,r11
6631cb0ef41Sopenharmony_ci	addze		r9,r9
6641cb0ef41Sopenharmony_ci	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
6651cb0ef41Sopenharmony_ci						#sqr_add_c(a,4,c3,c1,c2);
6661cb0ef41Sopenharmony_ci	$UMULL		r7,r6,r6
6671cb0ef41Sopenharmony_ci	$UMULH		r8,r6,r6
6681cb0ef41Sopenharmony_ci	addc		r11,r7,r11
6691cb0ef41Sopenharmony_ci	adde		r9,r8,r9
6701cb0ef41Sopenharmony_ci	addze		r10,r0
6711cb0ef41Sopenharmony_ci						#sqr_add_c2(a,5,3,c3,c1,c2);
6721cb0ef41Sopenharmony_ci	$LD		r6,`5*$BNSZ`(r4)
6731cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
6741cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
6751cb0ef41Sopenharmony_ci	addc		r11,r7,r11
6761cb0ef41Sopenharmony_ci	adde		r9,r8,r9
6771cb0ef41Sopenharmony_ci	addze		r10,r10
6781cb0ef41Sopenharmony_ci	addc		r11,r7,r11
6791cb0ef41Sopenharmony_ci	adde		r9,r8,r9
6801cb0ef41Sopenharmony_ci	addze		r10,r10
6811cb0ef41Sopenharmony_ci						#sqr_add_c2(a,6,2,c3,c1,c2);
6821cb0ef41Sopenharmony_ci	$LD		r5,`2*$BNSZ`(r4)
6831cb0ef41Sopenharmony_ci	$LD		r6,`6*$BNSZ`(r4)
6841cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
6851cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
6861cb0ef41Sopenharmony_ci	addc		r11,r7,r11
6871cb0ef41Sopenharmony_ci	adde		r9,r8,r9
6881cb0ef41Sopenharmony_ci	addze		r10,r10
6891cb0ef41Sopenharmony_ci
6901cb0ef41Sopenharmony_ci	addc		r11,r7,r11
6911cb0ef41Sopenharmony_ci	adde		r9,r8,r9
6921cb0ef41Sopenharmony_ci	addze		r10,r10
6931cb0ef41Sopenharmony_ci						#sqr_add_c2(a,7,1,c3,c1,c2);
6941cb0ef41Sopenharmony_ci	$LD		r5,`1*$BNSZ`(r4)
6951cb0ef41Sopenharmony_ci	$LD		r6,`7*$BNSZ`(r4)
6961cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
6971cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
6981cb0ef41Sopenharmony_ci	addc		r11,r7,r11
6991cb0ef41Sopenharmony_ci	adde		r9,r8,r9
7001cb0ef41Sopenharmony_ci	addze		r10,r10
7011cb0ef41Sopenharmony_ci	addc		r11,r7,r11
7021cb0ef41Sopenharmony_ci	adde		r9,r8,r9
7031cb0ef41Sopenharmony_ci	addze		r10,r10
7041cb0ef41Sopenharmony_ci	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
7051cb0ef41Sopenharmony_ci						#sqr_add_c2(a,7,2,c1,c2,c3);
7061cb0ef41Sopenharmony_ci	$LD		r5,`2*$BNSZ`(r4)
7071cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
7081cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
7091cb0ef41Sopenharmony_ci
7101cb0ef41Sopenharmony_ci	addc		r9,r7,r9
7111cb0ef41Sopenharmony_ci	adde		r10,r8,r10
7121cb0ef41Sopenharmony_ci	addze		r11,r0
7131cb0ef41Sopenharmony_ci	addc		r9,r7,r9
7141cb0ef41Sopenharmony_ci	adde		r10,r8,r10
7151cb0ef41Sopenharmony_ci	addze		r11,r11
7161cb0ef41Sopenharmony_ci						#sqr_add_c2(a,6,3,c1,c2,c3);
7171cb0ef41Sopenharmony_ci	$LD		r5,`3*$BNSZ`(r4)
7181cb0ef41Sopenharmony_ci	$LD		r6,`6*$BNSZ`(r4)
7191cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
7201cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
7211cb0ef41Sopenharmony_ci	addc		r9,r7,r9
7221cb0ef41Sopenharmony_ci	adde		r10,r8,r10
7231cb0ef41Sopenharmony_ci	addze		r11,r11
7241cb0ef41Sopenharmony_ci	addc		r9,r7,r9
7251cb0ef41Sopenharmony_ci	adde		r10,r8,r10
7261cb0ef41Sopenharmony_ci	addze		r11,r11
7271cb0ef41Sopenharmony_ci						#sqr_add_c2(a,5,4,c1,c2,c3);
7281cb0ef41Sopenharmony_ci	$LD		r5,`4*$BNSZ`(r4)
7291cb0ef41Sopenharmony_ci	$LD		r6,`5*$BNSZ`(r4)
7301cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
7311cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
7321cb0ef41Sopenharmony_ci	addc		r9,r7,r9
7331cb0ef41Sopenharmony_ci	adde		r10,r8,r10
7341cb0ef41Sopenharmony_ci	addze		r11,r11
7351cb0ef41Sopenharmony_ci	addc		r9,r7,r9
7361cb0ef41Sopenharmony_ci	adde		r10,r8,r10
7371cb0ef41Sopenharmony_ci	addze		r11,r11
7381cb0ef41Sopenharmony_ci	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
7391cb0ef41Sopenharmony_ci						#sqr_add_c(a,5,c2,c3,c1);
7401cb0ef41Sopenharmony_ci	$UMULL		r7,r6,r6
7411cb0ef41Sopenharmony_ci	$UMULH		r8,r6,r6
7421cb0ef41Sopenharmony_ci	addc		r10,r7,r10
7431cb0ef41Sopenharmony_ci	adde		r11,r8,r11
7441cb0ef41Sopenharmony_ci	addze		r9,r0
7451cb0ef41Sopenharmony_ci						#sqr_add_c2(a,6,4,c2,c3,c1);
7461cb0ef41Sopenharmony_ci	$LD		r6,`6*$BNSZ`(r4)
7471cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
7481cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
7491cb0ef41Sopenharmony_ci	addc		r10,r7,r10
7501cb0ef41Sopenharmony_ci	adde		r11,r8,r11
7511cb0ef41Sopenharmony_ci	addze		r9,r9
7521cb0ef41Sopenharmony_ci	addc		r10,r7,r10
7531cb0ef41Sopenharmony_ci	adde		r11,r8,r11
7541cb0ef41Sopenharmony_ci	addze		r9,r9
7551cb0ef41Sopenharmony_ci						#sqr_add_c2(a,7,3,c2,c3,c1);
7561cb0ef41Sopenharmony_ci	$LD		r5,`3*$BNSZ`(r4)
7571cb0ef41Sopenharmony_ci	$LD		r6,`7*$BNSZ`(r4)
7581cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
7591cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
7601cb0ef41Sopenharmony_ci	addc		r10,r7,r10
7611cb0ef41Sopenharmony_ci	adde		r11,r8,r11
7621cb0ef41Sopenharmony_ci	addze		r9,r9
7631cb0ef41Sopenharmony_ci	addc		r10,r7,r10
7641cb0ef41Sopenharmony_ci	adde		r11,r8,r11
7651cb0ef41Sopenharmony_ci	addze		r9,r9
7661cb0ef41Sopenharmony_ci	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
7671cb0ef41Sopenharmony_ci						#sqr_add_c2(a,7,4,c3,c1,c2);
7681cb0ef41Sopenharmony_ci	$LD		r5,`4*$BNSZ`(r4)
7691cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
7701cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
7711cb0ef41Sopenharmony_ci	addc		r11,r7,r11
7721cb0ef41Sopenharmony_ci	adde		r9,r8,r9
7731cb0ef41Sopenharmony_ci	addze		r10,r0
7741cb0ef41Sopenharmony_ci	addc		r11,r7,r11
7751cb0ef41Sopenharmony_ci	adde		r9,r8,r9
7761cb0ef41Sopenharmony_ci	addze		r10,r10
7771cb0ef41Sopenharmony_ci						#sqr_add_c2(a,6,5,c3,c1,c2);
7781cb0ef41Sopenharmony_ci	$LD		r5,`5*$BNSZ`(r4)
7791cb0ef41Sopenharmony_ci	$LD		r6,`6*$BNSZ`(r4)
7801cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
7811cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
7821cb0ef41Sopenharmony_ci	addc		r11,r7,r11
7831cb0ef41Sopenharmony_ci	adde		r9,r8,r9
7841cb0ef41Sopenharmony_ci	addze		r10,r10
7851cb0ef41Sopenharmony_ci	addc		r11,r7,r11
7861cb0ef41Sopenharmony_ci	adde		r9,r8,r9
7871cb0ef41Sopenharmony_ci	addze		r10,r10
7881cb0ef41Sopenharmony_ci	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
7891cb0ef41Sopenharmony_ci						#sqr_add_c(a,6,c1,c2,c3);
7901cb0ef41Sopenharmony_ci	$UMULL		r7,r6,r6
7911cb0ef41Sopenharmony_ci	$UMULH		r8,r6,r6
7921cb0ef41Sopenharmony_ci	addc		r9,r7,r9
7931cb0ef41Sopenharmony_ci	adde		r10,r8,r10
7941cb0ef41Sopenharmony_ci	addze		r11,r0
7951cb0ef41Sopenharmony_ci						#sqr_add_c2(a,7,5,c1,c2,c3)
7961cb0ef41Sopenharmony_ci	$LD		r6,`7*$BNSZ`(r4)
7971cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
7981cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
7991cb0ef41Sopenharmony_ci	addc		r9,r7,r9
8001cb0ef41Sopenharmony_ci	adde		r10,r8,r10
8011cb0ef41Sopenharmony_ci	addze		r11,r11
8021cb0ef41Sopenharmony_ci	addc		r9,r7,r9
8031cb0ef41Sopenharmony_ci	adde		r10,r8,r10
8041cb0ef41Sopenharmony_ci	addze		r11,r11
8051cb0ef41Sopenharmony_ci	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
8061cb0ef41Sopenharmony_ci
8071cb0ef41Sopenharmony_ci						#sqr_add_c2(a,7,6,c2,c3,c1)
8081cb0ef41Sopenharmony_ci	$LD		r5,`6*$BNSZ`(r4)
8091cb0ef41Sopenharmony_ci	$UMULL		r7,r5,r6
8101cb0ef41Sopenharmony_ci	$UMULH		r8,r5,r6
8111cb0ef41Sopenharmony_ci	addc		r10,r7,r10
8121cb0ef41Sopenharmony_ci	adde		r11,r8,r11
8131cb0ef41Sopenharmony_ci	addze		r9,r0
8141cb0ef41Sopenharmony_ci	addc		r10,r7,r10
8151cb0ef41Sopenharmony_ci	adde		r11,r8,r11
8161cb0ef41Sopenharmony_ci	addze		r9,r9
8171cb0ef41Sopenharmony_ci	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
8181cb0ef41Sopenharmony_ci						#sqr_add_c(a,7,c3,c1,c2);
8191cb0ef41Sopenharmony_ci	$UMULL		r7,r6,r6
8201cb0ef41Sopenharmony_ci	$UMULH		r8,r6,r6
8211cb0ef41Sopenharmony_ci	addc		r11,r7,r11
8221cb0ef41Sopenharmony_ci	adde		r9,r8,r9
8231cb0ef41Sopenharmony_ci	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
8241cb0ef41Sopenharmony_ci	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
8251cb0ef41Sopenharmony_ci
8261cb0ef41Sopenharmony_ci
8271cb0ef41Sopenharmony_ci	blr
8281cb0ef41Sopenharmony_ci	.long	0
8291cb0ef41Sopenharmony_ci	.byte	0,12,0x14,0,0,0,2,0
8301cb0ef41Sopenharmony_ci	.long	0
8311cb0ef41Sopenharmony_ci.size	.bn_sqr_comba8,.-.bn_sqr_comba8
8321cb0ef41Sopenharmony_ci
8331cb0ef41Sopenharmony_ci#
8341cb0ef41Sopenharmony_ci#	NOTE:	The following label name should be changed to
8351cb0ef41Sopenharmony_ci#		"bn_mul_comba4" i.e. remove the first dot
8361cb0ef41Sopenharmony_ci#		for the gcc compiler. This should be automatically
8371cb0ef41Sopenharmony_ci#		done in the build
8381cb0ef41Sopenharmony_ci#
8391cb0ef41Sopenharmony_ci
8401cb0ef41Sopenharmony_ci.align	4
8411cb0ef41Sopenharmony_ci.bn_mul_comba4:
8421cb0ef41Sopenharmony_ci#
8431cb0ef41Sopenharmony_ci# This is an optimized version of the bn_mul_comba4 routine.
8441cb0ef41Sopenharmony_ci#
8451cb0ef41Sopenharmony_ci# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
8461cb0ef41Sopenharmony_ci# r3 contains r
8471cb0ef41Sopenharmony_ci# r4 contains a
8481cb0ef41Sopenharmony_ci# r5 contains b
8491cb0ef41Sopenharmony_ci# r6, r7 are the 2 BN_ULONGs being multiplied.
8501cb0ef41Sopenharmony_ci# r8, r9 are the results of the 32x32 giving 64 multiply.
8511cb0ef41Sopenharmony_ci# r10, r11, r12 are the equivalents of c1, c2, and c3.
8521cb0ef41Sopenharmony_ci#
8531cb0ef41Sopenharmony_ci	xor	r0,r0,r0		#r0=0. Used in addze below.
8541cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[0],c1,c2,c3);
8551cb0ef41Sopenharmony_ci	$LD	r6,`0*$BNSZ`(r4)
8561cb0ef41Sopenharmony_ci	$LD	r7,`0*$BNSZ`(r5)
8571cb0ef41Sopenharmony_ci	$UMULL	r10,r6,r7
8581cb0ef41Sopenharmony_ci	$UMULH	r11,r6,r7
8591cb0ef41Sopenharmony_ci	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
8601cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[1],c2,c3,c1);
8611cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
8621cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
8631cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
8641cb0ef41Sopenharmony_ci	addc	r11,r8,r11
8651cb0ef41Sopenharmony_ci	adde	r12,r9,r0
8661cb0ef41Sopenharmony_ci	addze	r10,r0
8671cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[0],c2,c3,c1);
8681cb0ef41Sopenharmony_ci	$LD	r6, `1*$BNSZ`(r4)
8691cb0ef41Sopenharmony_ci	$LD	r7, `0*$BNSZ`(r5)
8701cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
8711cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
8721cb0ef41Sopenharmony_ci	addc	r11,r8,r11
8731cb0ef41Sopenharmony_ci	adde	r12,r9,r12
8741cb0ef41Sopenharmony_ci	addze	r10,r10
8751cb0ef41Sopenharmony_ci	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
8761cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[0],c3,c1,c2);
8771cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
8781cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
8791cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
8801cb0ef41Sopenharmony_ci	addc	r12,r8,r12
8811cb0ef41Sopenharmony_ci	adde	r10,r9,r10
8821cb0ef41Sopenharmony_ci	addze	r11,r0
8831cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[1],c3,c1,c2);
8841cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
8851cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
8861cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
8871cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
8881cb0ef41Sopenharmony_ci	addc	r12,r8,r12
8891cb0ef41Sopenharmony_ci	adde	r10,r9,r10
8901cb0ef41Sopenharmony_ci	addze	r11,r11
8911cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[2],c3,c1,c2);
8921cb0ef41Sopenharmony_ci	$LD	r6,`0*$BNSZ`(r4)
8931cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
8941cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
8951cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
8961cb0ef41Sopenharmony_ci	addc	r12,r8,r12
8971cb0ef41Sopenharmony_ci	adde	r10,r9,r10
8981cb0ef41Sopenharmony_ci	addze	r11,r11
8991cb0ef41Sopenharmony_ci	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
9001cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[3],c1,c2,c3);
9011cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
9021cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
9031cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
9041cb0ef41Sopenharmony_ci	addc	r10,r8,r10
9051cb0ef41Sopenharmony_ci	adde	r11,r9,r11
9061cb0ef41Sopenharmony_ci	addze	r12,r0
9071cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[2],c1,c2,c3);
9081cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
9091cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
9101cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
9111cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
9121cb0ef41Sopenharmony_ci	addc	r10,r8,r10
9131cb0ef41Sopenharmony_ci	adde	r11,r9,r11
9141cb0ef41Sopenharmony_ci	addze	r12,r12
9151cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[1],c1,c2,c3);
9161cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
9171cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
9181cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
9191cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
9201cb0ef41Sopenharmony_ci	addc	r10,r8,r10
9211cb0ef41Sopenharmony_ci	adde	r11,r9,r11
9221cb0ef41Sopenharmony_ci	addze	r12,r12
9231cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[0],c1,c2,c3);
9241cb0ef41Sopenharmony_ci	$LD	r6,`3*$BNSZ`(r4)
9251cb0ef41Sopenharmony_ci	$LD	r7,`0*$BNSZ`(r5)
9261cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
9271cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
9281cb0ef41Sopenharmony_ci	addc	r10,r8,r10
9291cb0ef41Sopenharmony_ci	adde	r11,r9,r11
9301cb0ef41Sopenharmony_ci	addze	r12,r12
9311cb0ef41Sopenharmony_ci	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
9321cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[1],c2,c3,c1);
9331cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
9341cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
9351cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
9361cb0ef41Sopenharmony_ci	addc	r11,r8,r11
9371cb0ef41Sopenharmony_ci	adde	r12,r9,r12
9381cb0ef41Sopenharmony_ci	addze	r10,r0
9391cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[2],c2,c3,c1);
9401cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
9411cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
9421cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
9431cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
9441cb0ef41Sopenharmony_ci	addc	r11,r8,r11
9451cb0ef41Sopenharmony_ci	adde	r12,r9,r12
9461cb0ef41Sopenharmony_ci	addze	r10,r10
9471cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[3],c2,c3,c1);
9481cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
9491cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
9501cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
9511cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
9521cb0ef41Sopenharmony_ci	addc	r11,r8,r11
9531cb0ef41Sopenharmony_ci	adde	r12,r9,r12
9541cb0ef41Sopenharmony_ci	addze	r10,r10
9551cb0ef41Sopenharmony_ci	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
9561cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[3],c3,c1,c2);
9571cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
9581cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
9591cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
9601cb0ef41Sopenharmony_ci	addc	r12,r8,r12
9611cb0ef41Sopenharmony_ci	adde	r10,r9,r10
9621cb0ef41Sopenharmony_ci	addze	r11,r0
9631cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[2],c3,c1,c2);
9641cb0ef41Sopenharmony_ci	$LD	r6,`3*$BNSZ`(r4)
9651cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
9661cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
9671cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
9681cb0ef41Sopenharmony_ci	addc	r12,r8,r12
9691cb0ef41Sopenharmony_ci	adde	r10,r9,r10
9701cb0ef41Sopenharmony_ci	addze	r11,r11
9711cb0ef41Sopenharmony_ci	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
9721cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[3],c1,c2,c3);
9731cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
9741cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
9751cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
9761cb0ef41Sopenharmony_ci	addc	r10,r8,r10
9771cb0ef41Sopenharmony_ci	adde	r11,r9,r11
9781cb0ef41Sopenharmony_ci
9791cb0ef41Sopenharmony_ci	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
9801cb0ef41Sopenharmony_ci	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
9811cb0ef41Sopenharmony_ci	blr
9821cb0ef41Sopenharmony_ci	.long	0
9831cb0ef41Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
9841cb0ef41Sopenharmony_ci	.long	0
9851cb0ef41Sopenharmony_ci.size	.bn_mul_comba4,.-.bn_mul_comba4
9861cb0ef41Sopenharmony_ci
9871cb0ef41Sopenharmony_ci#
9881cb0ef41Sopenharmony_ci#	NOTE:	The following label name should be changed to
9891cb0ef41Sopenharmony_ci#		"bn_mul_comba8" i.e. remove the first dot
9901cb0ef41Sopenharmony_ci#		for the gcc compiler. This should be automatically
9911cb0ef41Sopenharmony_ci#		done in the build
9921cb0ef41Sopenharmony_ci#
9931cb0ef41Sopenharmony_ci
9941cb0ef41Sopenharmony_ci.align	4
9951cb0ef41Sopenharmony_ci.bn_mul_comba8:
9961cb0ef41Sopenharmony_ci#
9971cb0ef41Sopenharmony_ci# Optimized version of the bn_mul_comba8 routine.
9981cb0ef41Sopenharmony_ci#
9991cb0ef41Sopenharmony_ci# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
10001cb0ef41Sopenharmony_ci# r3 contains r
10011cb0ef41Sopenharmony_ci# r4 contains a
10021cb0ef41Sopenharmony_ci# r5 contains b
10031cb0ef41Sopenharmony_ci# r6, r7 are the 2 BN_ULONGs being multiplied.
10041cb0ef41Sopenharmony_ci# r8, r9 are the results of the 32x32 giving 64 multiply.
10051cb0ef41Sopenharmony_ci# r10, r11, r12 are the equivalents of c1, c2, and c3.
10061cb0ef41Sopenharmony_ci#
10071cb0ef41Sopenharmony_ci	xor	r0,r0,r0		#r0=0. Used in addze below.
10081cb0ef41Sopenharmony_ci
10091cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[0],c1,c2,c3);
10101cb0ef41Sopenharmony_ci	$LD	r6,`0*$BNSZ`(r4)	#a[0]
10111cb0ef41Sopenharmony_ci	$LD	r7,`0*$BNSZ`(r5)	#b[0]
10121cb0ef41Sopenharmony_ci	$UMULL	r10,r6,r7
10131cb0ef41Sopenharmony_ci	$UMULH	r11,r6,r7
10141cb0ef41Sopenharmony_ci	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
10151cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[1],c2,c3,c1);
10161cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
10171cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10181cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
10191cb0ef41Sopenharmony_ci	addc	r11,r11,r8
10201cb0ef41Sopenharmony_ci	addze	r12,r9			# since we didn't set r12 to zero before.
10211cb0ef41Sopenharmony_ci	addze	r10,r0
10221cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[0],c2,c3,c1);
10231cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
10241cb0ef41Sopenharmony_ci	$LD	r7,`0*$BNSZ`(r5)
10251cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10261cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
10271cb0ef41Sopenharmony_ci	addc	r11,r11,r8
10281cb0ef41Sopenharmony_ci	adde	r12,r12,r9
10291cb0ef41Sopenharmony_ci	addze	r10,r10
10301cb0ef41Sopenharmony_ci	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
10311cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[0],c3,c1,c2);
10321cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
10331cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10341cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
10351cb0ef41Sopenharmony_ci	addc	r12,r12,r8
10361cb0ef41Sopenharmony_ci	adde	r10,r10,r9
10371cb0ef41Sopenharmony_ci	addze	r11,r0
10381cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[1],c3,c1,c2);
10391cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
10401cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
10411cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10421cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
10431cb0ef41Sopenharmony_ci	addc	r12,r12,r8
10441cb0ef41Sopenharmony_ci	adde	r10,r10,r9
10451cb0ef41Sopenharmony_ci	addze	r11,r11
10461cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[2],c3,c1,c2);
10471cb0ef41Sopenharmony_ci	$LD	r6,`0*$BNSZ`(r4)
10481cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
10491cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10501cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
10511cb0ef41Sopenharmony_ci	addc	r12,r12,r8
10521cb0ef41Sopenharmony_ci	adde	r10,r10,r9
10531cb0ef41Sopenharmony_ci	addze	r11,r11
10541cb0ef41Sopenharmony_ci	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
10551cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[3],c1,c2,c3);
10561cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
10571cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10581cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
10591cb0ef41Sopenharmony_ci	addc	r10,r10,r8
10601cb0ef41Sopenharmony_ci	adde	r11,r11,r9
10611cb0ef41Sopenharmony_ci	addze	r12,r0
10621cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[2],c1,c2,c3);
10631cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
10641cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
10651cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10661cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
10671cb0ef41Sopenharmony_ci	addc	r10,r10,r8
10681cb0ef41Sopenharmony_ci	adde	r11,r11,r9
10691cb0ef41Sopenharmony_ci	addze	r12,r12
10701cb0ef41Sopenharmony_ci
10711cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[1],c1,c2,c3);
10721cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
10731cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
10741cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10751cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
10761cb0ef41Sopenharmony_ci	addc	r10,r10,r8
10771cb0ef41Sopenharmony_ci	adde	r11,r11,r9
10781cb0ef41Sopenharmony_ci	addze	r12,r12
10791cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[0],c1,c2,c3);
10801cb0ef41Sopenharmony_ci	$LD	r6,`3*$BNSZ`(r4)
10811cb0ef41Sopenharmony_ci	$LD	r7,`0*$BNSZ`(r5)
10821cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10831cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
10841cb0ef41Sopenharmony_ci	addc	r10,r10,r8
10851cb0ef41Sopenharmony_ci	adde	r11,r11,r9
10861cb0ef41Sopenharmony_ci	addze	r12,r12
10871cb0ef41Sopenharmony_ci	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
10881cb0ef41Sopenharmony_ci					#mul_add_c(a[4],b[0],c2,c3,c1);
10891cb0ef41Sopenharmony_ci	$LD	r6,`4*$BNSZ`(r4)
10901cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10911cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
10921cb0ef41Sopenharmony_ci	addc	r11,r11,r8
10931cb0ef41Sopenharmony_ci	adde	r12,r12,r9
10941cb0ef41Sopenharmony_ci	addze	r10,r0
10951cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[1],c2,c3,c1);
10961cb0ef41Sopenharmony_ci	$LD	r6,`3*$BNSZ`(r4)
10971cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
10981cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
10991cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11001cb0ef41Sopenharmony_ci	addc	r11,r11,r8
11011cb0ef41Sopenharmony_ci	adde	r12,r12,r9
11021cb0ef41Sopenharmony_ci	addze	r10,r10
11031cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[2],c2,c3,c1);
11041cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
11051cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
11061cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11071cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11081cb0ef41Sopenharmony_ci	addc	r11,r11,r8
11091cb0ef41Sopenharmony_ci	adde	r12,r12,r9
11101cb0ef41Sopenharmony_ci	addze	r10,r10
11111cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[3],c2,c3,c1);
11121cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
11131cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
11141cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11151cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11161cb0ef41Sopenharmony_ci	addc	r11,r11,r8
11171cb0ef41Sopenharmony_ci	adde	r12,r12,r9
11181cb0ef41Sopenharmony_ci	addze	r10,r10
11191cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[4],c2,c3,c1);
11201cb0ef41Sopenharmony_ci	$LD	r6,`0*$BNSZ`(r4)
11211cb0ef41Sopenharmony_ci	$LD	r7,`4*$BNSZ`(r5)
11221cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11231cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11241cb0ef41Sopenharmony_ci	addc	r11,r11,r8
11251cb0ef41Sopenharmony_ci	adde	r12,r12,r9
11261cb0ef41Sopenharmony_ci	addze	r10,r10
11271cb0ef41Sopenharmony_ci	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
11281cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[5],c3,c1,c2);
11291cb0ef41Sopenharmony_ci	$LD	r7,`5*$BNSZ`(r5)
11301cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11311cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11321cb0ef41Sopenharmony_ci	addc	r12,r12,r8
11331cb0ef41Sopenharmony_ci	adde	r10,r10,r9
11341cb0ef41Sopenharmony_ci	addze	r11,r0
11351cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[4],c3,c1,c2);
11361cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
11371cb0ef41Sopenharmony_ci	$LD	r7,`4*$BNSZ`(r5)
11381cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11391cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11401cb0ef41Sopenharmony_ci	addc	r12,r12,r8
11411cb0ef41Sopenharmony_ci	adde	r10,r10,r9
11421cb0ef41Sopenharmony_ci	addze	r11,r11
11431cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[3],c3,c1,c2);
11441cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
11451cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
11461cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11471cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11481cb0ef41Sopenharmony_ci	addc	r12,r12,r8
11491cb0ef41Sopenharmony_ci	adde	r10,r10,r9
11501cb0ef41Sopenharmony_ci	addze	r11,r11
11511cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[2],c3,c1,c2);
11521cb0ef41Sopenharmony_ci	$LD	r6,`3*$BNSZ`(r4)
11531cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
11541cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11551cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11561cb0ef41Sopenharmony_ci	addc	r12,r12,r8
11571cb0ef41Sopenharmony_ci	adde	r10,r10,r9
11581cb0ef41Sopenharmony_ci	addze	r11,r11
11591cb0ef41Sopenharmony_ci					#mul_add_c(a[4],b[1],c3,c1,c2);
11601cb0ef41Sopenharmony_ci	$LD	r6,`4*$BNSZ`(r4)
11611cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
11621cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11631cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11641cb0ef41Sopenharmony_ci	addc	r12,r12,r8
11651cb0ef41Sopenharmony_ci	adde	r10,r10,r9
11661cb0ef41Sopenharmony_ci	addze	r11,r11
11671cb0ef41Sopenharmony_ci					#mul_add_c(a[5],b[0],c3,c1,c2);
11681cb0ef41Sopenharmony_ci	$LD	r6,`5*$BNSZ`(r4)
11691cb0ef41Sopenharmony_ci	$LD	r7,`0*$BNSZ`(r5)
11701cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11711cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11721cb0ef41Sopenharmony_ci	addc	r12,r12,r8
11731cb0ef41Sopenharmony_ci	adde	r10,r10,r9
11741cb0ef41Sopenharmony_ci	addze	r11,r11
11751cb0ef41Sopenharmony_ci	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
11761cb0ef41Sopenharmony_ci					#mul_add_c(a[6],b[0],c1,c2,c3);
11771cb0ef41Sopenharmony_ci	$LD	r6,`6*$BNSZ`(r4)
11781cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11791cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11801cb0ef41Sopenharmony_ci	addc	r10,r10,r8
11811cb0ef41Sopenharmony_ci	adde	r11,r11,r9
11821cb0ef41Sopenharmony_ci	addze	r12,r0
11831cb0ef41Sopenharmony_ci					#mul_add_c(a[5],b[1],c1,c2,c3);
11841cb0ef41Sopenharmony_ci	$LD	r6,`5*$BNSZ`(r4)
11851cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
11861cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11871cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11881cb0ef41Sopenharmony_ci	addc	r10,r10,r8
11891cb0ef41Sopenharmony_ci	adde	r11,r11,r9
11901cb0ef41Sopenharmony_ci	addze	r12,r12
11911cb0ef41Sopenharmony_ci					#mul_add_c(a[4],b[2],c1,c2,c3);
11921cb0ef41Sopenharmony_ci	$LD	r6,`4*$BNSZ`(r4)
11931cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
11941cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
11951cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
11961cb0ef41Sopenharmony_ci	addc	r10,r10,r8
11971cb0ef41Sopenharmony_ci	adde	r11,r11,r9
11981cb0ef41Sopenharmony_ci	addze	r12,r12
11991cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[3],c1,c2,c3);
12001cb0ef41Sopenharmony_ci	$LD	r6,`3*$BNSZ`(r4)
12011cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
12021cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12031cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12041cb0ef41Sopenharmony_ci	addc	r10,r10,r8
12051cb0ef41Sopenharmony_ci	adde	r11,r11,r9
12061cb0ef41Sopenharmony_ci	addze	r12,r12
12071cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[4],c1,c2,c3);
12081cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
12091cb0ef41Sopenharmony_ci	$LD	r7,`4*$BNSZ`(r5)
12101cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12111cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12121cb0ef41Sopenharmony_ci	addc	r10,r10,r8
12131cb0ef41Sopenharmony_ci	adde	r11,r11,r9
12141cb0ef41Sopenharmony_ci	addze	r12,r12
12151cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[5],c1,c2,c3);
12161cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
12171cb0ef41Sopenharmony_ci	$LD	r7,`5*$BNSZ`(r5)
12181cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12191cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12201cb0ef41Sopenharmony_ci	addc	r10,r10,r8
12211cb0ef41Sopenharmony_ci	adde	r11,r11,r9
12221cb0ef41Sopenharmony_ci	addze	r12,r12
12231cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[6],c1,c2,c3);
12241cb0ef41Sopenharmony_ci	$LD	r6,`0*$BNSZ`(r4)
12251cb0ef41Sopenharmony_ci	$LD	r7,`6*$BNSZ`(r5)
12261cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12271cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12281cb0ef41Sopenharmony_ci	addc	r10,r10,r8
12291cb0ef41Sopenharmony_ci	adde	r11,r11,r9
12301cb0ef41Sopenharmony_ci	addze	r12,r12
12311cb0ef41Sopenharmony_ci	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
12321cb0ef41Sopenharmony_ci					#mul_add_c(a[0],b[7],c2,c3,c1);
12331cb0ef41Sopenharmony_ci	$LD	r7,`7*$BNSZ`(r5)
12341cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12351cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12361cb0ef41Sopenharmony_ci	addc	r11,r11,r8
12371cb0ef41Sopenharmony_ci	adde	r12,r12,r9
12381cb0ef41Sopenharmony_ci	addze	r10,r0
12391cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[6],c2,c3,c1);
12401cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
12411cb0ef41Sopenharmony_ci	$LD	r7,`6*$BNSZ`(r5)
12421cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12431cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12441cb0ef41Sopenharmony_ci	addc	r11,r11,r8
12451cb0ef41Sopenharmony_ci	adde	r12,r12,r9
12461cb0ef41Sopenharmony_ci	addze	r10,r10
12471cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[5],c2,c3,c1);
12481cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
12491cb0ef41Sopenharmony_ci	$LD	r7,`5*$BNSZ`(r5)
12501cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12511cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12521cb0ef41Sopenharmony_ci	addc	r11,r11,r8
12531cb0ef41Sopenharmony_ci	adde	r12,r12,r9
12541cb0ef41Sopenharmony_ci	addze	r10,r10
12551cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[4],c2,c3,c1);
12561cb0ef41Sopenharmony_ci	$LD	r6,`3*$BNSZ`(r4)
12571cb0ef41Sopenharmony_ci	$LD	r7,`4*$BNSZ`(r5)
12581cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12591cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12601cb0ef41Sopenharmony_ci	addc	r11,r11,r8
12611cb0ef41Sopenharmony_ci	adde	r12,r12,r9
12621cb0ef41Sopenharmony_ci	addze	r10,r10
12631cb0ef41Sopenharmony_ci					#mul_add_c(a[4],b[3],c2,c3,c1);
12641cb0ef41Sopenharmony_ci	$LD	r6,`4*$BNSZ`(r4)
12651cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
12661cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12671cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12681cb0ef41Sopenharmony_ci	addc	r11,r11,r8
12691cb0ef41Sopenharmony_ci	adde	r12,r12,r9
12701cb0ef41Sopenharmony_ci	addze	r10,r10
12711cb0ef41Sopenharmony_ci					#mul_add_c(a[5],b[2],c2,c3,c1);
12721cb0ef41Sopenharmony_ci	$LD	r6,`5*$BNSZ`(r4)
12731cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
12741cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12751cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12761cb0ef41Sopenharmony_ci	addc	r11,r11,r8
12771cb0ef41Sopenharmony_ci	adde	r12,r12,r9
12781cb0ef41Sopenharmony_ci	addze	r10,r10
12791cb0ef41Sopenharmony_ci					#mul_add_c(a[6],b[1],c2,c3,c1);
12801cb0ef41Sopenharmony_ci	$LD	r6,`6*$BNSZ`(r4)
12811cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
12821cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12831cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12841cb0ef41Sopenharmony_ci	addc	r11,r11,r8
12851cb0ef41Sopenharmony_ci	adde	r12,r12,r9
12861cb0ef41Sopenharmony_ci	addze	r10,r10
12871cb0ef41Sopenharmony_ci					#mul_add_c(a[7],b[0],c2,c3,c1);
12881cb0ef41Sopenharmony_ci	$LD	r6,`7*$BNSZ`(r4)
12891cb0ef41Sopenharmony_ci	$LD	r7,`0*$BNSZ`(r5)
12901cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12911cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
12921cb0ef41Sopenharmony_ci	addc	r11,r11,r8
12931cb0ef41Sopenharmony_ci	adde	r12,r12,r9
12941cb0ef41Sopenharmony_ci	addze	r10,r10
12951cb0ef41Sopenharmony_ci	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
12961cb0ef41Sopenharmony_ci					#mul_add_c(a[7],b[1],c3,c1,c2);
12971cb0ef41Sopenharmony_ci	$LD	r7,`1*$BNSZ`(r5)
12981cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
12991cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13001cb0ef41Sopenharmony_ci	addc	r12,r12,r8
13011cb0ef41Sopenharmony_ci	adde	r10,r10,r9
13021cb0ef41Sopenharmony_ci	addze	r11,r0
13031cb0ef41Sopenharmony_ci					#mul_add_c(a[6],b[2],c3,c1,c2);
13041cb0ef41Sopenharmony_ci	$LD	r6,`6*$BNSZ`(r4)
13051cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
13061cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13071cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13081cb0ef41Sopenharmony_ci	addc	r12,r12,r8
13091cb0ef41Sopenharmony_ci	adde	r10,r10,r9
13101cb0ef41Sopenharmony_ci	addze	r11,r11
13111cb0ef41Sopenharmony_ci					#mul_add_c(a[5],b[3],c3,c1,c2);
13121cb0ef41Sopenharmony_ci	$LD	r6,`5*$BNSZ`(r4)
13131cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
13141cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13151cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13161cb0ef41Sopenharmony_ci	addc	r12,r12,r8
13171cb0ef41Sopenharmony_ci	adde	r10,r10,r9
13181cb0ef41Sopenharmony_ci	addze	r11,r11
13191cb0ef41Sopenharmony_ci					#mul_add_c(a[4],b[4],c3,c1,c2);
13201cb0ef41Sopenharmony_ci	$LD	r6,`4*$BNSZ`(r4)
13211cb0ef41Sopenharmony_ci	$LD	r7,`4*$BNSZ`(r5)
13221cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13231cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13241cb0ef41Sopenharmony_ci	addc	r12,r12,r8
13251cb0ef41Sopenharmony_ci	adde	r10,r10,r9
13261cb0ef41Sopenharmony_ci	addze	r11,r11
13271cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[5],c3,c1,c2);
13281cb0ef41Sopenharmony_ci	$LD	r6,`3*$BNSZ`(r4)
13291cb0ef41Sopenharmony_ci	$LD	r7,`5*$BNSZ`(r5)
13301cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13311cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13321cb0ef41Sopenharmony_ci	addc	r12,r12,r8
13331cb0ef41Sopenharmony_ci	adde	r10,r10,r9
13341cb0ef41Sopenharmony_ci	addze	r11,r11
13351cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[6],c3,c1,c2);
13361cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
13371cb0ef41Sopenharmony_ci	$LD	r7,`6*$BNSZ`(r5)
13381cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13391cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13401cb0ef41Sopenharmony_ci	addc	r12,r12,r8
13411cb0ef41Sopenharmony_ci	adde	r10,r10,r9
13421cb0ef41Sopenharmony_ci	addze	r11,r11
13431cb0ef41Sopenharmony_ci					#mul_add_c(a[1],b[7],c3,c1,c2);
13441cb0ef41Sopenharmony_ci	$LD	r6,`1*$BNSZ`(r4)
13451cb0ef41Sopenharmony_ci	$LD	r7,`7*$BNSZ`(r5)
13461cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13471cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13481cb0ef41Sopenharmony_ci	addc	r12,r12,r8
13491cb0ef41Sopenharmony_ci	adde	r10,r10,r9
13501cb0ef41Sopenharmony_ci	addze	r11,r11
13511cb0ef41Sopenharmony_ci	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
13521cb0ef41Sopenharmony_ci					#mul_add_c(a[2],b[7],c1,c2,c3);
13531cb0ef41Sopenharmony_ci	$LD	r6,`2*$BNSZ`(r4)
13541cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13551cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13561cb0ef41Sopenharmony_ci	addc	r10,r10,r8
13571cb0ef41Sopenharmony_ci	adde	r11,r11,r9
13581cb0ef41Sopenharmony_ci	addze	r12,r0
13591cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[6],c1,c2,c3);
13601cb0ef41Sopenharmony_ci	$LD	r6,`3*$BNSZ`(r4)
13611cb0ef41Sopenharmony_ci	$LD	r7,`6*$BNSZ`(r5)
13621cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13631cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13641cb0ef41Sopenharmony_ci	addc	r10,r10,r8
13651cb0ef41Sopenharmony_ci	adde	r11,r11,r9
13661cb0ef41Sopenharmony_ci	addze	r12,r12
13671cb0ef41Sopenharmony_ci					#mul_add_c(a[4],b[5],c1,c2,c3);
13681cb0ef41Sopenharmony_ci	$LD	r6,`4*$BNSZ`(r4)
13691cb0ef41Sopenharmony_ci	$LD	r7,`5*$BNSZ`(r5)
13701cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13711cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13721cb0ef41Sopenharmony_ci	addc	r10,r10,r8
13731cb0ef41Sopenharmony_ci	adde	r11,r11,r9
13741cb0ef41Sopenharmony_ci	addze	r12,r12
13751cb0ef41Sopenharmony_ci					#mul_add_c(a[5],b[4],c1,c2,c3);
13761cb0ef41Sopenharmony_ci	$LD	r6,`5*$BNSZ`(r4)
13771cb0ef41Sopenharmony_ci	$LD	r7,`4*$BNSZ`(r5)
13781cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13791cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13801cb0ef41Sopenharmony_ci	addc	r10,r10,r8
13811cb0ef41Sopenharmony_ci	adde	r11,r11,r9
13821cb0ef41Sopenharmony_ci	addze	r12,r12
13831cb0ef41Sopenharmony_ci					#mul_add_c(a[6],b[3],c1,c2,c3);
13841cb0ef41Sopenharmony_ci	$LD	r6,`6*$BNSZ`(r4)
13851cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
13861cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13871cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13881cb0ef41Sopenharmony_ci	addc	r10,r10,r8
13891cb0ef41Sopenharmony_ci	adde	r11,r11,r9
13901cb0ef41Sopenharmony_ci	addze	r12,r12
13911cb0ef41Sopenharmony_ci					#mul_add_c(a[7],b[2],c1,c2,c3);
13921cb0ef41Sopenharmony_ci	$LD	r6,`7*$BNSZ`(r4)
13931cb0ef41Sopenharmony_ci	$LD	r7,`2*$BNSZ`(r5)
13941cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
13951cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
13961cb0ef41Sopenharmony_ci	addc	r10,r10,r8
13971cb0ef41Sopenharmony_ci	adde	r11,r11,r9
13981cb0ef41Sopenharmony_ci	addze	r12,r12
13991cb0ef41Sopenharmony_ci	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
14001cb0ef41Sopenharmony_ci					#mul_add_c(a[7],b[3],c2,c3,c1);
14011cb0ef41Sopenharmony_ci	$LD	r7,`3*$BNSZ`(r5)
14021cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14031cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14041cb0ef41Sopenharmony_ci	addc	r11,r11,r8
14051cb0ef41Sopenharmony_ci	adde	r12,r12,r9
14061cb0ef41Sopenharmony_ci	addze	r10,r0
14071cb0ef41Sopenharmony_ci					#mul_add_c(a[6],b[4],c2,c3,c1);
14081cb0ef41Sopenharmony_ci	$LD	r6,`6*$BNSZ`(r4)
14091cb0ef41Sopenharmony_ci	$LD	r7,`4*$BNSZ`(r5)
14101cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14111cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14121cb0ef41Sopenharmony_ci	addc	r11,r11,r8
14131cb0ef41Sopenharmony_ci	adde	r12,r12,r9
14141cb0ef41Sopenharmony_ci	addze	r10,r10
14151cb0ef41Sopenharmony_ci					#mul_add_c(a[5],b[5],c2,c3,c1);
14161cb0ef41Sopenharmony_ci	$LD	r6,`5*$BNSZ`(r4)
14171cb0ef41Sopenharmony_ci	$LD	r7,`5*$BNSZ`(r5)
14181cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14191cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14201cb0ef41Sopenharmony_ci	addc	r11,r11,r8
14211cb0ef41Sopenharmony_ci	adde	r12,r12,r9
14221cb0ef41Sopenharmony_ci	addze	r10,r10
14231cb0ef41Sopenharmony_ci					#mul_add_c(a[4],b[6],c2,c3,c1);
14241cb0ef41Sopenharmony_ci	$LD	r6,`4*$BNSZ`(r4)
14251cb0ef41Sopenharmony_ci	$LD	r7,`6*$BNSZ`(r5)
14261cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14271cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14281cb0ef41Sopenharmony_ci	addc	r11,r11,r8
14291cb0ef41Sopenharmony_ci	adde	r12,r12,r9
14301cb0ef41Sopenharmony_ci	addze	r10,r10
14311cb0ef41Sopenharmony_ci					#mul_add_c(a[3],b[7],c2,c3,c1);
14321cb0ef41Sopenharmony_ci	$LD	r6,`3*$BNSZ`(r4)
14331cb0ef41Sopenharmony_ci	$LD	r7,`7*$BNSZ`(r5)
14341cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14351cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14361cb0ef41Sopenharmony_ci	addc	r11,r11,r8
14371cb0ef41Sopenharmony_ci	adde	r12,r12,r9
14381cb0ef41Sopenharmony_ci	addze	r10,r10
14391cb0ef41Sopenharmony_ci	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
14401cb0ef41Sopenharmony_ci					#mul_add_c(a[4],b[7],c3,c1,c2);
14411cb0ef41Sopenharmony_ci	$LD	r6,`4*$BNSZ`(r4)
14421cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14431cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14441cb0ef41Sopenharmony_ci	addc	r12,r12,r8
14451cb0ef41Sopenharmony_ci	adde	r10,r10,r9
14461cb0ef41Sopenharmony_ci	addze	r11,r0
14471cb0ef41Sopenharmony_ci					#mul_add_c(a[5],b[6],c3,c1,c2);
14481cb0ef41Sopenharmony_ci	$LD	r6,`5*$BNSZ`(r4)
14491cb0ef41Sopenharmony_ci	$LD	r7,`6*$BNSZ`(r5)
14501cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14511cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14521cb0ef41Sopenharmony_ci	addc	r12,r12,r8
14531cb0ef41Sopenharmony_ci	adde	r10,r10,r9
14541cb0ef41Sopenharmony_ci	addze	r11,r11
14551cb0ef41Sopenharmony_ci					#mul_add_c(a[6],b[5],c3,c1,c2);
14561cb0ef41Sopenharmony_ci	$LD	r6,`6*$BNSZ`(r4)
14571cb0ef41Sopenharmony_ci	$LD	r7,`5*$BNSZ`(r5)
14581cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14591cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14601cb0ef41Sopenharmony_ci	addc	r12,r12,r8
14611cb0ef41Sopenharmony_ci	adde	r10,r10,r9
14621cb0ef41Sopenharmony_ci	addze	r11,r11
14631cb0ef41Sopenharmony_ci					#mul_add_c(a[7],b[4],c3,c1,c2);
14641cb0ef41Sopenharmony_ci	$LD	r6,`7*$BNSZ`(r4)
14651cb0ef41Sopenharmony_ci	$LD	r7,`4*$BNSZ`(r5)
14661cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14671cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14681cb0ef41Sopenharmony_ci	addc	r12,r12,r8
14691cb0ef41Sopenharmony_ci	adde	r10,r10,r9
14701cb0ef41Sopenharmony_ci	addze	r11,r11
14711cb0ef41Sopenharmony_ci	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
14721cb0ef41Sopenharmony_ci					#mul_add_c(a[7],b[5],c1,c2,c3);
14731cb0ef41Sopenharmony_ci	$LD	r7,`5*$BNSZ`(r5)
14741cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14751cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14761cb0ef41Sopenharmony_ci	addc	r10,r10,r8
14771cb0ef41Sopenharmony_ci	adde	r11,r11,r9
14781cb0ef41Sopenharmony_ci	addze	r12,r0
14791cb0ef41Sopenharmony_ci					#mul_add_c(a[6],b[6],c1,c2,c3);
14801cb0ef41Sopenharmony_ci	$LD	r6,`6*$BNSZ`(r4)
14811cb0ef41Sopenharmony_ci	$LD	r7,`6*$BNSZ`(r5)
14821cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14831cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14841cb0ef41Sopenharmony_ci	addc	r10,r10,r8
14851cb0ef41Sopenharmony_ci	adde	r11,r11,r9
14861cb0ef41Sopenharmony_ci	addze	r12,r12
14871cb0ef41Sopenharmony_ci					#mul_add_c(a[5],b[7],c1,c2,c3);
14881cb0ef41Sopenharmony_ci	$LD	r6,`5*$BNSZ`(r4)
14891cb0ef41Sopenharmony_ci	$LD	r7,`7*$BNSZ`(r5)
14901cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14911cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
14921cb0ef41Sopenharmony_ci	addc	r10,r10,r8
14931cb0ef41Sopenharmony_ci	adde	r11,r11,r9
14941cb0ef41Sopenharmony_ci	addze	r12,r12
14951cb0ef41Sopenharmony_ci	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
14961cb0ef41Sopenharmony_ci					#mul_add_c(a[6],b[7],c2,c3,c1);
14971cb0ef41Sopenharmony_ci	$LD	r6,`6*$BNSZ`(r4)
14981cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
14991cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
15001cb0ef41Sopenharmony_ci	addc	r11,r11,r8
15011cb0ef41Sopenharmony_ci	adde	r12,r12,r9
15021cb0ef41Sopenharmony_ci	addze	r10,r0
15031cb0ef41Sopenharmony_ci					#mul_add_c(a[7],b[6],c2,c3,c1);
15041cb0ef41Sopenharmony_ci	$LD	r6,`7*$BNSZ`(r4)
15051cb0ef41Sopenharmony_ci	$LD	r7,`6*$BNSZ`(r5)
15061cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
15071cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
15081cb0ef41Sopenharmony_ci	addc	r11,r11,r8
15091cb0ef41Sopenharmony_ci	adde	r12,r12,r9
15101cb0ef41Sopenharmony_ci	addze	r10,r10
15111cb0ef41Sopenharmony_ci	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
15121cb0ef41Sopenharmony_ci					#mul_add_c(a[7],b[7],c3,c1,c2);
15131cb0ef41Sopenharmony_ci	$LD	r7,`7*$BNSZ`(r5)
15141cb0ef41Sopenharmony_ci	$UMULL	r8,r6,r7
15151cb0ef41Sopenharmony_ci	$UMULH	r9,r6,r7
15161cb0ef41Sopenharmony_ci	addc	r12,r12,r8
15171cb0ef41Sopenharmony_ci	adde	r10,r10,r9
15181cb0ef41Sopenharmony_ci	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
15191cb0ef41Sopenharmony_ci	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
15201cb0ef41Sopenharmony_ci	blr
15211cb0ef41Sopenharmony_ci	.long	0
15221cb0ef41Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
15231cb0ef41Sopenharmony_ci	.long	0
15241cb0ef41Sopenharmony_ci.size	.bn_mul_comba8,.-.bn_mul_comba8
15251cb0ef41Sopenharmony_ci
15261cb0ef41Sopenharmony_ci#
15271cb0ef41Sopenharmony_ci#	NOTE:	The following label name should be changed to
15281cb0ef41Sopenharmony_ci#		"bn_sub_words" i.e. remove the first dot
15291cb0ef41Sopenharmony_ci#		for the gcc compiler. This should be automatically
15301cb0ef41Sopenharmony_ci#		done in the build
15311cb0ef41Sopenharmony_ci#
15321cb0ef41Sopenharmony_ci#
15331cb0ef41Sopenharmony_ci.align	4
15341cb0ef41Sopenharmony_ci.bn_sub_words:
15351cb0ef41Sopenharmony_ci#
15361cb0ef41Sopenharmony_ci#	Handcoded version of bn_sub_words
15371cb0ef41Sopenharmony_ci#
15381cb0ef41Sopenharmony_ci#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
15391cb0ef41Sopenharmony_ci#
15401cb0ef41Sopenharmony_ci#	r3 = r
15411cb0ef41Sopenharmony_ci#	r4 = a
15421cb0ef41Sopenharmony_ci#	r5 = b
15431cb0ef41Sopenharmony_ci#	r6 = n
15441cb0ef41Sopenharmony_ci#
15451cb0ef41Sopenharmony_ci#       Note:	No loop unrolling done since this is not a performance
15461cb0ef41Sopenharmony_ci#               critical loop.
15471cb0ef41Sopenharmony_ci
15481cb0ef41Sopenharmony_ci	xor	r0,r0,r0	#set r0 = 0
15491cb0ef41Sopenharmony_ci#
15501cb0ef41Sopenharmony_ci#	check for r6 = 0 AND set carry bit.
15511cb0ef41Sopenharmony_ci#
15521cb0ef41Sopenharmony_ci	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
15531cb0ef41Sopenharmony_ci				# if r6 > 0 then result !=0
15541cb0ef41Sopenharmony_ci				# In either case carry bit is set.
15551cb0ef41Sopenharmony_ci	beq	Lppcasm_sub_adios
15561cb0ef41Sopenharmony_ci	addi	r4,r4,-$BNSZ
15571cb0ef41Sopenharmony_ci	addi	r3,r3,-$BNSZ
15581cb0ef41Sopenharmony_ci	addi	r5,r5,-$BNSZ
15591cb0ef41Sopenharmony_ci	mtctr	r6
15601cb0ef41Sopenharmony_ciLppcasm_sub_mainloop:
15611cb0ef41Sopenharmony_ci	$LDU	r7,$BNSZ(r4)
15621cb0ef41Sopenharmony_ci	$LDU	r8,$BNSZ(r5)
15631cb0ef41Sopenharmony_ci	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
15641cb0ef41Sopenharmony_ci				# if carry = 1 this is r7-r8. Else it
15651cb0ef41Sopenharmony_ci				# is r7-r8 -1 as we need.
15661cb0ef41Sopenharmony_ci	$STU	r6,$BNSZ(r3)
15671cb0ef41Sopenharmony_ci	bdnz	Lppcasm_sub_mainloop
15681cb0ef41Sopenharmony_ciLppcasm_sub_adios:
15691cb0ef41Sopenharmony_ci	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
15701cb0ef41Sopenharmony_ci	andi.	r3,r3,1         # keep only last bit.
15711cb0ef41Sopenharmony_ci	blr
15721cb0ef41Sopenharmony_ci	.long	0
15731cb0ef41Sopenharmony_ci	.byte	0,12,0x14,0,0,0,4,0
15741cb0ef41Sopenharmony_ci	.long	0
15751cb0ef41Sopenharmony_ci.size	.bn_sub_words,.-.bn_sub_words
15761cb0ef41Sopenharmony_ci
15771cb0ef41Sopenharmony_ci#
15781cb0ef41Sopenharmony_ci#	NOTE:	The following label name should be changed to
15791cb0ef41Sopenharmony_ci#		"bn_add_words" i.e. remove the first dot
15801cb0ef41Sopenharmony_ci#		for the gcc compiler. This should be automatically
15811cb0ef41Sopenharmony_ci#		done in the build
15821cb0ef41Sopenharmony_ci#
15831cb0ef41Sopenharmony_ci
15841cb0ef41Sopenharmony_ci.align	4
15851cb0ef41Sopenharmony_ci.bn_add_words:
15861cb0ef41Sopenharmony_ci#
15871cb0ef41Sopenharmony_ci#	Handcoded version of bn_add_words
15881cb0ef41Sopenharmony_ci#
15891cb0ef41Sopenharmony_ci#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
15901cb0ef41Sopenharmony_ci#
15911cb0ef41Sopenharmony_ci#	r3 = r
15921cb0ef41Sopenharmony_ci#	r4 = a
15931cb0ef41Sopenharmony_ci#	r5 = b
15941cb0ef41Sopenharmony_ci#	r6 = n
15951cb0ef41Sopenharmony_ci#
15961cb0ef41Sopenharmony_ci#       Note:	No loop unrolling done since this is not a performance
15971cb0ef41Sopenharmony_ci#               critical loop.
15981cb0ef41Sopenharmony_ci
15991cb0ef41Sopenharmony_ci	xor	r0,r0,r0
16001cb0ef41Sopenharmony_ci#
16011cb0ef41Sopenharmony_ci#	check for r6 = 0. Is this needed?
16021cb0ef41Sopenharmony_ci#
16031cb0ef41Sopenharmony_ci	addic.	r6,r6,0		#test r6 and clear carry bit.
16041cb0ef41Sopenharmony_ci	beq	Lppcasm_add_adios
16051cb0ef41Sopenharmony_ci	addi	r4,r4,-$BNSZ
16061cb0ef41Sopenharmony_ci	addi	r3,r3,-$BNSZ
16071cb0ef41Sopenharmony_ci	addi	r5,r5,-$BNSZ
16081cb0ef41Sopenharmony_ci	mtctr	r6
16091cb0ef41Sopenharmony_ciLppcasm_add_mainloop:
16101cb0ef41Sopenharmony_ci	$LDU	r7,$BNSZ(r4)
16111cb0ef41Sopenharmony_ci	$LDU	r8,$BNSZ(r5)
16121cb0ef41Sopenharmony_ci	adde	r8,r7,r8
16131cb0ef41Sopenharmony_ci	$STU	r8,$BNSZ(r3)
16141cb0ef41Sopenharmony_ci	bdnz	Lppcasm_add_mainloop
16151cb0ef41Sopenharmony_ciLppcasm_add_adios:
16161cb0ef41Sopenharmony_ci	addze	r3,r0			#return carry bit.
16171cb0ef41Sopenharmony_ci	blr
16181cb0ef41Sopenharmony_ci	.long	0
16191cb0ef41Sopenharmony_ci	.byte	0,12,0x14,0,0,0,4,0
16201cb0ef41Sopenharmony_ci	.long	0
16211cb0ef41Sopenharmony_ci.size	.bn_add_words,.-.bn_add_words
16221cb0ef41Sopenharmony_ci
16231cb0ef41Sopenharmony_ci#
16241cb0ef41Sopenharmony_ci#	NOTE:	The following label name should be changed to
16251cb0ef41Sopenharmony_ci#		"bn_div_words" i.e. remove the first dot
16261cb0ef41Sopenharmony_ci#		for the gcc compiler. This should be automatically
16271cb0ef41Sopenharmony_ci#		done in the build
16281cb0ef41Sopenharmony_ci#
16291cb0ef41Sopenharmony_ci
16301cb0ef41Sopenharmony_ci.align	4
16311cb0ef41Sopenharmony_ci.bn_div_words:
16321cb0ef41Sopenharmony_ci#
16331cb0ef41Sopenharmony_ci#	This is a cleaned up version of code generated by
16341cb0ef41Sopenharmony_ci#	the AIX compiler. The only optimization is to use
16351cb0ef41Sopenharmony_ci#	the PPC instruction to count leading zeros instead
16361cb0ef41Sopenharmony_ci#	of call to num_bits_word. Since this was compiled
16371cb0ef41Sopenharmony_ci#	only at level -O2 we can possibly squeeze it more?
16381cb0ef41Sopenharmony_ci#
16391cb0ef41Sopenharmony_ci#	r3 = h
16401cb0ef41Sopenharmony_ci#	r4 = l
16411cb0ef41Sopenharmony_ci#	r5 = d
16421cb0ef41Sopenharmony_ci
16431cb0ef41Sopenharmony_ci	$UCMPI	0,r5,0			# compare r5 and 0
16441cb0ef41Sopenharmony_ci	bne	Lppcasm_div1		# proceed if d!=0
16451cb0ef41Sopenharmony_ci	li	r3,-1			# d=0 return -1
16461cb0ef41Sopenharmony_ci	blr
16471cb0ef41Sopenharmony_ciLppcasm_div1:
16481cb0ef41Sopenharmony_ci	xor	r0,r0,r0		#r0=0
16491cb0ef41Sopenharmony_ci	li	r8,$BITS
16501cb0ef41Sopenharmony_ci	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
16511cb0ef41Sopenharmony_ci	beq	Lppcasm_div2		#proceed if no leading zeros
16521cb0ef41Sopenharmony_ci	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
16531cb0ef41Sopenharmony_ci	$SHR.	r9,r3,r8		#are there any bits above r8'th?
16541cb0ef41Sopenharmony_ci	$TR	16,r9,r0		#if there're, signal to dump core...
16551cb0ef41Sopenharmony_ciLppcasm_div2:
16561cb0ef41Sopenharmony_ci	$UCMP	0,r3,r5			#h>=d?
16571cb0ef41Sopenharmony_ci	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
16581cb0ef41Sopenharmony_ci	subf	r3,r5,r3		#h-=d ;
16591cb0ef41Sopenharmony_ciLppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
16601cb0ef41Sopenharmony_ci	cmpi	0,0,r7,0		# is (i == 0)?
16611cb0ef41Sopenharmony_ci	beq	Lppcasm_div4
16621cb0ef41Sopenharmony_ci	$SHL	r3,r3,r7		# h = (h<< i)
16631cb0ef41Sopenharmony_ci	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
16641cb0ef41Sopenharmony_ci	$SHL	r5,r5,r7		# d<<=i
16651cb0ef41Sopenharmony_ci	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
16661cb0ef41Sopenharmony_ci	$SHL	r4,r4,r7		# l <<=i
16671cb0ef41Sopenharmony_ciLppcasm_div4:
16681cb0ef41Sopenharmony_ci	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
16691cb0ef41Sopenharmony_ci					# dl will be computed when needed
16701cb0ef41Sopenharmony_ci					# as it saves registers.
16711cb0ef41Sopenharmony_ci	li	r6,2			#r6=2
16721cb0ef41Sopenharmony_ci	mtctr	r6			#counter will be in count.
16731cb0ef41Sopenharmony_ciLppcasm_divouterloop:
16741cb0ef41Sopenharmony_ci	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
16751cb0ef41Sopenharmony_ci	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
16761cb0ef41Sopenharmony_ci					# compute here for innerloop.
16771cb0ef41Sopenharmony_ci	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
16781cb0ef41Sopenharmony_ci	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
16791cb0ef41Sopenharmony_ci
16801cb0ef41Sopenharmony_ci	li	r8,-1
16811cb0ef41Sopenharmony_ci	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l
16821cb0ef41Sopenharmony_ci	b	Lppcasm_div6
16831cb0ef41Sopenharmony_ciLppcasm_div5:
16841cb0ef41Sopenharmony_ci	$UDIV	r8,r3,r9		#q = h/dh
16851cb0ef41Sopenharmony_ciLppcasm_div6:
16861cb0ef41Sopenharmony_ci	$UMULL	r12,r9,r8		#th = q*dh
16871cb0ef41Sopenharmony_ci	$CLRU	r10,r5,`$BITS/2`	#r10=dl
16881cb0ef41Sopenharmony_ci	$UMULL	r6,r8,r10		#tl = q*dl
16891cb0ef41Sopenharmony_ci
16901cb0ef41Sopenharmony_ciLppcasm_divinnerloop:
16911cb0ef41Sopenharmony_ci	subf	r10,r12,r3		#t = h -th
16921cb0ef41Sopenharmony_ci	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
16931cb0ef41Sopenharmony_ci	addic.	r7,r7,0			#test if r7 == 0. used below.
16941cb0ef41Sopenharmony_ci					# now want to compute
16951cb0ef41Sopenharmony_ci					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
16961cb0ef41Sopenharmony_ci					# the following 2 instructions do that
16971cb0ef41Sopenharmony_ci	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
16981cb0ef41Sopenharmony_ci	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
16991cb0ef41Sopenharmony_ci	$UCMP	cr1,r6,r7		# compare (tl <= r7)
17001cb0ef41Sopenharmony_ci	bne	Lppcasm_divinnerexit
17011cb0ef41Sopenharmony_ci	ble	cr1,Lppcasm_divinnerexit
17021cb0ef41Sopenharmony_ci	addi	r8,r8,-1		#q--
17031cb0ef41Sopenharmony_ci	subf	r12,r9,r12		#th -=dh
17041cb0ef41Sopenharmony_ci	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
17051cb0ef41Sopenharmony_ci	subf	r6,r10,r6		#tl -=dl
17061cb0ef41Sopenharmony_ci	b	Lppcasm_divinnerloop
17071cb0ef41Sopenharmony_ciLppcasm_divinnerexit:
17081cb0ef41Sopenharmony_ci	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
17091cb0ef41Sopenharmony_ci	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
17101cb0ef41Sopenharmony_ci	$UCMP	cr1,r4,r11		# compare l and tl
17111cb0ef41Sopenharmony_ci	add	r12,r12,r10		# th+=t
17121cb0ef41Sopenharmony_ci	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
17131cb0ef41Sopenharmony_ci	addi	r12,r12,1		# th++
17141cb0ef41Sopenharmony_ciLppcasm_div7:
17151cb0ef41Sopenharmony_ci	subf	r11,r11,r4		#r11=l-tl
17161cb0ef41Sopenharmony_ci	$UCMP	cr1,r3,r12		#compare h and th
17171cb0ef41Sopenharmony_ci	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
17181cb0ef41Sopenharmony_ci	addi	r8,r8,-1		# q--
17191cb0ef41Sopenharmony_ci	add	r3,r5,r3		# h+=d
17201cb0ef41Sopenharmony_ciLppcasm_div8:
17211cb0ef41Sopenharmony_ci	subf	r12,r12,r3		#r12 = h-th
17221cb0ef41Sopenharmony_ci	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
17231cb0ef41Sopenharmony_ci					# want to compute
17241cb0ef41Sopenharmony_ci					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
17251cb0ef41Sopenharmony_ci					# the following 2 instructions will do this.
17261cb0ef41Sopenharmony_ci	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
17271cb0ef41Sopenharmony_ci	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
17281cb0ef41Sopenharmony_ci	bdz	Lppcasm_div9		#if (count==0) break ;
17291cb0ef41Sopenharmony_ci	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
17301cb0ef41Sopenharmony_ci	b	Lppcasm_divouterloop
17311cb0ef41Sopenharmony_ciLppcasm_div9:
17321cb0ef41Sopenharmony_ci	or	r3,r8,r0
17331cb0ef41Sopenharmony_ci	blr
17341cb0ef41Sopenharmony_ci	.long	0
17351cb0ef41Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
17361cb0ef41Sopenharmony_ci	.long	0
17371cb0ef41Sopenharmony_ci.size	.bn_div_words,.-.bn_div_words
17381cb0ef41Sopenharmony_ci
17391cb0ef41Sopenharmony_ci#
17401cb0ef41Sopenharmony_ci#	NOTE:	The following label name should be changed to
17411cb0ef41Sopenharmony_ci#		"bn_sqr_words" i.e. remove the first dot
17421cb0ef41Sopenharmony_ci#		for the gcc compiler. This should be automatically
17431cb0ef41Sopenharmony_ci#		done in the build
17441cb0ef41Sopenharmony_ci#
17451cb0ef41Sopenharmony_ci.align	4
17461cb0ef41Sopenharmony_ci.bn_sqr_words:
17471cb0ef41Sopenharmony_ci#
17481cb0ef41Sopenharmony_ci#	Optimized version of bn_sqr_words
17491cb0ef41Sopenharmony_ci#
17501cb0ef41Sopenharmony_ci#	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
17511cb0ef41Sopenharmony_ci#
17521cb0ef41Sopenharmony_ci#	r3 = r
17531cb0ef41Sopenharmony_ci#	r4 = a
17541cb0ef41Sopenharmony_ci#	r5 = n
17551cb0ef41Sopenharmony_ci#
17561cb0ef41Sopenharmony_ci#	r6 = a[i].
17571cb0ef41Sopenharmony_ci#	r7,r8 = product.
17581cb0ef41Sopenharmony_ci#
17591cb0ef41Sopenharmony_ci#	No unrolling done here. Not performance critical.
17601cb0ef41Sopenharmony_ci
17611cb0ef41Sopenharmony_ci	addic.	r5,r5,0			#test r5.
17621cb0ef41Sopenharmony_ci	beq	Lppcasm_sqr_adios
17631cb0ef41Sopenharmony_ci	addi	r4,r4,-$BNSZ
17641cb0ef41Sopenharmony_ci	addi	r3,r3,-$BNSZ
17651cb0ef41Sopenharmony_ci	mtctr	r5
17661cb0ef41Sopenharmony_ciLppcasm_sqr_mainloop:
17671cb0ef41Sopenharmony_ci					#sqr(r[0],r[1],a[0]);
17681cb0ef41Sopenharmony_ci	$LDU	r6,$BNSZ(r4)
17691cb0ef41Sopenharmony_ci	$UMULL	r7,r6,r6
17701cb0ef41Sopenharmony_ci	$UMULH  r8,r6,r6
17711cb0ef41Sopenharmony_ci	$STU	r7,$BNSZ(r3)
17721cb0ef41Sopenharmony_ci	$STU	r8,$BNSZ(r3)
17731cb0ef41Sopenharmony_ci	bdnz	Lppcasm_sqr_mainloop
17741cb0ef41Sopenharmony_ciLppcasm_sqr_adios:
17751cb0ef41Sopenharmony_ci	blr
17761cb0ef41Sopenharmony_ci	.long	0
17771cb0ef41Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
17781cb0ef41Sopenharmony_ci	.long	0
17791cb0ef41Sopenharmony_ci.size	.bn_sqr_words,.-.bn_sqr_words
17801cb0ef41Sopenharmony_ci
17811cb0ef41Sopenharmony_ci#
17821cb0ef41Sopenharmony_ci#	NOTE:	The following label name should be changed to
17831cb0ef41Sopenharmony_ci#		"bn_mul_words" i.e. remove the first dot
17841cb0ef41Sopenharmony_ci#		for the gcc compiler. This should be automatically
17851cb0ef41Sopenharmony_ci#		done in the build
17861cb0ef41Sopenharmony_ci#
17871cb0ef41Sopenharmony_ci
17881cb0ef41Sopenharmony_ci.align	4
17891cb0ef41Sopenharmony_ci.bn_mul_words:
17901cb0ef41Sopenharmony_ci#
17911cb0ef41Sopenharmony_ci# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
17921cb0ef41Sopenharmony_ci#
17931cb0ef41Sopenharmony_ci# r3 = rp
17941cb0ef41Sopenharmony_ci# r4 = ap
17951cb0ef41Sopenharmony_ci# r5 = num
17961cb0ef41Sopenharmony_ci# r6 = w
17971cb0ef41Sopenharmony_ci	xor	r0,r0,r0
17981cb0ef41Sopenharmony_ci	xor	r12,r12,r12		# used for carry
17991cb0ef41Sopenharmony_ci	rlwinm.	r7,r5,30,2,31		# num >> 2
18001cb0ef41Sopenharmony_ci	beq	Lppcasm_mw_REM
18011cb0ef41Sopenharmony_ci	mtctr	r7
18021cb0ef41Sopenharmony_ciLppcasm_mw_LOOP:
18031cb0ef41Sopenharmony_ci					#mul(rp[0],ap[0],w,c1);
18041cb0ef41Sopenharmony_ci	$LD	r8,`0*$BNSZ`(r4)
18051cb0ef41Sopenharmony_ci	$UMULL	r9,r6,r8
18061cb0ef41Sopenharmony_ci	$UMULH  r10,r6,r8
18071cb0ef41Sopenharmony_ci	addc	r9,r9,r12
18081cb0ef41Sopenharmony_ci	#addze	r10,r10			#carry is NOT ignored.
18091cb0ef41Sopenharmony_ci					#will be taken care of
18101cb0ef41Sopenharmony_ci					#in second spin below
18111cb0ef41Sopenharmony_ci					#using adde.
18121cb0ef41Sopenharmony_ci	$ST	r9,`0*$BNSZ`(r3)
18131cb0ef41Sopenharmony_ci					#mul(rp[1],ap[1],w,c1);
18141cb0ef41Sopenharmony_ci	$LD	r8,`1*$BNSZ`(r4)
18151cb0ef41Sopenharmony_ci	$UMULL	r11,r6,r8
18161cb0ef41Sopenharmony_ci	$UMULH  r12,r6,r8
18171cb0ef41Sopenharmony_ci	adde	r11,r11,r10
18181cb0ef41Sopenharmony_ci	#addze	r12,r12
18191cb0ef41Sopenharmony_ci	$ST	r11,`1*$BNSZ`(r3)
18201cb0ef41Sopenharmony_ci					#mul(rp[2],ap[2],w,c1);
18211cb0ef41Sopenharmony_ci	$LD	r8,`2*$BNSZ`(r4)
18221cb0ef41Sopenharmony_ci	$UMULL	r9,r6,r8
18231cb0ef41Sopenharmony_ci	$UMULH  r10,r6,r8
18241cb0ef41Sopenharmony_ci	adde	r9,r9,r12
18251cb0ef41Sopenharmony_ci	#addze	r10,r10
18261cb0ef41Sopenharmony_ci	$ST	r9,`2*$BNSZ`(r3)
18271cb0ef41Sopenharmony_ci					#mul_add(rp[3],ap[3],w,c1);
18281cb0ef41Sopenharmony_ci	$LD	r8,`3*$BNSZ`(r4)
18291cb0ef41Sopenharmony_ci	$UMULL	r11,r6,r8
18301cb0ef41Sopenharmony_ci	$UMULH  r12,r6,r8
18311cb0ef41Sopenharmony_ci	adde	r11,r11,r10
18321cb0ef41Sopenharmony_ci	addze	r12,r12			#this spin we collect carry into
18331cb0ef41Sopenharmony_ci					#r12
18341cb0ef41Sopenharmony_ci	$ST	r11,`3*$BNSZ`(r3)
18351cb0ef41Sopenharmony_ci
18361cb0ef41Sopenharmony_ci	addi	r3,r3,`4*$BNSZ`
18371cb0ef41Sopenharmony_ci	addi	r4,r4,`4*$BNSZ`
18381cb0ef41Sopenharmony_ci	bdnz	Lppcasm_mw_LOOP
18391cb0ef41Sopenharmony_ci
18401cb0ef41Sopenharmony_ciLppcasm_mw_REM:
18411cb0ef41Sopenharmony_ci	andi.	r5,r5,0x3
18421cb0ef41Sopenharmony_ci	beq	Lppcasm_mw_OVER
18431cb0ef41Sopenharmony_ci					#mul(rp[0],ap[0],w,c1);
18441cb0ef41Sopenharmony_ci	$LD	r8,`0*$BNSZ`(r4)
18451cb0ef41Sopenharmony_ci	$UMULL	r9,r6,r8
18461cb0ef41Sopenharmony_ci	$UMULH  r10,r6,r8
18471cb0ef41Sopenharmony_ci	addc	r9,r9,r12
18481cb0ef41Sopenharmony_ci	addze	r10,r10
18491cb0ef41Sopenharmony_ci	$ST	r9,`0*$BNSZ`(r3)
18501cb0ef41Sopenharmony_ci	addi	r12,r10,0
18511cb0ef41Sopenharmony_ci
18521cb0ef41Sopenharmony_ci	addi	r5,r5,-1
18531cb0ef41Sopenharmony_ci	cmpli	0,0,r5,0
18541cb0ef41Sopenharmony_ci	beq	Lppcasm_mw_OVER
18551cb0ef41Sopenharmony_ci
18561cb0ef41Sopenharmony_ci
18571cb0ef41Sopenharmony_ci					#mul(rp[1],ap[1],w,c1);
18581cb0ef41Sopenharmony_ci	$LD	r8,`1*$BNSZ`(r4)
18591cb0ef41Sopenharmony_ci	$UMULL	r9,r6,r8
18601cb0ef41Sopenharmony_ci	$UMULH  r10,r6,r8
18611cb0ef41Sopenharmony_ci	addc	r9,r9,r12
18621cb0ef41Sopenharmony_ci	addze	r10,r10
18631cb0ef41Sopenharmony_ci	$ST	r9,`1*$BNSZ`(r3)
18641cb0ef41Sopenharmony_ci	addi	r12,r10,0
18651cb0ef41Sopenharmony_ci
18661cb0ef41Sopenharmony_ci	addi	r5,r5,-1
18671cb0ef41Sopenharmony_ci	cmpli	0,0,r5,0
18681cb0ef41Sopenharmony_ci	beq	Lppcasm_mw_OVER
18691cb0ef41Sopenharmony_ci
18701cb0ef41Sopenharmony_ci					#mul_add(rp[2],ap[2],w,c1);
18711cb0ef41Sopenharmony_ci	$LD	r8,`2*$BNSZ`(r4)
18721cb0ef41Sopenharmony_ci	$UMULL	r9,r6,r8
18731cb0ef41Sopenharmony_ci	$UMULH  r10,r6,r8
18741cb0ef41Sopenharmony_ci	addc	r9,r9,r12
18751cb0ef41Sopenharmony_ci	addze	r10,r10
18761cb0ef41Sopenharmony_ci	$ST	r9,`2*$BNSZ`(r3)
18771cb0ef41Sopenharmony_ci	addi	r12,r10,0
18781cb0ef41Sopenharmony_ci
18791cb0ef41Sopenharmony_ciLppcasm_mw_OVER:
18801cb0ef41Sopenharmony_ci	addi	r3,r12,0
18811cb0ef41Sopenharmony_ci	blr
18821cb0ef41Sopenharmony_ci	.long	0
18831cb0ef41Sopenharmony_ci	.byte	0,12,0x14,0,0,0,4,0
18841cb0ef41Sopenharmony_ci	.long	0
18851cb0ef41Sopenharmony_ci.size	.bn_mul_words,.-.bn_mul_words
18861cb0ef41Sopenharmony_ci
18871cb0ef41Sopenharmony_ci#
18881cb0ef41Sopenharmony_ci#	NOTE:	The following label name should be changed to
18891cb0ef41Sopenharmony_ci#		"bn_mul_add_words" i.e. remove the first dot
18901cb0ef41Sopenharmony_ci#		for the gcc compiler. This should be automatically
18911cb0ef41Sopenharmony_ci#		done in the build
18921cb0ef41Sopenharmony_ci#
18931cb0ef41Sopenharmony_ci
18941cb0ef41Sopenharmony_ci.align	4
18951cb0ef41Sopenharmony_ci.bn_mul_add_words:
18961cb0ef41Sopenharmony_ci#
18971cb0ef41Sopenharmony_ci# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
18981cb0ef41Sopenharmony_ci#
18991cb0ef41Sopenharmony_ci# r3 = rp
19001cb0ef41Sopenharmony_ci# r4 = ap
19011cb0ef41Sopenharmony_ci# r5 = num
19021cb0ef41Sopenharmony_ci# r6 = w
19031cb0ef41Sopenharmony_ci#
19041cb0ef41Sopenharmony_ci# empirical evidence suggests that unrolled version performs best!!
19051cb0ef41Sopenharmony_ci#
19061cb0ef41Sopenharmony_ci	xor	r0,r0,r0		#r0 = 0
19071cb0ef41Sopenharmony_ci	xor	r12,r12,r12  		#r12 = 0 . used for carry
19081cb0ef41Sopenharmony_ci	rlwinm.	r7,r5,30,2,31		# num >> 2
19091cb0ef41Sopenharmony_ci	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
19101cb0ef41Sopenharmony_ci	mtctr	r7
19111cb0ef41Sopenharmony_ciLppcasm_maw_mainloop:
19121cb0ef41Sopenharmony_ci					#mul_add(rp[0],ap[0],w,c1);
19131cb0ef41Sopenharmony_ci	$LD	r8,`0*$BNSZ`(r4)
19141cb0ef41Sopenharmony_ci	$LD	r11,`0*$BNSZ`(r3)
19151cb0ef41Sopenharmony_ci	$UMULL	r9,r6,r8
19161cb0ef41Sopenharmony_ci	$UMULH  r10,r6,r8
19171cb0ef41Sopenharmony_ci	addc	r9,r9,r12		#r12 is carry.
19181cb0ef41Sopenharmony_ci	addze	r10,r10
19191cb0ef41Sopenharmony_ci	addc	r9,r9,r11
19201cb0ef41Sopenharmony_ci	#addze	r10,r10
19211cb0ef41Sopenharmony_ci					#the above instruction addze
19221cb0ef41Sopenharmony_ci					#is NOT needed. Carry will NOT
19231cb0ef41Sopenharmony_ci					#be ignored. It's not affected
19241cb0ef41Sopenharmony_ci					#by multiply and will be collected
19251cb0ef41Sopenharmony_ci					#in the next spin
19261cb0ef41Sopenharmony_ci	$ST	r9,`0*$BNSZ`(r3)
19271cb0ef41Sopenharmony_ci
19281cb0ef41Sopenharmony_ci					#mul_add(rp[1],ap[1],w,c1);
19291cb0ef41Sopenharmony_ci	$LD	r8,`1*$BNSZ`(r4)
19301cb0ef41Sopenharmony_ci	$LD	r9,`1*$BNSZ`(r3)
19311cb0ef41Sopenharmony_ci	$UMULL	r11,r6,r8
19321cb0ef41Sopenharmony_ci	$UMULH  r12,r6,r8
19331cb0ef41Sopenharmony_ci	adde	r11,r11,r10		#r10 is carry.
19341cb0ef41Sopenharmony_ci	addze	r12,r12
19351cb0ef41Sopenharmony_ci	addc	r11,r11,r9
19361cb0ef41Sopenharmony_ci	#addze	r12,r12
19371cb0ef41Sopenharmony_ci	$ST	r11,`1*$BNSZ`(r3)
19381cb0ef41Sopenharmony_ci
19391cb0ef41Sopenharmony_ci					#mul_add(rp[2],ap[2],w,c1);
19401cb0ef41Sopenharmony_ci	$LD	r8,`2*$BNSZ`(r4)
19411cb0ef41Sopenharmony_ci	$UMULL	r9,r6,r8
19421cb0ef41Sopenharmony_ci	$LD	r11,`2*$BNSZ`(r3)
19431cb0ef41Sopenharmony_ci	$UMULH  r10,r6,r8
19441cb0ef41Sopenharmony_ci	adde	r9,r9,r12
19451cb0ef41Sopenharmony_ci	addze	r10,r10
19461cb0ef41Sopenharmony_ci	addc	r9,r9,r11
19471cb0ef41Sopenharmony_ci	#addze	r10,r10
19481cb0ef41Sopenharmony_ci	$ST	r9,`2*$BNSZ`(r3)
19491cb0ef41Sopenharmony_ci
19501cb0ef41Sopenharmony_ci					#mul_add(rp[3],ap[3],w,c1);
19511cb0ef41Sopenharmony_ci	$LD	r8,`3*$BNSZ`(r4)
19521cb0ef41Sopenharmony_ci	$UMULL	r11,r6,r8
19531cb0ef41Sopenharmony_ci	$LD	r9,`3*$BNSZ`(r3)
19541cb0ef41Sopenharmony_ci	$UMULH  r12,r6,r8
19551cb0ef41Sopenharmony_ci	adde	r11,r11,r10
19561cb0ef41Sopenharmony_ci	addze	r12,r12
19571cb0ef41Sopenharmony_ci	addc	r11,r11,r9
19581cb0ef41Sopenharmony_ci	addze	r12,r12
19591cb0ef41Sopenharmony_ci	$ST	r11,`3*$BNSZ`(r3)
19601cb0ef41Sopenharmony_ci	addi	r3,r3,`4*$BNSZ`
19611cb0ef41Sopenharmony_ci	addi	r4,r4,`4*$BNSZ`
19621cb0ef41Sopenharmony_ci	bdnz	Lppcasm_maw_mainloop
19631cb0ef41Sopenharmony_ci
19641cb0ef41Sopenharmony_ciLppcasm_maw_leftover:
19651cb0ef41Sopenharmony_ci	andi.	r5,r5,0x3
19661cb0ef41Sopenharmony_ci	beq	Lppcasm_maw_adios
19671cb0ef41Sopenharmony_ci	addi	r3,r3,-$BNSZ
19681cb0ef41Sopenharmony_ci	addi	r4,r4,-$BNSZ
19691cb0ef41Sopenharmony_ci					#mul_add(rp[0],ap[0],w,c1);
19701cb0ef41Sopenharmony_ci	mtctr	r5
19711cb0ef41Sopenharmony_ci	$LDU	r8,$BNSZ(r4)
19721cb0ef41Sopenharmony_ci	$UMULL	r9,r6,r8
19731cb0ef41Sopenharmony_ci	$UMULH  r10,r6,r8
19741cb0ef41Sopenharmony_ci	$LDU	r11,$BNSZ(r3)
19751cb0ef41Sopenharmony_ci	addc	r9,r9,r11
19761cb0ef41Sopenharmony_ci	addze	r10,r10
19771cb0ef41Sopenharmony_ci	addc	r9,r9,r12
19781cb0ef41Sopenharmony_ci	addze	r12,r10
19791cb0ef41Sopenharmony_ci	$ST	r9,0(r3)
19801cb0ef41Sopenharmony_ci
19811cb0ef41Sopenharmony_ci	bdz	Lppcasm_maw_adios
19821cb0ef41Sopenharmony_ci					#mul_add(rp[1],ap[1],w,c1);
19831cb0ef41Sopenharmony_ci	$LDU	r8,$BNSZ(r4)
19841cb0ef41Sopenharmony_ci	$UMULL	r9,r6,r8
19851cb0ef41Sopenharmony_ci	$UMULH  r10,r6,r8
19861cb0ef41Sopenharmony_ci	$LDU	r11,$BNSZ(r3)
19871cb0ef41Sopenharmony_ci	addc	r9,r9,r11
19881cb0ef41Sopenharmony_ci	addze	r10,r10
19891cb0ef41Sopenharmony_ci	addc	r9,r9,r12
19901cb0ef41Sopenharmony_ci	addze	r12,r10
19911cb0ef41Sopenharmony_ci	$ST	r9,0(r3)
19921cb0ef41Sopenharmony_ci
19931cb0ef41Sopenharmony_ci	bdz	Lppcasm_maw_adios
19941cb0ef41Sopenharmony_ci					#mul_add(rp[2],ap[2],w,c1);
19951cb0ef41Sopenharmony_ci	$LDU	r8,$BNSZ(r4)
19961cb0ef41Sopenharmony_ci	$UMULL	r9,r6,r8
19971cb0ef41Sopenharmony_ci	$UMULH  r10,r6,r8
19981cb0ef41Sopenharmony_ci	$LDU	r11,$BNSZ(r3)
19991cb0ef41Sopenharmony_ci	addc	r9,r9,r11
20001cb0ef41Sopenharmony_ci	addze	r10,r10
20011cb0ef41Sopenharmony_ci	addc	r9,r9,r12
20021cb0ef41Sopenharmony_ci	addze	r12,r10
20031cb0ef41Sopenharmony_ci	$ST	r9,0(r3)
20041cb0ef41Sopenharmony_ci
20051cb0ef41Sopenharmony_ciLppcasm_maw_adios:
20061cb0ef41Sopenharmony_ci	addi	r3,r12,0
20071cb0ef41Sopenharmony_ci	blr
20081cb0ef41Sopenharmony_ci	.long	0
20091cb0ef41Sopenharmony_ci	.byte	0,12,0x14,0,0,0,4,0
20101cb0ef41Sopenharmony_ci	.long	0
20111cb0ef41Sopenharmony_ci.size	.bn_mul_add_words,.-.bn_mul_add_words
20121cb0ef41Sopenharmony_ci	.align	4
20131cb0ef41Sopenharmony_ciEOF
20141cb0ef41Sopenharmony_ci$data =~ s/\`([^\`]*)\`/eval $1/gem;
20151cb0ef41Sopenharmony_ciprint $data;
20161cb0ef41Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
2017