1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci# March 2010
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# The module implements "4-bit" GCM GHASH function and underlying
20e1051a39Sopenharmony_ci# single multiplication operation in GF(2^128). "4-bit" means that it
21e1051a39Sopenharmony_ci# uses 256 bytes per-key table [+128 bytes shared table]. Performance
22e1051a39Sopenharmony_ci# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
23e1051a39Sopenharmony_ci# and are expressed in cycles per processed byte, less is better:
24e1051a39Sopenharmony_ci#
25e1051a39Sopenharmony_ci#		gcc 3.3.x	cc 5.2		this assembler
26e1051a39Sopenharmony_ci#
27e1051a39Sopenharmony_ci# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
28e1051a39Sopenharmony_ci# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
29e1051a39Sopenharmony_ci#
30e1051a39Sopenharmony_ci# Here is data collected on UltraSPARC T1 system running Linux:
31e1051a39Sopenharmony_ci#
32e1051a39Sopenharmony_ci#		gcc 4.4.1			this assembler
33e1051a39Sopenharmony_ci#
34e1051a39Sopenharmony_ci# 32-bit build	566				50	(+1000%)
35e1051a39Sopenharmony_ci# 64-bit build	56				50	(+12%)
36e1051a39Sopenharmony_ci#
37e1051a39Sopenharmony_ci# I don't quite understand why difference between 32-bit and 64-bit
38e1051a39Sopenharmony_ci# compiler-generated code is so big. Compilers *were* instructed to
39e1051a39Sopenharmony_ci# generate code for UltraSPARC and should have used 64-bit registers
40e1051a39Sopenharmony_ci# for Z vector (see C code) even in 32-bit build... Oh well, it only
41e1051a39Sopenharmony_ci# means more impressive improvement coefficients for this assembler
42e1051a39Sopenharmony_ci# module;-) Loops are aggressively modulo-scheduled in respect to
43e1051a39Sopenharmony_ci# references to input data and Z.hi updates to achieve 12 cycles
44e1051a39Sopenharmony_ci# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
45e1051a39Sopenharmony_ci# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
46e1051a39Sopenharmony_ci#
47e1051a39Sopenharmony_ci# October 2012
48e1051a39Sopenharmony_ci#
49e1051a39Sopenharmony_ci# Add VIS3 lookup-table-free implementation using polynomial
50e1051a39Sopenharmony_ci# multiplication xmulx[hi] and extended addition addxc[cc]
51e1051a39Sopenharmony_ci# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
52e1051a39Sopenharmony_ci# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
53e1051a39Sopenharmony_ci# saturates at ~15.5x single-process result on 8-core processor,
54e1051a39Sopenharmony_ci# or ~20.5GBps per 2.85GHz socket.
55e1051a39Sopenharmony_ci
56e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output";
57e1051a39Sopenharmony_ci
58e1051a39Sopenharmony_ci$frame="STACK_FRAME";
59e1051a39Sopenharmony_ci$bias="STACK_BIAS";
60e1051a39Sopenharmony_ci
61e1051a39Sopenharmony_ci$Zhi="%o0";	# 64-bit values
62e1051a39Sopenharmony_ci$Zlo="%o1";
63e1051a39Sopenharmony_ci$Thi="%o2";
64e1051a39Sopenharmony_ci$Tlo="%o3";
65e1051a39Sopenharmony_ci$rem="%o4";
66e1051a39Sopenharmony_ci$tmp="%o5";
67e1051a39Sopenharmony_ci
68e1051a39Sopenharmony_ci$nhi="%l0";	# small values and pointers
69e1051a39Sopenharmony_ci$nlo="%l1";
70e1051a39Sopenharmony_ci$xi0="%l2";
71e1051a39Sopenharmony_ci$xi1="%l3";
72e1051a39Sopenharmony_ci$rem_4bit="%l4";
73e1051a39Sopenharmony_ci$remi="%l5";
74e1051a39Sopenharmony_ci$Htblo="%l6";
75e1051a39Sopenharmony_ci$cnt="%l7";
76e1051a39Sopenharmony_ci
77e1051a39Sopenharmony_ci$Xi="%i0";	# input argument block
78e1051a39Sopenharmony_ci$Htbl="%i1";
79e1051a39Sopenharmony_ci$inp="%i2";
80e1051a39Sopenharmony_ci$len="%i3";
81e1051a39Sopenharmony_ci
82e1051a39Sopenharmony_ci$code.=<<___;
83e1051a39Sopenharmony_ci#ifndef __ASSEMBLER__
84e1051a39Sopenharmony_ci# define __ASSEMBLER__ 1
85e1051a39Sopenharmony_ci#endif
86e1051a39Sopenharmony_ci#include "crypto/sparc_arch.h"
87e1051a39Sopenharmony_ci
88e1051a39Sopenharmony_ci#ifdef  __arch64__
89e1051a39Sopenharmony_ci.register	%g2,#scratch
90e1051a39Sopenharmony_ci.register	%g3,#scratch
91e1051a39Sopenharmony_ci#endif
92e1051a39Sopenharmony_ci
93e1051a39Sopenharmony_ci.section	".text",#alloc,#execinstr
94e1051a39Sopenharmony_ci
95e1051a39Sopenharmony_ci.align	64
96e1051a39Sopenharmony_cirem_4bit:
97e1051a39Sopenharmony_ci	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
98e1051a39Sopenharmony_ci	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
99e1051a39Sopenharmony_ci	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
100e1051a39Sopenharmony_ci	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
101e1051a39Sopenharmony_ci.type	rem_4bit,#object
102e1051a39Sopenharmony_ci.size	rem_4bit,(.-rem_4bit)
103e1051a39Sopenharmony_ci
104e1051a39Sopenharmony_ci.globl	gcm_ghash_4bit
105e1051a39Sopenharmony_ci.align	32
106e1051a39Sopenharmony_cigcm_ghash_4bit:
107e1051a39Sopenharmony_ci	save	%sp,-$frame,%sp
108e1051a39Sopenharmony_ci	ldub	[$inp+15],$nlo
109e1051a39Sopenharmony_ci	ldub	[$Xi+15],$xi0
110e1051a39Sopenharmony_ci	ldub	[$Xi+14],$xi1
111e1051a39Sopenharmony_ci	add	$len,$inp,$len
112e1051a39Sopenharmony_ci	add	$Htbl,8,$Htblo
113e1051a39Sopenharmony_ci
114e1051a39Sopenharmony_ci1:	call	.+8
115e1051a39Sopenharmony_ci	add	%o7,rem_4bit-1b,$rem_4bit
116e1051a39Sopenharmony_ci
117e1051a39Sopenharmony_ci.Louter:
118e1051a39Sopenharmony_ci	xor	$xi0,$nlo,$nlo
119e1051a39Sopenharmony_ci	and	$nlo,0xf0,$nhi
120e1051a39Sopenharmony_ci	and	$nlo,0x0f,$nlo
121e1051a39Sopenharmony_ci	sll	$nlo,4,$nlo
122e1051a39Sopenharmony_ci	ldx	[$Htblo+$nlo],$Zlo
123e1051a39Sopenharmony_ci	ldx	[$Htbl+$nlo],$Zhi
124e1051a39Sopenharmony_ci
125e1051a39Sopenharmony_ci	ldub	[$inp+14],$nlo
126e1051a39Sopenharmony_ci
127e1051a39Sopenharmony_ci	ldx	[$Htblo+$nhi],$Tlo
128e1051a39Sopenharmony_ci	and	$Zlo,0xf,$remi
129e1051a39Sopenharmony_ci	ldx	[$Htbl+$nhi],$Thi
130e1051a39Sopenharmony_ci	sll	$remi,3,$remi
131e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
132e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
133e1051a39Sopenharmony_ci	mov	13,$cnt
134e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
135e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
136e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
137e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
138e1051a39Sopenharmony_ci
139e1051a39Sopenharmony_ci	xor	$xi1,$nlo,$nlo
140e1051a39Sopenharmony_ci	and	$Zlo,0xf,$remi
141e1051a39Sopenharmony_ci	and	$nlo,0xf0,$nhi
142e1051a39Sopenharmony_ci	and	$nlo,0x0f,$nlo
143e1051a39Sopenharmony_ci	ba	.Lghash_inner
144e1051a39Sopenharmony_ci	sll	$nlo,4,$nlo
145e1051a39Sopenharmony_ci.align	32
146e1051a39Sopenharmony_ci.Lghash_inner:
147e1051a39Sopenharmony_ci	ldx	[$Htblo+$nlo],$Tlo
148e1051a39Sopenharmony_ci	sll	$remi,3,$remi
149e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
150e1051a39Sopenharmony_ci	ldx	[$Htbl+$nlo],$Thi
151e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
152e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
153e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
154e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
155e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
156e1051a39Sopenharmony_ci	ldub	[$inp+$cnt],$nlo
157e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
158e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
159e1051a39Sopenharmony_ci	ldub	[$Xi+$cnt],$xi1
160e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
161e1051a39Sopenharmony_ci	and	$Zlo,0xf,$remi
162e1051a39Sopenharmony_ci
163e1051a39Sopenharmony_ci	ldx	[$Htblo+$nhi],$Tlo
164e1051a39Sopenharmony_ci	sll	$remi,3,$remi
165e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
166e1051a39Sopenharmony_ci	ldx	[$Htbl+$nhi],$Thi
167e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
168e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
169e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
170e1051a39Sopenharmony_ci	xor	$xi1,$nlo,$nlo
171e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
172e1051a39Sopenharmony_ci	and	$nlo,0xf0,$nhi
173e1051a39Sopenharmony_ci	addcc	$cnt,-1,$cnt
174e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
175e1051a39Sopenharmony_ci	and	$nlo,0x0f,$nlo
176e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
177e1051a39Sopenharmony_ci	sll	$nlo,4,$nlo
178e1051a39Sopenharmony_ci	blu	.Lghash_inner
179e1051a39Sopenharmony_ci	and	$Zlo,0xf,$remi
180e1051a39Sopenharmony_ci
181e1051a39Sopenharmony_ci	ldx	[$Htblo+$nlo],$Tlo
182e1051a39Sopenharmony_ci	sll	$remi,3,$remi
183e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
184e1051a39Sopenharmony_ci	ldx	[$Htbl+$nlo],$Thi
185e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
186e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
187e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
188e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
189e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
190e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
191e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
192e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
193e1051a39Sopenharmony_ci
194e1051a39Sopenharmony_ci	add	$inp,16,$inp
195e1051a39Sopenharmony_ci	cmp	$inp,$len
196e1051a39Sopenharmony_ci	be,pn	SIZE_T_CC,.Ldone
197e1051a39Sopenharmony_ci	and	$Zlo,0xf,$remi
198e1051a39Sopenharmony_ci
199e1051a39Sopenharmony_ci	ldx	[$Htblo+$nhi],$Tlo
200e1051a39Sopenharmony_ci	sll	$remi,3,$remi
201e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
202e1051a39Sopenharmony_ci	ldx	[$Htbl+$nhi],$Thi
203e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
204e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
205e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
206e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
207e1051a39Sopenharmony_ci	ldub	[$inp+15],$nlo
208e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
209e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
210e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
211e1051a39Sopenharmony_ci	stx	$Zlo,[$Xi+8]
212e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
213e1051a39Sopenharmony_ci	stx	$Zhi,[$Xi]
214e1051a39Sopenharmony_ci	srl	$Zlo,8,$xi1
215e1051a39Sopenharmony_ci	and	$Zlo,0xff,$xi0
216e1051a39Sopenharmony_ci	ba	.Louter
217e1051a39Sopenharmony_ci	and	$xi1,0xff,$xi1
218e1051a39Sopenharmony_ci.align	32
219e1051a39Sopenharmony_ci.Ldone:
220e1051a39Sopenharmony_ci	ldx	[$Htblo+$nhi],$Tlo
221e1051a39Sopenharmony_ci	sll	$remi,3,$remi
222e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
223e1051a39Sopenharmony_ci	ldx	[$Htbl+$nhi],$Thi
224e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
225e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
226e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
227e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
228e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
229e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
230e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
231e1051a39Sopenharmony_ci	stx	$Zlo,[$Xi+8]
232e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
233e1051a39Sopenharmony_ci	stx	$Zhi,[$Xi]
234e1051a39Sopenharmony_ci
235e1051a39Sopenharmony_ci	ret
236e1051a39Sopenharmony_ci	restore
237e1051a39Sopenharmony_ci.type	gcm_ghash_4bit,#function
238e1051a39Sopenharmony_ci.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
239e1051a39Sopenharmony_ci___
240e1051a39Sopenharmony_ci
241e1051a39Sopenharmony_ciundef $inp;
242e1051a39Sopenharmony_ciundef $len;
243e1051a39Sopenharmony_ci
244e1051a39Sopenharmony_ci$code.=<<___;
245e1051a39Sopenharmony_ci.globl	gcm_gmult_4bit
246e1051a39Sopenharmony_ci.align	32
247e1051a39Sopenharmony_cigcm_gmult_4bit:
248e1051a39Sopenharmony_ci	save	%sp,-$frame,%sp
249e1051a39Sopenharmony_ci	ldub	[$Xi+15],$nlo
250e1051a39Sopenharmony_ci	add	$Htbl,8,$Htblo
251e1051a39Sopenharmony_ci
252e1051a39Sopenharmony_ci1:	call	.+8
253e1051a39Sopenharmony_ci	add	%o7,rem_4bit-1b,$rem_4bit
254e1051a39Sopenharmony_ci
255e1051a39Sopenharmony_ci	and	$nlo,0xf0,$nhi
256e1051a39Sopenharmony_ci	and	$nlo,0x0f,$nlo
257e1051a39Sopenharmony_ci	sll	$nlo,4,$nlo
258e1051a39Sopenharmony_ci	ldx	[$Htblo+$nlo],$Zlo
259e1051a39Sopenharmony_ci	ldx	[$Htbl+$nlo],$Zhi
260e1051a39Sopenharmony_ci
261e1051a39Sopenharmony_ci	ldub	[$Xi+14],$nlo
262e1051a39Sopenharmony_ci
263e1051a39Sopenharmony_ci	ldx	[$Htblo+$nhi],$Tlo
264e1051a39Sopenharmony_ci	and	$Zlo,0xf,$remi
265e1051a39Sopenharmony_ci	ldx	[$Htbl+$nhi],$Thi
266e1051a39Sopenharmony_ci	sll	$remi,3,$remi
267e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
268e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
269e1051a39Sopenharmony_ci	mov	13,$cnt
270e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
271e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
272e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
273e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
274e1051a39Sopenharmony_ci
275e1051a39Sopenharmony_ci	and	$Zlo,0xf,$remi
276e1051a39Sopenharmony_ci	and	$nlo,0xf0,$nhi
277e1051a39Sopenharmony_ci	and	$nlo,0x0f,$nlo
278e1051a39Sopenharmony_ci	ba	.Lgmult_inner
279e1051a39Sopenharmony_ci	sll	$nlo,4,$nlo
280e1051a39Sopenharmony_ci.align	32
281e1051a39Sopenharmony_ci.Lgmult_inner:
282e1051a39Sopenharmony_ci	ldx	[$Htblo+$nlo],$Tlo
283e1051a39Sopenharmony_ci	sll	$remi,3,$remi
284e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
285e1051a39Sopenharmony_ci	ldx	[$Htbl+$nlo],$Thi
286e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
287e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
288e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
289e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
290e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
291e1051a39Sopenharmony_ci	ldub	[$Xi+$cnt],$nlo
292e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
293e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
294e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
295e1051a39Sopenharmony_ci	and	$Zlo,0xf,$remi
296e1051a39Sopenharmony_ci
297e1051a39Sopenharmony_ci	ldx	[$Htblo+$nhi],$Tlo
298e1051a39Sopenharmony_ci	sll	$remi,3,$remi
299e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
300e1051a39Sopenharmony_ci	ldx	[$Htbl+$nhi],$Thi
301e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
302e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
303e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
304e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
305e1051a39Sopenharmony_ci	and	$nlo,0xf0,$nhi
306e1051a39Sopenharmony_ci	addcc	$cnt,-1,$cnt
307e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
308e1051a39Sopenharmony_ci	and	$nlo,0x0f,$nlo
309e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
310e1051a39Sopenharmony_ci	sll	$nlo,4,$nlo
311e1051a39Sopenharmony_ci	blu	.Lgmult_inner
312e1051a39Sopenharmony_ci	and	$Zlo,0xf,$remi
313e1051a39Sopenharmony_ci
314e1051a39Sopenharmony_ci	ldx	[$Htblo+$nlo],$Tlo
315e1051a39Sopenharmony_ci	sll	$remi,3,$remi
316e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
317e1051a39Sopenharmony_ci	ldx	[$Htbl+$nlo],$Thi
318e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
319e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
320e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
321e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
322e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
323e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
324e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
325e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
326e1051a39Sopenharmony_ci	and	$Zlo,0xf,$remi
327e1051a39Sopenharmony_ci
328e1051a39Sopenharmony_ci	ldx	[$Htblo+$nhi],$Tlo
329e1051a39Sopenharmony_ci	sll	$remi,3,$remi
330e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
331e1051a39Sopenharmony_ci	ldx	[$Htbl+$nhi],$Thi
332e1051a39Sopenharmony_ci	srlx	$Zlo,4,$Zlo
333e1051a39Sopenharmony_ci	ldx	[$rem_4bit+$remi],$rem
334e1051a39Sopenharmony_ci	sllx	$Zhi,60,$tmp
335e1051a39Sopenharmony_ci	xor	$Tlo,$Zlo,$Zlo
336e1051a39Sopenharmony_ci	srlx	$Zhi,4,$Zhi
337e1051a39Sopenharmony_ci	xor	$Zlo,$tmp,$Zlo
338e1051a39Sopenharmony_ci	xor	$Thi,$Zhi,$Zhi
339e1051a39Sopenharmony_ci	stx	$Zlo,[$Xi+8]
340e1051a39Sopenharmony_ci	xor	$rem,$Zhi,$Zhi
341e1051a39Sopenharmony_ci	stx	$Zhi,[$Xi]
342e1051a39Sopenharmony_ci
343e1051a39Sopenharmony_ci	ret
344e1051a39Sopenharmony_ci	restore
345e1051a39Sopenharmony_ci.type	gcm_gmult_4bit,#function
346e1051a39Sopenharmony_ci.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
347e1051a39Sopenharmony_ci___
348e1051a39Sopenharmony_ci
349e1051a39Sopenharmony_ci{{{
350e1051a39Sopenharmony_ci# Straightforward 128x128-bit multiplication using Karatsuba algorithm
351e1051a39Sopenharmony_ci# followed by pair of 64-bit reductions [with a shortcut in first one,
352e1051a39Sopenharmony_ci# which allowed to break dependency between reductions and remove one
353e1051a39Sopenharmony_ci# multiplication from critical path]. While it might be suboptimal
354e1051a39Sopenharmony_ci# with regard to sheer number of multiplications, other methods [such
355e1051a39Sopenharmony_ci# as aggregate reduction] would require more 64-bit registers, which
356e1051a39Sopenharmony_ci# we don't have in 32-bit application context.
357e1051a39Sopenharmony_ci
358e1051a39Sopenharmony_ci($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
359e1051a39Sopenharmony_ci
360e1051a39Sopenharmony_ci($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
361e1051a39Sopenharmony_ci	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
362e1051a39Sopenharmony_ci
363e1051a39Sopenharmony_ci($shl,$shr)=map("%l$_",(0..7));
364e1051a39Sopenharmony_ci
365e1051a39Sopenharmony_ci# For details regarding "twisted H" see ghash-x86.pl.
366e1051a39Sopenharmony_ci$code.=<<___;
367e1051a39Sopenharmony_ci.globl	gcm_init_vis3
368e1051a39Sopenharmony_ci.align	32
369e1051a39Sopenharmony_cigcm_init_vis3:
370e1051a39Sopenharmony_ci	save	%sp,-$frame,%sp
371e1051a39Sopenharmony_ci
372e1051a39Sopenharmony_ci	ldx	[%i1+0],$Hhi
373e1051a39Sopenharmony_ci	ldx	[%i1+8],$Hlo
374e1051a39Sopenharmony_ci	mov	0xE1,$Xhi
375e1051a39Sopenharmony_ci	mov	1,$Xlo
376e1051a39Sopenharmony_ci	sllx	$Xhi,57,$Xhi
377e1051a39Sopenharmony_ci	srax	$Hhi,63,$C0		! broadcast carry
378e1051a39Sopenharmony_ci	addcc	$Hlo,$Hlo,$Hlo		! H<<=1
379e1051a39Sopenharmony_ci	addxc	$Hhi,$Hhi,$Hhi
380e1051a39Sopenharmony_ci	and	$C0,$Xlo,$Xlo
381e1051a39Sopenharmony_ci	and	$C0,$Xhi,$Xhi
382e1051a39Sopenharmony_ci	xor	$Xlo,$Hlo,$Hlo
383e1051a39Sopenharmony_ci	xor	$Xhi,$Hhi,$Hhi
384e1051a39Sopenharmony_ci	stx	$Hlo,[%i0+8]		! save twisted H
385e1051a39Sopenharmony_ci	stx	$Hhi,[%i0+0]
386e1051a39Sopenharmony_ci
387e1051a39Sopenharmony_ci	sethi	%hi(0xA0406080),$V
388e1051a39Sopenharmony_ci	sethi	%hi(0x20C0E000),%l0
389e1051a39Sopenharmony_ci	or	$V,%lo(0xA0406080),$V
390e1051a39Sopenharmony_ci	or	%l0,%lo(0x20C0E000),%l0
391e1051a39Sopenharmony_ci	sllx	$V,32,$V
392e1051a39Sopenharmony_ci	or	%l0,$V,$V		! (0xE0·i)&0xff=0xA040608020C0E000
393e1051a39Sopenharmony_ci	stx	$V,[%i0+16]
394e1051a39Sopenharmony_ci
395e1051a39Sopenharmony_ci	ret
396e1051a39Sopenharmony_ci	restore
397e1051a39Sopenharmony_ci.type	gcm_init_vis3,#function
398e1051a39Sopenharmony_ci.size	gcm_init_vis3,.-gcm_init_vis3
399e1051a39Sopenharmony_ci
400e1051a39Sopenharmony_ci.globl	gcm_gmult_vis3
401e1051a39Sopenharmony_ci.align	32
402e1051a39Sopenharmony_cigcm_gmult_vis3:
403e1051a39Sopenharmony_ci	save	%sp,-$frame,%sp
404e1051a39Sopenharmony_ci
405e1051a39Sopenharmony_ci	ldx	[$Xip+8],$Xlo		! load Xi
406e1051a39Sopenharmony_ci	ldx	[$Xip+0],$Xhi
407e1051a39Sopenharmony_ci	ldx	[$Htable+8],$Hlo	! load twisted H
408e1051a39Sopenharmony_ci	ldx	[$Htable+0],$Hhi
409e1051a39Sopenharmony_ci
410e1051a39Sopenharmony_ci	mov	0xE1,%l7
411e1051a39Sopenharmony_ci	sllx	%l7,57,$xE1		! 57 is not a typo
412e1051a39Sopenharmony_ci	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000
413e1051a39Sopenharmony_ci
414e1051a39Sopenharmony_ci	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
415e1051a39Sopenharmony_ci	xmulx	$Xlo,$Hlo,$C0
416e1051a39Sopenharmony_ci	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
417e1051a39Sopenharmony_ci	xmulx	$C2,$Hhl,$C1
418e1051a39Sopenharmony_ci	xmulxhi	$Xlo,$Hlo,$Xlo
419e1051a39Sopenharmony_ci	xmulxhi	$C2,$Hhl,$C2
420e1051a39Sopenharmony_ci	xmulxhi	$Xhi,$Hhi,$C3
421e1051a39Sopenharmony_ci	xmulx	$Xhi,$Hhi,$Xhi
422e1051a39Sopenharmony_ci
423e1051a39Sopenharmony_ci	sll	$C0,3,$sqr
424e1051a39Sopenharmony_ci	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
425e1051a39Sopenharmony_ci	xor	$C0,$sqr,$sqr
426e1051a39Sopenharmony_ci	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]
427e1051a39Sopenharmony_ci
428e1051a39Sopenharmony_ci	xor	$C0,$C1,$C1		! Karatsuba post-processing
429e1051a39Sopenharmony_ci	xor	$Xlo,$C2,$C2
430e1051a39Sopenharmony_ci	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
431e1051a39Sopenharmony_ci	xor	$C3,$C2,$C2
432e1051a39Sopenharmony_ci	xor	$Xlo,$C1,$C1
433e1051a39Sopenharmony_ci	xor	$Xhi,$C2,$C2
434e1051a39Sopenharmony_ci	xor	$Xhi,$C1,$C1
435e1051a39Sopenharmony_ci
436e1051a39Sopenharmony_ci	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
437e1051a39Sopenharmony_ci	 xor	$C0,$C2,$C2
438e1051a39Sopenharmony_ci	xmulx	$C1,$xE1,$C0
439e1051a39Sopenharmony_ci	 xor	$C1,$C3,$C3
440e1051a39Sopenharmony_ci	xmulxhi	$C1,$xE1,$C1
441e1051a39Sopenharmony_ci
442e1051a39Sopenharmony_ci	xor	$Xlo,$C2,$C2
443e1051a39Sopenharmony_ci	xor	$C0,$C2,$C2
444e1051a39Sopenharmony_ci	xor	$C1,$C3,$C3
445e1051a39Sopenharmony_ci
446e1051a39Sopenharmony_ci	stx	$C2,[$Xip+8]		! save Xi
447e1051a39Sopenharmony_ci	stx	$C3,[$Xip+0]
448e1051a39Sopenharmony_ci
449e1051a39Sopenharmony_ci	ret
450e1051a39Sopenharmony_ci	restore
451e1051a39Sopenharmony_ci.type	gcm_gmult_vis3,#function
452e1051a39Sopenharmony_ci.size	gcm_gmult_vis3,.-gcm_gmult_vis3
453e1051a39Sopenharmony_ci
454e1051a39Sopenharmony_ci.globl	gcm_ghash_vis3
455e1051a39Sopenharmony_ci.align	32
456e1051a39Sopenharmony_cigcm_ghash_vis3:
457e1051a39Sopenharmony_ci	save	%sp,-$frame,%sp
458e1051a39Sopenharmony_ci	nop
459e1051a39Sopenharmony_ci	srln	$len,0,$len		! needed on v8+, "nop" on v9
460e1051a39Sopenharmony_ci
461e1051a39Sopenharmony_ci	ldx	[$Xip+8],$C2		! load Xi
462e1051a39Sopenharmony_ci	ldx	[$Xip+0],$C3
463e1051a39Sopenharmony_ci	ldx	[$Htable+8],$Hlo	! load twisted H
464e1051a39Sopenharmony_ci	ldx	[$Htable+0],$Hhi
465e1051a39Sopenharmony_ci
466e1051a39Sopenharmony_ci	mov	0xE1,%l7
467e1051a39Sopenharmony_ci	sllx	%l7,57,$xE1		! 57 is not a typo
468e1051a39Sopenharmony_ci	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000
469e1051a39Sopenharmony_ci
470e1051a39Sopenharmony_ci	and	$inp,7,$shl
471e1051a39Sopenharmony_ci	andn	$inp,7,$inp
472e1051a39Sopenharmony_ci	sll	$shl,3,$shl
473e1051a39Sopenharmony_ci	prefetch [$inp+63], 20
474e1051a39Sopenharmony_ci	sub	%g0,$shl,$shr
475e1051a39Sopenharmony_ci
476e1051a39Sopenharmony_ci	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
477e1051a39Sopenharmony_ci.Loop:
478e1051a39Sopenharmony_ci	ldx	[$inp+8],$Xlo
479e1051a39Sopenharmony_ci	brz,pt	$shl,1f
480e1051a39Sopenharmony_ci	ldx	[$inp+0],$Xhi
481e1051a39Sopenharmony_ci
482e1051a39Sopenharmony_ci	ldx	[$inp+16],$C1		! align data
483e1051a39Sopenharmony_ci	srlx	$Xlo,$shr,$C0
484e1051a39Sopenharmony_ci	sllx	$Xlo,$shl,$Xlo
485e1051a39Sopenharmony_ci	sllx	$Xhi,$shl,$Xhi
486e1051a39Sopenharmony_ci	srlx	$C1,$shr,$C1
487e1051a39Sopenharmony_ci	or	$C0,$Xhi,$Xhi
488e1051a39Sopenharmony_ci	or	$C1,$Xlo,$Xlo
489e1051a39Sopenharmony_ci1:
490e1051a39Sopenharmony_ci	add	$inp,16,$inp
491e1051a39Sopenharmony_ci	sub	$len,16,$len
492e1051a39Sopenharmony_ci	xor	$C2,$Xlo,$Xlo
493e1051a39Sopenharmony_ci	xor	$C3,$Xhi,$Xhi
494e1051a39Sopenharmony_ci	prefetch [$inp+63], 20
495e1051a39Sopenharmony_ci
496e1051a39Sopenharmony_ci	xmulx	$Xlo,$Hlo,$C0
497e1051a39Sopenharmony_ci	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
498e1051a39Sopenharmony_ci	xmulx	$C2,$Hhl,$C1
499e1051a39Sopenharmony_ci	xmulxhi	$Xlo,$Hlo,$Xlo
500e1051a39Sopenharmony_ci	xmulxhi	$C2,$Hhl,$C2
501e1051a39Sopenharmony_ci	xmulxhi	$Xhi,$Hhi,$C3
502e1051a39Sopenharmony_ci	xmulx	$Xhi,$Hhi,$Xhi
503e1051a39Sopenharmony_ci
504e1051a39Sopenharmony_ci	sll	$C0,3,$sqr
505e1051a39Sopenharmony_ci	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
506e1051a39Sopenharmony_ci	xor	$C0,$sqr,$sqr
507e1051a39Sopenharmony_ci	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]
508e1051a39Sopenharmony_ci
509e1051a39Sopenharmony_ci	xor	$C0,$C1,$C1		! Karatsuba post-processing
510e1051a39Sopenharmony_ci	xor	$Xlo,$C2,$C2
511e1051a39Sopenharmony_ci	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
512e1051a39Sopenharmony_ci	xor	$C3,$C2,$C2
513e1051a39Sopenharmony_ci	xor	$Xlo,$C1,$C1
514e1051a39Sopenharmony_ci	xor	$Xhi,$C2,$C2
515e1051a39Sopenharmony_ci	xor	$Xhi,$C1,$C1
516e1051a39Sopenharmony_ci
517e1051a39Sopenharmony_ci	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
518e1051a39Sopenharmony_ci	 xor	$C0,$C2,$C2
519e1051a39Sopenharmony_ci	xmulx	$C1,$xE1,$C0
520e1051a39Sopenharmony_ci	 xor	$C1,$C3,$C3
521e1051a39Sopenharmony_ci	xmulxhi	$C1,$xE1,$C1
522e1051a39Sopenharmony_ci
523e1051a39Sopenharmony_ci	xor	$Xlo,$C2,$C2
524e1051a39Sopenharmony_ci	xor	$C0,$C2,$C2
525e1051a39Sopenharmony_ci	brnz,pt	$len,.Loop
526e1051a39Sopenharmony_ci	xor	$C1,$C3,$C3
527e1051a39Sopenharmony_ci
528e1051a39Sopenharmony_ci	stx	$C2,[$Xip+8]		! save Xi
529e1051a39Sopenharmony_ci	stx	$C3,[$Xip+0]
530e1051a39Sopenharmony_ci
531e1051a39Sopenharmony_ci	ret
532e1051a39Sopenharmony_ci	restore
533e1051a39Sopenharmony_ci.type	gcm_ghash_vis3,#function
534e1051a39Sopenharmony_ci.size	gcm_ghash_vis3,.-gcm_ghash_vis3
535e1051a39Sopenharmony_ci___
536e1051a39Sopenharmony_ci}}}
537e1051a39Sopenharmony_ci$code.=<<___;
538e1051a39Sopenharmony_ci.asciz	"GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
539e1051a39Sopenharmony_ci.align	4
540e1051a39Sopenharmony_ci___
541e1051a39Sopenharmony_ci
542e1051a39Sopenharmony_ci
543e1051a39Sopenharmony_ci# Purpose of these subroutines is to explicitly encode VIS instructions,
544e1051a39Sopenharmony_ci# so that one can compile the module without having to specify VIS
545e1051a39Sopenharmony_ci# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
546e1051a39Sopenharmony_ci# Idea is to reserve for option to produce "universal" binary and let
547e1051a39Sopenharmony_ci# programmer detect if current CPU is VIS capable at run-time.
548e1051a39Sopenharmony_cisub unvis3 {
549e1051a39Sopenharmony_cimy ($mnemonic,$rs1,$rs2,$rd)=@_;
550e1051a39Sopenharmony_cimy %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
551e1051a39Sopenharmony_cimy ($ref,$opf);
552e1051a39Sopenharmony_cimy %visopf = (	"addxc"		=> 0x011,
553e1051a39Sopenharmony_ci		"addxccc"	=> 0x013,
554e1051a39Sopenharmony_ci		"xmulx"		=> 0x115,
555e1051a39Sopenharmony_ci		"xmulxhi"	=> 0x116	);
556e1051a39Sopenharmony_ci
557e1051a39Sopenharmony_ci    $ref = "$mnemonic\t$rs1,$rs2,$rd";
558e1051a39Sopenharmony_ci
559e1051a39Sopenharmony_ci    if ($opf=$visopf{$mnemonic}) {
560e1051a39Sopenharmony_ci	foreach ($rs1,$rs2,$rd) {
561e1051a39Sopenharmony_ci	    return $ref if (!/%([goli])([0-9])/);
562e1051a39Sopenharmony_ci	    $_=$bias{$1}+$2;
563e1051a39Sopenharmony_ci	}
564e1051a39Sopenharmony_ci
565e1051a39Sopenharmony_ci	return	sprintf ".word\t0x%08x !%s",
566e1051a39Sopenharmony_ci			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
567e1051a39Sopenharmony_ci			$ref;
568e1051a39Sopenharmony_ci    } else {
569e1051a39Sopenharmony_ci	return $ref;
570e1051a39Sopenharmony_ci    }
571e1051a39Sopenharmony_ci}
572e1051a39Sopenharmony_ci
573e1051a39Sopenharmony_ciforeach (split("\n",$code)) {
574e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/ge;
575e1051a39Sopenharmony_ci
576e1051a39Sopenharmony_ci	s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
577e1051a39Sopenharmony_ci		&unvis3($1,$2,$3,$4)
578e1051a39Sopenharmony_ci	 /ge;
579e1051a39Sopenharmony_ci
580e1051a39Sopenharmony_ci	print $_,"\n";
581e1051a39Sopenharmony_ci}
582e1051a39Sopenharmony_ci
583e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
584