1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci# September 2010.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# The module implements "4-bit" GCM GHASH function and underlying
20e1051a39Sopenharmony_ci# single multiplication operation in GF(2^128). "4-bit" means that it
21e1051a39Sopenharmony_ci# uses 256 bytes per-key table [+128 bytes shared table]. Performance
22e1051a39Sopenharmony_ci# was measured to be ~18 cycles per processed byte on z10, which is
23e1051a39Sopenharmony_ci# almost 40% better than gcc-generated code. It should be noted that
24e1051a39Sopenharmony_ci# 18 cycles is worse result than expected: loop is scheduled for 12
25e1051a39Sopenharmony_ci# and the result should be close to 12. In the lack of instruction-
26e1051a39Sopenharmony_ci# level profiling data it's impossible to tell why...
27e1051a39Sopenharmony_ci
28e1051a39Sopenharmony_ci# November 2010.
29e1051a39Sopenharmony_ci#
30e1051a39Sopenharmony_ci# Adapt for -m31 build. If kernel supports what's called "highgprs"
31e1051a39Sopenharmony_ci# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
32e1051a39Sopenharmony_ci# instructions and achieve "64-bit" performance even in 31-bit legacy
33e1051a39Sopenharmony_ci# application context. The feature is not specific to any particular
34e1051a39Sopenharmony_ci# processor, as long as it's "z-CPU". Latter implies that the code
35e1051a39Sopenharmony_ci# remains z/Architecture specific. On z990 it was measured to perform
36e1051a39Sopenharmony_ci# 2.8x better than 32-bit code generated by gcc 4.3.
37e1051a39Sopenharmony_ci
38e1051a39Sopenharmony_ci# March 2011.
39e1051a39Sopenharmony_ci#
40e1051a39Sopenharmony_ci# Support for hardware KIMD-GHASH is verified to produce correct
41e1051a39Sopenharmony_ci# result and therefore is engaged. On z196 it was measured to process
42e1051a39Sopenharmony_ci# 8KB buffer ~7 faster than software implementation. It's not as
43e1051a39Sopenharmony_ci# impressive for smaller buffer sizes and for smallest 16-bytes buffer
44e1051a39Sopenharmony_ci# it's actually almost 2 times slower. Which is the reason why
45e1051a39Sopenharmony_ci# KIMD-GHASH is not used in gcm_gmult_4bit.
46e1051a39Sopenharmony_ci
47e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
48e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
49e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51e1051a39Sopenharmony_ci
52e1051a39Sopenharmony_ciif ($flavour =~ /3[12]/) {
53e1051a39Sopenharmony_ci	$SIZE_T=4;
54e1051a39Sopenharmony_ci	$g="";
55e1051a39Sopenharmony_ci} else {
56e1051a39Sopenharmony_ci	$SIZE_T=8;
57e1051a39Sopenharmony_ci	$g="g";
58e1051a39Sopenharmony_ci}
59e1051a39Sopenharmony_ci
60e1051a39Sopenharmony_ci$output and open STDOUT,">$output";
61e1051a39Sopenharmony_ci
62e1051a39Sopenharmony_ci$softonly=0;
63e1051a39Sopenharmony_ci
64e1051a39Sopenharmony_ci$Zhi="%r0";
65e1051a39Sopenharmony_ci$Zlo="%r1";
66e1051a39Sopenharmony_ci
67e1051a39Sopenharmony_ci$Xi="%r2";	# argument block
68e1051a39Sopenharmony_ci$Htbl="%r3";
69e1051a39Sopenharmony_ci$inp="%r4";
70e1051a39Sopenharmony_ci$len="%r5";
71e1051a39Sopenharmony_ci
72e1051a39Sopenharmony_ci$rem0="%r6";	# variables
73e1051a39Sopenharmony_ci$rem1="%r7";
74e1051a39Sopenharmony_ci$nlo="%r8";
75e1051a39Sopenharmony_ci$nhi="%r9";
76e1051a39Sopenharmony_ci$xi="%r10";
77e1051a39Sopenharmony_ci$cnt="%r11";
78e1051a39Sopenharmony_ci$tmp="%r12";
79e1051a39Sopenharmony_ci$x78="%r13";
80e1051a39Sopenharmony_ci$rem_4bit="%r14";
81e1051a39Sopenharmony_ci
82e1051a39Sopenharmony_ci$sp="%r15";
83e1051a39Sopenharmony_ci
84e1051a39Sopenharmony_ci$code.=<<___;
85e1051a39Sopenharmony_ci#include "s390x_arch.h"
86e1051a39Sopenharmony_ci
87e1051a39Sopenharmony_ci.text
88e1051a39Sopenharmony_ci
89e1051a39Sopenharmony_ci.globl	gcm_gmult_4bit
90e1051a39Sopenharmony_ci.align	32
91e1051a39Sopenharmony_cigcm_gmult_4bit:
92e1051a39Sopenharmony_ci___
93e1051a39Sopenharmony_ci$code.=<<___ if(!$softonly && 0);	# hardware is slow for single block...
94e1051a39Sopenharmony_ci	larl	%r1,OPENSSL_s390xcap_P
95e1051a39Sopenharmony_ci	lghi	%r0,0
96e1051a39Sopenharmony_ci	lg	%r1,S390X_KIMD+8(%r1)	# load second word of kimd capabilities
97e1051a39Sopenharmony_ci					#  vector
98e1051a39Sopenharmony_ci	tmhh	%r1,0x4000	# check for function 65
99e1051a39Sopenharmony_ci	jz	.Lsoft_gmult
100e1051a39Sopenharmony_ci	stg	%r0,16($sp)	# arrange 16 bytes of zero input
101e1051a39Sopenharmony_ci	stg	%r0,24($sp)
102e1051a39Sopenharmony_ci	lghi	%r0,S390X_GHASH	# function 65
103e1051a39Sopenharmony_ci	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
104e1051a39Sopenharmony_ci	la	$inp,16($sp)
105e1051a39Sopenharmony_ci	lghi	$len,16
106e1051a39Sopenharmony_ci	.long	0xb93e0004	# kimd %r0,$inp
107e1051a39Sopenharmony_ci	brc	1,.-4		# pay attention to "partial completion"
108e1051a39Sopenharmony_ci	br	%r14
109e1051a39Sopenharmony_ci.align	32
110e1051a39Sopenharmony_ci.Lsoft_gmult:
111e1051a39Sopenharmony_ci___
112e1051a39Sopenharmony_ci$code.=<<___;
113e1051a39Sopenharmony_ci	stm${g}	%r6,%r14,6*$SIZE_T($sp)
114e1051a39Sopenharmony_ci
115e1051a39Sopenharmony_ci	aghi	$Xi,-1
116e1051a39Sopenharmony_ci	lghi	$len,1
117e1051a39Sopenharmony_ci	lghi	$x78,`0xf<<3`
118e1051a39Sopenharmony_ci	larl	$rem_4bit,rem_4bit
119e1051a39Sopenharmony_ci
120e1051a39Sopenharmony_ci	lg	$Zlo,8+1($Xi)		# Xi
121e1051a39Sopenharmony_ci	j	.Lgmult_shortcut
122e1051a39Sopenharmony_ci.type	gcm_gmult_4bit,\@function
123e1051a39Sopenharmony_ci.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
124e1051a39Sopenharmony_ci
125e1051a39Sopenharmony_ci.globl	gcm_ghash_4bit
126e1051a39Sopenharmony_ci.align	32
127e1051a39Sopenharmony_cigcm_ghash_4bit:
128e1051a39Sopenharmony_ci___
129e1051a39Sopenharmony_ci$code.=<<___ if(!$softonly);
130e1051a39Sopenharmony_ci	larl	%r1,OPENSSL_s390xcap_P
131e1051a39Sopenharmony_ci	lg	%r0,S390X_KIMD+8(%r1)	# load second word of kimd capabilities
132e1051a39Sopenharmony_ci					#  vector
133e1051a39Sopenharmony_ci	tmhh	%r0,0x4000	# check for function 65
134e1051a39Sopenharmony_ci	jz	.Lsoft_ghash
135e1051a39Sopenharmony_ci	lghi	%r0,S390X_GHASH	# function 65
136e1051a39Sopenharmony_ci	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
137e1051a39Sopenharmony_ci	.long	0xb93e0004	# kimd %r0,$inp
138e1051a39Sopenharmony_ci	brc	1,.-4		# pay attention to "partial completion"
139e1051a39Sopenharmony_ci	br	%r14
140e1051a39Sopenharmony_ci.align	32
141e1051a39Sopenharmony_ci.Lsoft_ghash:
142e1051a39Sopenharmony_ci___
143e1051a39Sopenharmony_ci$code.=<<___ if ($flavour =~ /3[12]/);
144e1051a39Sopenharmony_ci	llgfr	$len,$len
145e1051a39Sopenharmony_ci___
146e1051a39Sopenharmony_ci$code.=<<___;
147e1051a39Sopenharmony_ci	stm${g}	%r6,%r14,6*$SIZE_T($sp)
148e1051a39Sopenharmony_ci
149e1051a39Sopenharmony_ci	aghi	$Xi,-1
150e1051a39Sopenharmony_ci	srlg	$len,$len,4
151e1051a39Sopenharmony_ci	lghi	$x78,`0xf<<3`
152e1051a39Sopenharmony_ci	larl	$rem_4bit,rem_4bit
153e1051a39Sopenharmony_ci
154e1051a39Sopenharmony_ci	lg	$Zlo,8+1($Xi)		# Xi
155e1051a39Sopenharmony_ci	lg	$Zhi,0+1($Xi)
156e1051a39Sopenharmony_ci	lghi	$tmp,0
157e1051a39Sopenharmony_ci.Louter:
158e1051a39Sopenharmony_ci	xg	$Zhi,0($inp)		# Xi ^= inp
159e1051a39Sopenharmony_ci	xg	$Zlo,8($inp)
160e1051a39Sopenharmony_ci	xgr	$Zhi,$tmp
161e1051a39Sopenharmony_ci	stg	$Zlo,8+1($Xi)
162e1051a39Sopenharmony_ci	stg	$Zhi,0+1($Xi)
163e1051a39Sopenharmony_ci
164e1051a39Sopenharmony_ci.Lgmult_shortcut:
165e1051a39Sopenharmony_ci	lghi	$tmp,0xf0
166e1051a39Sopenharmony_ci	sllg	$nlo,$Zlo,4
167e1051a39Sopenharmony_ci	srlg	$xi,$Zlo,8		# extract second byte
168e1051a39Sopenharmony_ci	ngr	$nlo,$tmp
169e1051a39Sopenharmony_ci	lgr	$nhi,$Zlo
170e1051a39Sopenharmony_ci	lghi	$cnt,14
171e1051a39Sopenharmony_ci	ngr	$nhi,$tmp
172e1051a39Sopenharmony_ci
173e1051a39Sopenharmony_ci	lg	$Zlo,8($nlo,$Htbl)
174e1051a39Sopenharmony_ci	lg	$Zhi,0($nlo,$Htbl)
175e1051a39Sopenharmony_ci
176e1051a39Sopenharmony_ci	sllg	$nlo,$xi,4
177e1051a39Sopenharmony_ci	sllg	$rem0,$Zlo,3
178e1051a39Sopenharmony_ci	ngr	$nlo,$tmp
179e1051a39Sopenharmony_ci	ngr	$rem0,$x78
180e1051a39Sopenharmony_ci	ngr	$xi,$tmp
181e1051a39Sopenharmony_ci
182e1051a39Sopenharmony_ci	sllg	$tmp,$Zhi,60
183e1051a39Sopenharmony_ci	srlg	$Zlo,$Zlo,4
184e1051a39Sopenharmony_ci	srlg	$Zhi,$Zhi,4
185e1051a39Sopenharmony_ci	xg	$Zlo,8($nhi,$Htbl)
186e1051a39Sopenharmony_ci	xg	$Zhi,0($nhi,$Htbl)
187e1051a39Sopenharmony_ci	lgr	$nhi,$xi
188e1051a39Sopenharmony_ci	sllg	$rem1,$Zlo,3
189e1051a39Sopenharmony_ci	xgr	$Zlo,$tmp
190e1051a39Sopenharmony_ci	ngr	$rem1,$x78
191e1051a39Sopenharmony_ci	sllg	$tmp,$Zhi,60
192e1051a39Sopenharmony_ci	j	.Lghash_inner
193e1051a39Sopenharmony_ci.align	16
194e1051a39Sopenharmony_ci.Lghash_inner:
195e1051a39Sopenharmony_ci	srlg	$Zlo,$Zlo,4
196e1051a39Sopenharmony_ci	srlg	$Zhi,$Zhi,4
197e1051a39Sopenharmony_ci	xg	$Zlo,8($nlo,$Htbl)
198e1051a39Sopenharmony_ci	llgc	$xi,0($cnt,$Xi)
199e1051a39Sopenharmony_ci	xg	$Zhi,0($nlo,$Htbl)
200e1051a39Sopenharmony_ci	sllg	$nlo,$xi,4
201e1051a39Sopenharmony_ci	xg	$Zhi,0($rem0,$rem_4bit)
202e1051a39Sopenharmony_ci	nill	$nlo,0xf0
203e1051a39Sopenharmony_ci	sllg	$rem0,$Zlo,3
204e1051a39Sopenharmony_ci	xgr	$Zlo,$tmp
205e1051a39Sopenharmony_ci	ngr	$rem0,$x78
206e1051a39Sopenharmony_ci	nill	$xi,0xf0
207e1051a39Sopenharmony_ci
208e1051a39Sopenharmony_ci	sllg	$tmp,$Zhi,60
209e1051a39Sopenharmony_ci	srlg	$Zlo,$Zlo,4
210e1051a39Sopenharmony_ci	srlg	$Zhi,$Zhi,4
211e1051a39Sopenharmony_ci	xg	$Zlo,8($nhi,$Htbl)
212e1051a39Sopenharmony_ci	xg	$Zhi,0($nhi,$Htbl)
213e1051a39Sopenharmony_ci	lgr	$nhi,$xi
214e1051a39Sopenharmony_ci	xg	$Zhi,0($rem1,$rem_4bit)
215e1051a39Sopenharmony_ci	sllg	$rem1,$Zlo,3
216e1051a39Sopenharmony_ci	xgr	$Zlo,$tmp
217e1051a39Sopenharmony_ci	ngr	$rem1,$x78
218e1051a39Sopenharmony_ci	sllg	$tmp,$Zhi,60
219e1051a39Sopenharmony_ci	brct	$cnt,.Lghash_inner
220e1051a39Sopenharmony_ci
221e1051a39Sopenharmony_ci	srlg	$Zlo,$Zlo,4
222e1051a39Sopenharmony_ci	srlg	$Zhi,$Zhi,4
223e1051a39Sopenharmony_ci	xg	$Zlo,8($nlo,$Htbl)
224e1051a39Sopenharmony_ci	xg	$Zhi,0($nlo,$Htbl)
225e1051a39Sopenharmony_ci	sllg	$xi,$Zlo,3
226e1051a39Sopenharmony_ci	xg	$Zhi,0($rem0,$rem_4bit)
227e1051a39Sopenharmony_ci	xgr	$Zlo,$tmp
228e1051a39Sopenharmony_ci	ngr	$xi,$x78
229e1051a39Sopenharmony_ci
230e1051a39Sopenharmony_ci	sllg	$tmp,$Zhi,60
231e1051a39Sopenharmony_ci	srlg	$Zlo,$Zlo,4
232e1051a39Sopenharmony_ci	srlg	$Zhi,$Zhi,4
233e1051a39Sopenharmony_ci	xg	$Zlo,8($nhi,$Htbl)
234e1051a39Sopenharmony_ci	xg	$Zhi,0($nhi,$Htbl)
235e1051a39Sopenharmony_ci	xgr	$Zlo,$tmp
236e1051a39Sopenharmony_ci	xg	$Zhi,0($rem1,$rem_4bit)
237e1051a39Sopenharmony_ci
238e1051a39Sopenharmony_ci	lg	$tmp,0($xi,$rem_4bit)
239e1051a39Sopenharmony_ci	la	$inp,16($inp)
240e1051a39Sopenharmony_ci	sllg	$tmp,$tmp,4		# correct last rem_4bit[rem]
241e1051a39Sopenharmony_ci	brctg	$len,.Louter
242e1051a39Sopenharmony_ci
243e1051a39Sopenharmony_ci	xgr	$Zhi,$tmp
244e1051a39Sopenharmony_ci	stg	$Zlo,8+1($Xi)
245e1051a39Sopenharmony_ci	stg	$Zhi,0+1($Xi)
246e1051a39Sopenharmony_ci	lm${g}	%r6,%r14,6*$SIZE_T($sp)
247e1051a39Sopenharmony_ci	br	%r14
248e1051a39Sopenharmony_ci.type	gcm_ghash_4bit,\@function
249e1051a39Sopenharmony_ci.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
250e1051a39Sopenharmony_ci
251e1051a39Sopenharmony_ci.align	64
252e1051a39Sopenharmony_cirem_4bit:
253e1051a39Sopenharmony_ci	.long	`0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
254e1051a39Sopenharmony_ci	.long	`0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
255e1051a39Sopenharmony_ci	.long	`0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
256e1051a39Sopenharmony_ci	.long	`0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
257e1051a39Sopenharmony_ci.type	rem_4bit,\@object
258e1051a39Sopenharmony_ci.size	rem_4bit,(.-rem_4bit)
259e1051a39Sopenharmony_ci.string	"GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
260e1051a39Sopenharmony_ci___
261e1051a39Sopenharmony_ci
262e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval $1/gem;
263e1051a39Sopenharmony_ciprint $code;
264e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
265