1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci 17e1051a39Sopenharmony_ci# September 2010. 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# The module implements "4-bit" GCM GHASH function and underlying 20e1051a39Sopenharmony_ci# single multiplication operation in GF(2^128). "4-bit" means that it 21e1051a39Sopenharmony_ci# uses 256 bytes per-key table [+128 bytes shared table]. Performance 22e1051a39Sopenharmony_ci# was measured to be ~18 cycles per processed byte on z10, which is 23e1051a39Sopenharmony_ci# almost 40% better than gcc-generated code. It should be noted that 24e1051a39Sopenharmony_ci# 18 cycles is worse result than expected: loop is scheduled for 12 25e1051a39Sopenharmony_ci# and the result should be close to 12. In the lack of instruction- 26e1051a39Sopenharmony_ci# level profiling data it's impossible to tell why... 27e1051a39Sopenharmony_ci 28e1051a39Sopenharmony_ci# November 2010. 29e1051a39Sopenharmony_ci# 30e1051a39Sopenharmony_ci# Adapt for -m31 build. If kernel supports what's called "highgprs" 31e1051a39Sopenharmony_ci# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit 32e1051a39Sopenharmony_ci# instructions and achieve "64-bit" performance even in 31-bit legacy 33e1051a39Sopenharmony_ci# application context. The feature is not specific to any particular 34e1051a39Sopenharmony_ci# processor, as long as it's "z-CPU". Latter implies that the code 35e1051a39Sopenharmony_ci# remains z/Architecture specific. On z990 it was measured to perform 36e1051a39Sopenharmony_ci# 2.8x better than 32-bit code generated by gcc 4.3. 37e1051a39Sopenharmony_ci 38e1051a39Sopenharmony_ci# March 2011. 39e1051a39Sopenharmony_ci# 40e1051a39Sopenharmony_ci# Support for hardware KIMD-GHASH is verified to produce correct 41e1051a39Sopenharmony_ci# result and therefore is engaged. On z196 it was measured to process 42e1051a39Sopenharmony_ci# 8KB buffer ~7 faster than software implementation. It's not as 43e1051a39Sopenharmony_ci# impressive for smaller buffer sizes and for smallest 16-bytes buffer 44e1051a39Sopenharmony_ci# it's actually almost 2 times slower. Which is the reason why 45e1051a39Sopenharmony_ci# KIMD-GHASH is not used in gcm_gmult_4bit. 46e1051a39Sopenharmony_ci 47e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 48e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 49e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 50e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 51e1051a39Sopenharmony_ci 52e1051a39Sopenharmony_ciif ($flavour =~ /3[12]/) { 53e1051a39Sopenharmony_ci $SIZE_T=4; 54e1051a39Sopenharmony_ci $g=""; 55e1051a39Sopenharmony_ci} else { 56e1051a39Sopenharmony_ci $SIZE_T=8; 57e1051a39Sopenharmony_ci $g="g"; 58e1051a39Sopenharmony_ci} 59e1051a39Sopenharmony_ci 60e1051a39Sopenharmony_ci$output and open STDOUT,">$output"; 61e1051a39Sopenharmony_ci 62e1051a39Sopenharmony_ci$softonly=0; 63e1051a39Sopenharmony_ci 64e1051a39Sopenharmony_ci$Zhi="%r0"; 65e1051a39Sopenharmony_ci$Zlo="%r1"; 66e1051a39Sopenharmony_ci 67e1051a39Sopenharmony_ci$Xi="%r2"; # argument block 68e1051a39Sopenharmony_ci$Htbl="%r3"; 69e1051a39Sopenharmony_ci$inp="%r4"; 70e1051a39Sopenharmony_ci$len="%r5"; 71e1051a39Sopenharmony_ci 72e1051a39Sopenharmony_ci$rem0="%r6"; # variables 73e1051a39Sopenharmony_ci$rem1="%r7"; 74e1051a39Sopenharmony_ci$nlo="%r8"; 75e1051a39Sopenharmony_ci$nhi="%r9"; 76e1051a39Sopenharmony_ci$xi="%r10"; 77e1051a39Sopenharmony_ci$cnt="%r11"; 78e1051a39Sopenharmony_ci$tmp="%r12"; 79e1051a39Sopenharmony_ci$x78="%r13"; 80e1051a39Sopenharmony_ci$rem_4bit="%r14"; 81e1051a39Sopenharmony_ci 82e1051a39Sopenharmony_ci$sp="%r15"; 83e1051a39Sopenharmony_ci 84e1051a39Sopenharmony_ci$code.=<<___; 85e1051a39Sopenharmony_ci#include "s390x_arch.h" 86e1051a39Sopenharmony_ci 87e1051a39Sopenharmony_ci.text 88e1051a39Sopenharmony_ci 89e1051a39Sopenharmony_ci.globl gcm_gmult_4bit 90e1051a39Sopenharmony_ci.align 32 91e1051a39Sopenharmony_cigcm_gmult_4bit: 92e1051a39Sopenharmony_ci___ 93e1051a39Sopenharmony_ci$code.=<<___ if(!$softonly && 0); # hardware is slow for single block... 94e1051a39Sopenharmony_ci larl %r1,OPENSSL_s390xcap_P 95e1051a39Sopenharmony_ci lghi %r0,0 96e1051a39Sopenharmony_ci lg %r1,S390X_KIMD+8(%r1) # load second word of kimd capabilities 97e1051a39Sopenharmony_ci # vector 98e1051a39Sopenharmony_ci tmhh %r1,0x4000 # check for function 65 99e1051a39Sopenharmony_ci jz .Lsoft_gmult 100e1051a39Sopenharmony_ci stg %r0,16($sp) # arrange 16 bytes of zero input 101e1051a39Sopenharmony_ci stg %r0,24($sp) 102e1051a39Sopenharmony_ci lghi %r0,S390X_GHASH # function 65 103e1051a39Sopenharmony_ci la %r1,0($Xi) # H lies right after Xi in gcm128_context 104e1051a39Sopenharmony_ci la $inp,16($sp) 105e1051a39Sopenharmony_ci lghi $len,16 106e1051a39Sopenharmony_ci .long 0xb93e0004 # kimd %r0,$inp 107e1051a39Sopenharmony_ci brc 1,.-4 # pay attention to "partial completion" 108e1051a39Sopenharmony_ci br %r14 109e1051a39Sopenharmony_ci.align 32 110e1051a39Sopenharmony_ci.Lsoft_gmult: 111e1051a39Sopenharmony_ci___ 112e1051a39Sopenharmony_ci$code.=<<___; 113e1051a39Sopenharmony_ci stm${g} %r6,%r14,6*$SIZE_T($sp) 114e1051a39Sopenharmony_ci 115e1051a39Sopenharmony_ci aghi $Xi,-1 116e1051a39Sopenharmony_ci lghi $len,1 117e1051a39Sopenharmony_ci lghi $x78,`0xf<<3` 118e1051a39Sopenharmony_ci larl $rem_4bit,rem_4bit 119e1051a39Sopenharmony_ci 120e1051a39Sopenharmony_ci lg $Zlo,8+1($Xi) # Xi 121e1051a39Sopenharmony_ci j .Lgmult_shortcut 122e1051a39Sopenharmony_ci.type gcm_gmult_4bit,\@function 123e1051a39Sopenharmony_ci.size gcm_gmult_4bit,(.-gcm_gmult_4bit) 124e1051a39Sopenharmony_ci 125e1051a39Sopenharmony_ci.globl gcm_ghash_4bit 126e1051a39Sopenharmony_ci.align 32 127e1051a39Sopenharmony_cigcm_ghash_4bit: 128e1051a39Sopenharmony_ci___ 129e1051a39Sopenharmony_ci$code.=<<___ if(!$softonly); 130e1051a39Sopenharmony_ci larl %r1,OPENSSL_s390xcap_P 131e1051a39Sopenharmony_ci lg %r0,S390X_KIMD+8(%r1) # load second word of kimd capabilities 132e1051a39Sopenharmony_ci # vector 133e1051a39Sopenharmony_ci tmhh %r0,0x4000 # check for function 65 134e1051a39Sopenharmony_ci jz .Lsoft_ghash 135e1051a39Sopenharmony_ci lghi %r0,S390X_GHASH # function 65 136e1051a39Sopenharmony_ci la %r1,0($Xi) # H lies right after Xi in gcm128_context 137e1051a39Sopenharmony_ci .long 0xb93e0004 # kimd %r0,$inp 138e1051a39Sopenharmony_ci brc 1,.-4 # pay attention to "partial completion" 139e1051a39Sopenharmony_ci br %r14 140e1051a39Sopenharmony_ci.align 32 141e1051a39Sopenharmony_ci.Lsoft_ghash: 142e1051a39Sopenharmony_ci___ 143e1051a39Sopenharmony_ci$code.=<<___ if ($flavour =~ /3[12]/); 144e1051a39Sopenharmony_ci llgfr $len,$len 145e1051a39Sopenharmony_ci___ 146e1051a39Sopenharmony_ci$code.=<<___; 147e1051a39Sopenharmony_ci stm${g} %r6,%r14,6*$SIZE_T($sp) 148e1051a39Sopenharmony_ci 149e1051a39Sopenharmony_ci aghi $Xi,-1 150e1051a39Sopenharmony_ci srlg $len,$len,4 151e1051a39Sopenharmony_ci lghi $x78,`0xf<<3` 152e1051a39Sopenharmony_ci larl $rem_4bit,rem_4bit 153e1051a39Sopenharmony_ci 154e1051a39Sopenharmony_ci lg $Zlo,8+1($Xi) # Xi 155e1051a39Sopenharmony_ci lg $Zhi,0+1($Xi) 156e1051a39Sopenharmony_ci lghi $tmp,0 157e1051a39Sopenharmony_ci.Louter: 158e1051a39Sopenharmony_ci xg $Zhi,0($inp) # Xi ^= inp 159e1051a39Sopenharmony_ci xg $Zlo,8($inp) 160e1051a39Sopenharmony_ci xgr $Zhi,$tmp 161e1051a39Sopenharmony_ci stg $Zlo,8+1($Xi) 162e1051a39Sopenharmony_ci stg $Zhi,0+1($Xi) 163e1051a39Sopenharmony_ci 164e1051a39Sopenharmony_ci.Lgmult_shortcut: 165e1051a39Sopenharmony_ci lghi $tmp,0xf0 166e1051a39Sopenharmony_ci sllg $nlo,$Zlo,4 167e1051a39Sopenharmony_ci srlg $xi,$Zlo,8 # extract second byte 168e1051a39Sopenharmony_ci ngr $nlo,$tmp 169e1051a39Sopenharmony_ci lgr $nhi,$Zlo 170e1051a39Sopenharmony_ci lghi $cnt,14 171e1051a39Sopenharmony_ci ngr $nhi,$tmp 172e1051a39Sopenharmony_ci 173e1051a39Sopenharmony_ci lg $Zlo,8($nlo,$Htbl) 174e1051a39Sopenharmony_ci lg $Zhi,0($nlo,$Htbl) 175e1051a39Sopenharmony_ci 176e1051a39Sopenharmony_ci sllg $nlo,$xi,4 177e1051a39Sopenharmony_ci sllg $rem0,$Zlo,3 178e1051a39Sopenharmony_ci ngr $nlo,$tmp 179e1051a39Sopenharmony_ci ngr $rem0,$x78 180e1051a39Sopenharmony_ci ngr $xi,$tmp 181e1051a39Sopenharmony_ci 182e1051a39Sopenharmony_ci sllg $tmp,$Zhi,60 183e1051a39Sopenharmony_ci srlg $Zlo,$Zlo,4 184e1051a39Sopenharmony_ci srlg $Zhi,$Zhi,4 185e1051a39Sopenharmony_ci xg $Zlo,8($nhi,$Htbl) 186e1051a39Sopenharmony_ci xg $Zhi,0($nhi,$Htbl) 187e1051a39Sopenharmony_ci lgr $nhi,$xi 188e1051a39Sopenharmony_ci sllg $rem1,$Zlo,3 189e1051a39Sopenharmony_ci xgr $Zlo,$tmp 190e1051a39Sopenharmony_ci ngr $rem1,$x78 191e1051a39Sopenharmony_ci sllg $tmp,$Zhi,60 192e1051a39Sopenharmony_ci j .Lghash_inner 193e1051a39Sopenharmony_ci.align 16 194e1051a39Sopenharmony_ci.Lghash_inner: 195e1051a39Sopenharmony_ci srlg $Zlo,$Zlo,4 196e1051a39Sopenharmony_ci srlg $Zhi,$Zhi,4 197e1051a39Sopenharmony_ci xg $Zlo,8($nlo,$Htbl) 198e1051a39Sopenharmony_ci llgc $xi,0($cnt,$Xi) 199e1051a39Sopenharmony_ci xg $Zhi,0($nlo,$Htbl) 200e1051a39Sopenharmony_ci sllg $nlo,$xi,4 201e1051a39Sopenharmony_ci xg $Zhi,0($rem0,$rem_4bit) 202e1051a39Sopenharmony_ci nill $nlo,0xf0 203e1051a39Sopenharmony_ci sllg $rem0,$Zlo,3 204e1051a39Sopenharmony_ci xgr $Zlo,$tmp 205e1051a39Sopenharmony_ci ngr $rem0,$x78 206e1051a39Sopenharmony_ci nill $xi,0xf0 207e1051a39Sopenharmony_ci 208e1051a39Sopenharmony_ci sllg $tmp,$Zhi,60 209e1051a39Sopenharmony_ci srlg $Zlo,$Zlo,4 210e1051a39Sopenharmony_ci srlg $Zhi,$Zhi,4 211e1051a39Sopenharmony_ci xg $Zlo,8($nhi,$Htbl) 212e1051a39Sopenharmony_ci xg $Zhi,0($nhi,$Htbl) 213e1051a39Sopenharmony_ci lgr $nhi,$xi 214e1051a39Sopenharmony_ci xg $Zhi,0($rem1,$rem_4bit) 215e1051a39Sopenharmony_ci sllg $rem1,$Zlo,3 216e1051a39Sopenharmony_ci xgr $Zlo,$tmp 217e1051a39Sopenharmony_ci ngr $rem1,$x78 218e1051a39Sopenharmony_ci sllg $tmp,$Zhi,60 219e1051a39Sopenharmony_ci brct $cnt,.Lghash_inner 220e1051a39Sopenharmony_ci 221e1051a39Sopenharmony_ci srlg $Zlo,$Zlo,4 222e1051a39Sopenharmony_ci srlg $Zhi,$Zhi,4 223e1051a39Sopenharmony_ci xg $Zlo,8($nlo,$Htbl) 224e1051a39Sopenharmony_ci xg $Zhi,0($nlo,$Htbl) 225e1051a39Sopenharmony_ci sllg $xi,$Zlo,3 226e1051a39Sopenharmony_ci xg $Zhi,0($rem0,$rem_4bit) 227e1051a39Sopenharmony_ci xgr $Zlo,$tmp 228e1051a39Sopenharmony_ci ngr $xi,$x78 229e1051a39Sopenharmony_ci 230e1051a39Sopenharmony_ci sllg $tmp,$Zhi,60 231e1051a39Sopenharmony_ci srlg $Zlo,$Zlo,4 232e1051a39Sopenharmony_ci srlg $Zhi,$Zhi,4 233e1051a39Sopenharmony_ci xg $Zlo,8($nhi,$Htbl) 234e1051a39Sopenharmony_ci xg $Zhi,0($nhi,$Htbl) 235e1051a39Sopenharmony_ci xgr $Zlo,$tmp 236e1051a39Sopenharmony_ci xg $Zhi,0($rem1,$rem_4bit) 237e1051a39Sopenharmony_ci 238e1051a39Sopenharmony_ci lg $tmp,0($xi,$rem_4bit) 239e1051a39Sopenharmony_ci la $inp,16($inp) 240e1051a39Sopenharmony_ci sllg $tmp,$tmp,4 # correct last rem_4bit[rem] 241e1051a39Sopenharmony_ci brctg $len,.Louter 242e1051a39Sopenharmony_ci 243e1051a39Sopenharmony_ci xgr $Zhi,$tmp 244e1051a39Sopenharmony_ci stg $Zlo,8+1($Xi) 245e1051a39Sopenharmony_ci stg $Zhi,0+1($Xi) 246e1051a39Sopenharmony_ci lm${g} %r6,%r14,6*$SIZE_T($sp) 247e1051a39Sopenharmony_ci br %r14 248e1051a39Sopenharmony_ci.type gcm_ghash_4bit,\@function 249e1051a39Sopenharmony_ci.size gcm_ghash_4bit,(.-gcm_ghash_4bit) 250e1051a39Sopenharmony_ci 251e1051a39Sopenharmony_ci.align 64 252e1051a39Sopenharmony_cirem_4bit: 253e1051a39Sopenharmony_ci .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0 254e1051a39Sopenharmony_ci .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0 255e1051a39Sopenharmony_ci .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0 256e1051a39Sopenharmony_ci .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0 257e1051a39Sopenharmony_ci.type rem_4bit,\@object 258e1051a39Sopenharmony_ci.size rem_4bit,(.-rem_4bit) 259e1051a39Sopenharmony_ci.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>" 260e1051a39Sopenharmony_ci___ 261e1051a39Sopenharmony_ci 262e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval $1/gem; 263e1051a39Sopenharmony_ciprint $code; 264e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 265