18c2ecf20Sopenharmony_ci#!/usr/bin/env perl
28c2ecf20Sopenharmony_ci# SPDX-License-Identifier: GPL-2.0
38c2ecf20Sopenharmony_ci
48c2ecf20Sopenharmony_ci# This code is taken from the OpenSSL project but the author (Andy Polyakov)
58c2ecf20Sopenharmony_ci# has relicensed it under the GPLv2. Therefore this program is free software;
68c2ecf20Sopenharmony_ci# you can redistribute it and/or modify it under the terms of the GNU General
78c2ecf20Sopenharmony_ci# Public License version 2 as published by the Free Software Foundation.
88c2ecf20Sopenharmony_ci#
98c2ecf20Sopenharmony_ci# The original headers, including the original license headers, are
108c2ecf20Sopenharmony_ci# included below for completeness.
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci# ====================================================================
138c2ecf20Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
148c2ecf20Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
158c2ecf20Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
168c2ecf20Sopenharmony_ci# details see https://www.openssl.org/~appro/cryptogams/.
178c2ecf20Sopenharmony_ci# ====================================================================
188c2ecf20Sopenharmony_ci#
198c2ecf20Sopenharmony_ci# GHASH for for PowerISA v2.07.
208c2ecf20Sopenharmony_ci#
218c2ecf20Sopenharmony_ci# July 2014
228c2ecf20Sopenharmony_ci#
238c2ecf20Sopenharmony_ci# Accurate performance measurements are problematic, because it's
248c2ecf20Sopenharmony_ci# always virtualized setup with possibly throttled processor.
258c2ecf20Sopenharmony_ci# Relative comparison is therefore more informative. This initial
268c2ecf20Sopenharmony_ci# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
278c2ecf20Sopenharmony_ci# faster than "4-bit" integer-only compiler-generated 64-bit code.
288c2ecf20Sopenharmony_ci# "Initial version" means that there is room for futher improvement.
298c2ecf20Sopenharmony_ci
308c2ecf20Sopenharmony_ci$flavour=shift;
318c2ecf20Sopenharmony_ci$output =shift;
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ciif ($flavour =~ /64/) {
348c2ecf20Sopenharmony_ci	$SIZE_T=8;
358c2ecf20Sopenharmony_ci	$LRSAVE=2*$SIZE_T;
368c2ecf20Sopenharmony_ci	$STU="stdu";
378c2ecf20Sopenharmony_ci	$POP="ld";
388c2ecf20Sopenharmony_ci	$PUSH="std";
398c2ecf20Sopenharmony_ci} elsif ($flavour =~ /32/) {
408c2ecf20Sopenharmony_ci	$SIZE_T=4;
418c2ecf20Sopenharmony_ci	$LRSAVE=$SIZE_T;
428c2ecf20Sopenharmony_ci	$STU="stwu";
438c2ecf20Sopenharmony_ci	$POP="lwz";
448c2ecf20Sopenharmony_ci	$PUSH="stw";
458c2ecf20Sopenharmony_ci} else { die "nonsense $flavour"; }
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
488c2ecf20Sopenharmony_ci( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
498c2ecf20Sopenharmony_ci( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
508c2ecf20Sopenharmony_cidie "can't locate ppc-xlate.pl";
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ciopen STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
538c2ecf20Sopenharmony_ci
548c2ecf20Sopenharmony_cimy ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_cimy ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
578c2ecf20Sopenharmony_cimy ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
588c2ecf20Sopenharmony_cimy $vrsave="r12";
598c2ecf20Sopenharmony_ci
608c2ecf20Sopenharmony_ci$code=<<___;
618c2ecf20Sopenharmony_ci.machine	"any"
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci.text
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_ci.globl	.gcm_init_p8
668c2ecf20Sopenharmony_ci	lis		r0,0xfff0
678c2ecf20Sopenharmony_ci	li		r8,0x10
688c2ecf20Sopenharmony_ci	mfspr		$vrsave,256
698c2ecf20Sopenharmony_ci	li		r9,0x20
708c2ecf20Sopenharmony_ci	mtspr		256,r0
718c2ecf20Sopenharmony_ci	li		r10,0x30
728c2ecf20Sopenharmony_ci	lvx_u		$H,0,r4			# load H
738c2ecf20Sopenharmony_ci	le?xor		r7,r7,r7
748c2ecf20Sopenharmony_ci	le?addi		r7,r7,0x8		# need a vperm start with 08
758c2ecf20Sopenharmony_ci	le?lvsr		5,0,r7
768c2ecf20Sopenharmony_ci	le?vspltisb	6,0x0f
778c2ecf20Sopenharmony_ci	le?vxor		5,5,6			# set a b-endian mask
788c2ecf20Sopenharmony_ci	le?vperm	$H,$H,$H,5
798c2ecf20Sopenharmony_ci
808c2ecf20Sopenharmony_ci	vspltisb	$xC2,-16		# 0xf0
818c2ecf20Sopenharmony_ci	vspltisb	$t0,1			# one
828c2ecf20Sopenharmony_ci	vaddubm		$xC2,$xC2,$xC2		# 0xe0
838c2ecf20Sopenharmony_ci	vxor		$zero,$zero,$zero
848c2ecf20Sopenharmony_ci	vor		$xC2,$xC2,$t0		# 0xe1
858c2ecf20Sopenharmony_ci	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
868c2ecf20Sopenharmony_ci	vsldoi		$t1,$zero,$t0,1		# ...1
878c2ecf20Sopenharmony_ci	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
888c2ecf20Sopenharmony_ci	vspltisb	$t2,7
898c2ecf20Sopenharmony_ci	vor		$xC2,$xC2,$t1		# 0xc2....01
908c2ecf20Sopenharmony_ci	vspltb		$t1,$H,0		# most significant byte
918c2ecf20Sopenharmony_ci	vsl		$H,$H,$t0		# H<<=1
928c2ecf20Sopenharmony_ci	vsrab		$t1,$t1,$t2		# broadcast carry bit
938c2ecf20Sopenharmony_ci	vand		$t1,$t1,$xC2
948c2ecf20Sopenharmony_ci	vxor		$H,$H,$t1		# twisted H
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci	vsldoi		$H,$H,$H,8		# twist even more ...
978c2ecf20Sopenharmony_ci	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
988c2ecf20Sopenharmony_ci	vsldoi		$Hl,$zero,$H,8		# ... and split
998c2ecf20Sopenharmony_ci	vsldoi		$Hh,$H,$zero,8
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci	stvx_u		$xC2,0,r3		# save pre-computed table
1028c2ecf20Sopenharmony_ci	stvx_u		$Hl,r8,r3
1038c2ecf20Sopenharmony_ci	stvx_u		$H, r9,r3
1048c2ecf20Sopenharmony_ci	stvx_u		$Hh,r10,r3
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_ci	mtspr		256,$vrsave
1078c2ecf20Sopenharmony_ci	blr
1088c2ecf20Sopenharmony_ci	.long		0
1098c2ecf20Sopenharmony_ci	.byte		0,12,0x14,0,0,0,2,0
1108c2ecf20Sopenharmony_ci	.long		0
1118c2ecf20Sopenharmony_ci.size	.gcm_init_p8,.-.gcm_init_p8
1128c2ecf20Sopenharmony_ci
1138c2ecf20Sopenharmony_ci.globl	.gcm_gmult_p8
1148c2ecf20Sopenharmony_ci	lis		r0,0xfff8
1158c2ecf20Sopenharmony_ci	li		r8,0x10
1168c2ecf20Sopenharmony_ci	mfspr		$vrsave,256
1178c2ecf20Sopenharmony_ci	li		r9,0x20
1188c2ecf20Sopenharmony_ci	mtspr		256,r0
1198c2ecf20Sopenharmony_ci	li		r10,0x30
1208c2ecf20Sopenharmony_ci	lvx_u		$IN,0,$Xip		# load Xi
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
1238c2ecf20Sopenharmony_ci	 le?lvsl	$lemask,r0,r0
1248c2ecf20Sopenharmony_ci	lvx_u		$H, r9,$Htbl
1258c2ecf20Sopenharmony_ci	 le?vspltisb	$t0,0x07
1268c2ecf20Sopenharmony_ci	lvx_u		$Hh,r10,$Htbl
1278c2ecf20Sopenharmony_ci	 le?vxor	$lemask,$lemask,$t0
1288c2ecf20Sopenharmony_ci	lvx_u		$xC2,0,$Htbl
1298c2ecf20Sopenharmony_ci	 le?vperm	$IN,$IN,$IN,$lemask
1308c2ecf20Sopenharmony_ci	vxor		$zero,$zero,$zero
1318c2ecf20Sopenharmony_ci
1328c2ecf20Sopenharmony_ci	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
1338c2ecf20Sopenharmony_ci	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
1348c2ecf20Sopenharmony_ci	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
1358c2ecf20Sopenharmony_ci
1368c2ecf20Sopenharmony_ci	vpmsumd		$t2,$Xl,$xC2		# 1st phase
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci	vsldoi		$t0,$Xm,$zero,8
1398c2ecf20Sopenharmony_ci	vsldoi		$t1,$zero,$Xm,8
1408c2ecf20Sopenharmony_ci	vxor		$Xl,$Xl,$t0
1418c2ecf20Sopenharmony_ci	vxor		$Xh,$Xh,$t1
1428c2ecf20Sopenharmony_ci
1438c2ecf20Sopenharmony_ci	vsldoi		$Xl,$Xl,$Xl,8
1448c2ecf20Sopenharmony_ci	vxor		$Xl,$Xl,$t2
1458c2ecf20Sopenharmony_ci
1468c2ecf20Sopenharmony_ci	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
1478c2ecf20Sopenharmony_ci	vpmsumd		$Xl,$Xl,$xC2
1488c2ecf20Sopenharmony_ci	vxor		$t1,$t1,$Xh
1498c2ecf20Sopenharmony_ci	vxor		$Xl,$Xl,$t1
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci	le?vperm	$Xl,$Xl,$Xl,$lemask
1528c2ecf20Sopenharmony_ci	stvx_u		$Xl,0,$Xip		# write out Xi
1538c2ecf20Sopenharmony_ci
1548c2ecf20Sopenharmony_ci	mtspr		256,$vrsave
1558c2ecf20Sopenharmony_ci	blr
1568c2ecf20Sopenharmony_ci	.long		0
1578c2ecf20Sopenharmony_ci	.byte		0,12,0x14,0,0,0,2,0
1588c2ecf20Sopenharmony_ci	.long		0
1598c2ecf20Sopenharmony_ci.size	.gcm_gmult_p8,.-.gcm_gmult_p8
1608c2ecf20Sopenharmony_ci
1618c2ecf20Sopenharmony_ci.globl	.gcm_ghash_p8
1628c2ecf20Sopenharmony_ci	lis		r0,0xfff8
1638c2ecf20Sopenharmony_ci	li		r8,0x10
1648c2ecf20Sopenharmony_ci	mfspr		$vrsave,256
1658c2ecf20Sopenharmony_ci	li		r9,0x20
1668c2ecf20Sopenharmony_ci	mtspr		256,r0
1678c2ecf20Sopenharmony_ci	li		r10,0x30
1688c2ecf20Sopenharmony_ci	lvx_u		$Xl,0,$Xip		# load Xi
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_ci	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
1718c2ecf20Sopenharmony_ci	 le?lvsl	$lemask,r0,r0
1728c2ecf20Sopenharmony_ci	lvx_u		$H, r9,$Htbl
1738c2ecf20Sopenharmony_ci	 le?vspltisb	$t0,0x07
1748c2ecf20Sopenharmony_ci	lvx_u		$Hh,r10,$Htbl
1758c2ecf20Sopenharmony_ci	 le?vxor	$lemask,$lemask,$t0
1768c2ecf20Sopenharmony_ci	lvx_u		$xC2,0,$Htbl
1778c2ecf20Sopenharmony_ci	 le?vperm	$Xl,$Xl,$Xl,$lemask
1788c2ecf20Sopenharmony_ci	vxor		$zero,$zero,$zero
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci	lvx_u		$IN,0,$inp
1818c2ecf20Sopenharmony_ci	addi		$inp,$inp,16
1828c2ecf20Sopenharmony_ci	subi		$len,$len,16
1838c2ecf20Sopenharmony_ci	 le?vperm	$IN,$IN,$IN,$lemask
1848c2ecf20Sopenharmony_ci	vxor		$IN,$IN,$Xl
1858c2ecf20Sopenharmony_ci	b		Loop
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci.align	5
1888c2ecf20Sopenharmony_ciLoop:
1898c2ecf20Sopenharmony_ci	 subic		$len,$len,16
1908c2ecf20Sopenharmony_ci	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
1918c2ecf20Sopenharmony_ci	 subfe.		r0,r0,r0		# borrow?-1:0
1928c2ecf20Sopenharmony_ci	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
1938c2ecf20Sopenharmony_ci	 and		r0,r0,$len
1948c2ecf20Sopenharmony_ci	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
1958c2ecf20Sopenharmony_ci	 add		$inp,$inp,r0
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_ci	vpmsumd		$t2,$Xl,$xC2		# 1st phase
1988c2ecf20Sopenharmony_ci
1998c2ecf20Sopenharmony_ci	vsldoi		$t0,$Xm,$zero,8
2008c2ecf20Sopenharmony_ci	vsldoi		$t1,$zero,$Xm,8
2018c2ecf20Sopenharmony_ci	vxor		$Xl,$Xl,$t0
2028c2ecf20Sopenharmony_ci	vxor		$Xh,$Xh,$t1
2038c2ecf20Sopenharmony_ci
2048c2ecf20Sopenharmony_ci	vsldoi		$Xl,$Xl,$Xl,8
2058c2ecf20Sopenharmony_ci	vxor		$Xl,$Xl,$t2
2068c2ecf20Sopenharmony_ci	 lvx_u		$IN,0,$inp
2078c2ecf20Sopenharmony_ci	 addi		$inp,$inp,16
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_ci	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
2108c2ecf20Sopenharmony_ci	vpmsumd		$Xl,$Xl,$xC2
2118c2ecf20Sopenharmony_ci	 le?vperm	$IN,$IN,$IN,$lemask
2128c2ecf20Sopenharmony_ci	vxor		$t1,$t1,$Xh
2138c2ecf20Sopenharmony_ci	vxor		$IN,$IN,$t1
2148c2ecf20Sopenharmony_ci	vxor		$IN,$IN,$Xl
2158c2ecf20Sopenharmony_ci	beq		Loop			# did $len-=16 borrow?
2168c2ecf20Sopenharmony_ci
2178c2ecf20Sopenharmony_ci	vxor		$Xl,$Xl,$t1
2188c2ecf20Sopenharmony_ci	le?vperm	$Xl,$Xl,$Xl,$lemask
2198c2ecf20Sopenharmony_ci	stvx_u		$Xl,0,$Xip		# write out Xi
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_ci	mtspr		256,$vrsave
2228c2ecf20Sopenharmony_ci	blr
2238c2ecf20Sopenharmony_ci	.long		0
2248c2ecf20Sopenharmony_ci	.byte		0,12,0x14,0,0,0,4,0
2258c2ecf20Sopenharmony_ci	.long		0
2268c2ecf20Sopenharmony_ci.size	.gcm_ghash_p8,.-.gcm_ghash_p8
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
2298c2ecf20Sopenharmony_ci.align  2
2308c2ecf20Sopenharmony_ci___
2318c2ecf20Sopenharmony_ci
2328c2ecf20Sopenharmony_ciforeach (split("\n",$code)) {
2338c2ecf20Sopenharmony_ci	if ($flavour =~ /le$/o) {	# little-endian
2348c2ecf20Sopenharmony_ci	    s/le\?//o		or
2358c2ecf20Sopenharmony_ci	    s/be\?/#be#/o;
2368c2ecf20Sopenharmony_ci	} else {
2378c2ecf20Sopenharmony_ci	    s/le\?/#le#/o	or
2388c2ecf20Sopenharmony_ci	    s/be\?//o;
2398c2ecf20Sopenharmony_ci	}
2408c2ecf20Sopenharmony_ci	print $_,"\n";
2418c2ecf20Sopenharmony_ci}
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ciclose STDOUT; # enforce flush
244