1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci######################################################################
11e1051a39Sopenharmony_ci## Constant-time SSSE3 AES core implementation.
12e1051a39Sopenharmony_ci## version 0.1
13e1051a39Sopenharmony_ci##
14e1051a39Sopenharmony_ci## By Mike Hamburg (Stanford University), 2009
15e1051a39Sopenharmony_ci## Public domain.
16e1051a39Sopenharmony_ci##
17e1051a39Sopenharmony_ci## For details see http://shiftleft.org/papers/vector_aes/ and
18e1051a39Sopenharmony_ci## http://crypto.stanford.edu/vpaes/.
19e1051a39Sopenharmony_ci
20e1051a39Sopenharmony_ci# CBC encrypt/decrypt performance in cycles per byte processed with
21e1051a39Sopenharmony_ci# 128-bit key.
22e1051a39Sopenharmony_ci#
23e1051a39Sopenharmony_ci#		aes-ppc.pl		this
24e1051a39Sopenharmony_ci# PPC74x0/G4e	35.5/52.1/(23.8)	11.9(*)/15.4
25e1051a39Sopenharmony_ci# PPC970/G5	37.9/55.0/(28.5)	22.2/28.5
26e1051a39Sopenharmony_ci# POWER6	42.7/54.3/(28.2)	63.0/92.8(**)
27e1051a39Sopenharmony_ci# POWER7	32.3/42.9/(18.4)	18.5/23.3
28e1051a39Sopenharmony_ci#
29e1051a39Sopenharmony_ci# (*)	This is ~10% worse than reported in paper. The reason is
30e1051a39Sopenharmony_ci#	twofold. This module doesn't make any assumption about
31e1051a39Sopenharmony_ci#	key schedule (or data for that matter) alignment and handles
32e1051a39Sopenharmony_ci#	it in-line. Secondly it, being transliterated from
33e1051a39Sopenharmony_ci#	vpaes-x86_64.pl, relies on "nested inversion" better suited
34e1051a39Sopenharmony_ci#	for Intel CPUs.
35e1051a39Sopenharmony_ci# (**)	Inadequate POWER6 performance is due to astronomic AltiVec
36e1051a39Sopenharmony_ci#	latency, 9 cycles per simple logical operation.
37e1051a39Sopenharmony_ci
38e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
39e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
40e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
41e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
42e1051a39Sopenharmony_ci
43e1051a39Sopenharmony_ciif ($flavour =~ /64/) {
44e1051a39Sopenharmony_ci	$SIZE_T	=8;
45e1051a39Sopenharmony_ci	$LRSAVE	=2*$SIZE_T;
46e1051a39Sopenharmony_ci	$STU	="stdu";
47e1051a39Sopenharmony_ci	$POP	="ld";
48e1051a39Sopenharmony_ci	$PUSH	="std";
49e1051a39Sopenharmony_ci	$UCMP	="cmpld";
50e1051a39Sopenharmony_ci} elsif ($flavour =~ /32/) {
51e1051a39Sopenharmony_ci	$SIZE_T	=4;
52e1051a39Sopenharmony_ci	$LRSAVE	=$SIZE_T;
53e1051a39Sopenharmony_ci	$STU	="stwu";
54e1051a39Sopenharmony_ci	$POP	="lwz";
55e1051a39Sopenharmony_ci	$PUSH	="stw";
56e1051a39Sopenharmony_ci	$UCMP	="cmplw";
57e1051a39Sopenharmony_ci} else { die "nonsense $flavour"; }
58e1051a39Sopenharmony_ci
59e1051a39Sopenharmony_ci$sp="r1";
60e1051a39Sopenharmony_ci$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
61e1051a39Sopenharmony_ci
62e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63e1051a39Sopenharmony_ci( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
64e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
65e1051a39Sopenharmony_cidie "can't locate ppc-xlate.pl";
66e1051a39Sopenharmony_ci
67e1051a39Sopenharmony_ciopen STDOUT,"| $^X $xlate $flavour \"$output\""
68e1051a39Sopenharmony_ci    || die "can't call $xlate: $!";
69e1051a39Sopenharmony_ci
70e1051a39Sopenharmony_ci$code.=<<___;
71e1051a39Sopenharmony_ci.machine	"any"
72e1051a39Sopenharmony_ci
73e1051a39Sopenharmony_ci.text
74e1051a39Sopenharmony_ci
75e1051a39Sopenharmony_ci.align	7	# totally strategic alignment
76e1051a39Sopenharmony_ci_vpaes_consts:
77e1051a39Sopenharmony_ciLk_mc_forward:	# mc_forward
78e1051a39Sopenharmony_ci	.long	0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c	?inv
79e1051a39Sopenharmony_ci	.long	0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300	?inv
80e1051a39Sopenharmony_ci	.long	0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704	?inv
81e1051a39Sopenharmony_ci	.long	0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08	?inv
82e1051a39Sopenharmony_ciLk_mc_backward:	# mc_backward
83e1051a39Sopenharmony_ci	.long	0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e	?inv
84e1051a39Sopenharmony_ci	.long	0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a	?inv
85e1051a39Sopenharmony_ci	.long	0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506	?inv
86e1051a39Sopenharmony_ci	.long	0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102	?inv
87e1051a39Sopenharmony_ciLk_sr:		# sr
88e1051a39Sopenharmony_ci	.long	0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f	?inv
89e1051a39Sopenharmony_ci	.long	0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b	?inv
90e1051a39Sopenharmony_ci	.long	0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07	?inv
91e1051a39Sopenharmony_ci	.long	0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603	?inv
92e1051a39Sopenharmony_ci
93e1051a39Sopenharmony_ci##
94e1051a39Sopenharmony_ci## "Hot" constants
95e1051a39Sopenharmony_ci##
96e1051a39Sopenharmony_ciLk_inv:		# inv, inva
97e1051a39Sopenharmony_ci	.long	0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704	?rev
98e1051a39Sopenharmony_ci	.long	0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03	?rev
99e1051a39Sopenharmony_ciLk_ipt:		# input transform (lo, hi)
100e1051a39Sopenharmony_ci	.long	0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca	?rev
101e1051a39Sopenharmony_ci	.long	0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd	?rev
102e1051a39Sopenharmony_ciLk_sbo:		# sbou, sbot
103e1051a39Sopenharmony_ci	.long	0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15	?rev
104e1051a39Sopenharmony_ci	.long	0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e	?rev
105e1051a39Sopenharmony_ciLk_sb1:		# sb1u, sb1t
106e1051a39Sopenharmony_ci	.long	0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b	?rev
107e1051a39Sopenharmony_ci	.long	0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5	?rev
108e1051a39Sopenharmony_ciLk_sb2:		# sb2u, sb2t
109e1051a39Sopenharmony_ci	.long	0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2	?rev
110e1051a39Sopenharmony_ci	.long	0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e	?rev
111e1051a39Sopenharmony_ci
112e1051a39Sopenharmony_ci##
113e1051a39Sopenharmony_ci##  Decryption stuff
114e1051a39Sopenharmony_ci##
115e1051a39Sopenharmony_ciLk_dipt:	# decryption input transform
116e1051a39Sopenharmony_ci	.long	0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15	?rev
117e1051a39Sopenharmony_ci	.long	0x00650560, 0xe683e386, 0x94f191f4, 0x72177712	?rev
118e1051a39Sopenharmony_ciLk_dsbo:	# decryption sbox final output
119e1051a39Sopenharmony_ci	.long	0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7	?rev
120e1051a39Sopenharmony_ci	.long	0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca	?rev
121e1051a39Sopenharmony_ciLk_dsb9:	# decryption sbox output *9*u, *9*t
122e1051a39Sopenharmony_ci	.long	0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca	?rev
123e1051a39Sopenharmony_ci	.long	0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72	?rev
124e1051a39Sopenharmony_ciLk_dsbd:	# decryption sbox output *D*u, *D*t
125e1051a39Sopenharmony_ci	.long	0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5	?rev
126e1051a39Sopenharmony_ci	.long	0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129	?rev
127e1051a39Sopenharmony_ciLk_dsbb:	# decryption sbox output *B*u, *B*t
128e1051a39Sopenharmony_ci	.long	0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660	?rev
129e1051a39Sopenharmony_ci	.long	0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3	?rev
130e1051a39Sopenharmony_ciLk_dsbe:	# decryption sbox output *E*u, *E*t
131e1051a39Sopenharmony_ci	.long	0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222	?rev
132e1051a39Sopenharmony_ci	.long	0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794	?rev
133e1051a39Sopenharmony_ci
134e1051a39Sopenharmony_ci##
135e1051a39Sopenharmony_ci##  Key schedule constants
136e1051a39Sopenharmony_ci##
137e1051a39Sopenharmony_ciLk_dksd:	# decryption key schedule: invskew x*D
138e1051a39Sopenharmony_ci	.long	0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007	?rev
139e1051a39Sopenharmony_ci	.long	0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f	?rev
140e1051a39Sopenharmony_ciLk_dksb:	# decryption key schedule: invskew x*B
141e1051a39Sopenharmony_ci	.long	0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603	?rev
142e1051a39Sopenharmony_ci	.long	0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9	?rev
143e1051a39Sopenharmony_ciLk_dkse:	# decryption key schedule: invskew x*E + 0x63
144e1051a39Sopenharmony_ci	.long	0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553	?rev
145e1051a39Sopenharmony_ci	.long	0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd	?rev
146e1051a39Sopenharmony_ciLk_dks9:	# decryption key schedule: invskew x*9
147e1051a39Sopenharmony_ci	.long	0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a	?rev
148e1051a39Sopenharmony_ci	.long	0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b	?rev
149e1051a39Sopenharmony_ci
150e1051a39Sopenharmony_ciLk_rcon:	# rcon
151e1051a39Sopenharmony_ci	.long	0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70	?asis
152e1051a39Sopenharmony_ciLk_s63:
153e1051a39Sopenharmony_ci	.long	0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b	?asis
154e1051a39Sopenharmony_ci
155e1051a39Sopenharmony_ciLk_opt:		# output transform
156e1051a39Sopenharmony_ci	.long	0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7	?rev
157e1051a39Sopenharmony_ci	.long	0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1	?rev
158e1051a39Sopenharmony_ciLk_deskew:	# deskew tables: inverts the sbox's "skew"
159e1051a39Sopenharmony_ci	.long	0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d	?rev
160e1051a39Sopenharmony_ci	.long	0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128	?rev
161e1051a39Sopenharmony_ci.align	5
162e1051a39Sopenharmony_ciLconsts:
163e1051a39Sopenharmony_ci	mflr	r0
164e1051a39Sopenharmony_ci	bcl	20,31,\$+4
165e1051a39Sopenharmony_ci	mflr	r12	#vvvvv "distance between . and _vpaes_consts
166e1051a39Sopenharmony_ci	addi	r12,r12,-0x308
167e1051a39Sopenharmony_ci	mtlr	r0
168e1051a39Sopenharmony_ci	blr
169e1051a39Sopenharmony_ci	.long	0
170e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
171e1051a39Sopenharmony_ci.asciz  "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
172e1051a39Sopenharmony_ci.align	6
173e1051a39Sopenharmony_ci___
174e1051a39Sopenharmony_ci
175e1051a39Sopenharmony_cimy ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
176e1051a39Sopenharmony_ci{
177e1051a39Sopenharmony_cimy ($inp,$out,$key) = map("r$_",(3..5));
178e1051a39Sopenharmony_ci
179e1051a39Sopenharmony_cimy ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
180e1051a39Sopenharmony_cimy ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
181e1051a39Sopenharmony_cimy ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
182e1051a39Sopenharmony_ci
183e1051a39Sopenharmony_ci$code.=<<___;
184e1051a39Sopenharmony_ci##
185e1051a39Sopenharmony_ci##  _aes_preheat
186e1051a39Sopenharmony_ci##
187e1051a39Sopenharmony_ci##  Fills register %r10 -> .aes_consts (so you can -fPIC)
188e1051a39Sopenharmony_ci##  and %xmm9-%xmm15 as specified below.
189e1051a39Sopenharmony_ci##
190e1051a39Sopenharmony_ci.align	4
191e1051a39Sopenharmony_ci_vpaes_encrypt_preheat:
192e1051a39Sopenharmony_ci	mflr	r8
193e1051a39Sopenharmony_ci	bl	Lconsts
194e1051a39Sopenharmony_ci	mtlr	r8
195e1051a39Sopenharmony_ci	li	r11, 0xc0		# Lk_inv
196e1051a39Sopenharmony_ci	li	r10, 0xd0
197e1051a39Sopenharmony_ci	li	r9,  0xe0		# Lk_ipt
198e1051a39Sopenharmony_ci	li	r8,  0xf0
199e1051a39Sopenharmony_ci	vxor	v7, v7, v7		# 0x00..00
200e1051a39Sopenharmony_ci	vspltisb	v8,4		# 0x04..04
201e1051a39Sopenharmony_ci	vspltisb	v9,0x0f		# 0x0f..0f
202e1051a39Sopenharmony_ci	lvx	$invlo, r12, r11
203e1051a39Sopenharmony_ci	li	r11, 0x100
204e1051a39Sopenharmony_ci	lvx	$invhi, r12, r10
205e1051a39Sopenharmony_ci	li	r10, 0x110
206e1051a39Sopenharmony_ci	lvx	$iptlo, r12, r9
207e1051a39Sopenharmony_ci	li	r9,  0x120
208e1051a39Sopenharmony_ci	lvx	$ipthi, r12, r8
209e1051a39Sopenharmony_ci	li	r8,  0x130
210e1051a39Sopenharmony_ci	lvx	$sbou, r12, r11
211e1051a39Sopenharmony_ci	li	r11, 0x140
212e1051a39Sopenharmony_ci	lvx	$sbot, r12, r10
213e1051a39Sopenharmony_ci	li	r10, 0x150
214e1051a39Sopenharmony_ci	lvx	$sb1u, r12, r9
215e1051a39Sopenharmony_ci	lvx	$sb1t, r12, r8
216e1051a39Sopenharmony_ci	lvx	$sb2u, r12, r11
217e1051a39Sopenharmony_ci	lvx	$sb2t, r12, r10
218e1051a39Sopenharmony_ci	blr
219e1051a39Sopenharmony_ci	.long	0
220e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
221e1051a39Sopenharmony_ci
222e1051a39Sopenharmony_ci##
223e1051a39Sopenharmony_ci##  _aes_encrypt_core
224e1051a39Sopenharmony_ci##
225e1051a39Sopenharmony_ci##  AES-encrypt %xmm0.
226e1051a39Sopenharmony_ci##
227e1051a39Sopenharmony_ci##  Inputs:
228e1051a39Sopenharmony_ci##     %xmm0 = input
229e1051a39Sopenharmony_ci##     %xmm9-%xmm15 as in _vpaes_preheat
230e1051a39Sopenharmony_ci##    (%rdx) = scheduled keys
231e1051a39Sopenharmony_ci##
232e1051a39Sopenharmony_ci##  Output in %xmm0
233e1051a39Sopenharmony_ci##  Clobbers  %xmm1-%xmm6, %r9, %r10, %r11, %rax
234e1051a39Sopenharmony_ci##
235e1051a39Sopenharmony_ci##
236e1051a39Sopenharmony_ci.align 5
237e1051a39Sopenharmony_ci_vpaes_encrypt_core:
238e1051a39Sopenharmony_ci	lwz	r8, 240($key)		# pull rounds
239e1051a39Sopenharmony_ci	li	r9, 16
240e1051a39Sopenharmony_ci	lvx	v5, 0, $key		# vmovdqu	(%r9),	%xmm5		# round0 key
241e1051a39Sopenharmony_ci	li	r11, 0x10
242e1051a39Sopenharmony_ci	lvx	v6, r9, $key
243e1051a39Sopenharmony_ci	addi	r9, r9, 16
244e1051a39Sopenharmony_ci	?vperm	v5, v5, v6, $keyperm	# align round key
245e1051a39Sopenharmony_ci	addi	r10, r11, 0x40
246e1051a39Sopenharmony_ci	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
247e1051a39Sopenharmony_ci	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm1
248e1051a39Sopenharmony_ci	vperm	v1, $ipthi, $ipthi, v1	# vpshufb	%xmm0,	%xmm3,	%xmm2
249e1051a39Sopenharmony_ci	vxor	v0, v0, v5		# vpxor	%xmm5,	%xmm1,	%xmm0
250e1051a39Sopenharmony_ci	vxor	v0, v0, v1		# vpxor	%xmm2,	%xmm0,	%xmm0
251e1051a39Sopenharmony_ci	mtctr	r8
252e1051a39Sopenharmony_ci	b	Lenc_entry
253e1051a39Sopenharmony_ci
254e1051a39Sopenharmony_ci.align 4
255e1051a39Sopenharmony_ciLenc_loop:
256e1051a39Sopenharmony_ci	# middle of middle round
257e1051a39Sopenharmony_ci	vperm	v4, $sb1t, v7, v2	# vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
258e1051a39Sopenharmony_ci	lvx	v1, r12, r11		# vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
259e1051a39Sopenharmony_ci	addi	r11, r11, 16
260e1051a39Sopenharmony_ci	vperm	v0, $sb1u, v7, v3	# vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
261e1051a39Sopenharmony_ci	vxor	v4, v4, v5		# vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
262e1051a39Sopenharmony_ci	andi.	r11, r11, 0x30		# and		\$0x30, %r11	# ... mod 4
263e1051a39Sopenharmony_ci	vperm	v5, $sb2t, v7, v2	# vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
264e1051a39Sopenharmony_ci	vxor	v0, v0, v4		# vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
265e1051a39Sopenharmony_ci	vperm	v2, $sb2u, v7, v3	# vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
266e1051a39Sopenharmony_ci	lvx	v4, r12, r10		# vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
267e1051a39Sopenharmony_ci	addi	r10, r11, 0x40
268e1051a39Sopenharmony_ci	vperm	v3, v0, v7, v1		# vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
269e1051a39Sopenharmony_ci	vxor	v2, v2, v5		# vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
270e1051a39Sopenharmony_ci	vperm	v0, v0, v7, v4		# vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
271e1051a39Sopenharmony_ci	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
272e1051a39Sopenharmony_ci	vperm	v4, v3, v7, v1		# vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
273e1051a39Sopenharmony_ci	vxor	v0, v0, v3		# vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
274e1051a39Sopenharmony_ci	vxor	v0, v0, v4		# vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
275e1051a39Sopenharmony_ci
276e1051a39Sopenharmony_ciLenc_entry:
277e1051a39Sopenharmony_ci	# top of round
278e1051a39Sopenharmony_ci	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
279e1051a39Sopenharmony_ci	vperm	v5, $invhi, $invhi, v0	# vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
280e1051a39Sopenharmony_ci	vxor	v0, v0, v1		# vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
281e1051a39Sopenharmony_ci	vperm	v3, $invlo, $invlo, v1	# vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
282e1051a39Sopenharmony_ci	vperm	v4, $invlo, $invlo, v0	# vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
283e1051a39Sopenharmony_ci	vand	v0, v0, v9
284e1051a39Sopenharmony_ci	vxor	v3, v3, v5		# vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
285e1051a39Sopenharmony_ci	vxor	v4, v4, v5		# vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
286e1051a39Sopenharmony_ci	vperm	v2, $invlo, v7, v3	# vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
287e1051a39Sopenharmony_ci	vmr	v5, v6
288e1051a39Sopenharmony_ci	lvx	v6, r9, $key		# vmovdqu	(%r9), %xmm5
289e1051a39Sopenharmony_ci	vperm	v3, $invlo, v7, v4	# vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
290e1051a39Sopenharmony_ci	addi	r9, r9, 16
291e1051a39Sopenharmony_ci	vxor	v2, v2, v0		# vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
292e1051a39Sopenharmony_ci	?vperm	v5, v5, v6, $keyperm	# align round key
293e1051a39Sopenharmony_ci	vxor	v3, v3, v1		# vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
294e1051a39Sopenharmony_ci	bdnz	Lenc_loop
295e1051a39Sopenharmony_ci
296e1051a39Sopenharmony_ci	# middle of last round
297e1051a39Sopenharmony_ci	addi	r10, r11, 0x80
298e1051a39Sopenharmony_ci					# vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
299e1051a39Sopenharmony_ci					# vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
300e1051a39Sopenharmony_ci	vperm	v4, $sbou, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
301e1051a39Sopenharmony_ci	lvx	v1, r12, r10		# vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
302e1051a39Sopenharmony_ci	vperm	v0, $sbot, v7, v3	# vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
303e1051a39Sopenharmony_ci	vxor	v4, v4, v5		# vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
304e1051a39Sopenharmony_ci	vxor	v0, v0, v4		# vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
305e1051a39Sopenharmony_ci	vperm	v0, v0, v7, v1		# vpshufb	%xmm1,	%xmm0,	%xmm0
306e1051a39Sopenharmony_ci	blr
307e1051a39Sopenharmony_ci	.long	0
308e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
309e1051a39Sopenharmony_ci
310e1051a39Sopenharmony_ci.globl	.vpaes_encrypt
311e1051a39Sopenharmony_ci.align	5
312e1051a39Sopenharmony_ci.vpaes_encrypt:
313e1051a39Sopenharmony_ci	$STU	$sp,-$FRAME($sp)
314e1051a39Sopenharmony_ci	li	r10,`15+6*$SIZE_T`
315e1051a39Sopenharmony_ci	li	r11,`31+6*$SIZE_T`
316e1051a39Sopenharmony_ci	mflr	r6
317e1051a39Sopenharmony_ci	mfspr	r7, 256			# save vrsave
318e1051a39Sopenharmony_ci	stvx	v20,r10,$sp
319e1051a39Sopenharmony_ci	addi	r10,r10,32
320e1051a39Sopenharmony_ci	stvx	v21,r11,$sp
321e1051a39Sopenharmony_ci	addi	r11,r11,32
322e1051a39Sopenharmony_ci	stvx	v22,r10,$sp
323e1051a39Sopenharmony_ci	addi	r10,r10,32
324e1051a39Sopenharmony_ci	stvx	v23,r11,$sp
325e1051a39Sopenharmony_ci	addi	r11,r11,32
326e1051a39Sopenharmony_ci	stvx	v24,r10,$sp
327e1051a39Sopenharmony_ci	addi	r10,r10,32
328e1051a39Sopenharmony_ci	stvx	v25,r11,$sp
329e1051a39Sopenharmony_ci	addi	r11,r11,32
330e1051a39Sopenharmony_ci	stvx	v26,r10,$sp
331e1051a39Sopenharmony_ci	addi	r10,r10,32
332e1051a39Sopenharmony_ci	stvx	v27,r11,$sp
333e1051a39Sopenharmony_ci	addi	r11,r11,32
334e1051a39Sopenharmony_ci	stvx	v28,r10,$sp
335e1051a39Sopenharmony_ci	addi	r10,r10,32
336e1051a39Sopenharmony_ci	stvx	v29,r11,$sp
337e1051a39Sopenharmony_ci	addi	r11,r11,32
338e1051a39Sopenharmony_ci	stvx	v30,r10,$sp
339e1051a39Sopenharmony_ci	stvx	v31,r11,$sp
340e1051a39Sopenharmony_ci	stw	r7,`$FRAME-4`($sp)	# save vrsave
341e1051a39Sopenharmony_ci	li	r0, -1
342e1051a39Sopenharmony_ci	$PUSH	r6,`$FRAME+$LRSAVE`($sp)
343e1051a39Sopenharmony_ci	mtspr	256, r0			# preserve all AltiVec registers
344e1051a39Sopenharmony_ci
345e1051a39Sopenharmony_ci	bl	_vpaes_encrypt_preheat
346e1051a39Sopenharmony_ci
347e1051a39Sopenharmony_ci	?lvsl	$inpperm, 0, $inp	# prepare for unaligned access
348e1051a39Sopenharmony_ci	lvx	v0, 0, $inp
349e1051a39Sopenharmony_ci	addi	$inp, $inp, 15		# 15 is not a typo
350e1051a39Sopenharmony_ci	 ?lvsr	$outperm, 0, $out
351e1051a39Sopenharmony_ci	?lvsl	$keyperm, 0, $key	# prepare for unaligned access
352e1051a39Sopenharmony_ci	lvx	$inptail, 0, $inp	# redundant in aligned case
353e1051a39Sopenharmony_ci	?vperm	v0, v0, $inptail, $inpperm
354e1051a39Sopenharmony_ci
355e1051a39Sopenharmony_ci	bl	_vpaes_encrypt_core
356e1051a39Sopenharmony_ci
357e1051a39Sopenharmony_ci	andi.	r8, $out, 15
358e1051a39Sopenharmony_ci	li	r9, 16
359e1051a39Sopenharmony_ci	beq	Lenc_out_aligned
360e1051a39Sopenharmony_ci
361e1051a39Sopenharmony_ci	vperm	v0, v0, v0, $outperm	# rotate right/left
362e1051a39Sopenharmony_ci	mtctr	r9
363e1051a39Sopenharmony_ciLenc_out_unaligned:
364e1051a39Sopenharmony_ci	stvebx	v0, 0, $out
365e1051a39Sopenharmony_ci	addi	$out, $out, 1
366e1051a39Sopenharmony_ci	bdnz	Lenc_out_unaligned
367e1051a39Sopenharmony_ci	b	Lenc_done
368e1051a39Sopenharmony_ci
369e1051a39Sopenharmony_ci.align	4
370e1051a39Sopenharmony_ciLenc_out_aligned:
371e1051a39Sopenharmony_ci	stvx	v0, 0, $out
372e1051a39Sopenharmony_ciLenc_done:
373e1051a39Sopenharmony_ci
374e1051a39Sopenharmony_ci	li	r10,`15+6*$SIZE_T`
375e1051a39Sopenharmony_ci	li	r11,`31+6*$SIZE_T`
376e1051a39Sopenharmony_ci	mtlr	r6
377e1051a39Sopenharmony_ci	mtspr	256, r7			# restore vrsave
378e1051a39Sopenharmony_ci	lvx	v20,r10,$sp
379e1051a39Sopenharmony_ci	addi	r10,r10,32
380e1051a39Sopenharmony_ci	lvx	v21,r11,$sp
381e1051a39Sopenharmony_ci	addi	r11,r11,32
382e1051a39Sopenharmony_ci	lvx	v22,r10,$sp
383e1051a39Sopenharmony_ci	addi	r10,r10,32
384e1051a39Sopenharmony_ci	lvx	v23,r11,$sp
385e1051a39Sopenharmony_ci	addi	r11,r11,32
386e1051a39Sopenharmony_ci	lvx	v24,r10,$sp
387e1051a39Sopenharmony_ci	addi	r10,r10,32
388e1051a39Sopenharmony_ci	lvx	v25,r11,$sp
389e1051a39Sopenharmony_ci	addi	r11,r11,32
390e1051a39Sopenharmony_ci	lvx	v26,r10,$sp
391e1051a39Sopenharmony_ci	addi	r10,r10,32
392e1051a39Sopenharmony_ci	lvx	v27,r11,$sp
393e1051a39Sopenharmony_ci	addi	r11,r11,32
394e1051a39Sopenharmony_ci	lvx	v28,r10,$sp
395e1051a39Sopenharmony_ci	addi	r10,r10,32
396e1051a39Sopenharmony_ci	lvx	v29,r11,$sp
397e1051a39Sopenharmony_ci	addi	r11,r11,32
398e1051a39Sopenharmony_ci	lvx	v30,r10,$sp
399e1051a39Sopenharmony_ci	lvx	v31,r11,$sp
400e1051a39Sopenharmony_ci	addi	$sp,$sp,$FRAME
401e1051a39Sopenharmony_ci	blr
402e1051a39Sopenharmony_ci	.long	0
403e1051a39Sopenharmony_ci	.byte	0,12,0x04,1,0x80,0,3,0
404e1051a39Sopenharmony_ci	.long	0
405e1051a39Sopenharmony_ci.size	.vpaes_encrypt,.-.vpaes_encrypt
406e1051a39Sopenharmony_ci
407e1051a39Sopenharmony_ci.align	4
408e1051a39Sopenharmony_ci_vpaes_decrypt_preheat:
409e1051a39Sopenharmony_ci	mflr	r8
410e1051a39Sopenharmony_ci	bl	Lconsts
411e1051a39Sopenharmony_ci	mtlr	r8
412e1051a39Sopenharmony_ci	li	r11, 0xc0		# Lk_inv
413e1051a39Sopenharmony_ci	li	r10, 0xd0
414e1051a39Sopenharmony_ci	li	r9,  0x160		# Ldipt
415e1051a39Sopenharmony_ci	li	r8,  0x170
416e1051a39Sopenharmony_ci	vxor	v7, v7, v7		# 0x00..00
417e1051a39Sopenharmony_ci	vspltisb	v8,4		# 0x04..04
418e1051a39Sopenharmony_ci	vspltisb	v9,0x0f		# 0x0f..0f
419e1051a39Sopenharmony_ci	lvx	$invlo, r12, r11
420e1051a39Sopenharmony_ci	li	r11, 0x180
421e1051a39Sopenharmony_ci	lvx	$invhi, r12, r10
422e1051a39Sopenharmony_ci	li	r10, 0x190
423e1051a39Sopenharmony_ci	lvx	$iptlo, r12, r9
424e1051a39Sopenharmony_ci	li	r9,  0x1a0
425e1051a39Sopenharmony_ci	lvx	$ipthi, r12, r8
426e1051a39Sopenharmony_ci	li	r8,  0x1b0
427e1051a39Sopenharmony_ci	lvx	$sbou, r12, r11
428e1051a39Sopenharmony_ci	li	r11, 0x1c0
429e1051a39Sopenharmony_ci	lvx	$sbot, r12, r10
430e1051a39Sopenharmony_ci	li	r10, 0x1d0
431e1051a39Sopenharmony_ci	lvx	$sb9u, r12, r9
432e1051a39Sopenharmony_ci	li	r9,  0x1e0
433e1051a39Sopenharmony_ci	lvx	$sb9t, r12, r8
434e1051a39Sopenharmony_ci	li	r8,  0x1f0
435e1051a39Sopenharmony_ci	lvx	$sbdu, r12, r11
436e1051a39Sopenharmony_ci	li	r11, 0x200
437e1051a39Sopenharmony_ci	lvx	$sbdt, r12, r10
438e1051a39Sopenharmony_ci	li	r10, 0x210
439e1051a39Sopenharmony_ci	lvx	$sbbu, r12, r9
440e1051a39Sopenharmony_ci	lvx	$sbbt, r12, r8
441e1051a39Sopenharmony_ci	lvx	$sbeu, r12, r11
442e1051a39Sopenharmony_ci	lvx	$sbet, r12, r10
443e1051a39Sopenharmony_ci	blr
444e1051a39Sopenharmony_ci	.long	0
445e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
446e1051a39Sopenharmony_ci
447e1051a39Sopenharmony_ci##
448e1051a39Sopenharmony_ci##  Decryption core
449e1051a39Sopenharmony_ci##
450e1051a39Sopenharmony_ci##  Same API as encryption core.
451e1051a39Sopenharmony_ci##
452e1051a39Sopenharmony_ci.align	4
453e1051a39Sopenharmony_ci_vpaes_decrypt_core:
454e1051a39Sopenharmony_ci	lwz	r8, 240($key)		# pull rounds
455e1051a39Sopenharmony_ci	li	r9, 16
456e1051a39Sopenharmony_ci	lvx	v5, 0, $key		# vmovdqu	(%r9),	%xmm4		# round0 key
457e1051a39Sopenharmony_ci	li	r11, 0x30
458e1051a39Sopenharmony_ci	lvx	v6, r9, $key
459e1051a39Sopenharmony_ci	addi	r9, r9, 16
460e1051a39Sopenharmony_ci	?vperm	v5, v5, v6, $keyperm	# align round key
461e1051a39Sopenharmony_ci	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
462e1051a39Sopenharmony_ci	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm2
463e1051a39Sopenharmony_ci	vperm	v1, $ipthi, $ipthi, v1	# vpshufb	%xmm0,	%xmm1,	%xmm0
464e1051a39Sopenharmony_ci	vxor	v0, v0, v5		# vpxor	%xmm4,	%xmm2,	%xmm2
465e1051a39Sopenharmony_ci	vxor	v0, v0, v1		# vpxor	%xmm2,	%xmm0,	%xmm0
466e1051a39Sopenharmony_ci	mtctr	r8
467e1051a39Sopenharmony_ci	b	Ldec_entry
468e1051a39Sopenharmony_ci
469e1051a39Sopenharmony_ci.align 4
470e1051a39Sopenharmony_ciLdec_loop:
471e1051a39Sopenharmony_ci#
472e1051a39Sopenharmony_ci#  Inverse mix columns
473e1051a39Sopenharmony_ci#
474e1051a39Sopenharmony_ci	lvx	v0, r12, r11		# v5 and v0 are flipped
475e1051a39Sopenharmony_ci					# vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
476e1051a39Sopenharmony_ci					# vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
477e1051a39Sopenharmony_ci	vperm	v4, $sb9u, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
478e1051a39Sopenharmony_ci	subi	r11, r11, 16
479e1051a39Sopenharmony_ci	vperm	v1, $sb9t, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
480e1051a39Sopenharmony_ci	andi.	r11, r11, 0x30
481e1051a39Sopenharmony_ci	vxor	v5, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0
482e1051a39Sopenharmony_ci					# vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
483e1051a39Sopenharmony_ci	vxor	v5, v5, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
484e1051a39Sopenharmony_ci					# vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
485e1051a39Sopenharmony_ci
486e1051a39Sopenharmony_ci	vperm	v4, $sbdu, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
487e1051a39Sopenharmony_ci	vperm 	v5, v5, v7, v0		# vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
488e1051a39Sopenharmony_ci	vperm	v1, $sbdt, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
489e1051a39Sopenharmony_ci	vxor	v5, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
490e1051a39Sopenharmony_ci					# vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
491e1051a39Sopenharmony_ci	vxor	v5, v5, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
492e1051a39Sopenharmony_ci					# vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
493e1051a39Sopenharmony_ci
494e1051a39Sopenharmony_ci	vperm	v4, $sbbu, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
495e1051a39Sopenharmony_ci	vperm	v5, v5, v7, v0		# vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
496e1051a39Sopenharmony_ci	vperm	v1, $sbbt, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
497e1051a39Sopenharmony_ci	vxor	v5, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
498e1051a39Sopenharmony_ci					# vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
499e1051a39Sopenharmony_ci	vxor	v5, v5, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
500e1051a39Sopenharmony_ci					# vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
501e1051a39Sopenharmony_ci
502e1051a39Sopenharmony_ci	vperm	v4, $sbeu, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
503e1051a39Sopenharmony_ci	vperm	v5, v5, v7, v0		# vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
504e1051a39Sopenharmony_ci	vperm	v1, $sbet, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
505e1051a39Sopenharmony_ci	vxor	v0, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
506e1051a39Sopenharmony_ci	vxor	v0, v0, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
507e1051a39Sopenharmony_ci
508e1051a39Sopenharmony_ciLdec_entry:
509e1051a39Sopenharmony_ci	# top of round
510e1051a39Sopenharmony_ci	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
511e1051a39Sopenharmony_ci	vperm	v2, $invhi, $invhi, v0	# vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
512e1051a39Sopenharmony_ci	vxor	v0, v0, v1		# vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
513e1051a39Sopenharmony_ci	vperm	v3, $invlo, $invlo, v1	# vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
514e1051a39Sopenharmony_ci	vperm	v4, $invlo, $invlo, v0	# vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
515e1051a39Sopenharmony_ci	vand	v0, v0, v9
516e1051a39Sopenharmony_ci	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
517e1051a39Sopenharmony_ci	vxor	v4, v4, v2		# vpxor		%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
518e1051a39Sopenharmony_ci	vperm	v2, $invlo, v7, v3	# vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
519e1051a39Sopenharmony_ci	vmr	v5, v6
520e1051a39Sopenharmony_ci	lvx	v6, r9, $key		# vmovdqu	(%r9),	%xmm0
521e1051a39Sopenharmony_ci	vperm	v3, $invlo, v7, v4	# vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
522e1051a39Sopenharmony_ci	addi	r9, r9, 16
523e1051a39Sopenharmony_ci	vxor	v2, v2, v0		# vpxor		%xmm1,	%xmm2,	%xmm2	# 2 = io
524e1051a39Sopenharmony_ci	?vperm	v5, v5, v6, $keyperm	# align round key
525e1051a39Sopenharmony_ci	vxor	v3, v3, v1		# vpxor		%xmm0,  %xmm3,	%xmm3	# 3 = jo
526e1051a39Sopenharmony_ci	bdnz	Ldec_loop
527e1051a39Sopenharmony_ci
528e1051a39Sopenharmony_ci	# middle of last round
529e1051a39Sopenharmony_ci	addi	r10, r11, 0x80
530e1051a39Sopenharmony_ci					# vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
531e1051a39Sopenharmony_ci	vperm	v4, $sbou, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
532e1051a39Sopenharmony_ci					# vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
533e1051a39Sopenharmony_ci	lvx	v2, r12, r10		# vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
534e1051a39Sopenharmony_ci	vperm	v1, $sbot, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
535e1051a39Sopenharmony_ci	vxor	v4, v4, v5		# vpxor		%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
536e1051a39Sopenharmony_ci	vxor	v0, v1, v4		# vpxor		%xmm4,	%xmm1,	%xmm0	# 0 = A
537e1051a39Sopenharmony_ci	vperm	v0, v0, v7, v2		# vpshufb	%xmm2,	%xmm0,	%xmm0
538e1051a39Sopenharmony_ci	blr
539e1051a39Sopenharmony_ci	.long	0
540e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
541e1051a39Sopenharmony_ci
542e1051a39Sopenharmony_ci.globl	.vpaes_decrypt
543e1051a39Sopenharmony_ci.align	5
544e1051a39Sopenharmony_ci.vpaes_decrypt:
545e1051a39Sopenharmony_ci	$STU	$sp,-$FRAME($sp)
546e1051a39Sopenharmony_ci	li	r10,`15+6*$SIZE_T`
547e1051a39Sopenharmony_ci	li	r11,`31+6*$SIZE_T`
548e1051a39Sopenharmony_ci	mflr	r6
549e1051a39Sopenharmony_ci	mfspr	r7, 256			# save vrsave
550e1051a39Sopenharmony_ci	stvx	v20,r10,$sp
551e1051a39Sopenharmony_ci	addi	r10,r10,32
552e1051a39Sopenharmony_ci	stvx	v21,r11,$sp
553e1051a39Sopenharmony_ci	addi	r11,r11,32
554e1051a39Sopenharmony_ci	stvx	v22,r10,$sp
555e1051a39Sopenharmony_ci	addi	r10,r10,32
556e1051a39Sopenharmony_ci	stvx	v23,r11,$sp
557e1051a39Sopenharmony_ci	addi	r11,r11,32
558e1051a39Sopenharmony_ci	stvx	v24,r10,$sp
559e1051a39Sopenharmony_ci	addi	r10,r10,32
560e1051a39Sopenharmony_ci	stvx	v25,r11,$sp
561e1051a39Sopenharmony_ci	addi	r11,r11,32
562e1051a39Sopenharmony_ci	stvx	v26,r10,$sp
563e1051a39Sopenharmony_ci	addi	r10,r10,32
564e1051a39Sopenharmony_ci	stvx	v27,r11,$sp
565e1051a39Sopenharmony_ci	addi	r11,r11,32
566e1051a39Sopenharmony_ci	stvx	v28,r10,$sp
567e1051a39Sopenharmony_ci	addi	r10,r10,32
568e1051a39Sopenharmony_ci	stvx	v29,r11,$sp
569e1051a39Sopenharmony_ci	addi	r11,r11,32
570e1051a39Sopenharmony_ci	stvx	v30,r10,$sp
571e1051a39Sopenharmony_ci	stvx	v31,r11,$sp
572e1051a39Sopenharmony_ci	stw	r7,`$FRAME-4`($sp)	# save vrsave
573e1051a39Sopenharmony_ci	li	r0, -1
574e1051a39Sopenharmony_ci	$PUSH	r6,`$FRAME+$LRSAVE`($sp)
575e1051a39Sopenharmony_ci	mtspr	256, r0			# preserve all AltiVec registers
576e1051a39Sopenharmony_ci
577e1051a39Sopenharmony_ci	bl	_vpaes_decrypt_preheat
578e1051a39Sopenharmony_ci
579e1051a39Sopenharmony_ci	?lvsl	$inpperm, 0, $inp	# prepare for unaligned access
580e1051a39Sopenharmony_ci	lvx	v0, 0, $inp
581e1051a39Sopenharmony_ci	addi	$inp, $inp, 15		# 15 is not a typo
582e1051a39Sopenharmony_ci	 ?lvsr	$outperm, 0, $out
583e1051a39Sopenharmony_ci	?lvsl	$keyperm, 0, $key
584e1051a39Sopenharmony_ci	lvx	$inptail, 0, $inp	# redundant in aligned case
585e1051a39Sopenharmony_ci	?vperm	v0, v0, $inptail, $inpperm
586e1051a39Sopenharmony_ci
587e1051a39Sopenharmony_ci	bl	_vpaes_decrypt_core
588e1051a39Sopenharmony_ci
589e1051a39Sopenharmony_ci	andi.	r8, $out, 15
590e1051a39Sopenharmony_ci	li	r9, 16
591e1051a39Sopenharmony_ci	beq	Ldec_out_aligned
592e1051a39Sopenharmony_ci
593e1051a39Sopenharmony_ci	vperm	v0, v0, v0, $outperm	# rotate right/left
594e1051a39Sopenharmony_ci	mtctr	r9
595e1051a39Sopenharmony_ciLdec_out_unaligned:
596e1051a39Sopenharmony_ci	stvebx	v0, 0, $out
597e1051a39Sopenharmony_ci	addi	$out, $out, 1
598e1051a39Sopenharmony_ci	bdnz	Ldec_out_unaligned
599e1051a39Sopenharmony_ci	b	Ldec_done
600e1051a39Sopenharmony_ci
601e1051a39Sopenharmony_ci.align	4
602e1051a39Sopenharmony_ciLdec_out_aligned:
603e1051a39Sopenharmony_ci	stvx	v0, 0, $out
604e1051a39Sopenharmony_ciLdec_done:
605e1051a39Sopenharmony_ci
606e1051a39Sopenharmony_ci	li	r10,`15+6*$SIZE_T`
607e1051a39Sopenharmony_ci	li	r11,`31+6*$SIZE_T`
608e1051a39Sopenharmony_ci	mtlr	r6
609e1051a39Sopenharmony_ci	mtspr	256, r7			# restore vrsave
610e1051a39Sopenharmony_ci	lvx	v20,r10,$sp
611e1051a39Sopenharmony_ci	addi	r10,r10,32
612e1051a39Sopenharmony_ci	lvx	v21,r11,$sp
613e1051a39Sopenharmony_ci	addi	r11,r11,32
614e1051a39Sopenharmony_ci	lvx	v22,r10,$sp
615e1051a39Sopenharmony_ci	addi	r10,r10,32
616e1051a39Sopenharmony_ci	lvx	v23,r11,$sp
617e1051a39Sopenharmony_ci	addi	r11,r11,32
618e1051a39Sopenharmony_ci	lvx	v24,r10,$sp
619e1051a39Sopenharmony_ci	addi	r10,r10,32
620e1051a39Sopenharmony_ci	lvx	v25,r11,$sp
621e1051a39Sopenharmony_ci	addi	r11,r11,32
622e1051a39Sopenharmony_ci	lvx	v26,r10,$sp
623e1051a39Sopenharmony_ci	addi	r10,r10,32
624e1051a39Sopenharmony_ci	lvx	v27,r11,$sp
625e1051a39Sopenharmony_ci	addi	r11,r11,32
626e1051a39Sopenharmony_ci	lvx	v28,r10,$sp
627e1051a39Sopenharmony_ci	addi	r10,r10,32
628e1051a39Sopenharmony_ci	lvx	v29,r11,$sp
629e1051a39Sopenharmony_ci	addi	r11,r11,32
630e1051a39Sopenharmony_ci	lvx	v30,r10,$sp
631e1051a39Sopenharmony_ci	lvx	v31,r11,$sp
632e1051a39Sopenharmony_ci	addi	$sp,$sp,$FRAME
633e1051a39Sopenharmony_ci	blr
634e1051a39Sopenharmony_ci	.long	0
635e1051a39Sopenharmony_ci	.byte	0,12,0x04,1,0x80,0,3,0
636e1051a39Sopenharmony_ci	.long	0
637e1051a39Sopenharmony_ci.size	.vpaes_decrypt,.-.vpaes_decrypt
638e1051a39Sopenharmony_ci
639e1051a39Sopenharmony_ci.globl	.vpaes_cbc_encrypt
640e1051a39Sopenharmony_ci.align	5
641e1051a39Sopenharmony_ci.vpaes_cbc_encrypt:
642e1051a39Sopenharmony_ci	${UCMP}i r5,16
643e1051a39Sopenharmony_ci	bltlr-
644e1051a39Sopenharmony_ci
645e1051a39Sopenharmony_ci	$STU	$sp,-`($FRAME+2*$SIZE_T)`($sp)
646e1051a39Sopenharmony_ci	mflr	r0
647e1051a39Sopenharmony_ci	li	r10,`15+6*$SIZE_T`
648e1051a39Sopenharmony_ci	li	r11,`31+6*$SIZE_T`
649e1051a39Sopenharmony_ci	mfspr	r12, 256
650e1051a39Sopenharmony_ci	stvx	v20,r10,$sp
651e1051a39Sopenharmony_ci	addi	r10,r10,32
652e1051a39Sopenharmony_ci	stvx	v21,r11,$sp
653e1051a39Sopenharmony_ci	addi	r11,r11,32
654e1051a39Sopenharmony_ci	stvx	v22,r10,$sp
655e1051a39Sopenharmony_ci	addi	r10,r10,32
656e1051a39Sopenharmony_ci	stvx	v23,r11,$sp
657e1051a39Sopenharmony_ci	addi	r11,r11,32
658e1051a39Sopenharmony_ci	stvx	v24,r10,$sp
659e1051a39Sopenharmony_ci	addi	r10,r10,32
660e1051a39Sopenharmony_ci	stvx	v25,r11,$sp
661e1051a39Sopenharmony_ci	addi	r11,r11,32
662e1051a39Sopenharmony_ci	stvx	v26,r10,$sp
663e1051a39Sopenharmony_ci	addi	r10,r10,32
664e1051a39Sopenharmony_ci	stvx	v27,r11,$sp
665e1051a39Sopenharmony_ci	addi	r11,r11,32
666e1051a39Sopenharmony_ci	stvx	v28,r10,$sp
667e1051a39Sopenharmony_ci	addi	r10,r10,32
668e1051a39Sopenharmony_ci	stvx	v29,r11,$sp
669e1051a39Sopenharmony_ci	addi	r11,r11,32
670e1051a39Sopenharmony_ci	stvx	v30,r10,$sp
671e1051a39Sopenharmony_ci	stvx	v31,r11,$sp
672e1051a39Sopenharmony_ci	stw	r12,`$FRAME-4`($sp)	# save vrsave
673e1051a39Sopenharmony_ci	$PUSH	r30,`$FRAME+$SIZE_T*0`($sp)
674e1051a39Sopenharmony_ci	$PUSH	r31,`$FRAME+$SIZE_T*1`($sp)
675e1051a39Sopenharmony_ci	li	r9, -16
676e1051a39Sopenharmony_ci	$PUSH	r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
677e1051a39Sopenharmony_ci
678e1051a39Sopenharmony_ci	and	r30, r5, r9		# copy length&-16
679e1051a39Sopenharmony_ci	andi.	r9, $out, 15		# is $out aligned?
680e1051a39Sopenharmony_ci	mr	r5, r6			# copy pointer to key
681e1051a39Sopenharmony_ci	mr	r31, r7			# copy pointer to iv
682e1051a39Sopenharmony_ci	li	r6, -1
683e1051a39Sopenharmony_ci	mcrf	cr1, cr0		# put aside $out alignment flag
684e1051a39Sopenharmony_ci	mr	r7, r12			# copy vrsave
685e1051a39Sopenharmony_ci	mtspr	256, r6			# preserve all AltiVec registers
686e1051a39Sopenharmony_ci
687e1051a39Sopenharmony_ci	lvx	v24, 0, r31		# load [potentially unaligned] iv
688e1051a39Sopenharmony_ci	li	r9, 15
689e1051a39Sopenharmony_ci	?lvsl	$inpperm, 0, r31
690e1051a39Sopenharmony_ci	lvx	v25, r9, r31
691e1051a39Sopenharmony_ci	?vperm	v24, v24, v25, $inpperm
692e1051a39Sopenharmony_ci
693e1051a39Sopenharmony_ci	cmpwi	r8, 0			# test direction
694e1051a39Sopenharmony_ci	neg	r8, $inp		# prepare for unaligned access
695e1051a39Sopenharmony_ci	 vxor	v7, v7, v7
696e1051a39Sopenharmony_ci	?lvsl	$keyperm, 0, $key
697e1051a39Sopenharmony_ci	 ?lvsr	$outperm, 0, $out
698e1051a39Sopenharmony_ci	?lvsr	$inpperm, 0, r8		# -$inp
699e1051a39Sopenharmony_ci	 vnor	$outmask, v7, v7	# 0xff..ff
700e1051a39Sopenharmony_ci	lvx	$inptail, 0, $inp
701e1051a39Sopenharmony_ci	 ?vperm	$outmask, v7, $outmask, $outperm
702e1051a39Sopenharmony_ci	addi	$inp, $inp, 15		# 15 is not a typo
703e1051a39Sopenharmony_ci
704e1051a39Sopenharmony_ci	beq	Lcbc_decrypt
705e1051a39Sopenharmony_ci
706e1051a39Sopenharmony_ci	bl	_vpaes_encrypt_preheat
707e1051a39Sopenharmony_ci	li	r0, 16
708e1051a39Sopenharmony_ci
709e1051a39Sopenharmony_ci	beq	cr1, Lcbc_enc_loop	# $out is aligned
710e1051a39Sopenharmony_ci
711e1051a39Sopenharmony_ci	vmr	v0, $inptail
712e1051a39Sopenharmony_ci	lvx	$inptail, 0, $inp
713e1051a39Sopenharmony_ci	addi	$inp, $inp, 16
714e1051a39Sopenharmony_ci	?vperm	v0, v0, $inptail, $inpperm
715e1051a39Sopenharmony_ci	vxor	v0, v0, v24		# ^= iv
716e1051a39Sopenharmony_ci
717e1051a39Sopenharmony_ci	bl	_vpaes_encrypt_core
718e1051a39Sopenharmony_ci
719e1051a39Sopenharmony_ci	andi.	r8, $out, 15
720e1051a39Sopenharmony_ci	vmr	v24, v0			# put aside iv
721e1051a39Sopenharmony_ci	sub	r9, $out, r8
722e1051a39Sopenharmony_ci	vperm	$outhead, v0, v0, $outperm	# rotate right/left
723e1051a39Sopenharmony_ci
724e1051a39Sopenharmony_ciLcbc_enc_head:
725e1051a39Sopenharmony_ci	stvebx	$outhead, r8, r9
726e1051a39Sopenharmony_ci	cmpwi	r8, 15
727e1051a39Sopenharmony_ci	addi	r8, r8, 1
728e1051a39Sopenharmony_ci	bne	Lcbc_enc_head
729e1051a39Sopenharmony_ci
730e1051a39Sopenharmony_ci	sub.	r30, r30, r0		# len -= 16
731e1051a39Sopenharmony_ci	addi	$out, $out, 16
732e1051a39Sopenharmony_ci	beq	Lcbc_unaligned_done
733e1051a39Sopenharmony_ci
734e1051a39Sopenharmony_ciLcbc_enc_loop:
735e1051a39Sopenharmony_ci	vmr	v0, $inptail
736e1051a39Sopenharmony_ci	lvx	$inptail, 0, $inp
737e1051a39Sopenharmony_ci	addi	$inp, $inp, 16
738e1051a39Sopenharmony_ci	?vperm	v0, v0, $inptail, $inpperm
739e1051a39Sopenharmony_ci	vxor	v0, v0, v24		# ^= iv
740e1051a39Sopenharmony_ci
741e1051a39Sopenharmony_ci	bl	_vpaes_encrypt_core
742e1051a39Sopenharmony_ci
743e1051a39Sopenharmony_ci	vmr	v24, v0			# put aside iv
744e1051a39Sopenharmony_ci	sub.	r30, r30, r0		# len -= 16
745e1051a39Sopenharmony_ci	vperm	v0, v0, v0, $outperm	# rotate right/left
746e1051a39Sopenharmony_ci	vsel	v1, $outhead, v0, $outmask
747e1051a39Sopenharmony_ci	vmr	$outhead, v0
748e1051a39Sopenharmony_ci	stvx	v1, 0, $out
749e1051a39Sopenharmony_ci	addi	$out, $out, 16
750e1051a39Sopenharmony_ci	bne	Lcbc_enc_loop
751e1051a39Sopenharmony_ci
752e1051a39Sopenharmony_ci	b	Lcbc_done
753e1051a39Sopenharmony_ci
754e1051a39Sopenharmony_ci.align	5
755e1051a39Sopenharmony_ciLcbc_decrypt:
756e1051a39Sopenharmony_ci	bl	_vpaes_decrypt_preheat
757e1051a39Sopenharmony_ci	li	r0, 16
758e1051a39Sopenharmony_ci
759e1051a39Sopenharmony_ci	beq	cr1, Lcbc_dec_loop	# $out is aligned
760e1051a39Sopenharmony_ci
761e1051a39Sopenharmony_ci	vmr	v0, $inptail
762e1051a39Sopenharmony_ci	lvx	$inptail, 0, $inp
763e1051a39Sopenharmony_ci	addi	$inp, $inp, 16
764e1051a39Sopenharmony_ci	?vperm	v0, v0, $inptail, $inpperm
765e1051a39Sopenharmony_ci	vmr	v25, v0			# put aside input
766e1051a39Sopenharmony_ci
767e1051a39Sopenharmony_ci	bl	_vpaes_decrypt_core
768e1051a39Sopenharmony_ci
769e1051a39Sopenharmony_ci	andi.	r8, $out, 15
770e1051a39Sopenharmony_ci	vxor	v0, v0, v24		# ^= iv
771e1051a39Sopenharmony_ci	vmr	v24, v25
772e1051a39Sopenharmony_ci	sub	r9, $out, r8
773e1051a39Sopenharmony_ci	vperm	$outhead, v0, v0, $outperm	# rotate right/left
774e1051a39Sopenharmony_ci
775e1051a39Sopenharmony_ciLcbc_dec_head:
776e1051a39Sopenharmony_ci	stvebx	$outhead, r8, r9
777e1051a39Sopenharmony_ci	cmpwi	r8, 15
778e1051a39Sopenharmony_ci	addi	r8, r8, 1
779e1051a39Sopenharmony_ci	bne	Lcbc_dec_head
780e1051a39Sopenharmony_ci
781e1051a39Sopenharmony_ci	sub.	r30, r30, r0		# len -= 16
782e1051a39Sopenharmony_ci	addi	$out, $out, 16
783e1051a39Sopenharmony_ci	beq	Lcbc_unaligned_done
784e1051a39Sopenharmony_ci
785e1051a39Sopenharmony_ciLcbc_dec_loop:
786e1051a39Sopenharmony_ci	vmr	v0, $inptail
787e1051a39Sopenharmony_ci	lvx	$inptail, 0, $inp
788e1051a39Sopenharmony_ci	addi	$inp, $inp, 16
789e1051a39Sopenharmony_ci	?vperm	v0, v0, $inptail, $inpperm
790e1051a39Sopenharmony_ci	vmr	v25, v0			# put aside input
791e1051a39Sopenharmony_ci
792e1051a39Sopenharmony_ci	bl	_vpaes_decrypt_core
793e1051a39Sopenharmony_ci
794e1051a39Sopenharmony_ci	vxor	v0, v0, v24		# ^= iv
795e1051a39Sopenharmony_ci	vmr	v24, v25
796e1051a39Sopenharmony_ci	sub.	r30, r30, r0		# len -= 16
797e1051a39Sopenharmony_ci	vperm	v0, v0, v0, $outperm	# rotate right/left
798e1051a39Sopenharmony_ci	vsel	v1, $outhead, v0, $outmask
799e1051a39Sopenharmony_ci	vmr	$outhead, v0
800e1051a39Sopenharmony_ci	stvx	v1, 0, $out
801e1051a39Sopenharmony_ci	addi	$out, $out, 16
802e1051a39Sopenharmony_ci	bne	Lcbc_dec_loop
803e1051a39Sopenharmony_ci
804e1051a39Sopenharmony_ciLcbc_done:
805e1051a39Sopenharmony_ci	beq	cr1, Lcbc_write_iv	# $out is aligned
806e1051a39Sopenharmony_ci
807e1051a39Sopenharmony_ciLcbc_unaligned_done:
808e1051a39Sopenharmony_ci	andi.	r8, $out, 15
809e1051a39Sopenharmony_ci	sub	$out, $out, r8
810e1051a39Sopenharmony_ci	li	r9, 0
811e1051a39Sopenharmony_ciLcbc_tail:
812e1051a39Sopenharmony_ci	stvebx	$outhead, r9, $out
813e1051a39Sopenharmony_ci	addi	r9, r9, 1
814e1051a39Sopenharmony_ci	cmpw	r9, r8
815e1051a39Sopenharmony_ci	bne	Lcbc_tail
816e1051a39Sopenharmony_ci
817e1051a39Sopenharmony_ciLcbc_write_iv:
818e1051a39Sopenharmony_ci	neg	r8, r31			# write [potentially unaligned] iv
819e1051a39Sopenharmony_ci	li	r10, 4
820e1051a39Sopenharmony_ci	?lvsl	$outperm, 0, r8
821e1051a39Sopenharmony_ci	li	r11, 8
822e1051a39Sopenharmony_ci	li	r12, 12
823e1051a39Sopenharmony_ci	vperm	v24, v24, v24, $outperm	# rotate right/left
824e1051a39Sopenharmony_ci	stvewx	v24, 0, r31		# ivp is at least 32-bit aligned
825e1051a39Sopenharmony_ci	stvewx	v24, r10, r31
826e1051a39Sopenharmony_ci	stvewx	v24, r11, r31
827e1051a39Sopenharmony_ci	stvewx	v24, r12, r31
828e1051a39Sopenharmony_ci
829e1051a39Sopenharmony_ci	mtspr	256, r7			# restore vrsave
830e1051a39Sopenharmony_ci	li	r10,`15+6*$SIZE_T`
831e1051a39Sopenharmony_ci	li	r11,`31+6*$SIZE_T`
832e1051a39Sopenharmony_ci	lvx	v20,r10,$sp
833e1051a39Sopenharmony_ci	addi	r10,r10,32
834e1051a39Sopenharmony_ci	lvx	v21,r11,$sp
835e1051a39Sopenharmony_ci	addi	r11,r11,32
836e1051a39Sopenharmony_ci	lvx	v22,r10,$sp
837e1051a39Sopenharmony_ci	addi	r10,r10,32
838e1051a39Sopenharmony_ci	lvx	v23,r11,$sp
839e1051a39Sopenharmony_ci	addi	r11,r11,32
840e1051a39Sopenharmony_ci	lvx	v24,r10,$sp
841e1051a39Sopenharmony_ci	addi	r10,r10,32
842e1051a39Sopenharmony_ci	lvx	v25,r11,$sp
843e1051a39Sopenharmony_ci	addi	r11,r11,32
844e1051a39Sopenharmony_ci	lvx	v26,r10,$sp
845e1051a39Sopenharmony_ci	addi	r10,r10,32
846e1051a39Sopenharmony_ci	lvx	v27,r11,$sp
847e1051a39Sopenharmony_ci	addi	r11,r11,32
848e1051a39Sopenharmony_ci	lvx	v28,r10,$sp
849e1051a39Sopenharmony_ci	addi	r10,r10,32
850e1051a39Sopenharmony_ci	lvx	v29,r11,$sp
851e1051a39Sopenharmony_ci	addi	r11,r11,32
852e1051a39Sopenharmony_ci	lvx	v30,r10,$sp
853e1051a39Sopenharmony_ci	lvx	v31,r11,$sp
854e1051a39Sopenharmony_ciLcbc_abort:
855e1051a39Sopenharmony_ci	$POP	r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
856e1051a39Sopenharmony_ci	$POP	r30,`$FRAME+$SIZE_T*0`($sp)
857e1051a39Sopenharmony_ci	$POP	r31,`$FRAME+$SIZE_T*1`($sp)
858e1051a39Sopenharmony_ci	mtlr	r0
859e1051a39Sopenharmony_ci	addi	$sp,$sp,`$FRAME+$SIZE_T*2`
860e1051a39Sopenharmony_ci	blr
861e1051a39Sopenharmony_ci	.long	0
862e1051a39Sopenharmony_ci	.byte	0,12,0x04,1,0x80,2,6,0
863e1051a39Sopenharmony_ci	.long	0
864e1051a39Sopenharmony_ci.size	.vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
865e1051a39Sopenharmony_ci___
866e1051a39Sopenharmony_ci}
867e1051a39Sopenharmony_ci{
868e1051a39Sopenharmony_cimy ($inp,$bits,$out)=map("r$_",(3..5));
869e1051a39Sopenharmony_cimy $dir="cr1";
870e1051a39Sopenharmony_cimy ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
871e1051a39Sopenharmony_ci
872e1051a39Sopenharmony_ci$code.=<<___;
873e1051a39Sopenharmony_ci########################################################
874e1051a39Sopenharmony_ci##                                                    ##
875e1051a39Sopenharmony_ci##                  AES key schedule                  ##
876e1051a39Sopenharmony_ci##                                                    ##
877e1051a39Sopenharmony_ci########################################################
878e1051a39Sopenharmony_ci.align	4
879e1051a39Sopenharmony_ci_vpaes_key_preheat:
880e1051a39Sopenharmony_ci	mflr	r8
881e1051a39Sopenharmony_ci	bl	Lconsts
882e1051a39Sopenharmony_ci	mtlr	r8
883e1051a39Sopenharmony_ci	li	r11, 0xc0		# Lk_inv
884e1051a39Sopenharmony_ci	li	r10, 0xd0
885e1051a39Sopenharmony_ci	li	r9,  0xe0		# L_ipt
886e1051a39Sopenharmony_ci	li	r8,  0xf0
887e1051a39Sopenharmony_ci
888e1051a39Sopenharmony_ci	vspltisb	v8,4		# 0x04..04
889e1051a39Sopenharmony_ci	vxor	v9,v9,v9		# 0x00..00
890e1051a39Sopenharmony_ci	lvx	$invlo, r12, r11	# Lk_inv
891e1051a39Sopenharmony_ci	li	r11, 0x120
892e1051a39Sopenharmony_ci	lvx	$invhi, r12, r10
893e1051a39Sopenharmony_ci	li	r10, 0x130
894e1051a39Sopenharmony_ci	lvx	$iptlo, r12, r9		# Lk_ipt
895e1051a39Sopenharmony_ci	li	r9, 0x220
896e1051a39Sopenharmony_ci	lvx	$ipthi, r12, r8
897e1051a39Sopenharmony_ci	li	r8, 0x230
898e1051a39Sopenharmony_ci
899e1051a39Sopenharmony_ci	lvx	v14, r12, r11		# Lk_sb1
900e1051a39Sopenharmony_ci	li	r11, 0x240
901e1051a39Sopenharmony_ci	lvx	v15, r12, r10
902e1051a39Sopenharmony_ci	li	r10, 0x250
903e1051a39Sopenharmony_ci
904e1051a39Sopenharmony_ci	lvx	v16, r12, r9		# Lk_dksd
905e1051a39Sopenharmony_ci	li	r9, 0x260
906e1051a39Sopenharmony_ci	lvx	v17, r12, r8
907e1051a39Sopenharmony_ci	li	r8, 0x270
908e1051a39Sopenharmony_ci	lvx	v18, r12, r11		# Lk_dksb
909e1051a39Sopenharmony_ci	li	r11, 0x280
910e1051a39Sopenharmony_ci	lvx	v19, r12, r10
911e1051a39Sopenharmony_ci	li	r10, 0x290
912e1051a39Sopenharmony_ci	lvx	v20, r12, r9		# Lk_dkse
913e1051a39Sopenharmony_ci	li	r9, 0x2a0
914e1051a39Sopenharmony_ci	lvx	v21, r12, r8
915e1051a39Sopenharmony_ci	li	r8, 0x2b0
916e1051a39Sopenharmony_ci	lvx	v22, r12, r11		# Lk_dks9
917e1051a39Sopenharmony_ci	lvx	v23, r12, r10
918e1051a39Sopenharmony_ci
919e1051a39Sopenharmony_ci	lvx	v24, r12, r9		# Lk_rcon
920e1051a39Sopenharmony_ci	lvx	v25, 0, r12		# Lk_mc_forward[0]
921e1051a39Sopenharmony_ci	lvx	v26, r12, r8		# Lks63
922e1051a39Sopenharmony_ci	blr
923e1051a39Sopenharmony_ci	.long	0
924e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
925e1051a39Sopenharmony_ci
926e1051a39Sopenharmony_ci.align	4
927e1051a39Sopenharmony_ci_vpaes_schedule_core:
928e1051a39Sopenharmony_ci	mflr	r7
929e1051a39Sopenharmony_ci
930e1051a39Sopenharmony_ci	bl	_vpaes_key_preheat	# load the tables
931e1051a39Sopenharmony_ci
932e1051a39Sopenharmony_ci	#lvx	v0, 0, $inp		# vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
933e1051a39Sopenharmony_ci	neg	r8, $inp		# prepare for unaligned access
934e1051a39Sopenharmony_ci	lvx	v0, 0, $inp
935e1051a39Sopenharmony_ci	addi	$inp, $inp, 15		# 15 is not typo
936e1051a39Sopenharmony_ci	?lvsr	$inpperm, 0, r8		# -$inp
937e1051a39Sopenharmony_ci	lvx	v6, 0, $inp		# v6 serves as inptail
938e1051a39Sopenharmony_ci	addi	$inp, $inp, 8
939e1051a39Sopenharmony_ci	?vperm	v0, v0, v6, $inpperm
940e1051a39Sopenharmony_ci
941e1051a39Sopenharmony_ci	# input transform
942e1051a39Sopenharmony_ci	vmr	v3, v0			# vmovdqa	%xmm0,	%xmm3
943e1051a39Sopenharmony_ci	bl	_vpaes_schedule_transform
944e1051a39Sopenharmony_ci	vmr	v7, v0			# vmovdqa	%xmm0,	%xmm7
945e1051a39Sopenharmony_ci
946e1051a39Sopenharmony_ci	bne	$dir, Lschedule_am_decrypting
947e1051a39Sopenharmony_ci
948e1051a39Sopenharmony_ci	# encrypting, output zeroth round key after transform
949e1051a39Sopenharmony_ci	li	r8, 0x30		# mov	\$0x30,%r8d
950e1051a39Sopenharmony_ci	li	r9, 4
951e1051a39Sopenharmony_ci	li	r10, 8
952e1051a39Sopenharmony_ci	li	r11, 12
953e1051a39Sopenharmony_ci
954e1051a39Sopenharmony_ci	?lvsr	$outperm, 0, $out	# prepare for unaligned access
955e1051a39Sopenharmony_ci	vnor	$outmask, v9, v9	# 0xff..ff
956e1051a39Sopenharmony_ci	?vperm	$outmask, v9, $outmask, $outperm
957e1051a39Sopenharmony_ci
958e1051a39Sopenharmony_ci	#stvx	v0, 0, $out		# vmovdqu	%xmm0,	(%rdx)
959e1051a39Sopenharmony_ci	vperm	$outhead, v0, v0, $outperm	# rotate right/left
960e1051a39Sopenharmony_ci	stvewx	$outhead, 0, $out	# some are superfluous
961e1051a39Sopenharmony_ci	stvewx	$outhead, r9, $out
962e1051a39Sopenharmony_ci	stvewx	$outhead, r10, $out
963e1051a39Sopenharmony_ci	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
964e1051a39Sopenharmony_ci	stvewx	$outhead, r11, $out
965e1051a39Sopenharmony_ci	b	Lschedule_go
966e1051a39Sopenharmony_ci
967e1051a39Sopenharmony_ciLschedule_am_decrypting:
968e1051a39Sopenharmony_ci	srwi	r8, $bits, 1		# shr	\$1,%r8d
969e1051a39Sopenharmony_ci	andi.	r8, r8, 32		# and	\$32,%r8d
970e1051a39Sopenharmony_ci	xori	r8, r8, 32		# xor	\$32,%r8d	# nbits==192?0:32
971e1051a39Sopenharmony_ci	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
972e1051a39Sopenharmony_ci	# decrypting, output zeroth round key after shiftrows
973e1051a39Sopenharmony_ci	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
974e1051a39Sopenharmony_ci	li	r9, 4
975e1051a39Sopenharmony_ci	li	r10, 8
976e1051a39Sopenharmony_ci	li	r11, 12
977e1051a39Sopenharmony_ci	vperm	v4, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3
978e1051a39Sopenharmony_ci
979e1051a39Sopenharmony_ci	neg	r0, $out		# prepare for unaligned access
980e1051a39Sopenharmony_ci	?lvsl	$outperm, 0, r0
981e1051a39Sopenharmony_ci	vnor	$outmask, v9, v9	# 0xff..ff
982e1051a39Sopenharmony_ci	?vperm	$outmask, $outmask, v9, $outperm
983e1051a39Sopenharmony_ci
984e1051a39Sopenharmony_ci	#stvx	v4, 0, $out		# vmovdqu	%xmm3,	(%rdx)
985e1051a39Sopenharmony_ci	vperm	$outhead, v4, v4, $outperm	# rotate right/left
986e1051a39Sopenharmony_ci	stvewx	$outhead, 0, $out	# some are superfluous
987e1051a39Sopenharmony_ci	stvewx	$outhead, r9, $out
988e1051a39Sopenharmony_ci	stvewx	$outhead, r10, $out
989e1051a39Sopenharmony_ci	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
990e1051a39Sopenharmony_ci	stvewx	$outhead, r11, $out
991e1051a39Sopenharmony_ci	addi	$out, $out, 15		# 15 is not typo
992e1051a39Sopenharmony_ci	xori	r8, r8, 0x30		# xor	\$0x30, %r8
993e1051a39Sopenharmony_ci
994e1051a39Sopenharmony_ciLschedule_go:
995e1051a39Sopenharmony_ci	cmplwi	$bits, 192		# cmp	\$192,	%esi
996e1051a39Sopenharmony_ci	bgt	Lschedule_256
997e1051a39Sopenharmony_ci	beq	Lschedule_192
998e1051a39Sopenharmony_ci	# 128: fall though
999e1051a39Sopenharmony_ci
1000e1051a39Sopenharmony_ci##
1001e1051a39Sopenharmony_ci##  .schedule_128
1002e1051a39Sopenharmony_ci##
1003e1051a39Sopenharmony_ci##  128-bit specific part of key schedule.
1004e1051a39Sopenharmony_ci##
1005e1051a39Sopenharmony_ci##  This schedule is really simple, because all its parts
1006e1051a39Sopenharmony_ci##  are accomplished by the subroutines.
1007e1051a39Sopenharmony_ci##
1008e1051a39Sopenharmony_ciLschedule_128:
1009e1051a39Sopenharmony_ci	li	r0, 10			# mov	\$10, %esi
1010e1051a39Sopenharmony_ci	mtctr	r0
1011e1051a39Sopenharmony_ci
1012e1051a39Sopenharmony_ciLoop_schedule_128:
1013e1051a39Sopenharmony_ci	bl 	_vpaes_schedule_round
1014e1051a39Sopenharmony_ci	bdz 	Lschedule_mangle_last	# dec	%esi
1015e1051a39Sopenharmony_ci	bl	_vpaes_schedule_mangle	# write output
1016e1051a39Sopenharmony_ci	b 	Loop_schedule_128
1017e1051a39Sopenharmony_ci
1018e1051a39Sopenharmony_ci##
1019e1051a39Sopenharmony_ci##  .aes_schedule_192
1020e1051a39Sopenharmony_ci##
1021e1051a39Sopenharmony_ci##  192-bit specific part of key schedule.
1022e1051a39Sopenharmony_ci##
1023e1051a39Sopenharmony_ci##  The main body of this schedule is the same as the 128-bit
1024e1051a39Sopenharmony_ci##  schedule, but with more smearing.  The long, high side is
1025e1051a39Sopenharmony_ci##  stored in %xmm7 as before, and the short, low side is in
1026e1051a39Sopenharmony_ci##  the high bits of %xmm6.
1027e1051a39Sopenharmony_ci##
1028e1051a39Sopenharmony_ci##  This schedule is somewhat nastier, however, because each
1029e1051a39Sopenharmony_ci##  round produces 192 bits of key material, or 1.5 round keys.
1030e1051a39Sopenharmony_ci##  Therefore, on each cycle we do 2 rounds and produce 3 round
1031e1051a39Sopenharmony_ci##  keys.
1032e1051a39Sopenharmony_ci##
1033e1051a39Sopenharmony_ci.align	4
1034e1051a39Sopenharmony_ciLschedule_192:
1035e1051a39Sopenharmony_ci	li	r0, 4			# mov	\$4,	%esi
1036e1051a39Sopenharmony_ci	lvx	v0, 0, $inp
1037e1051a39Sopenharmony_ci	?vperm	v0, v6, v0, $inpperm
1038e1051a39Sopenharmony_ci	?vsldoi	v0, v3, v0, 8		# vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
1039e1051a39Sopenharmony_ci	bl	_vpaes_schedule_transform	# input transform
1040e1051a39Sopenharmony_ci	?vsldoi	v6, v0, v9, 8
1041e1051a39Sopenharmony_ci	?vsldoi	v6, v9, v6, 8		# clobber "low" side with zeros
1042e1051a39Sopenharmony_ci	mtctr	r0
1043e1051a39Sopenharmony_ci
1044e1051a39Sopenharmony_ciLoop_schedule_192:
1045e1051a39Sopenharmony_ci	bl	_vpaes_schedule_round
1046e1051a39Sopenharmony_ci	?vsldoi	v0, v6, v0, 8		# vpalignr	\$8,%xmm6,%xmm0,%xmm0
1047e1051a39Sopenharmony_ci	bl	_vpaes_schedule_mangle	# save key n
1048e1051a39Sopenharmony_ci	bl	_vpaes_schedule_192_smear
1049e1051a39Sopenharmony_ci	bl	_vpaes_schedule_mangle	# save key n+1
1050e1051a39Sopenharmony_ci	bl	_vpaes_schedule_round
1051e1051a39Sopenharmony_ci	bdz 	Lschedule_mangle_last	# dec	%esi
1052e1051a39Sopenharmony_ci	bl	_vpaes_schedule_mangle	# save key n+2
1053e1051a39Sopenharmony_ci	bl	_vpaes_schedule_192_smear
1054e1051a39Sopenharmony_ci	b	Loop_schedule_192
1055e1051a39Sopenharmony_ci
1056e1051a39Sopenharmony_ci##
1057e1051a39Sopenharmony_ci##  .aes_schedule_256
1058e1051a39Sopenharmony_ci##
1059e1051a39Sopenharmony_ci##  256-bit specific part of key schedule.
1060e1051a39Sopenharmony_ci##
1061e1051a39Sopenharmony_ci##  The structure here is very similar to the 128-bit
1062e1051a39Sopenharmony_ci##  schedule, but with an additional "low side" in
1063e1051a39Sopenharmony_ci##  %xmm6.  The low side's rounds are the same as the
1064e1051a39Sopenharmony_ci##  high side's, except no rcon and no rotation.
1065e1051a39Sopenharmony_ci##
1066e1051a39Sopenharmony_ci.align	4
1067e1051a39Sopenharmony_ciLschedule_256:
1068e1051a39Sopenharmony_ci	li	r0, 7			# mov	\$7, %esi
1069e1051a39Sopenharmony_ci	addi	$inp, $inp, 8
1070e1051a39Sopenharmony_ci	lvx	v0, 0, $inp		# vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
1071e1051a39Sopenharmony_ci	?vperm	v0, v6, v0, $inpperm
1072e1051a39Sopenharmony_ci	bl	_vpaes_schedule_transform	# input transform
1073e1051a39Sopenharmony_ci	mtctr	r0
1074e1051a39Sopenharmony_ci
1075e1051a39Sopenharmony_ciLoop_schedule_256:
1076e1051a39Sopenharmony_ci	bl	_vpaes_schedule_mangle	# output low result
1077e1051a39Sopenharmony_ci	vmr	v6, v0			# vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
1078e1051a39Sopenharmony_ci
1079e1051a39Sopenharmony_ci	# high round
1080e1051a39Sopenharmony_ci	bl	_vpaes_schedule_round
1081e1051a39Sopenharmony_ci	bdz 	Lschedule_mangle_last	# dec	%esi
1082e1051a39Sopenharmony_ci	bl	_vpaes_schedule_mangle
1083e1051a39Sopenharmony_ci
1084e1051a39Sopenharmony_ci	# low round. swap xmm7 and xmm6
1085e1051a39Sopenharmony_ci	?vspltw	v0, v0, 3		# vpshufd	\$0xFF,	%xmm0,	%xmm0
1086e1051a39Sopenharmony_ci	vmr	v5, v7			# vmovdqa	%xmm7,	%xmm5
1087e1051a39Sopenharmony_ci	vmr	v7, v6			# vmovdqa	%xmm6,	%xmm7
1088e1051a39Sopenharmony_ci	bl	_vpaes_schedule_low_round
1089e1051a39Sopenharmony_ci	vmr	v7, v5			# vmovdqa	%xmm5,	%xmm7
1090e1051a39Sopenharmony_ci
1091e1051a39Sopenharmony_ci	b	Loop_schedule_256
1092e1051a39Sopenharmony_ci##
1093e1051a39Sopenharmony_ci##  .aes_schedule_mangle_last
1094e1051a39Sopenharmony_ci##
1095e1051a39Sopenharmony_ci##  Mangler for last round of key schedule
1096e1051a39Sopenharmony_ci##  Mangles %xmm0
1097e1051a39Sopenharmony_ci##    when encrypting, outputs out(%xmm0) ^ 63
1098e1051a39Sopenharmony_ci##    when decrypting, outputs unskew(%xmm0)
1099e1051a39Sopenharmony_ci##
1100e1051a39Sopenharmony_ci##  Always called right before return... jumps to cleanup and exits
1101e1051a39Sopenharmony_ci##
1102e1051a39Sopenharmony_ci.align	4
1103e1051a39Sopenharmony_ciLschedule_mangle_last:
1104e1051a39Sopenharmony_ci	# schedule last round key from xmm0
1105e1051a39Sopenharmony_ci	li	r11, 0x2e0		# lea	.Lk_deskew(%rip),%r11
1106e1051a39Sopenharmony_ci	li	r9,  0x2f0
1107e1051a39Sopenharmony_ci	bne	$dir, Lschedule_mangle_last_dec
1108e1051a39Sopenharmony_ci
1109e1051a39Sopenharmony_ci	# encrypting
1110e1051a39Sopenharmony_ci	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),%xmm1
1111e1051a39Sopenharmony_ci	li	r11, 0x2c0		# lea		.Lk_opt(%rip),	%r11	# prepare to output transform
1112e1051a39Sopenharmony_ci	li	r9,  0x2d0		# prepare to output transform
1113e1051a39Sopenharmony_ci	vperm	v0, v0, v0, v1		# vpshufb	%xmm1,	%xmm0,	%xmm0	# output permute
1114e1051a39Sopenharmony_ci
1115e1051a39Sopenharmony_ci	lvx	$iptlo, r11, r12	# reload $ipt
1116e1051a39Sopenharmony_ci	lvx	$ipthi, r9, r12
1117e1051a39Sopenharmony_ci	addi	$out, $out, 16		# add	\$16,	%rdx
1118e1051a39Sopenharmony_ci	vxor	v0, v0, v26		# vpxor		.Lk_s63(%rip),	%xmm0,	%xmm0
1119e1051a39Sopenharmony_ci	bl	_vpaes_schedule_transform	# output transform
1120e1051a39Sopenharmony_ci
1121e1051a39Sopenharmony_ci	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
1122e1051a39Sopenharmony_ci	vperm	v0, v0, v0, $outperm	# rotate right/left
1123e1051a39Sopenharmony_ci	li	r10, 4
1124e1051a39Sopenharmony_ci	vsel	v2, $outhead, v0, $outmask
1125e1051a39Sopenharmony_ci	li	r11, 8
1126e1051a39Sopenharmony_ci	stvx	v2, 0, $out
1127e1051a39Sopenharmony_ci	li	r12, 12
1128e1051a39Sopenharmony_ci	stvewx	v0, 0, $out		# some (or all) are redundant
1129e1051a39Sopenharmony_ci	stvewx	v0, r10, $out
1130e1051a39Sopenharmony_ci	stvewx	v0, r11, $out
1131e1051a39Sopenharmony_ci	stvewx	v0, r12, $out
1132e1051a39Sopenharmony_ci	b	Lschedule_mangle_done
1133e1051a39Sopenharmony_ci
1134e1051a39Sopenharmony_ci.align	4
1135e1051a39Sopenharmony_ciLschedule_mangle_last_dec:
1136e1051a39Sopenharmony_ci	lvx	$iptlo, r11, r12	# reload $ipt
1137e1051a39Sopenharmony_ci	lvx	$ipthi, r9,  r12
1138e1051a39Sopenharmony_ci	addi	$out, $out, -16		# add	\$-16,	%rdx
1139e1051a39Sopenharmony_ci	vxor	v0, v0, v26		# vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
1140e1051a39Sopenharmony_ci	bl	_vpaes_schedule_transform	# output transform
1141e1051a39Sopenharmony_ci
1142e1051a39Sopenharmony_ci	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
1143e1051a39Sopenharmony_ci	addi	r9, $out, -15		# -15 is not typo
1144e1051a39Sopenharmony_ci	vperm	v0, v0, v0, $outperm	# rotate right/left
1145e1051a39Sopenharmony_ci	li	r10, 4
1146e1051a39Sopenharmony_ci	vsel	v2, $outhead, v0, $outmask
1147e1051a39Sopenharmony_ci	li	r11, 8
1148e1051a39Sopenharmony_ci	stvx	v2, 0, $out
1149e1051a39Sopenharmony_ci	li	r12, 12
1150e1051a39Sopenharmony_ci	stvewx	v0, 0, r9		# some (or all) are redundant
1151e1051a39Sopenharmony_ci	stvewx	v0, r10, r9
1152e1051a39Sopenharmony_ci	stvewx	v0, r11, r9
1153e1051a39Sopenharmony_ci	stvewx	v0, r12, r9
1154e1051a39Sopenharmony_ci
1155e1051a39Sopenharmony_ci
1156e1051a39Sopenharmony_ciLschedule_mangle_done:
1157e1051a39Sopenharmony_ci	mtlr	r7
1158e1051a39Sopenharmony_ci	# cleanup
1159e1051a39Sopenharmony_ci	vxor	v0, v0, v0		# vpxor		%xmm0,	%xmm0,	%xmm0
1160e1051a39Sopenharmony_ci	vxor	v1, v1, v1		# vpxor		%xmm1,	%xmm1,	%xmm1
1161e1051a39Sopenharmony_ci	vxor	v2, v2, v2		# vpxor		%xmm2,	%xmm2,	%xmm2
1162e1051a39Sopenharmony_ci	vxor	v3, v3, v3		# vpxor		%xmm3,	%xmm3,	%xmm3
1163e1051a39Sopenharmony_ci	vxor	v4, v4, v4		# vpxor		%xmm4,	%xmm4,	%xmm4
1164e1051a39Sopenharmony_ci	vxor	v5, v5, v5		# vpxor		%xmm5,	%xmm5,	%xmm5
1165e1051a39Sopenharmony_ci	vxor	v6, v6, v6		# vpxor		%xmm6,	%xmm6,	%xmm6
1166e1051a39Sopenharmony_ci	vxor	v7, v7, v7		# vpxor		%xmm7,	%xmm7,	%xmm7
1167e1051a39Sopenharmony_ci
1168e1051a39Sopenharmony_ci	blr
1169e1051a39Sopenharmony_ci	.long	0
1170e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
1171e1051a39Sopenharmony_ci
1172e1051a39Sopenharmony_ci##
1173e1051a39Sopenharmony_ci##  .aes_schedule_192_smear
1174e1051a39Sopenharmony_ci##
1175e1051a39Sopenharmony_ci##  Smear the short, low side in the 192-bit key schedule.
1176e1051a39Sopenharmony_ci##
1177e1051a39Sopenharmony_ci##  Inputs:
1178e1051a39Sopenharmony_ci##    %xmm7: high side, b  a  x  y
1179e1051a39Sopenharmony_ci##    %xmm6:  low side, d  c  0  0
1180e1051a39Sopenharmony_ci##    %xmm13: 0
1181e1051a39Sopenharmony_ci##
1182e1051a39Sopenharmony_ci##  Outputs:
1183e1051a39Sopenharmony_ci##    %xmm6: b+c+d  b+c  0  0
1184e1051a39Sopenharmony_ci##    %xmm0: b+c+d  b+c  b  a
1185e1051a39Sopenharmony_ci##
1186e1051a39Sopenharmony_ci.align	4
1187e1051a39Sopenharmony_ci_vpaes_schedule_192_smear:
1188e1051a39Sopenharmony_ci	?vspltw	v0, v7, 3
1189e1051a39Sopenharmony_ci	?vsldoi	v1, v9, v6, 12		# vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
1190e1051a39Sopenharmony_ci	?vsldoi	v0, v7, v0, 8		# vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
1191e1051a39Sopenharmony_ci	vxor	v6, v6, v1		# vpxor		%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
1192e1051a39Sopenharmony_ci	vxor	v6, v6, v0		# vpxor		%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
1193e1051a39Sopenharmony_ci	vmr	v0, v6
1194e1051a39Sopenharmony_ci	?vsldoi	v6, v6, v9, 8
1195e1051a39Sopenharmony_ci	?vsldoi	v6, v9, v6, 8		# clobber low side with zeros
1196e1051a39Sopenharmony_ci	blr
1197e1051a39Sopenharmony_ci	.long	0
1198e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
1199e1051a39Sopenharmony_ci
1200e1051a39Sopenharmony_ci##
1201e1051a39Sopenharmony_ci##  .aes_schedule_round
1202e1051a39Sopenharmony_ci##
1203e1051a39Sopenharmony_ci##  Runs one main round of the key schedule on %xmm0, %xmm7
1204e1051a39Sopenharmony_ci##
1205e1051a39Sopenharmony_ci##  Specifically, runs subbytes on the high dword of %xmm0
1206e1051a39Sopenharmony_ci##  then rotates it by one byte and xors into the low dword of
1207e1051a39Sopenharmony_ci##  %xmm7.
1208e1051a39Sopenharmony_ci##
1209e1051a39Sopenharmony_ci##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1210e1051a39Sopenharmony_ci##  next rcon.
1211e1051a39Sopenharmony_ci##
1212e1051a39Sopenharmony_ci##  Smears the dwords of %xmm7 by xoring the low into the
1213e1051a39Sopenharmony_ci##  second low, result into third, result into highest.
1214e1051a39Sopenharmony_ci##
1215e1051a39Sopenharmony_ci##  Returns results in %xmm7 = %xmm0.
1216e1051a39Sopenharmony_ci##  Clobbers %xmm1-%xmm4, %r11.
1217e1051a39Sopenharmony_ci##
1218e1051a39Sopenharmony_ci.align	4
1219e1051a39Sopenharmony_ci_vpaes_schedule_round:
1220e1051a39Sopenharmony_ci	# extract rcon from xmm8
1221e1051a39Sopenharmony_ci	#vxor	v4, v4, v4		# vpxor		%xmm4,	%xmm4,	%xmm4
1222e1051a39Sopenharmony_ci	?vsldoi	v1, $rcon, v9, 15	# vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
1223e1051a39Sopenharmony_ci	?vsldoi	$rcon, $rcon, $rcon, 15	# vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
1224e1051a39Sopenharmony_ci	vxor	v7, v7, v1		# vpxor		%xmm1,	%xmm7,	%xmm7
1225e1051a39Sopenharmony_ci
1226e1051a39Sopenharmony_ci	# rotate
1227e1051a39Sopenharmony_ci	?vspltw	v0, v0, 3		# vpshufd	\$0xFF,	%xmm0,	%xmm0
1228e1051a39Sopenharmony_ci	?vsldoi	v0, v0, v0, 1		# vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
1229e1051a39Sopenharmony_ci
1230e1051a39Sopenharmony_ci	# fall through...
1231e1051a39Sopenharmony_ci
1232e1051a39Sopenharmony_ci	# low round: same as high round, but no rotation and no rcon.
1233e1051a39Sopenharmony_ci_vpaes_schedule_low_round:
1234e1051a39Sopenharmony_ci	# smear xmm7
1235e1051a39Sopenharmony_ci	?vsldoi	v1, v9, v7, 12		# vpslldq	\$4,	%xmm7,	%xmm1
1236e1051a39Sopenharmony_ci	vxor	v7, v7, v1		# vpxor		%xmm1,	%xmm7,	%xmm7
1237e1051a39Sopenharmony_ci	vspltisb	v1, 0x0f	# 0x0f..0f
1238e1051a39Sopenharmony_ci	?vsldoi	v4, v9, v7, 8		# vpslldq	\$8,	%xmm7,	%xmm4
1239e1051a39Sopenharmony_ci
1240e1051a39Sopenharmony_ci	# subbytes
1241e1051a39Sopenharmony_ci	vand	v1, v1, v0		# vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
1242e1051a39Sopenharmony_ci	vsrb	v0, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
1243e1051a39Sopenharmony_ci	 vxor	v7, v7, v4		# vpxor		%xmm4,	%xmm7,	%xmm7
1244e1051a39Sopenharmony_ci	vperm	v2, $invhi, v9, v1	# vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
1245e1051a39Sopenharmony_ci	vxor	v1, v1, v0		# vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
1246e1051a39Sopenharmony_ci	vperm	v3, $invlo, v9, v0	# vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
1247e1051a39Sopenharmony_ci	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
1248e1051a39Sopenharmony_ci	vperm	v4, $invlo, v9, v1	# vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
1249e1051a39Sopenharmony_ci	 vxor	v7, v7, v26		# vpxor		.Lk_s63(%rip),	%xmm7,	%xmm7
1250e1051a39Sopenharmony_ci	vperm	v3, $invlo, v9, v3	# vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
1251e1051a39Sopenharmony_ci	vxor	v4, v4, v2		# vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
1252e1051a39Sopenharmony_ci	vperm	v2, $invlo, v9, v4	# vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
1253e1051a39Sopenharmony_ci	vxor	v3, v3, v1		# vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
1254e1051a39Sopenharmony_ci	vxor	v2, v2, v0		# vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
1255e1051a39Sopenharmony_ci	vperm	v4, v15, v9, v3		# vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
1256e1051a39Sopenharmony_ci	vperm	v1, v14, v9, v2		# vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
1257e1051a39Sopenharmony_ci	vxor	v1, v1, v4		# vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
1258e1051a39Sopenharmony_ci
1259e1051a39Sopenharmony_ci	# add in smeared stuff
1260e1051a39Sopenharmony_ci	vxor	v0, v1, v7		# vpxor		%xmm7,	%xmm1,	%xmm0
1261e1051a39Sopenharmony_ci	vxor	v7, v1, v7		# vmovdqa	%xmm0,	%xmm7
1262e1051a39Sopenharmony_ci	blr
1263e1051a39Sopenharmony_ci	.long	0
1264e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
1265e1051a39Sopenharmony_ci
1266e1051a39Sopenharmony_ci##
1267e1051a39Sopenharmony_ci##  .aes_schedule_transform
1268e1051a39Sopenharmony_ci##
1269e1051a39Sopenharmony_ci##  Linear-transform %xmm0 according to tables at (%r11)
1270e1051a39Sopenharmony_ci##
1271e1051a39Sopenharmony_ci##  Requires that %xmm9 = 0x0F0F... as in preheat
1272e1051a39Sopenharmony_ci##  Output in %xmm0
1273e1051a39Sopenharmony_ci##  Clobbers %xmm2
1274e1051a39Sopenharmony_ci##
1275e1051a39Sopenharmony_ci.align	4
1276e1051a39Sopenharmony_ci_vpaes_schedule_transform:
1277e1051a39Sopenharmony_ci	#vand	v1, v0, v9		# vpand		%xmm9,	%xmm0,	%xmm1
1278e1051a39Sopenharmony_ci	vsrb	v2, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
1279e1051a39Sopenharmony_ci					# vmovdqa	(%r11),	%xmm2 	# lo
1280e1051a39Sopenharmony_ci	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm2
1281e1051a39Sopenharmony_ci					# vmovdqa	16(%r11),	%xmm1 # hi
1282e1051a39Sopenharmony_ci	vperm	v2, $ipthi, $ipthi, v2	# vpshufb	%xmm0,	%xmm1,	%xmm0
1283e1051a39Sopenharmony_ci	vxor	v0, v0, v2		# vpxor		%xmm2,	%xmm0,	%xmm0
1284e1051a39Sopenharmony_ci	blr
1285e1051a39Sopenharmony_ci	.long	0
1286e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
1287e1051a39Sopenharmony_ci
1288e1051a39Sopenharmony_ci##
1289e1051a39Sopenharmony_ci##  .aes_schedule_mangle
1290e1051a39Sopenharmony_ci##
1291e1051a39Sopenharmony_ci##  Mangle xmm0 from (basis-transformed) standard version
1292e1051a39Sopenharmony_ci##  to our version.
1293e1051a39Sopenharmony_ci##
1294e1051a39Sopenharmony_ci##  On encrypt,
1295e1051a39Sopenharmony_ci##    xor with 0x63
1296e1051a39Sopenharmony_ci##    multiply by circulant 0,1,1,1
1297e1051a39Sopenharmony_ci##    apply shiftrows transform
1298e1051a39Sopenharmony_ci##
1299e1051a39Sopenharmony_ci##  On decrypt,
1300e1051a39Sopenharmony_ci##    xor with 0x63
1301e1051a39Sopenharmony_ci##    multiply by "inverse mixcolumns" circulant E,B,D,9
1302e1051a39Sopenharmony_ci##    deskew
1303e1051a39Sopenharmony_ci##    apply shiftrows transform
1304e1051a39Sopenharmony_ci##
1305e1051a39Sopenharmony_ci##
1306e1051a39Sopenharmony_ci##  Writes out to (%rdx), and increments or decrements it
1307e1051a39Sopenharmony_ci##  Keeps track of round number mod 4 in %r8
1308e1051a39Sopenharmony_ci##  Preserves xmm0
1309e1051a39Sopenharmony_ci##  Clobbers xmm1-xmm5
1310e1051a39Sopenharmony_ci##
1311e1051a39Sopenharmony_ci.align	4
1312e1051a39Sopenharmony_ci_vpaes_schedule_mangle:
1313e1051a39Sopenharmony_ci	#vmr	v4, v0			# vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
1314e1051a39Sopenharmony_ci					# vmovdqa	.Lk_mc_forward(%rip),%xmm5
1315e1051a39Sopenharmony_ci	bne	$dir, Lschedule_mangle_dec
1316e1051a39Sopenharmony_ci
1317e1051a39Sopenharmony_ci	# encrypting
1318e1051a39Sopenharmony_ci	vxor	v4, v0, v26		# vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
1319e1051a39Sopenharmony_ci	addi	$out, $out, 16		# add	\$16,	%rdx
1320e1051a39Sopenharmony_ci	vperm	v4, v4, v4, v25		# vpshufb	%xmm5,	%xmm4,	%xmm4
1321e1051a39Sopenharmony_ci	vperm	v1, v4, v4, v25		# vpshufb	%xmm5,	%xmm4,	%xmm1
1322e1051a39Sopenharmony_ci	vperm	v3, v1, v1, v25		# vpshufb	%xmm5,	%xmm1,	%xmm3
1323e1051a39Sopenharmony_ci	vxor	v4, v4, v1		# vpxor		%xmm1,	%xmm4,	%xmm4
1324e1051a39Sopenharmony_ci	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
1325e1051a39Sopenharmony_ci	vxor	v3, v3, v4		# vpxor		%xmm4,	%xmm3,	%xmm3
1326e1051a39Sopenharmony_ci
1327e1051a39Sopenharmony_ci	vperm	v3, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3
1328e1051a39Sopenharmony_ci	addi	r8, r8, -16		# add	\$-16,	%r8
1329e1051a39Sopenharmony_ci	andi.	r8, r8, 0x30		# and	\$0x30,	%r8
1330e1051a39Sopenharmony_ci
1331e1051a39Sopenharmony_ci	#stvx	v3, 0, $out		# vmovdqu	%xmm3,	(%rdx)
1332e1051a39Sopenharmony_ci	vperm	v1, v3, v3, $outperm	# rotate right/left
1333e1051a39Sopenharmony_ci	vsel	v2, $outhead, v1, $outmask
1334e1051a39Sopenharmony_ci	vmr	$outhead, v1
1335e1051a39Sopenharmony_ci	stvx	v2, 0, $out
1336e1051a39Sopenharmony_ci	blr
1337e1051a39Sopenharmony_ci
1338e1051a39Sopenharmony_ci.align	4
1339e1051a39Sopenharmony_ciLschedule_mangle_dec:
1340e1051a39Sopenharmony_ci	# inverse mix columns
1341e1051a39Sopenharmony_ci					# lea	.Lk_dksd(%rip),%r11
1342e1051a39Sopenharmony_ci	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm4,	%xmm1	# 1 = hi
1343e1051a39Sopenharmony_ci	#and	v4, v0, v9		# vpand		%xmm9,	%xmm4,	%xmm4	# 4 = lo
1344e1051a39Sopenharmony_ci
1345e1051a39Sopenharmony_ci					# vmovdqa	0x00(%r11),	%xmm2
1346e1051a39Sopenharmony_ci	vperm	v2, v16, v16, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1347e1051a39Sopenharmony_ci					# vmovdqa	0x10(%r11),	%xmm3
1348e1051a39Sopenharmony_ci	vperm	v3, v17, v17, v1	# vpshufb	%xmm1,	%xmm3,	%xmm3
1349e1051a39Sopenharmony_ci	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3
1350e1051a39Sopenharmony_ci	vperm	v3, v3, v9, v25		# vpshufb	%xmm5,	%xmm3,	%xmm3
1351e1051a39Sopenharmony_ci
1352e1051a39Sopenharmony_ci					# vmovdqa	0x20(%r11),	%xmm2
1353e1051a39Sopenharmony_ci	vperm	v2, v18, v18, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1354e1051a39Sopenharmony_ci	vxor	v2, v2, v3		# vpxor		%xmm3,	%xmm2,	%xmm2
1355e1051a39Sopenharmony_ci					# vmovdqa	0x30(%r11),	%xmm3
1356e1051a39Sopenharmony_ci	vperm	v3, v19, v19, v1	# vpshufb	%xmm1,	%xmm3,	%xmm3
1357e1051a39Sopenharmony_ci	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3
1358e1051a39Sopenharmony_ci	vperm	v3, v3, v9, v25		# vpshufb	%xmm5,	%xmm3,	%xmm3
1359e1051a39Sopenharmony_ci
1360e1051a39Sopenharmony_ci					# vmovdqa	0x40(%r11),	%xmm2
1361e1051a39Sopenharmony_ci	vperm	v2, v20, v20, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1362e1051a39Sopenharmony_ci	vxor	v2, v2, v3		# vpxor		%xmm3,	%xmm2,	%xmm2
1363e1051a39Sopenharmony_ci					# vmovdqa	0x50(%r11),	%xmm3
1364e1051a39Sopenharmony_ci	vperm	v3, v21, v21, v1	# vpshufb	%xmm1,	%xmm3,	%xmm3
1365e1051a39Sopenharmony_ci	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3
1366e1051a39Sopenharmony_ci
1367e1051a39Sopenharmony_ci					# vmovdqa	0x60(%r11),	%xmm2
1368e1051a39Sopenharmony_ci	vperm	v2, v22, v22, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1369e1051a39Sopenharmony_ci	vperm	v3, v3, v9, v25		# vpshufb	%xmm5,	%xmm3,	%xmm3
1370e1051a39Sopenharmony_ci					# vmovdqa	0x70(%r11),	%xmm4
1371e1051a39Sopenharmony_ci	vperm	v4, v23, v23, v1	# vpshufb	%xmm1,	%xmm4,	%xmm4
1372e1051a39Sopenharmony_ci	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
1373e1051a39Sopenharmony_ci	vxor	v2, v2, v3		# vpxor		%xmm3,	%xmm2,	%xmm2
1374e1051a39Sopenharmony_ci	vxor	v3, v4, v2		# vpxor		%xmm2,	%xmm4,	%xmm3
1375e1051a39Sopenharmony_ci
1376e1051a39Sopenharmony_ci	addi	$out, $out, -16		# add	\$-16,	%rdx
1377e1051a39Sopenharmony_ci
1378e1051a39Sopenharmony_ci	vperm	v3, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3
1379e1051a39Sopenharmony_ci	addi	r8, r8, -16		# add	\$-16,	%r8
1380e1051a39Sopenharmony_ci	andi.	r8, r8, 0x30		# and	\$0x30,	%r8
1381e1051a39Sopenharmony_ci
1382e1051a39Sopenharmony_ci	#stvx	v3, 0, $out		# vmovdqu	%xmm3,	(%rdx)
1383e1051a39Sopenharmony_ci	vperm	v1, v3, v3, $outperm	# rotate right/left
1384e1051a39Sopenharmony_ci	vsel	v2, $outhead, v1, $outmask
1385e1051a39Sopenharmony_ci	vmr	$outhead, v1
1386e1051a39Sopenharmony_ci	stvx	v2, 0, $out
1387e1051a39Sopenharmony_ci	blr
1388e1051a39Sopenharmony_ci	.long	0
1389e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,0,0
1390e1051a39Sopenharmony_ci
1391e1051a39Sopenharmony_ci.globl	.vpaes_set_encrypt_key
1392e1051a39Sopenharmony_ci.align	5
1393e1051a39Sopenharmony_ci.vpaes_set_encrypt_key:
1394e1051a39Sopenharmony_ci	$STU	$sp,-$FRAME($sp)
1395e1051a39Sopenharmony_ci	li	r10,`15+6*$SIZE_T`
1396e1051a39Sopenharmony_ci	li	r11,`31+6*$SIZE_T`
1397e1051a39Sopenharmony_ci	mflr	r0
1398e1051a39Sopenharmony_ci	mfspr	r6, 256			# save vrsave
1399e1051a39Sopenharmony_ci	stvx	v20,r10,$sp
1400e1051a39Sopenharmony_ci	addi	r10,r10,32
1401e1051a39Sopenharmony_ci	stvx	v21,r11,$sp
1402e1051a39Sopenharmony_ci	addi	r11,r11,32
1403e1051a39Sopenharmony_ci	stvx	v22,r10,$sp
1404e1051a39Sopenharmony_ci	addi	r10,r10,32
1405e1051a39Sopenharmony_ci	stvx	v23,r11,$sp
1406e1051a39Sopenharmony_ci	addi	r11,r11,32
1407e1051a39Sopenharmony_ci	stvx	v24,r10,$sp
1408e1051a39Sopenharmony_ci	addi	r10,r10,32
1409e1051a39Sopenharmony_ci	stvx	v25,r11,$sp
1410e1051a39Sopenharmony_ci	addi	r11,r11,32
1411e1051a39Sopenharmony_ci	stvx	v26,r10,$sp
1412e1051a39Sopenharmony_ci	addi	r10,r10,32
1413e1051a39Sopenharmony_ci	stvx	v27,r11,$sp
1414e1051a39Sopenharmony_ci	addi	r11,r11,32
1415e1051a39Sopenharmony_ci	stvx	v28,r10,$sp
1416e1051a39Sopenharmony_ci	addi	r10,r10,32
1417e1051a39Sopenharmony_ci	stvx	v29,r11,$sp
1418e1051a39Sopenharmony_ci	addi	r11,r11,32
1419e1051a39Sopenharmony_ci	stvx	v30,r10,$sp
1420e1051a39Sopenharmony_ci	stvx	v31,r11,$sp
1421e1051a39Sopenharmony_ci	stw	r6,`$FRAME-4`($sp)	# save vrsave
1422e1051a39Sopenharmony_ci	li	r7, -1
1423e1051a39Sopenharmony_ci	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
1424e1051a39Sopenharmony_ci	mtspr	256, r7			# preserve all AltiVec registers
1425e1051a39Sopenharmony_ci
1426e1051a39Sopenharmony_ci	srwi	r9, $bits, 5		# shr	\$5,%eax
1427e1051a39Sopenharmony_ci	addi	r9, r9, 6		# add	\$5,%eax
1428e1051a39Sopenharmony_ci	stw	r9, 240($out)		# mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1429e1051a39Sopenharmony_ci
1430e1051a39Sopenharmony_ci	cmplw	$dir, $bits, $bits	# set encrypt direction
1431e1051a39Sopenharmony_ci	li	r8, 0x30		# mov	\$0x30,%r8d
1432e1051a39Sopenharmony_ci	bl	_vpaes_schedule_core
1433e1051a39Sopenharmony_ci
1434e1051a39Sopenharmony_ci	$POP	r0, `$FRAME+$LRSAVE`($sp)
1435e1051a39Sopenharmony_ci	li	r10,`15+6*$SIZE_T`
1436e1051a39Sopenharmony_ci	li	r11,`31+6*$SIZE_T`
1437e1051a39Sopenharmony_ci	mtspr	256, r6			# restore vrsave
1438e1051a39Sopenharmony_ci	mtlr	r0
1439e1051a39Sopenharmony_ci	xor	r3, r3, r3
1440e1051a39Sopenharmony_ci	lvx	v20,r10,$sp
1441e1051a39Sopenharmony_ci	addi	r10,r10,32
1442e1051a39Sopenharmony_ci	lvx	v21,r11,$sp
1443e1051a39Sopenharmony_ci	addi	r11,r11,32
1444e1051a39Sopenharmony_ci	lvx	v22,r10,$sp
1445e1051a39Sopenharmony_ci	addi	r10,r10,32
1446e1051a39Sopenharmony_ci	lvx	v23,r11,$sp
1447e1051a39Sopenharmony_ci	addi	r11,r11,32
1448e1051a39Sopenharmony_ci	lvx	v24,r10,$sp
1449e1051a39Sopenharmony_ci	addi	r10,r10,32
1450e1051a39Sopenharmony_ci	lvx	v25,r11,$sp
1451e1051a39Sopenharmony_ci	addi	r11,r11,32
1452e1051a39Sopenharmony_ci	lvx	v26,r10,$sp
1453e1051a39Sopenharmony_ci	addi	r10,r10,32
1454e1051a39Sopenharmony_ci	lvx	v27,r11,$sp
1455e1051a39Sopenharmony_ci	addi	r11,r11,32
1456e1051a39Sopenharmony_ci	lvx	v28,r10,$sp
1457e1051a39Sopenharmony_ci	addi	r10,r10,32
1458e1051a39Sopenharmony_ci	lvx	v29,r11,$sp
1459e1051a39Sopenharmony_ci	addi	r11,r11,32
1460e1051a39Sopenharmony_ci	lvx	v30,r10,$sp
1461e1051a39Sopenharmony_ci	lvx	v31,r11,$sp
1462e1051a39Sopenharmony_ci	addi	$sp,$sp,$FRAME
1463e1051a39Sopenharmony_ci	blr
1464e1051a39Sopenharmony_ci	.long	0
1465e1051a39Sopenharmony_ci	.byte	0,12,0x04,1,0x80,0,3,0
1466e1051a39Sopenharmony_ci	.long	0
1467e1051a39Sopenharmony_ci.size	.vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1468e1051a39Sopenharmony_ci
1469e1051a39Sopenharmony_ci.globl	.vpaes_set_decrypt_key
1470e1051a39Sopenharmony_ci.align	4
1471e1051a39Sopenharmony_ci.vpaes_set_decrypt_key:
1472e1051a39Sopenharmony_ci	$STU	$sp,-$FRAME($sp)
1473e1051a39Sopenharmony_ci	li	r10,`15+6*$SIZE_T`
1474e1051a39Sopenharmony_ci	li	r11,`31+6*$SIZE_T`
1475e1051a39Sopenharmony_ci	mflr	r0
1476e1051a39Sopenharmony_ci	mfspr	r6, 256			# save vrsave
1477e1051a39Sopenharmony_ci	stvx	v20,r10,$sp
1478e1051a39Sopenharmony_ci	addi	r10,r10,32
1479e1051a39Sopenharmony_ci	stvx	v21,r11,$sp
1480e1051a39Sopenharmony_ci	addi	r11,r11,32
1481e1051a39Sopenharmony_ci	stvx	v22,r10,$sp
1482e1051a39Sopenharmony_ci	addi	r10,r10,32
1483e1051a39Sopenharmony_ci	stvx	v23,r11,$sp
1484e1051a39Sopenharmony_ci	addi	r11,r11,32
1485e1051a39Sopenharmony_ci	stvx	v24,r10,$sp
1486e1051a39Sopenharmony_ci	addi	r10,r10,32
1487e1051a39Sopenharmony_ci	stvx	v25,r11,$sp
1488e1051a39Sopenharmony_ci	addi	r11,r11,32
1489e1051a39Sopenharmony_ci	stvx	v26,r10,$sp
1490e1051a39Sopenharmony_ci	addi	r10,r10,32
1491e1051a39Sopenharmony_ci	stvx	v27,r11,$sp
1492e1051a39Sopenharmony_ci	addi	r11,r11,32
1493e1051a39Sopenharmony_ci	stvx	v28,r10,$sp
1494e1051a39Sopenharmony_ci	addi	r10,r10,32
1495e1051a39Sopenharmony_ci	stvx	v29,r11,$sp
1496e1051a39Sopenharmony_ci	addi	r11,r11,32
1497e1051a39Sopenharmony_ci	stvx	v30,r10,$sp
1498e1051a39Sopenharmony_ci	stvx	v31,r11,$sp
1499e1051a39Sopenharmony_ci	stw	r6,`$FRAME-4`($sp)	# save vrsave
1500e1051a39Sopenharmony_ci	li	r7, -1
1501e1051a39Sopenharmony_ci	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
1502e1051a39Sopenharmony_ci	mtspr	256, r7			# preserve all AltiVec registers
1503e1051a39Sopenharmony_ci
1504e1051a39Sopenharmony_ci	srwi	r9, $bits, 5		# shr	\$5,%eax
1505e1051a39Sopenharmony_ci	addi	r9, r9, 6		# add	\$5,%eax
1506e1051a39Sopenharmony_ci	stw	r9, 240($out)		# mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1507e1051a39Sopenharmony_ci
1508e1051a39Sopenharmony_ci	slwi	r9, r9, 4		# shl	\$4,%eax
1509e1051a39Sopenharmony_ci	add	$out, $out, r9		# lea	(%rdx,%rax),%rdx
1510e1051a39Sopenharmony_ci
1511e1051a39Sopenharmony_ci	cmplwi	$dir, $bits, 0		# set decrypt direction
1512e1051a39Sopenharmony_ci	srwi	r8, $bits, 1		# shr	\$1,%r8d
1513e1051a39Sopenharmony_ci	andi.	r8, r8, 32		# and	\$32,%r8d
1514e1051a39Sopenharmony_ci	xori	r8, r8, 32		# xor	\$32,%r8d	# nbits==192?0:32
1515e1051a39Sopenharmony_ci	bl	_vpaes_schedule_core
1516e1051a39Sopenharmony_ci
1517e1051a39Sopenharmony_ci	$POP	r0,  `$FRAME+$LRSAVE`($sp)
1518e1051a39Sopenharmony_ci	li	r10,`15+6*$SIZE_T`
1519e1051a39Sopenharmony_ci	li	r11,`31+6*$SIZE_T`
1520e1051a39Sopenharmony_ci	mtspr	256, r6			# restore vrsave
1521e1051a39Sopenharmony_ci	mtlr	r0
1522e1051a39Sopenharmony_ci	xor	r3, r3, r3
1523e1051a39Sopenharmony_ci	lvx	v20,r10,$sp
1524e1051a39Sopenharmony_ci	addi	r10,r10,32
1525e1051a39Sopenharmony_ci	lvx	v21,r11,$sp
1526e1051a39Sopenharmony_ci	addi	r11,r11,32
1527e1051a39Sopenharmony_ci	lvx	v22,r10,$sp
1528e1051a39Sopenharmony_ci	addi	r10,r10,32
1529e1051a39Sopenharmony_ci	lvx	v23,r11,$sp
1530e1051a39Sopenharmony_ci	addi	r11,r11,32
1531e1051a39Sopenharmony_ci	lvx	v24,r10,$sp
1532e1051a39Sopenharmony_ci	addi	r10,r10,32
1533e1051a39Sopenharmony_ci	lvx	v25,r11,$sp
1534e1051a39Sopenharmony_ci	addi	r11,r11,32
1535e1051a39Sopenharmony_ci	lvx	v26,r10,$sp
1536e1051a39Sopenharmony_ci	addi	r10,r10,32
1537e1051a39Sopenharmony_ci	lvx	v27,r11,$sp
1538e1051a39Sopenharmony_ci	addi	r11,r11,32
1539e1051a39Sopenharmony_ci	lvx	v28,r10,$sp
1540e1051a39Sopenharmony_ci	addi	r10,r10,32
1541e1051a39Sopenharmony_ci	lvx	v29,r11,$sp
1542e1051a39Sopenharmony_ci	addi	r11,r11,32
1543e1051a39Sopenharmony_ci	lvx	v30,r10,$sp
1544e1051a39Sopenharmony_ci	lvx	v31,r11,$sp
1545e1051a39Sopenharmony_ci	addi	$sp,$sp,$FRAME
1546e1051a39Sopenharmony_ci	blr
1547e1051a39Sopenharmony_ci	.long	0
1548e1051a39Sopenharmony_ci	.byte	0,12,0x04,1,0x80,0,3,0
1549e1051a39Sopenharmony_ci	.long	0
1550e1051a39Sopenharmony_ci.size	.vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1551e1051a39Sopenharmony_ci___
1552e1051a39Sopenharmony_ci}
1553e1051a39Sopenharmony_ci
1554e1051a39Sopenharmony_cimy $consts=1;
1555e1051a39Sopenharmony_ciforeach  (split("\n",$code)) {
1556e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/geo;
1557e1051a39Sopenharmony_ci
1558e1051a39Sopenharmony_ci	# constants table endian-specific conversion
1559e1051a39Sopenharmony_ci	if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1560e1051a39Sopenharmony_ci	    my $conv=$2;
1561e1051a39Sopenharmony_ci	    my @bytes=();
1562e1051a39Sopenharmony_ci
1563e1051a39Sopenharmony_ci	    # convert to endian-agnostic format
1564e1051a39Sopenharmony_ci	    foreach (split(/,\s+/,$1)) {
1565e1051a39Sopenharmony_ci		my $l = /^0/?oct:int;
1566e1051a39Sopenharmony_ci		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1567e1051a39Sopenharmony_ci	    }
1568e1051a39Sopenharmony_ci
1569e1051a39Sopenharmony_ci	    # little-endian conversion
1570e1051a39Sopenharmony_ci	    if ($flavour =~ /le$/o) {
1571e1051a39Sopenharmony_ci		SWITCH: for($conv)  {
1572e1051a39Sopenharmony_ci		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
1573e1051a39Sopenharmony_ci		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
1574e1051a39Sopenharmony_ci		}
1575e1051a39Sopenharmony_ci	    }
1576e1051a39Sopenharmony_ci
1577e1051a39Sopenharmony_ci	    #emit
1578e1051a39Sopenharmony_ci	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1579e1051a39Sopenharmony_ci	    next;
1580e1051a39Sopenharmony_ci	}
1581e1051a39Sopenharmony_ci	$consts=0 if (m/Lconsts:/o);	# end of table
1582e1051a39Sopenharmony_ci
1583e1051a39Sopenharmony_ci	# instructions prefixed with '?' are endian-specific and need
1584e1051a39Sopenharmony_ci	# to be adjusted accordingly...
1585e1051a39Sopenharmony_ci	if ($flavour =~ /le$/o) {	# little-endian
1586e1051a39Sopenharmony_ci	    s/\?lvsr/lvsl/o or
1587e1051a39Sopenharmony_ci	    s/\?lvsl/lvsr/o or
1588e1051a39Sopenharmony_ci	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1589e1051a39Sopenharmony_ci	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1590e1051a39Sopenharmony_ci	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1591e1051a39Sopenharmony_ci	} else {			# big-endian
1592e1051a39Sopenharmony_ci	    s/\?([a-z]+)/$1/o;
1593e1051a39Sopenharmony_ci	}
1594e1051a39Sopenharmony_ci
1595e1051a39Sopenharmony_ci	print $_,"\n";
1596e1051a39Sopenharmony_ci}
1597e1051a39Sopenharmony_ci
1598e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
1599