1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci# September 2011
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
20e1051a39Sopenharmony_ci# details.
21e1051a39Sopenharmony_ci
22e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
23e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
24e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
25e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
26e1051a39Sopenharmony_ci
27e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28e1051a39Sopenharmony_ci
29e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31e1051a39Sopenharmony_ci( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
32e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl";
33e1051a39Sopenharmony_ci
34e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
35e1051a39Sopenharmony_ci     or die "can't call $xlate: $!";
36e1051a39Sopenharmony_ci*STDOUT=*OUT;
37e1051a39Sopenharmony_ci
38e1051a39Sopenharmony_ci$code=".text\n";
39e1051a39Sopenharmony_ci
40e1051a39Sopenharmony_ci%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32);	# prefetch errata
41e1051a39Sopenharmony_ci$PADLOCK_CHUNK=512;	# Must be a power of 2 between 32 and 2^20
42e1051a39Sopenharmony_ci
43e1051a39Sopenharmony_ci$ctx="%rdx";
44e1051a39Sopenharmony_ci$out="%rdi";
45e1051a39Sopenharmony_ci$inp="%rsi";
46e1051a39Sopenharmony_ci$len="%rcx";
47e1051a39Sopenharmony_ci$chunk="%rbx";
48e1051a39Sopenharmony_ci
49e1051a39Sopenharmony_ci($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
50e1051a39Sopenharmony_ci                                 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
51e1051a39Sopenharmony_ci
52e1051a39Sopenharmony_ci$code.=<<___;
53e1051a39Sopenharmony_ci.globl	padlock_capability
54e1051a39Sopenharmony_ci.type	padlock_capability,\@abi-omnipotent
55e1051a39Sopenharmony_ci.align	16
56e1051a39Sopenharmony_cipadlock_capability:
57e1051a39Sopenharmony_ci	mov	%rbx,%r8
58e1051a39Sopenharmony_ci	xor	%eax,%eax
59e1051a39Sopenharmony_ci	cpuid
60e1051a39Sopenharmony_ci	xor	%eax,%eax
61e1051a39Sopenharmony_ci	cmp	\$`"0x".unpack("H*",'tneC')`,%ebx
62e1051a39Sopenharmony_ci	jne	.Lzhaoxin
63e1051a39Sopenharmony_ci	cmp	\$`"0x".unpack("H*",'Hrua')`,%edx
64e1051a39Sopenharmony_ci	jne	.Lnoluck
65e1051a39Sopenharmony_ci	cmp	\$`"0x".unpack("H*",'slua')`,%ecx
66e1051a39Sopenharmony_ci	jne	.Lnoluck
67e1051a39Sopenharmony_ci	jmp	.LzhaoxinEnd
68e1051a39Sopenharmony_ci.Lzhaoxin:
69e1051a39Sopenharmony_ci	cmp	\$`"0x".unpack("H*",'hS  ')`,%ebx
70e1051a39Sopenharmony_ci	jne	.Lnoluck
71e1051a39Sopenharmony_ci	cmp	\$`"0x".unpack("H*",'hgna')`,%edx
72e1051a39Sopenharmony_ci	jne	.Lnoluck
73e1051a39Sopenharmony_ci	cmp	\$`"0x".unpack("H*",'  ia')`,%ecx
74e1051a39Sopenharmony_ci	jne	.Lnoluck
75e1051a39Sopenharmony_ci.LzhaoxinEnd:
76e1051a39Sopenharmony_ci	mov	\$0xC0000000,%eax
77e1051a39Sopenharmony_ci	cpuid
78e1051a39Sopenharmony_ci	mov	%eax,%edx
79e1051a39Sopenharmony_ci	xor	%eax,%eax
80e1051a39Sopenharmony_ci	cmp	\$0xC0000001,%edx
81e1051a39Sopenharmony_ci	jb	.Lnoluck
82e1051a39Sopenharmony_ci	mov	\$0xC0000001,%eax
83e1051a39Sopenharmony_ci	cpuid
84e1051a39Sopenharmony_ci	mov	%edx,%eax
85e1051a39Sopenharmony_ci	and	\$0xffffffef,%eax
86e1051a39Sopenharmony_ci	or	\$0x10,%eax		# set Nano bit#4
87e1051a39Sopenharmony_ci.Lnoluck:
88e1051a39Sopenharmony_ci	mov	%r8,%rbx
89e1051a39Sopenharmony_ci	ret
90e1051a39Sopenharmony_ci.size	padlock_capability,.-padlock_capability
91e1051a39Sopenharmony_ci
92e1051a39Sopenharmony_ci.globl	padlock_key_bswap
93e1051a39Sopenharmony_ci.type	padlock_key_bswap,\@abi-omnipotent,0
94e1051a39Sopenharmony_ci.align	16
95e1051a39Sopenharmony_cipadlock_key_bswap:
96e1051a39Sopenharmony_ci	mov	240($arg1),%edx
97e1051a39Sopenharmony_ci	inc	%edx
98e1051a39Sopenharmony_ci	shl	\$2,%edx
99e1051a39Sopenharmony_ci.Lbswap_loop:
100e1051a39Sopenharmony_ci	mov	($arg1),%eax
101e1051a39Sopenharmony_ci	bswap	%eax
102e1051a39Sopenharmony_ci	mov	%eax,($arg1)
103e1051a39Sopenharmony_ci	lea	4($arg1),$arg1
104e1051a39Sopenharmony_ci	sub	\$1,%edx
105e1051a39Sopenharmony_ci	jnz	.Lbswap_loop
106e1051a39Sopenharmony_ci	ret
107e1051a39Sopenharmony_ci.size	padlock_key_bswap,.-padlock_key_bswap
108e1051a39Sopenharmony_ci
109e1051a39Sopenharmony_ci.globl	padlock_verify_context
110e1051a39Sopenharmony_ci.type	padlock_verify_context,\@abi-omnipotent
111e1051a39Sopenharmony_ci.align	16
112e1051a39Sopenharmony_cipadlock_verify_context:
113e1051a39Sopenharmony_ci	mov	$arg1,$ctx
114e1051a39Sopenharmony_ci	pushf
115e1051a39Sopenharmony_ci	lea	.Lpadlock_saved_context(%rip),%rax
116e1051a39Sopenharmony_ci	call	_padlock_verify_ctx
117e1051a39Sopenharmony_ci	lea	8(%rsp),%rsp
118e1051a39Sopenharmony_ci	ret
119e1051a39Sopenharmony_ci.size	padlock_verify_context,.-padlock_verify_context
120e1051a39Sopenharmony_ci
121e1051a39Sopenharmony_ci.type	_padlock_verify_ctx,\@abi-omnipotent
122e1051a39Sopenharmony_ci.align	16
123e1051a39Sopenharmony_ci_padlock_verify_ctx:
124e1051a39Sopenharmony_ci	mov	8(%rsp),%r8
125e1051a39Sopenharmony_ci	bt	\$30,%r8
126e1051a39Sopenharmony_ci	jnc	.Lverified
127e1051a39Sopenharmony_ci	cmp	(%rax),$ctx
128e1051a39Sopenharmony_ci	je	.Lverified
129e1051a39Sopenharmony_ci	pushf
130e1051a39Sopenharmony_ci	popf
131e1051a39Sopenharmony_ci.Lverified:
132e1051a39Sopenharmony_ci	mov	$ctx,(%rax)
133e1051a39Sopenharmony_ci	ret
134e1051a39Sopenharmony_ci.size	_padlock_verify_ctx,.-_padlock_verify_ctx
135e1051a39Sopenharmony_ci
136e1051a39Sopenharmony_ci.globl	padlock_reload_key
137e1051a39Sopenharmony_ci.type	padlock_reload_key,\@abi-omnipotent
138e1051a39Sopenharmony_ci.align	16
139e1051a39Sopenharmony_cipadlock_reload_key:
140e1051a39Sopenharmony_ci	pushf
141e1051a39Sopenharmony_ci	popf
142e1051a39Sopenharmony_ci	ret
143e1051a39Sopenharmony_ci.size	padlock_reload_key,.-padlock_reload_key
144e1051a39Sopenharmony_ci
145e1051a39Sopenharmony_ci.globl	padlock_aes_block
146e1051a39Sopenharmony_ci.type	padlock_aes_block,\@function,3
147e1051a39Sopenharmony_ci.align	16
148e1051a39Sopenharmony_cipadlock_aes_block:
149e1051a39Sopenharmony_ci	mov	%rbx,%r8
150e1051a39Sopenharmony_ci	mov	\$1,$len
151e1051a39Sopenharmony_ci	lea	32($ctx),%rbx		# key
152e1051a39Sopenharmony_ci	lea	16($ctx),$ctx		# control word
153e1051a39Sopenharmony_ci	.byte	0xf3,0x0f,0xa7,0xc8	# rep xcryptecb
154e1051a39Sopenharmony_ci	mov	%r8,%rbx
155e1051a39Sopenharmony_ci	ret
156e1051a39Sopenharmony_ci.size	padlock_aes_block,.-padlock_aes_block
157e1051a39Sopenharmony_ci
158e1051a39Sopenharmony_ci.globl	padlock_xstore
159e1051a39Sopenharmony_ci.type	padlock_xstore,\@function,2
160e1051a39Sopenharmony_ci.align	16
161e1051a39Sopenharmony_cipadlock_xstore:
162e1051a39Sopenharmony_ci	mov	%esi,%edx
163e1051a39Sopenharmony_ci	.byte	0x0f,0xa7,0xc0		# xstore
164e1051a39Sopenharmony_ci	ret
165e1051a39Sopenharmony_ci.size	padlock_xstore,.-padlock_xstore
166e1051a39Sopenharmony_ci
167e1051a39Sopenharmony_ci.globl	padlock_sha1_oneshot
168e1051a39Sopenharmony_ci.type	padlock_sha1_oneshot,\@function,3
169e1051a39Sopenharmony_ci.align	16
170e1051a39Sopenharmony_cipadlock_sha1_oneshot:
171e1051a39Sopenharmony_ci	mov	%rdx,%rcx
172e1051a39Sopenharmony_ci	mov	%rdi,%rdx		# put aside %rdi
173e1051a39Sopenharmony_ci	movups	(%rdi),%xmm0		# copy-in context
174e1051a39Sopenharmony_ci	sub	\$128+8,%rsp
175e1051a39Sopenharmony_ci	mov	16(%rdi),%eax
176e1051a39Sopenharmony_ci	movaps	%xmm0,(%rsp)
177e1051a39Sopenharmony_ci	mov	%rsp,%rdi
178e1051a39Sopenharmony_ci	mov	%eax,16(%rsp)
179e1051a39Sopenharmony_ci	xor	%rax,%rax
180e1051a39Sopenharmony_ci	.byte	0xf3,0x0f,0xa6,0xc8	# rep xsha1
181e1051a39Sopenharmony_ci	movaps	(%rsp),%xmm0
182e1051a39Sopenharmony_ci	mov	16(%rsp),%eax
183e1051a39Sopenharmony_ci	add	\$128+8,%rsp
184e1051a39Sopenharmony_ci	movups	%xmm0,(%rdx)		# copy-out context
185e1051a39Sopenharmony_ci	mov	%eax,16(%rdx)
186e1051a39Sopenharmony_ci	ret
187e1051a39Sopenharmony_ci.size	padlock_sha1_oneshot,.-padlock_sha1_oneshot
188e1051a39Sopenharmony_ci
189e1051a39Sopenharmony_ci.globl	padlock_sha1_blocks
190e1051a39Sopenharmony_ci.type	padlock_sha1_blocks,\@function,3
191e1051a39Sopenharmony_ci.align	16
192e1051a39Sopenharmony_cipadlock_sha1_blocks:
193e1051a39Sopenharmony_ci	mov	%rdx,%rcx
194e1051a39Sopenharmony_ci	mov	%rdi,%rdx		# put aside %rdi
195e1051a39Sopenharmony_ci	movups	(%rdi),%xmm0		# copy-in context
196e1051a39Sopenharmony_ci	sub	\$128+8,%rsp
197e1051a39Sopenharmony_ci	mov	16(%rdi),%eax
198e1051a39Sopenharmony_ci	movaps	%xmm0,(%rsp)
199e1051a39Sopenharmony_ci	mov	%rsp,%rdi
200e1051a39Sopenharmony_ci	mov	%eax,16(%rsp)
201e1051a39Sopenharmony_ci	mov	\$-1,%rax
202e1051a39Sopenharmony_ci	.byte	0xf3,0x0f,0xa6,0xc8	# rep xsha1
203e1051a39Sopenharmony_ci	movaps	(%rsp),%xmm0
204e1051a39Sopenharmony_ci	mov	16(%rsp),%eax
205e1051a39Sopenharmony_ci	add	\$128+8,%rsp
206e1051a39Sopenharmony_ci	movups	%xmm0,(%rdx)		# copy-out context
207e1051a39Sopenharmony_ci	mov	%eax,16(%rdx)
208e1051a39Sopenharmony_ci	ret
209e1051a39Sopenharmony_ci.size	padlock_sha1_blocks,.-padlock_sha1_blocks
210e1051a39Sopenharmony_ci
211e1051a39Sopenharmony_ci.globl	padlock_sha256_oneshot
212e1051a39Sopenharmony_ci.type	padlock_sha256_oneshot,\@function,3
213e1051a39Sopenharmony_ci.align	16
214e1051a39Sopenharmony_cipadlock_sha256_oneshot:
215e1051a39Sopenharmony_ci	mov	%rdx,%rcx
216e1051a39Sopenharmony_ci	mov	%rdi,%rdx		# put aside %rdi
217e1051a39Sopenharmony_ci	movups	(%rdi),%xmm0		# copy-in context
218e1051a39Sopenharmony_ci	sub	\$128+8,%rsp
219e1051a39Sopenharmony_ci	movups	16(%rdi),%xmm1
220e1051a39Sopenharmony_ci	movaps	%xmm0,(%rsp)
221e1051a39Sopenharmony_ci	mov	%rsp,%rdi
222e1051a39Sopenharmony_ci	movaps	%xmm1,16(%rsp)
223e1051a39Sopenharmony_ci	xor	%rax,%rax
224e1051a39Sopenharmony_ci	.byte	0xf3,0x0f,0xa6,0xd0	# rep xsha256
225e1051a39Sopenharmony_ci	movaps	(%rsp),%xmm0
226e1051a39Sopenharmony_ci	movaps	16(%rsp),%xmm1
227e1051a39Sopenharmony_ci	add	\$128+8,%rsp
228e1051a39Sopenharmony_ci	movups	%xmm0,(%rdx)		# copy-out context
229e1051a39Sopenharmony_ci	movups	%xmm1,16(%rdx)
230e1051a39Sopenharmony_ci	ret
231e1051a39Sopenharmony_ci.size	padlock_sha256_oneshot,.-padlock_sha256_oneshot
232e1051a39Sopenharmony_ci
233e1051a39Sopenharmony_ci.globl	padlock_sha256_blocks
234e1051a39Sopenharmony_ci.type	padlock_sha256_blocks,\@function,3
235e1051a39Sopenharmony_ci.align	16
236e1051a39Sopenharmony_cipadlock_sha256_blocks:
237e1051a39Sopenharmony_ci	mov	%rdx,%rcx
238e1051a39Sopenharmony_ci	mov	%rdi,%rdx		# put aside %rdi
239e1051a39Sopenharmony_ci	movups	(%rdi),%xmm0		# copy-in context
240e1051a39Sopenharmony_ci	sub	\$128+8,%rsp
241e1051a39Sopenharmony_ci	movups	16(%rdi),%xmm1
242e1051a39Sopenharmony_ci	movaps	%xmm0,(%rsp)
243e1051a39Sopenharmony_ci	mov	%rsp,%rdi
244e1051a39Sopenharmony_ci	movaps	%xmm1,16(%rsp)
245e1051a39Sopenharmony_ci	mov	\$-1,%rax
246e1051a39Sopenharmony_ci	.byte	0xf3,0x0f,0xa6,0xd0	# rep xsha256
247e1051a39Sopenharmony_ci	movaps	(%rsp),%xmm0
248e1051a39Sopenharmony_ci	movaps	16(%rsp),%xmm1
249e1051a39Sopenharmony_ci	add	\$128+8,%rsp
250e1051a39Sopenharmony_ci	movups	%xmm0,(%rdx)		# copy-out context
251e1051a39Sopenharmony_ci	movups	%xmm1,16(%rdx)
252e1051a39Sopenharmony_ci	ret
253e1051a39Sopenharmony_ci.size	padlock_sha256_blocks,.-padlock_sha256_blocks
254e1051a39Sopenharmony_ci
255e1051a39Sopenharmony_ci.globl	padlock_sha512_blocks
256e1051a39Sopenharmony_ci.type	padlock_sha512_blocks,\@function,3
257e1051a39Sopenharmony_ci.align	16
258e1051a39Sopenharmony_cipadlock_sha512_blocks:
259e1051a39Sopenharmony_ci	mov	%rdx,%rcx
260e1051a39Sopenharmony_ci	mov	%rdi,%rdx		# put aside %rdi
261e1051a39Sopenharmony_ci	movups	(%rdi),%xmm0		# copy-in context
262e1051a39Sopenharmony_ci	sub	\$128+8,%rsp
263e1051a39Sopenharmony_ci	movups	16(%rdi),%xmm1
264e1051a39Sopenharmony_ci	movups	32(%rdi),%xmm2
265e1051a39Sopenharmony_ci	movups	48(%rdi),%xmm3
266e1051a39Sopenharmony_ci	movaps	%xmm0,(%rsp)
267e1051a39Sopenharmony_ci	mov	%rsp,%rdi
268e1051a39Sopenharmony_ci	movaps	%xmm1,16(%rsp)
269e1051a39Sopenharmony_ci	movaps	%xmm2,32(%rsp)
270e1051a39Sopenharmony_ci	movaps	%xmm3,48(%rsp)
271e1051a39Sopenharmony_ci	.byte	0xf3,0x0f,0xa6,0xe0	# rep xha512
272e1051a39Sopenharmony_ci	movaps	(%rsp),%xmm0
273e1051a39Sopenharmony_ci	movaps	16(%rsp),%xmm1
274e1051a39Sopenharmony_ci	movaps	32(%rsp),%xmm2
275e1051a39Sopenharmony_ci	movaps	48(%rsp),%xmm3
276e1051a39Sopenharmony_ci	add	\$128+8,%rsp
277e1051a39Sopenharmony_ci	movups	%xmm0,(%rdx)		# copy-out context
278e1051a39Sopenharmony_ci	movups	%xmm1,16(%rdx)
279e1051a39Sopenharmony_ci	movups	%xmm2,32(%rdx)
280e1051a39Sopenharmony_ci	movups	%xmm3,48(%rdx)
281e1051a39Sopenharmony_ci	ret
282e1051a39Sopenharmony_ci.size	padlock_sha512_blocks,.-padlock_sha512_blocks
283e1051a39Sopenharmony_ci___
284e1051a39Sopenharmony_ci
285e1051a39Sopenharmony_cisub generate_mode {
286e1051a39Sopenharmony_cimy ($mode,$opcode) = @_;
287e1051a39Sopenharmony_ci# int padlock_$mode_encrypt(void *out, const void *inp,
288e1051a39Sopenharmony_ci#		struct padlock_cipher_data *ctx, size_t len);
289e1051a39Sopenharmony_ci$code.=<<___;
290e1051a39Sopenharmony_ci.globl	padlock_${mode}_encrypt
291e1051a39Sopenharmony_ci.type	padlock_${mode}_encrypt,\@function,4
292e1051a39Sopenharmony_ci.align	16
293e1051a39Sopenharmony_cipadlock_${mode}_encrypt:
294e1051a39Sopenharmony_ci	push	%rbp
295e1051a39Sopenharmony_ci	push	%rbx
296e1051a39Sopenharmony_ci
297e1051a39Sopenharmony_ci	xor	%eax,%eax
298e1051a39Sopenharmony_ci	test	\$15,$ctx
299e1051a39Sopenharmony_ci	jnz	.L${mode}_abort
300e1051a39Sopenharmony_ci	test	\$15,$len
301e1051a39Sopenharmony_ci	jnz	.L${mode}_abort
302e1051a39Sopenharmony_ci	lea	.Lpadlock_saved_context(%rip),%rax
303e1051a39Sopenharmony_ci	pushf
304e1051a39Sopenharmony_ci	cld
305e1051a39Sopenharmony_ci	call	_padlock_verify_ctx
306e1051a39Sopenharmony_ci	lea	16($ctx),$ctx		# control word
307e1051a39Sopenharmony_ci	xor	%eax,%eax
308e1051a39Sopenharmony_ci	xor	%ebx,%ebx
309e1051a39Sopenharmony_ci	testl	\$`1<<5`,($ctx)		# align bit in control word
310e1051a39Sopenharmony_ci	jnz	.L${mode}_aligned
311e1051a39Sopenharmony_ci	test	\$0x0f,$out
312e1051a39Sopenharmony_ci	setz	%al			# !out_misaligned
313e1051a39Sopenharmony_ci	test	\$0x0f,$inp
314e1051a39Sopenharmony_ci	setz	%bl			# !inp_misaligned
315e1051a39Sopenharmony_ci	test	%ebx,%eax
316e1051a39Sopenharmony_ci	jnz	.L${mode}_aligned
317e1051a39Sopenharmony_ci	neg	%rax
318e1051a39Sopenharmony_ci	mov	\$$PADLOCK_CHUNK,$chunk
319e1051a39Sopenharmony_ci	not	%rax			# out_misaligned?-1:0
320e1051a39Sopenharmony_ci	lea	(%rsp),%rbp
321e1051a39Sopenharmony_ci	cmp	$chunk,$len
322e1051a39Sopenharmony_ci	cmovc	$len,$chunk		# chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
323e1051a39Sopenharmony_ci	and	$chunk,%rax		# out_misaligned?chunk:0
324e1051a39Sopenharmony_ci	mov	$len,$chunk
325e1051a39Sopenharmony_ci	neg	%rax
326e1051a39Sopenharmony_ci	and	\$$PADLOCK_CHUNK-1,$chunk	# chunk%=PADLOCK_CHUNK
327e1051a39Sopenharmony_ci	lea	(%rax,%rbp),%rsp
328e1051a39Sopenharmony_ci	mov	\$$PADLOCK_CHUNK,%rax
329e1051a39Sopenharmony_ci	cmovz	%rax,$chunk			# chunk=chunk?:PADLOCK_CHUNK
330e1051a39Sopenharmony_ci___
331e1051a39Sopenharmony_ci$code.=<<___				if ($mode eq "ctr32");
332e1051a39Sopenharmony_ci.L${mode}_reenter:
333e1051a39Sopenharmony_ci	mov	-4($ctx),%eax		# pull 32-bit counter
334e1051a39Sopenharmony_ci	bswap	%eax
335e1051a39Sopenharmony_ci	neg	%eax
336e1051a39Sopenharmony_ci	and	\$`$PADLOCK_CHUNK/16-1`,%eax
337e1051a39Sopenharmony_ci	mov	\$$PADLOCK_CHUNK,$chunk
338e1051a39Sopenharmony_ci	shl	\$4,%eax
339e1051a39Sopenharmony_ci	cmovz	$chunk,%rax
340e1051a39Sopenharmony_ci	cmp	%rax,$len
341e1051a39Sopenharmony_ci	cmova	%rax,$chunk		# don't let counter cross PADLOCK_CHUNK
342e1051a39Sopenharmony_ci	cmovbe	$len,$chunk
343e1051a39Sopenharmony_ci___
344e1051a39Sopenharmony_ci$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
345e1051a39Sopenharmony_ci	cmp	$chunk,$len
346e1051a39Sopenharmony_ci	ja	.L${mode}_loop
347e1051a39Sopenharmony_ci	mov	$inp,%rax		# check if prefetch crosses page
348e1051a39Sopenharmony_ci	cmp	%rsp,%rbp
349e1051a39Sopenharmony_ci	cmove	$out,%rax
350e1051a39Sopenharmony_ci	add	$len,%rax
351e1051a39Sopenharmony_ci	neg	%rax
352e1051a39Sopenharmony_ci	and	\$0xfff,%rax		# distance to page boundary
353e1051a39Sopenharmony_ci	cmp	\$$PADLOCK_PREFETCH{$mode},%rax
354e1051a39Sopenharmony_ci	mov	\$-$PADLOCK_PREFETCH{$mode},%rax
355e1051a39Sopenharmony_ci	cmovae	$chunk,%rax		# mask=distance<prefetch?-prefetch:-1
356e1051a39Sopenharmony_ci	and	%rax,$chunk
357e1051a39Sopenharmony_ci	jz	.L${mode}_unaligned_tail
358e1051a39Sopenharmony_ci___
359e1051a39Sopenharmony_ci$code.=<<___;
360e1051a39Sopenharmony_ci	jmp	.L${mode}_loop
361e1051a39Sopenharmony_ci.align	16
362e1051a39Sopenharmony_ci.L${mode}_loop:
363e1051a39Sopenharmony_ci	cmp	$len,$chunk		# ctr32 artefact
364e1051a39Sopenharmony_ci	cmova	$len,$chunk		# ctr32 artefact
365e1051a39Sopenharmony_ci	mov	$out,%r8		# save parameters
366e1051a39Sopenharmony_ci	mov	$inp,%r9
367e1051a39Sopenharmony_ci	mov	$len,%r10
368e1051a39Sopenharmony_ci	mov	$chunk,$len
369e1051a39Sopenharmony_ci	mov	$chunk,%r11
370e1051a39Sopenharmony_ci	test	\$0x0f,$out		# out_misaligned
371e1051a39Sopenharmony_ci	cmovnz	%rsp,$out
372e1051a39Sopenharmony_ci	test	\$0x0f,$inp		# inp_misaligned
373e1051a39Sopenharmony_ci	jz	.L${mode}_inp_aligned
374e1051a39Sopenharmony_ci	shr	\$3,$len
375e1051a39Sopenharmony_ci	.byte	0xf3,0x48,0xa5		# rep movsq
376e1051a39Sopenharmony_ci	sub	$chunk,$out
377e1051a39Sopenharmony_ci	mov	$chunk,$len
378e1051a39Sopenharmony_ci	mov	$out,$inp
379e1051a39Sopenharmony_ci.L${mode}_inp_aligned:
380e1051a39Sopenharmony_ci	lea	-16($ctx),%rax		# ivp
381e1051a39Sopenharmony_ci	lea	16($ctx),%rbx		# key
382e1051a39Sopenharmony_ci	shr	\$4,$len
383e1051a39Sopenharmony_ci	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
384e1051a39Sopenharmony_ci___
385e1051a39Sopenharmony_ci$code.=<<___				if ($mode !~ /ecb|ctr/);
386e1051a39Sopenharmony_ci	movdqa	(%rax),%xmm0
387e1051a39Sopenharmony_ci	movdqa	%xmm0,-16($ctx)		# copy [or refresh] iv
388e1051a39Sopenharmony_ci___
389e1051a39Sopenharmony_ci$code.=<<___				if ($mode eq "ctr32");
390e1051a39Sopenharmony_ci	mov	-4($ctx),%eax		# pull 32-bit counter
391e1051a39Sopenharmony_ci	test	\$0xffff0000,%eax
392e1051a39Sopenharmony_ci	jnz	.L${mode}_no_carry
393e1051a39Sopenharmony_ci	bswap	%eax
394e1051a39Sopenharmony_ci	add	\$0x10000,%eax
395e1051a39Sopenharmony_ci	bswap	%eax
396e1051a39Sopenharmony_ci	mov	%eax,-4($ctx)
397e1051a39Sopenharmony_ci.L${mode}_no_carry:
398e1051a39Sopenharmony_ci___
399e1051a39Sopenharmony_ci$code.=<<___;
400e1051a39Sopenharmony_ci	mov	%r8,$out		# restore parameters
401e1051a39Sopenharmony_ci	mov	%r11,$chunk
402e1051a39Sopenharmony_ci	test	\$0x0f,$out
403e1051a39Sopenharmony_ci	jz	.L${mode}_out_aligned
404e1051a39Sopenharmony_ci	mov	$chunk,$len
405e1051a39Sopenharmony_ci	lea	(%rsp),$inp
406e1051a39Sopenharmony_ci	shr	\$3,$len
407e1051a39Sopenharmony_ci	.byte	0xf3,0x48,0xa5		# rep movsq
408e1051a39Sopenharmony_ci	sub	$chunk,$out
409e1051a39Sopenharmony_ci.L${mode}_out_aligned:
410e1051a39Sopenharmony_ci	mov	%r9,$inp
411e1051a39Sopenharmony_ci	mov	%r10,$len
412e1051a39Sopenharmony_ci	add	$chunk,$out
413e1051a39Sopenharmony_ci	add	$chunk,$inp
414e1051a39Sopenharmony_ci	sub	$chunk,$len
415e1051a39Sopenharmony_ci	mov	\$$PADLOCK_CHUNK,$chunk
416e1051a39Sopenharmony_ci___
417e1051a39Sopenharmony_ci					if (!$PADLOCK_PREFETCH{$mode}) {
418e1051a39Sopenharmony_ci$code.=<<___;
419e1051a39Sopenharmony_ci	jnz	.L${mode}_loop
420e1051a39Sopenharmony_ci___
421e1051a39Sopenharmony_ci					} else {
422e1051a39Sopenharmony_ci$code.=<<___;
423e1051a39Sopenharmony_ci	jz	.L${mode}_break
424e1051a39Sopenharmony_ci	cmp	$chunk,$len
425e1051a39Sopenharmony_ci	jae	.L${mode}_loop
426e1051a39Sopenharmony_ci___
427e1051a39Sopenharmony_ci$code.=<<___				if ($mode eq "ctr32");
428e1051a39Sopenharmony_ci	mov	$len,$chunk
429e1051a39Sopenharmony_ci	mov	$inp,%rax		# check if prefetch crosses page
430e1051a39Sopenharmony_ci	cmp	%rsp,%rbp
431e1051a39Sopenharmony_ci	cmove	$out,%rax
432e1051a39Sopenharmony_ci	add	$len,%rax
433e1051a39Sopenharmony_ci	neg	%rax
434e1051a39Sopenharmony_ci	and	\$0xfff,%rax		# distance to page boundary
435e1051a39Sopenharmony_ci	cmp	\$$PADLOCK_PREFETCH{$mode},%rax
436e1051a39Sopenharmony_ci	mov	\$-$PADLOCK_PREFETCH{$mode},%rax
437e1051a39Sopenharmony_ci	cmovae	$chunk,%rax
438e1051a39Sopenharmony_ci	and	%rax,$chunk
439e1051a39Sopenharmony_ci	jnz	.L${mode}_loop
440e1051a39Sopenharmony_ci___
441e1051a39Sopenharmony_ci$code.=<<___;
442e1051a39Sopenharmony_ci.L${mode}_unaligned_tail:
443e1051a39Sopenharmony_ci	xor	%eax,%eax
444e1051a39Sopenharmony_ci	cmp	%rsp,%rbp
445e1051a39Sopenharmony_ci	cmove	$len,%rax
446e1051a39Sopenharmony_ci	mov	$out,%r8		# save parameters
447e1051a39Sopenharmony_ci	mov	$len,$chunk
448e1051a39Sopenharmony_ci	sub	%rax,%rsp		# alloca
449e1051a39Sopenharmony_ci	shr	\$3,$len
450e1051a39Sopenharmony_ci	lea	(%rsp),$out
451e1051a39Sopenharmony_ci	.byte	0xf3,0x48,0xa5		# rep movsq
452e1051a39Sopenharmony_ci	mov	%rsp,$inp
453e1051a39Sopenharmony_ci	mov	%r8, $out		# restore parameters
454e1051a39Sopenharmony_ci	mov	$chunk,$len
455e1051a39Sopenharmony_ci	jmp	.L${mode}_loop
456e1051a39Sopenharmony_ci.align	16
457e1051a39Sopenharmony_ci.L${mode}_break:
458e1051a39Sopenharmony_ci___
459e1051a39Sopenharmony_ci					}
460e1051a39Sopenharmony_ci$code.=<<___;
461e1051a39Sopenharmony_ci	cmp	%rbp,%rsp
462e1051a39Sopenharmony_ci	je	.L${mode}_done
463e1051a39Sopenharmony_ci
464e1051a39Sopenharmony_ci	pxor	%xmm0,%xmm0
465e1051a39Sopenharmony_ci	lea	(%rsp),%rax
466e1051a39Sopenharmony_ci.L${mode}_bzero:
467e1051a39Sopenharmony_ci	movaps	%xmm0,(%rax)
468e1051a39Sopenharmony_ci	lea	16(%rax),%rax
469e1051a39Sopenharmony_ci	cmp	%rax,%rbp
470e1051a39Sopenharmony_ci	ja	.L${mode}_bzero
471e1051a39Sopenharmony_ci
472e1051a39Sopenharmony_ci.L${mode}_done:
473e1051a39Sopenharmony_ci	lea	(%rbp),%rsp
474e1051a39Sopenharmony_ci	jmp	.L${mode}_exit
475e1051a39Sopenharmony_ci
476e1051a39Sopenharmony_ci.align	16
477e1051a39Sopenharmony_ci.L${mode}_aligned:
478e1051a39Sopenharmony_ci___
479e1051a39Sopenharmony_ci$code.=<<___				if ($mode eq "ctr32");
480e1051a39Sopenharmony_ci	mov	-4($ctx),%eax		# pull 32-bit counter
481e1051a39Sopenharmony_ci	bswap	%eax
482e1051a39Sopenharmony_ci	neg	%eax
483e1051a39Sopenharmony_ci	and	\$0xffff,%eax
484e1051a39Sopenharmony_ci	mov	\$`16*0x10000`,$chunk
485e1051a39Sopenharmony_ci	shl	\$4,%eax
486e1051a39Sopenharmony_ci	cmovz	$chunk,%rax
487e1051a39Sopenharmony_ci	cmp	%rax,$len
488e1051a39Sopenharmony_ci	cmova	%rax,$chunk		# don't let counter cross 2^16
489e1051a39Sopenharmony_ci	cmovbe	$len,$chunk
490e1051a39Sopenharmony_ci	jbe	.L${mode}_aligned_skip
491e1051a39Sopenharmony_ci
492e1051a39Sopenharmony_ci.L${mode}_aligned_loop:
493e1051a39Sopenharmony_ci	mov	$len,%r10		# save parameters
494e1051a39Sopenharmony_ci	mov	$chunk,$len
495e1051a39Sopenharmony_ci	mov	$chunk,%r11
496e1051a39Sopenharmony_ci
497e1051a39Sopenharmony_ci	lea	-16($ctx),%rax		# ivp
498e1051a39Sopenharmony_ci	lea	16($ctx),%rbx		# key
499e1051a39Sopenharmony_ci	shr	\$4,$len		# len/=AES_BLOCK_SIZE
500e1051a39Sopenharmony_ci	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
501e1051a39Sopenharmony_ci
502e1051a39Sopenharmony_ci	mov	-4($ctx),%eax		# pull 32-bit counter
503e1051a39Sopenharmony_ci	bswap	%eax
504e1051a39Sopenharmony_ci	add	\$0x10000,%eax
505e1051a39Sopenharmony_ci	bswap	%eax
506e1051a39Sopenharmony_ci	mov	%eax,-4($ctx)
507e1051a39Sopenharmony_ci
508e1051a39Sopenharmony_ci	mov	%r10,$len		# restore parameters
509e1051a39Sopenharmony_ci	sub	%r11,$len
510e1051a39Sopenharmony_ci	mov	\$`16*0x10000`,$chunk
511e1051a39Sopenharmony_ci	jz	.L${mode}_exit
512e1051a39Sopenharmony_ci	cmp	$chunk,$len
513e1051a39Sopenharmony_ci	jae	.L${mode}_aligned_loop
514e1051a39Sopenharmony_ci
515e1051a39Sopenharmony_ci.L${mode}_aligned_skip:
516e1051a39Sopenharmony_ci___
517e1051a39Sopenharmony_ci$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
518e1051a39Sopenharmony_ci	lea	($inp,$len),%rbp
519e1051a39Sopenharmony_ci	neg	%rbp
520e1051a39Sopenharmony_ci	and	\$0xfff,%rbp		# distance to page boundary
521e1051a39Sopenharmony_ci	xor	%eax,%eax
522e1051a39Sopenharmony_ci	cmp	\$$PADLOCK_PREFETCH{$mode},%rbp
523e1051a39Sopenharmony_ci	mov	\$$PADLOCK_PREFETCH{$mode}-1,%rbp
524e1051a39Sopenharmony_ci	cmovae	%rax,%rbp
525e1051a39Sopenharmony_ci	and	$len,%rbp		# remainder
526e1051a39Sopenharmony_ci	sub	%rbp,$len
527e1051a39Sopenharmony_ci	jz	.L${mode}_aligned_tail
528e1051a39Sopenharmony_ci___
529e1051a39Sopenharmony_ci$code.=<<___;
530e1051a39Sopenharmony_ci	lea	-16($ctx),%rax		# ivp
531e1051a39Sopenharmony_ci	lea	16($ctx),%rbx		# key
532e1051a39Sopenharmony_ci	shr	\$4,$len		# len/=AES_BLOCK_SIZE
533e1051a39Sopenharmony_ci	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
534e1051a39Sopenharmony_ci___
535e1051a39Sopenharmony_ci$code.=<<___				if ($mode !~ /ecb|ctr/);
536e1051a39Sopenharmony_ci	movdqa	(%rax),%xmm0
537e1051a39Sopenharmony_ci	movdqa	%xmm0,-16($ctx)		# copy [or refresh] iv
538e1051a39Sopenharmony_ci___
539e1051a39Sopenharmony_ci$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
540e1051a39Sopenharmony_ci	test	%rbp,%rbp		# check remainder
541e1051a39Sopenharmony_ci	jz	.L${mode}_exit
542e1051a39Sopenharmony_ci
543e1051a39Sopenharmony_ci.L${mode}_aligned_tail:
544e1051a39Sopenharmony_ci	mov	$out,%r8
545e1051a39Sopenharmony_ci	mov	%rbp,$chunk
546e1051a39Sopenharmony_ci	mov	%rbp,$len
547e1051a39Sopenharmony_ci	lea	(%rsp),%rbp
548e1051a39Sopenharmony_ci	sub	$len,%rsp
549e1051a39Sopenharmony_ci	shr	\$3,$len
550e1051a39Sopenharmony_ci	lea	(%rsp),$out
551e1051a39Sopenharmony_ci	.byte	0xf3,0x48,0xa5		# rep movsq
552e1051a39Sopenharmony_ci	lea	(%r8),$out
553e1051a39Sopenharmony_ci	lea	(%rsp),$inp
554e1051a39Sopenharmony_ci	mov	$chunk,$len
555e1051a39Sopenharmony_ci	jmp	.L${mode}_loop
556e1051a39Sopenharmony_ci___
557e1051a39Sopenharmony_ci$code.=<<___;
558e1051a39Sopenharmony_ci.L${mode}_exit:
559e1051a39Sopenharmony_ci	mov	\$1,%eax
560e1051a39Sopenharmony_ci	lea	8(%rsp),%rsp
561e1051a39Sopenharmony_ci.L${mode}_abort:
562e1051a39Sopenharmony_ci	pop	%rbx
563e1051a39Sopenharmony_ci	pop	%rbp
564e1051a39Sopenharmony_ci	ret
565e1051a39Sopenharmony_ci.size	padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
566e1051a39Sopenharmony_ci___
567e1051a39Sopenharmony_ci}
568e1051a39Sopenharmony_ci
569e1051a39Sopenharmony_ci&generate_mode("ecb",0xc8);
570e1051a39Sopenharmony_ci&generate_mode("cbc",0xd0);
571e1051a39Sopenharmony_ci&generate_mode("cfb",0xe0);
572e1051a39Sopenharmony_ci&generate_mode("ofb",0xe8);
573e1051a39Sopenharmony_ci&generate_mode("ctr32",0xd8);	# all 64-bit CPUs have working CTR...
574e1051a39Sopenharmony_ci
575e1051a39Sopenharmony_ci$code.=<<___;
576e1051a39Sopenharmony_ci.asciz	"VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
577e1051a39Sopenharmony_ci.align	16
578e1051a39Sopenharmony_ci.data
579e1051a39Sopenharmony_ci.align	8
580e1051a39Sopenharmony_ci.Lpadlock_saved_context:
581e1051a39Sopenharmony_ci	.quad	0
582e1051a39Sopenharmony_ci___
583e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval($1)/gem;
584e1051a39Sopenharmony_ci
585e1051a39Sopenharmony_ciprint $code;
586e1051a39Sopenharmony_ci
587e1051a39Sopenharmony_ciclose STDOUT;
588