1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci#
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# May 2011
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
20e1051a39Sopenharmony_ci# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
21e1051a39Sopenharmony_ci# the time being... Except that it has two code paths: code suitable
22e1051a39Sopenharmony_ci# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
23e1051a39Sopenharmony_ci# later. Improvement varies from one benchmark and µ-arch to another.
24e1051a39Sopenharmony_ci# Vanilla code path is at most 20% faster than compiler-generated code
25e1051a39Sopenharmony_ci# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
26e1051a39Sopenharmony_ci# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
27e1051a39Sopenharmony_ci# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
28e1051a39Sopenharmony_ci# all CPU time is burnt in it...
29e1051a39Sopenharmony_ci
30e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
31e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
32e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
33e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
34e1051a39Sopenharmony_ci
35e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
36e1051a39Sopenharmony_ci
37e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
39e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
40e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl";
41e1051a39Sopenharmony_ci
42e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
43e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
44e1051a39Sopenharmony_ci*STDOUT=*OUT;
45e1051a39Sopenharmony_ci
46e1051a39Sopenharmony_ci($lo,$hi)=("%rax","%rdx");	$a=$lo;
47e1051a39Sopenharmony_ci($i0,$i1)=("%rsi","%rdi");
48e1051a39Sopenharmony_ci($t0,$t1)=("%rbx","%rcx");
49e1051a39Sopenharmony_ci($b,$mask)=("%rbp","%r8");
50e1051a39Sopenharmony_ci($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
51e1051a39Sopenharmony_ci($R,$Tx)=("%xmm0","%xmm1");
52e1051a39Sopenharmony_ci
53e1051a39Sopenharmony_ci$code.=<<___;
54e1051a39Sopenharmony_ci.text
55e1051a39Sopenharmony_ci
56e1051a39Sopenharmony_ci.type	_mul_1x1,\@abi-omnipotent
57e1051a39Sopenharmony_ci.align	16
58e1051a39Sopenharmony_ci_mul_1x1:
59e1051a39Sopenharmony_ci.cfi_startproc
60e1051a39Sopenharmony_ci	sub	\$128+8,%rsp
61e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset	128+8
62e1051a39Sopenharmony_ci	mov	\$-1,$a1
63e1051a39Sopenharmony_ci	lea	($a,$a),$i0
64e1051a39Sopenharmony_ci	shr	\$3,$a1
65e1051a39Sopenharmony_ci	lea	(,$a,4),$i1
66e1051a39Sopenharmony_ci	and	$a,$a1			# a1=a&0x1fffffffffffffff
67e1051a39Sopenharmony_ci	lea	(,$a,8),$a8
68e1051a39Sopenharmony_ci	sar	\$63,$a			# broadcast 63rd bit
69e1051a39Sopenharmony_ci	lea	($a1,$a1),$a2
70e1051a39Sopenharmony_ci	sar	\$63,$i0		# broadcast 62nd bit
71e1051a39Sopenharmony_ci	lea	(,$a1,4),$a4
72e1051a39Sopenharmony_ci	and	$b,$a
73e1051a39Sopenharmony_ci	sar	\$63,$i1		# broadcast 61st bit
74e1051a39Sopenharmony_ci	mov	$a,$hi			# $a is $lo
75e1051a39Sopenharmony_ci	shl	\$63,$lo
76e1051a39Sopenharmony_ci	and	$b,$i0
77e1051a39Sopenharmony_ci	shr	\$1,$hi
78e1051a39Sopenharmony_ci	mov	$i0,$t1
79e1051a39Sopenharmony_ci	shl	\$62,$i0
80e1051a39Sopenharmony_ci	and	$b,$i1
81e1051a39Sopenharmony_ci	shr	\$2,$t1
82e1051a39Sopenharmony_ci	xor	$i0,$lo
83e1051a39Sopenharmony_ci	mov	$i1,$t0
84e1051a39Sopenharmony_ci	shl	\$61,$i1
85e1051a39Sopenharmony_ci	xor	$t1,$hi
86e1051a39Sopenharmony_ci	shr	\$3,$t0
87e1051a39Sopenharmony_ci	xor	$i1,$lo
88e1051a39Sopenharmony_ci	xor	$t0,$hi
89e1051a39Sopenharmony_ci
90e1051a39Sopenharmony_ci	mov	$a1,$a12
91e1051a39Sopenharmony_ci	movq	\$0,0(%rsp)		# tab[0]=0
92e1051a39Sopenharmony_ci	xor	$a2,$a12		# a1^a2
93e1051a39Sopenharmony_ci	mov	$a1,8(%rsp)		# tab[1]=a1
94e1051a39Sopenharmony_ci	 mov	$a4,$a48
95e1051a39Sopenharmony_ci	mov	$a2,16(%rsp)		# tab[2]=a2
96e1051a39Sopenharmony_ci	 xor	$a8,$a48		# a4^a8
97e1051a39Sopenharmony_ci	mov	$a12,24(%rsp)		# tab[3]=a1^a2
98e1051a39Sopenharmony_ci
99e1051a39Sopenharmony_ci	xor	$a4,$a1
100e1051a39Sopenharmony_ci	mov	$a4,32(%rsp)		# tab[4]=a4
101e1051a39Sopenharmony_ci	xor	$a4,$a2
102e1051a39Sopenharmony_ci	mov	$a1,40(%rsp)		# tab[5]=a1^a4
103e1051a39Sopenharmony_ci	xor	$a4,$a12
104e1051a39Sopenharmony_ci	mov	$a2,48(%rsp)		# tab[6]=a2^a4
105e1051a39Sopenharmony_ci	 xor	$a48,$a1		# a1^a4^a4^a8=a1^a8
106e1051a39Sopenharmony_ci	mov	$a12,56(%rsp)		# tab[7]=a1^a2^a4
107e1051a39Sopenharmony_ci	 xor	$a48,$a2		# a2^a4^a4^a8=a1^a8
108e1051a39Sopenharmony_ci
109e1051a39Sopenharmony_ci	mov	$a8,64(%rsp)		# tab[8]=a8
110e1051a39Sopenharmony_ci	xor	$a48,$a12		# a1^a2^a4^a4^a8=a1^a2^a8
111e1051a39Sopenharmony_ci	mov	$a1,72(%rsp)		# tab[9]=a1^a8
112e1051a39Sopenharmony_ci	 xor	$a4,$a1			# a1^a8^a4
113e1051a39Sopenharmony_ci	mov	$a2,80(%rsp)		# tab[10]=a2^a8
114e1051a39Sopenharmony_ci	 xor	$a4,$a2			# a2^a8^a4
115e1051a39Sopenharmony_ci	mov	$a12,88(%rsp)		# tab[11]=a1^a2^a8
116e1051a39Sopenharmony_ci
117e1051a39Sopenharmony_ci	xor	$a4,$a12		# a1^a2^a8^a4
118e1051a39Sopenharmony_ci	mov	$a48,96(%rsp)		# tab[12]=a4^a8
119e1051a39Sopenharmony_ci	 mov	$mask,$i0
120e1051a39Sopenharmony_ci	mov	$a1,104(%rsp)		# tab[13]=a1^a4^a8
121e1051a39Sopenharmony_ci	 and	$b,$i0
122e1051a39Sopenharmony_ci	mov	$a2,112(%rsp)		# tab[14]=a2^a4^a8
123e1051a39Sopenharmony_ci	 shr	\$4,$b
124e1051a39Sopenharmony_ci	mov	$a12,120(%rsp)		# tab[15]=a1^a2^a4^a8
125e1051a39Sopenharmony_ci	 mov	$mask,$i1
126e1051a39Sopenharmony_ci	 and	$b,$i1
127e1051a39Sopenharmony_ci	 shr	\$4,$b
128e1051a39Sopenharmony_ci
129e1051a39Sopenharmony_ci	movq	(%rsp,$i0,8),$R		# half of calculations is done in SSE2
130e1051a39Sopenharmony_ci	mov	$mask,$i0
131e1051a39Sopenharmony_ci	and	$b,$i0
132e1051a39Sopenharmony_ci	shr	\$4,$b
133e1051a39Sopenharmony_ci___
134e1051a39Sopenharmony_ci    for ($n=1;$n<8;$n++) {
135e1051a39Sopenharmony_ci	$code.=<<___;
136e1051a39Sopenharmony_ci	mov	(%rsp,$i1,8),$t1
137e1051a39Sopenharmony_ci	mov	$mask,$i1
138e1051a39Sopenharmony_ci	mov	$t1,$t0
139e1051a39Sopenharmony_ci	shl	\$`8*$n-4`,$t1
140e1051a39Sopenharmony_ci	and	$b,$i1
141e1051a39Sopenharmony_ci	 movq	(%rsp,$i0,8),$Tx
142e1051a39Sopenharmony_ci	shr	\$`64-(8*$n-4)`,$t0
143e1051a39Sopenharmony_ci	xor	$t1,$lo
144e1051a39Sopenharmony_ci	 pslldq	\$$n,$Tx
145e1051a39Sopenharmony_ci	 mov	$mask,$i0
146e1051a39Sopenharmony_ci	shr	\$4,$b
147e1051a39Sopenharmony_ci	xor	$t0,$hi
148e1051a39Sopenharmony_ci	 and	$b,$i0
149e1051a39Sopenharmony_ci	 shr	\$4,$b
150e1051a39Sopenharmony_ci	 pxor	$Tx,$R
151e1051a39Sopenharmony_ci___
152e1051a39Sopenharmony_ci    }
153e1051a39Sopenharmony_ci$code.=<<___;
154e1051a39Sopenharmony_ci	mov	(%rsp,$i1,8),$t1
155e1051a39Sopenharmony_ci	mov	$t1,$t0
156e1051a39Sopenharmony_ci	shl	\$`8*$n-4`,$t1
157e1051a39Sopenharmony_ci	movq	$R,$i0
158e1051a39Sopenharmony_ci	shr	\$`64-(8*$n-4)`,$t0
159e1051a39Sopenharmony_ci	xor	$t1,$lo
160e1051a39Sopenharmony_ci	psrldq	\$8,$R
161e1051a39Sopenharmony_ci	xor	$t0,$hi
162e1051a39Sopenharmony_ci	movq	$R,$i1
163e1051a39Sopenharmony_ci	xor	$i0,$lo
164e1051a39Sopenharmony_ci	xor	$i1,$hi
165e1051a39Sopenharmony_ci
166e1051a39Sopenharmony_ci	add	\$128+8,%rsp
167e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset	-128-8
168e1051a39Sopenharmony_ci	ret
169e1051a39Sopenharmony_ci.Lend_mul_1x1:
170e1051a39Sopenharmony_ci.cfi_endproc
171e1051a39Sopenharmony_ci.size	_mul_1x1,.-_mul_1x1
172e1051a39Sopenharmony_ci___
173e1051a39Sopenharmony_ci
174e1051a39Sopenharmony_ci($rp,$a1,$a0,$b1,$b0) = $win64?	("%rcx","%rdx","%r8", "%r9","%r10") :	# Win64 order
175e1051a39Sopenharmony_ci				("%rdi","%rsi","%rdx","%rcx","%r8");	# Unix order
176e1051a39Sopenharmony_ci
177e1051a39Sopenharmony_ci$code.=<<___;
178e1051a39Sopenharmony_ci.extern	OPENSSL_ia32cap_P
179e1051a39Sopenharmony_ci.globl	bn_GF2m_mul_2x2
180e1051a39Sopenharmony_ci.type	bn_GF2m_mul_2x2,\@abi-omnipotent
181e1051a39Sopenharmony_ci.align	16
182e1051a39Sopenharmony_cibn_GF2m_mul_2x2:
183e1051a39Sopenharmony_ci.cfi_startproc
184e1051a39Sopenharmony_ci	mov	%rsp,%rax
185e1051a39Sopenharmony_ci	mov	OPENSSL_ia32cap_P(%rip),%r10
186e1051a39Sopenharmony_ci	bt	\$33,%r10
187e1051a39Sopenharmony_ci	jnc	.Lvanilla_mul_2x2
188e1051a39Sopenharmony_ci
189e1051a39Sopenharmony_ci	movq		$a1,%xmm0
190e1051a39Sopenharmony_ci	movq		$b1,%xmm1
191e1051a39Sopenharmony_ci	movq		$a0,%xmm2
192e1051a39Sopenharmony_ci___
193e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
194e1051a39Sopenharmony_ci	movq		40(%rsp),%xmm3
195e1051a39Sopenharmony_ci___
196e1051a39Sopenharmony_ci$code.=<<___ if (!$win64);
197e1051a39Sopenharmony_ci	movq		$b0,%xmm3
198e1051a39Sopenharmony_ci___
199e1051a39Sopenharmony_ci$code.=<<___;
200e1051a39Sopenharmony_ci	movdqa		%xmm0,%xmm4
201e1051a39Sopenharmony_ci	movdqa		%xmm1,%xmm5
202e1051a39Sopenharmony_ci	pclmulqdq	\$0,%xmm1,%xmm0	# a1·b1
203e1051a39Sopenharmony_ci	pxor		%xmm2,%xmm4
204e1051a39Sopenharmony_ci	pxor		%xmm3,%xmm5
205e1051a39Sopenharmony_ci	pclmulqdq	\$0,%xmm3,%xmm2	# a0·b0
206e1051a39Sopenharmony_ci	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)·(b0+b1)
207e1051a39Sopenharmony_ci	xorps		%xmm0,%xmm4
208e1051a39Sopenharmony_ci	xorps		%xmm2,%xmm4	# (a0+a1)·(b0+b1)-a0·b0-a1·b1
209e1051a39Sopenharmony_ci	movdqa		%xmm4,%xmm5
210e1051a39Sopenharmony_ci	pslldq		\$8,%xmm4
211e1051a39Sopenharmony_ci	psrldq		\$8,%xmm5
212e1051a39Sopenharmony_ci	pxor		%xmm4,%xmm2
213e1051a39Sopenharmony_ci	pxor		%xmm5,%xmm0
214e1051a39Sopenharmony_ci	movdqu		%xmm2,0($rp)
215e1051a39Sopenharmony_ci	movdqu		%xmm0,16($rp)
216e1051a39Sopenharmony_ci	ret
217e1051a39Sopenharmony_ci
218e1051a39Sopenharmony_ci.align	16
219e1051a39Sopenharmony_ci.Lvanilla_mul_2x2:
220e1051a39Sopenharmony_ci	lea	-8*17(%rsp),%rsp
221e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset	8*17
222e1051a39Sopenharmony_ci___
223e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
224e1051a39Sopenharmony_ci	mov	`8*17+40`(%rsp),$b0
225e1051a39Sopenharmony_ci	mov	%rdi,8*15(%rsp)
226e1051a39Sopenharmony_ci	mov	%rsi,8*16(%rsp)
227e1051a39Sopenharmony_ci___
228e1051a39Sopenharmony_ci$code.=<<___;
229e1051a39Sopenharmony_ci	mov	%r14,8*10(%rsp)
230e1051a39Sopenharmony_ci.cfi_rel_offset	%r14,8*10
231e1051a39Sopenharmony_ci	mov	%r13,8*11(%rsp)
232e1051a39Sopenharmony_ci.cfi_rel_offset	%r13,8*11
233e1051a39Sopenharmony_ci	mov	%r12,8*12(%rsp)
234e1051a39Sopenharmony_ci.cfi_rel_offset	%r12,8*12
235e1051a39Sopenharmony_ci	mov	%rbp,8*13(%rsp)
236e1051a39Sopenharmony_ci.cfi_rel_offset	%rbp,8*13
237e1051a39Sopenharmony_ci	mov	%rbx,8*14(%rsp)
238e1051a39Sopenharmony_ci.cfi_rel_offset	%rbx,8*14
239e1051a39Sopenharmony_ci.Lbody_mul_2x2:
240e1051a39Sopenharmony_ci	mov	$rp,32(%rsp)		# save the arguments
241e1051a39Sopenharmony_ci	mov	$a1,40(%rsp)
242e1051a39Sopenharmony_ci	mov	$a0,48(%rsp)
243e1051a39Sopenharmony_ci	mov	$b1,56(%rsp)
244e1051a39Sopenharmony_ci	mov	$b0,64(%rsp)
245e1051a39Sopenharmony_ci
246e1051a39Sopenharmony_ci	mov	\$0xf,$mask
247e1051a39Sopenharmony_ci	mov	$a1,$a
248e1051a39Sopenharmony_ci	mov	$b1,$b
249e1051a39Sopenharmony_ci	call	_mul_1x1		# a1·b1
250e1051a39Sopenharmony_ci	mov	$lo,16(%rsp)
251e1051a39Sopenharmony_ci	mov	$hi,24(%rsp)
252e1051a39Sopenharmony_ci
253e1051a39Sopenharmony_ci	mov	48(%rsp),$a
254e1051a39Sopenharmony_ci	mov	64(%rsp),$b
255e1051a39Sopenharmony_ci	call	_mul_1x1		# a0·b0
256e1051a39Sopenharmony_ci	mov	$lo,0(%rsp)
257e1051a39Sopenharmony_ci	mov	$hi,8(%rsp)
258e1051a39Sopenharmony_ci
259e1051a39Sopenharmony_ci	mov	40(%rsp),$a
260e1051a39Sopenharmony_ci	mov	56(%rsp),$b
261e1051a39Sopenharmony_ci	xor	48(%rsp),$a
262e1051a39Sopenharmony_ci	xor	64(%rsp),$b
263e1051a39Sopenharmony_ci	call	_mul_1x1		# (a0+a1)·(b0+b1)
264e1051a39Sopenharmony_ci___
265e1051a39Sopenharmony_ci	@r=("%rbx","%rcx","%rdi","%rsi");
266e1051a39Sopenharmony_ci$code.=<<___;
267e1051a39Sopenharmony_ci	mov	0(%rsp),@r[0]
268e1051a39Sopenharmony_ci	mov	8(%rsp),@r[1]
269e1051a39Sopenharmony_ci	mov	16(%rsp),@r[2]
270e1051a39Sopenharmony_ci	mov	24(%rsp),@r[3]
271e1051a39Sopenharmony_ci	mov	32(%rsp),%rbp
272e1051a39Sopenharmony_ci
273e1051a39Sopenharmony_ci	xor	$hi,$lo
274e1051a39Sopenharmony_ci	xor	@r[1],$hi
275e1051a39Sopenharmony_ci	xor	@r[0],$lo
276e1051a39Sopenharmony_ci	mov	@r[0],0(%rbp)
277e1051a39Sopenharmony_ci	xor	@r[2],$hi
278e1051a39Sopenharmony_ci	mov	@r[3],24(%rbp)
279e1051a39Sopenharmony_ci	xor	@r[3],$lo
280e1051a39Sopenharmony_ci	xor	@r[3],$hi
281e1051a39Sopenharmony_ci	xor	$hi,$lo
282e1051a39Sopenharmony_ci	mov	$hi,16(%rbp)
283e1051a39Sopenharmony_ci	mov	$lo,8(%rbp)
284e1051a39Sopenharmony_ci
285e1051a39Sopenharmony_ci	mov	8*10(%rsp),%r14
286e1051a39Sopenharmony_ci.cfi_restore	%r14
287e1051a39Sopenharmony_ci	mov	8*11(%rsp),%r13
288e1051a39Sopenharmony_ci.cfi_restore	%r13
289e1051a39Sopenharmony_ci	mov	8*12(%rsp),%r12
290e1051a39Sopenharmony_ci.cfi_restore	%r12
291e1051a39Sopenharmony_ci	mov	8*13(%rsp),%rbp
292e1051a39Sopenharmony_ci.cfi_restore	%rbp
293e1051a39Sopenharmony_ci	mov	8*14(%rsp),%rbx
294e1051a39Sopenharmony_ci.cfi_restore	%rbx
295e1051a39Sopenharmony_ci___
296e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
297e1051a39Sopenharmony_ci	mov	8*15(%rsp),%rdi
298e1051a39Sopenharmony_ci	mov	8*16(%rsp),%rsi
299e1051a39Sopenharmony_ci___
300e1051a39Sopenharmony_ci$code.=<<___;
301e1051a39Sopenharmony_ci	lea	8*17(%rsp),%rsp
302e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset	-8*17
303e1051a39Sopenharmony_ci.Lepilogue_mul_2x2:
304e1051a39Sopenharmony_ci	ret
305e1051a39Sopenharmony_ci.Lend_mul_2x2:
306e1051a39Sopenharmony_ci.cfi_endproc
307e1051a39Sopenharmony_ci.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
308e1051a39Sopenharmony_ci.asciz	"GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
309e1051a39Sopenharmony_ci.align	16
310e1051a39Sopenharmony_ci___
311e1051a39Sopenharmony_ci
312e1051a39Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
313e1051a39Sopenharmony_ci#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
314e1051a39Sopenharmony_ciif ($win64) {
315e1051a39Sopenharmony_ci$rec="%rcx";
316e1051a39Sopenharmony_ci$frame="%rdx";
317e1051a39Sopenharmony_ci$context="%r8";
318e1051a39Sopenharmony_ci$disp="%r9";
319e1051a39Sopenharmony_ci
320e1051a39Sopenharmony_ci$code.=<<___;
321e1051a39Sopenharmony_ci.extern __imp_RtlVirtualUnwind
322e1051a39Sopenharmony_ci
323e1051a39Sopenharmony_ci.type	se_handler,\@abi-omnipotent
324e1051a39Sopenharmony_ci.align	16
325e1051a39Sopenharmony_cise_handler:
326e1051a39Sopenharmony_ci	push	%rsi
327e1051a39Sopenharmony_ci	push	%rdi
328e1051a39Sopenharmony_ci	push	%rbx
329e1051a39Sopenharmony_ci	push	%rbp
330e1051a39Sopenharmony_ci	push	%r12
331e1051a39Sopenharmony_ci	push	%r13
332e1051a39Sopenharmony_ci	push	%r14
333e1051a39Sopenharmony_ci	push	%r15
334e1051a39Sopenharmony_ci	pushfq
335e1051a39Sopenharmony_ci	sub	\$64,%rsp
336e1051a39Sopenharmony_ci
337e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
338e1051a39Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
339e1051a39Sopenharmony_ci
340e1051a39Sopenharmony_ci	lea	.Lbody_mul_2x2(%rip),%r10
341e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<"prologue" label
342e1051a39Sopenharmony_ci	jb	.Lin_prologue
343e1051a39Sopenharmony_ci
344e1051a39Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
345e1051a39Sopenharmony_ci
346e1051a39Sopenharmony_ci	lea	.Lepilogue_mul_2x2(%rip),%r10
347e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>="epilogue" label
348e1051a39Sopenharmony_ci	jae	.Lin_prologue
349e1051a39Sopenharmony_ci
350e1051a39Sopenharmony_ci	mov	8*10(%rax),%r14		# mimic epilogue
351e1051a39Sopenharmony_ci	mov	8*11(%rax),%r13
352e1051a39Sopenharmony_ci	mov	8*12(%rax),%r12
353e1051a39Sopenharmony_ci	mov	8*13(%rax),%rbp
354e1051a39Sopenharmony_ci	mov	8*14(%rax),%rbx
355e1051a39Sopenharmony_ci	mov	8*15(%rax),%rdi
356e1051a39Sopenharmony_ci	mov	8*16(%rax),%rsi
357e1051a39Sopenharmony_ci
358e1051a39Sopenharmony_ci	mov	%rbx,144($context)	# restore context->Rbx
359e1051a39Sopenharmony_ci	mov	%rbp,160($context)	# restore context->Rbp
360e1051a39Sopenharmony_ci	mov	%rsi,168($context)	# restore context->Rsi
361e1051a39Sopenharmony_ci	mov	%rdi,176($context)	# restore context->Rdi
362e1051a39Sopenharmony_ci	mov	%r12,216($context)	# restore context->R12
363e1051a39Sopenharmony_ci	mov	%r13,224($context)	# restore context->R13
364e1051a39Sopenharmony_ci	mov	%r14,232($context)	# restore context->R14
365e1051a39Sopenharmony_ci
366e1051a39Sopenharmony_ci	lea	8*17(%rax),%rax
367e1051a39Sopenharmony_ci
368e1051a39Sopenharmony_ci.Lin_prologue:
369e1051a39Sopenharmony_ci	mov	%rax,152($context)	# restore context->Rsp
370e1051a39Sopenharmony_ci
371e1051a39Sopenharmony_ci	mov	40($disp),%rdi		# disp->ContextRecord
372e1051a39Sopenharmony_ci	mov	$context,%rsi		# context
373e1051a39Sopenharmony_ci	mov	\$154,%ecx		# sizeof(CONTEXT)
374e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
375e1051a39Sopenharmony_ci
376e1051a39Sopenharmony_ci	mov	$disp,%rsi
377e1051a39Sopenharmony_ci	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
378e1051a39Sopenharmony_ci	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
379e1051a39Sopenharmony_ci	mov	0(%rsi),%r8		# arg3, disp->ControlPc
380e1051a39Sopenharmony_ci	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
381e1051a39Sopenharmony_ci	mov	40(%rsi),%r10		# disp->ContextRecord
382e1051a39Sopenharmony_ci	lea	56(%rsi),%r11		# &disp->HandlerData
383e1051a39Sopenharmony_ci	lea	24(%rsi),%r12		# &disp->EstablisherFrame
384e1051a39Sopenharmony_ci	mov	%r10,32(%rsp)		# arg5
385e1051a39Sopenharmony_ci	mov	%r11,40(%rsp)		# arg6
386e1051a39Sopenharmony_ci	mov	%r12,48(%rsp)		# arg7
387e1051a39Sopenharmony_ci	mov	%rcx,56(%rsp)		# arg8, (NULL)
388e1051a39Sopenharmony_ci	call	*__imp_RtlVirtualUnwind(%rip)
389e1051a39Sopenharmony_ci
390e1051a39Sopenharmony_ci	mov	\$1,%eax		# ExceptionContinueSearch
391e1051a39Sopenharmony_ci	add	\$64,%rsp
392e1051a39Sopenharmony_ci	popfq
393e1051a39Sopenharmony_ci	pop	%r15
394e1051a39Sopenharmony_ci	pop	%r14
395e1051a39Sopenharmony_ci	pop	%r13
396e1051a39Sopenharmony_ci	pop	%r12
397e1051a39Sopenharmony_ci	pop	%rbp
398e1051a39Sopenharmony_ci	pop	%rbx
399e1051a39Sopenharmony_ci	pop	%rdi
400e1051a39Sopenharmony_ci	pop	%rsi
401e1051a39Sopenharmony_ci	ret
402e1051a39Sopenharmony_ci.size	se_handler,.-se_handler
403e1051a39Sopenharmony_ci
404e1051a39Sopenharmony_ci.section	.pdata
405e1051a39Sopenharmony_ci.align	4
406e1051a39Sopenharmony_ci	.rva	_mul_1x1
407e1051a39Sopenharmony_ci	.rva	.Lend_mul_1x1
408e1051a39Sopenharmony_ci	.rva	.LSEH_info_1x1
409e1051a39Sopenharmony_ci
410e1051a39Sopenharmony_ci	.rva	.Lvanilla_mul_2x2
411e1051a39Sopenharmony_ci	.rva	.Lend_mul_2x2
412e1051a39Sopenharmony_ci	.rva	.LSEH_info_2x2
413e1051a39Sopenharmony_ci.section	.xdata
414e1051a39Sopenharmony_ci.align	8
415e1051a39Sopenharmony_ci.LSEH_info_1x1:
416e1051a39Sopenharmony_ci	.byte	0x01,0x07,0x02,0x00
417e1051a39Sopenharmony_ci	.byte	0x07,0x01,0x11,0x00	# sub rsp,128+8
418e1051a39Sopenharmony_ci.LSEH_info_2x2:
419e1051a39Sopenharmony_ci	.byte	9,0,0,0
420e1051a39Sopenharmony_ci	.rva	se_handler
421e1051a39Sopenharmony_ci___
422e1051a39Sopenharmony_ci}
423e1051a39Sopenharmony_ci
424e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval($1)/gem;
425e1051a39Sopenharmony_ciprint $code;
426e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
427