1e1051a39Sopenharmony_ci# Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved. 2e1051a39Sopenharmony_ci# Copyright (c) 2020, Intel Corporation. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci# 9e1051a39Sopenharmony_ci# 10e1051a39Sopenharmony_ci# Originally written by Ilya Albrekht, Sergey Kirillov and Andrey Matyukov 11e1051a39Sopenharmony_ci# Intel Corporation 12e1051a39Sopenharmony_ci# 13e1051a39Sopenharmony_ci# December 2020 14e1051a39Sopenharmony_ci# 15e1051a39Sopenharmony_ci# Initial release. 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# IceLake-Client @ 1.3GHz 20e1051a39Sopenharmony_ci# |---------+----------------------+--------------+-------------| 21e1051a39Sopenharmony_ci# | | OpenSSL 3.0.0-alpha9 | this | Unit | 22e1051a39Sopenharmony_ci# |---------+----------------------+--------------+-------------| 23e1051a39Sopenharmony_ci# | rsa2048 | 2 127 659 | 1 015 625 | cycles/sign | 24e1051a39Sopenharmony_ci# | | 611 | 1280 / +109% | sign/s | 25e1051a39Sopenharmony_ci# |---------+----------------------+--------------+-------------| 26e1051a39Sopenharmony_ci# 27e1051a39Sopenharmony_ci 28e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 29e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 30e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 31e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 32e1051a39Sopenharmony_ci 33e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 34e1051a39Sopenharmony_ci$avx512ifma=0; 35e1051a39Sopenharmony_ci 36e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 37e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 38e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 39e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl"; 40e1051a39Sopenharmony_ci 41e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42e1051a39Sopenharmony_ci =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43e1051a39Sopenharmony_ci $avx512ifma = ($1>=2.26); 44e1051a39Sopenharmony_ci} 45e1051a39Sopenharmony_ci 46e1051a39Sopenharmony_ciif (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47e1051a39Sopenharmony_ci `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 48e1051a39Sopenharmony_ci $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); 49e1051a39Sopenharmony_ci} 50e1051a39Sopenharmony_ci 51e1051a39Sopenharmony_ciif (!$avx512 && `$ENV{CC} -v 2>&1` 52e1051a39Sopenharmony_ci =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { 53e1051a39Sopenharmony_ci my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 54e1051a39Sopenharmony_ci if ($1) { 55e1051a39Sopenharmony_ci # Apple conditions, they use a different version series, see 56e1051a39Sopenharmony_ci # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 57e1051a39Sopenharmony_ci # clang 7.0.0 is Apple clang 10.0.1 58e1051a39Sopenharmony_ci $avx512ifma = ($ver>=10.0001) 59e1051a39Sopenharmony_ci } else { 60e1051a39Sopenharmony_ci $avx512ifma = ($3>=7.0); 61e1051a39Sopenharmony_ci } 62e1051a39Sopenharmony_ci} 63e1051a39Sopenharmony_ci 64e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 65e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 66e1051a39Sopenharmony_ci*STDOUT=*OUT; 67e1051a39Sopenharmony_ci 68e1051a39Sopenharmony_ciif ($avx512ifma>0) {{{ 69e1051a39Sopenharmony_ci@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); 70e1051a39Sopenharmony_ci 71e1051a39Sopenharmony_ci$code.=<<___; 72e1051a39Sopenharmony_ci.extern OPENSSL_ia32cap_P 73e1051a39Sopenharmony_ci.globl ossl_rsaz_avx512ifma_eligible 74e1051a39Sopenharmony_ci.type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent 75e1051a39Sopenharmony_ci.align 32 76e1051a39Sopenharmony_ciossl_rsaz_avx512ifma_eligible: 77e1051a39Sopenharmony_ci mov OPENSSL_ia32cap_P+8(%rip), %ecx 78e1051a39Sopenharmony_ci xor %eax,%eax 79e1051a39Sopenharmony_ci and \$`1<<31|1<<21|1<<17|1<<16`, %ecx # avx512vl + avx512ifma + avx512dq + avx512f 80e1051a39Sopenharmony_ci cmp \$`1<<31|1<<21|1<<17|1<<16`, %ecx 81e1051a39Sopenharmony_ci cmove %ecx,%eax 82e1051a39Sopenharmony_ci ret 83e1051a39Sopenharmony_ci.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 84e1051a39Sopenharmony_ci___ 85e1051a39Sopenharmony_ci 86e1051a39Sopenharmony_ci############################################################################### 87e1051a39Sopenharmony_ci# Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52. 88e1051a39Sopenharmony_ci# 89e1051a39Sopenharmony_ci# AMM is defined as presented in the paper 90e1051a39Sopenharmony_ci# "Efficient Software Implementations of Modular Exponentiation" by Shay Gueron. 91e1051a39Sopenharmony_ci# 92e1051a39Sopenharmony_ci# The input and output are presented in 2^52 radix domain, i.e. 93e1051a39Sopenharmony_ci# |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed. 94e1051a39Sopenharmony_ci# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 95e1051a39Sopenharmony_ci# (note, the implementation counts only 52 bits from it). 96e1051a39Sopenharmony_ci# 97e1051a39Sopenharmony_ci# NB: the AMM implementation does not perform "conditional" subtraction step as 98e1051a39Sopenharmony_ci# specified in the original algorithm as according to the paper "Enhanced Montgomery 99e1051a39Sopenharmony_ci# Multiplication" by Shay Gueron (see Lemma 1), the result will be always < 2*2^1024 100e1051a39Sopenharmony_ci# and can be used as a direct input to the next AMM iteration. 101e1051a39Sopenharmony_ci# This post-condition is true, provided the correct parameter |s| is choosen, i.e. 102e1051a39Sopenharmony_ci# s >= n + 2 * k, which matches our case: 1040 > 1024 + 2 * 1. 103e1051a39Sopenharmony_ci# 104e1051a39Sopenharmony_ci# void ossl_rsaz_amm52x20_x1_256(BN_ULONG *res, 105e1051a39Sopenharmony_ci# const BN_ULONG *a, 106e1051a39Sopenharmony_ci# const BN_ULONG *b, 107e1051a39Sopenharmony_ci# const BN_ULONG *m, 108e1051a39Sopenharmony_ci# BN_ULONG k0); 109e1051a39Sopenharmony_ci############################################################################### 110e1051a39Sopenharmony_ci{ 111e1051a39Sopenharmony_ci# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") 112e1051a39Sopenharmony_cimy ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; 113e1051a39Sopenharmony_ci 114e1051a39Sopenharmony_cimy $mask52 = "%rax"; 115e1051a39Sopenharmony_cimy $acc0_0 = "%r9"; 116e1051a39Sopenharmony_cimy $acc0_0_low = "%r9d"; 117e1051a39Sopenharmony_cimy $acc0_1 = "%r15"; 118e1051a39Sopenharmony_cimy $acc0_1_low = "%r15d"; 119e1051a39Sopenharmony_cimy $b_ptr = "%r11"; 120e1051a39Sopenharmony_ci 121e1051a39Sopenharmony_cimy $iter = "%ebx"; 122e1051a39Sopenharmony_ci 123e1051a39Sopenharmony_cimy $zero = "%ymm0"; 124e1051a39Sopenharmony_cimy ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm1", map("%ymm$_",(16..19))); 125e1051a39Sopenharmony_cimy ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm2", map("%ymm$_",(20..23))); 126e1051a39Sopenharmony_cimy $Bi = "%ymm3"; 127e1051a39Sopenharmony_cimy $Yi = "%ymm4"; 128e1051a39Sopenharmony_ci 129e1051a39Sopenharmony_ci# Registers mapping for normalization. 130e1051a39Sopenharmony_ci# We can reuse Bi, Yi registers here. 131e1051a39Sopenharmony_cimy $TMP = $Bi; 132e1051a39Sopenharmony_cimy $mask52x4 = $Yi; 133e1051a39Sopenharmony_cimy ($T0,$T0h,$T1,$T1h,$T2) = map("%ymm$_", (24..28)); 134e1051a39Sopenharmony_ci 135e1051a39Sopenharmony_cisub amm52x20_x1() { 136e1051a39Sopenharmony_ci# _data_offset - offset in the |a| or |m| arrays pointing to the beginning 137e1051a39Sopenharmony_ci# of data for corresponding AMM operation; 138e1051a39Sopenharmony_ci# _b_offset - offset in the |b| array pointing to the next qword digit; 139e1051a39Sopenharmony_cimy ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_k0) = @_; 140e1051a39Sopenharmony_cimy $_R0_xmm = $_R0; 141e1051a39Sopenharmony_ci$_R0_xmm =~ s/%y/%x/; 142e1051a39Sopenharmony_ci$code.=<<___; 143e1051a39Sopenharmony_ci movq $_b_offset($b_ptr), %r13 # b[i] 144e1051a39Sopenharmony_ci 145e1051a39Sopenharmony_ci vpbroadcastq %r13, $Bi # broadcast b[i] 146e1051a39Sopenharmony_ci movq $_data_offset($a), %rdx 147e1051a39Sopenharmony_ci mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) 148e1051a39Sopenharmony_ci addq %r13, $_acc # acc += t0 149e1051a39Sopenharmony_ci movq %r12, %r10 150e1051a39Sopenharmony_ci adcq \$0, %r10 # t2 += CF 151e1051a39Sopenharmony_ci 152e1051a39Sopenharmony_ci movq $_k0, %r13 153e1051a39Sopenharmony_ci imulq $_acc, %r13 # acc * k0 154e1051a39Sopenharmony_ci andq $mask52, %r13 # yi = (acc * k0) & mask52 155e1051a39Sopenharmony_ci 156e1051a39Sopenharmony_ci vpbroadcastq %r13, $Yi # broadcast y[i] 157e1051a39Sopenharmony_ci movq $_data_offset($m), %rdx 158e1051a39Sopenharmony_ci mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) 159e1051a39Sopenharmony_ci addq %r13, $_acc # acc += t0 160e1051a39Sopenharmony_ci adcq %r12, %r10 # t2 += (t1 + CF) 161e1051a39Sopenharmony_ci 162e1051a39Sopenharmony_ci shrq \$52, $_acc 163e1051a39Sopenharmony_ci salq \$12, %r10 164e1051a39Sopenharmony_ci or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) 165e1051a39Sopenharmony_ci 166e1051a39Sopenharmony_ci vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 167e1051a39Sopenharmony_ci vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h 168e1051a39Sopenharmony_ci vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 169e1051a39Sopenharmony_ci vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h 170e1051a39Sopenharmony_ci vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 171e1051a39Sopenharmony_ci 172e1051a39Sopenharmony_ci vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 173e1051a39Sopenharmony_ci vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h 174e1051a39Sopenharmony_ci vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 175e1051a39Sopenharmony_ci vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h 176e1051a39Sopenharmony_ci vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 177e1051a39Sopenharmony_ci 178e1051a39Sopenharmony_ci # Shift accumulators right by 1 qword, zero extending the highest one 179e1051a39Sopenharmony_ci valignq \$1, $_R0, $_R0h, $_R0 180e1051a39Sopenharmony_ci valignq \$1, $_R0h, $_R1, $_R0h 181e1051a39Sopenharmony_ci valignq \$1, $_R1, $_R1h, $_R1 182e1051a39Sopenharmony_ci valignq \$1, $_R1h, $_R2, $_R1h 183e1051a39Sopenharmony_ci valignq \$1, $_R2, $zero, $_R2 184e1051a39Sopenharmony_ci 185e1051a39Sopenharmony_ci vmovq $_R0_xmm, %r13 186e1051a39Sopenharmony_ci addq %r13, $_acc # acc += R0[0] 187e1051a39Sopenharmony_ci 188e1051a39Sopenharmony_ci vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 189e1051a39Sopenharmony_ci vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h 190e1051a39Sopenharmony_ci vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 191e1051a39Sopenharmony_ci vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h 192e1051a39Sopenharmony_ci vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 193e1051a39Sopenharmony_ci 194e1051a39Sopenharmony_ci vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 195e1051a39Sopenharmony_ci vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h 196e1051a39Sopenharmony_ci vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 197e1051a39Sopenharmony_ci vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h 198e1051a39Sopenharmony_ci vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 199e1051a39Sopenharmony_ci___ 200e1051a39Sopenharmony_ci} 201e1051a39Sopenharmony_ci 202e1051a39Sopenharmony_ci# Normalization routine: handles carry bits in R0..R2 QWs and 203e1051a39Sopenharmony_ci# gets R0..R2 back to normalized 2^52 representation. 204e1051a39Sopenharmony_ci# 205e1051a39Sopenharmony_ci# Uses %r8-14,%e[bcd]x 206e1051a39Sopenharmony_cisub amm52x20_x1_norm { 207e1051a39Sopenharmony_cimy ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_; 208e1051a39Sopenharmony_ci$code.=<<___; 209e1051a39Sopenharmony_ci # Put accumulator to low qword in R0 210e1051a39Sopenharmony_ci vpbroadcastq $_acc, $TMP 211e1051a39Sopenharmony_ci vpblendd \$3, $TMP, $_R0, $_R0 212e1051a39Sopenharmony_ci 213e1051a39Sopenharmony_ci # Extract "carries" (12 high bits) from each QW of R0..R2 214e1051a39Sopenharmony_ci # Save them to LSB of QWs in T0..T2 215e1051a39Sopenharmony_ci vpsrlq \$52, $_R0, $T0 216e1051a39Sopenharmony_ci vpsrlq \$52, $_R0h, $T0h 217e1051a39Sopenharmony_ci vpsrlq \$52, $_R1, $T1 218e1051a39Sopenharmony_ci vpsrlq \$52, $_R1h, $T1h 219e1051a39Sopenharmony_ci vpsrlq \$52, $_R2, $T2 220e1051a39Sopenharmony_ci 221e1051a39Sopenharmony_ci # "Shift left" T0..T2 by 1 QW 222e1051a39Sopenharmony_ci valignq \$3, $T1h, $T2, $T2 223e1051a39Sopenharmony_ci valignq \$3, $T1, $T1h, $T1h 224e1051a39Sopenharmony_ci valignq \$3, $T0h, $T1, $T1 225e1051a39Sopenharmony_ci valignq \$3, $T0, $T0h, $T0h 226e1051a39Sopenharmony_ci valignq \$3, $zero, $T0, $T0 227e1051a39Sopenharmony_ci 228e1051a39Sopenharmony_ci # Drop "carries" from R0..R2 QWs 229e1051a39Sopenharmony_ci vpandq $mask52x4, $_R0, $_R0 230e1051a39Sopenharmony_ci vpandq $mask52x4, $_R0h, $_R0h 231e1051a39Sopenharmony_ci vpandq $mask52x4, $_R1, $_R1 232e1051a39Sopenharmony_ci vpandq $mask52x4, $_R1h, $_R1h 233e1051a39Sopenharmony_ci vpandq $mask52x4, $_R2, $_R2 234e1051a39Sopenharmony_ci 235e1051a39Sopenharmony_ci # Sum R0..R2 with corresponding adjusted carries 236e1051a39Sopenharmony_ci vpaddq $T0, $_R0, $_R0 237e1051a39Sopenharmony_ci vpaddq $T0h, $_R0h, $_R0h 238e1051a39Sopenharmony_ci vpaddq $T1, $_R1, $_R1 239e1051a39Sopenharmony_ci vpaddq $T1h, $_R1h, $_R1h 240e1051a39Sopenharmony_ci vpaddq $T2, $_R2, $_R2 241e1051a39Sopenharmony_ci 242e1051a39Sopenharmony_ci # Now handle carry bits from this addition 243e1051a39Sopenharmony_ci # Get mask of QWs which 52-bit parts overflow... 244e1051a39Sopenharmony_ci vpcmpuq \$1, $_R0, $mask52x4, %k1 # OP=lt 245e1051a39Sopenharmony_ci vpcmpuq \$1, $_R0h, $mask52x4, %k2 246e1051a39Sopenharmony_ci vpcmpuq \$1, $_R1, $mask52x4, %k3 247e1051a39Sopenharmony_ci vpcmpuq \$1, $_R1h, $mask52x4, %k4 248e1051a39Sopenharmony_ci vpcmpuq \$1, $_R2, $mask52x4, %k5 249e1051a39Sopenharmony_ci kmovb %k1, %r14d # k1 250e1051a39Sopenharmony_ci kmovb %k2, %r13d # k1h 251e1051a39Sopenharmony_ci kmovb %k3, %r12d # k2 252e1051a39Sopenharmony_ci kmovb %k4, %r11d # k2h 253e1051a39Sopenharmony_ci kmovb %k5, %r10d # k3 254e1051a39Sopenharmony_ci 255e1051a39Sopenharmony_ci # ...or saturated 256e1051a39Sopenharmony_ci vpcmpuq \$0, $_R0, $mask52x4, %k1 # OP=eq 257e1051a39Sopenharmony_ci vpcmpuq \$0, $_R0h, $mask52x4, %k2 258e1051a39Sopenharmony_ci vpcmpuq \$0, $_R1, $mask52x4, %k3 259e1051a39Sopenharmony_ci vpcmpuq \$0, $_R1h, $mask52x4, %k4 260e1051a39Sopenharmony_ci vpcmpuq \$0, $_R2, $mask52x4, %k5 261e1051a39Sopenharmony_ci kmovb %k1, %r9d # k4 262e1051a39Sopenharmony_ci kmovb %k2, %r8d # k4h 263e1051a39Sopenharmony_ci kmovb %k3, %ebx # k5 264e1051a39Sopenharmony_ci kmovb %k4, %ecx # k5h 265e1051a39Sopenharmony_ci kmovb %k5, %edx # k6 266e1051a39Sopenharmony_ci 267e1051a39Sopenharmony_ci # Get mask of QWs where carries shall be propagated to. 268e1051a39Sopenharmony_ci # Merge 4-bit masks to 8-bit values to use add with carry. 269e1051a39Sopenharmony_ci shl \$4, %r13b 270e1051a39Sopenharmony_ci or %r13b, %r14b 271e1051a39Sopenharmony_ci shl \$4, %r11b 272e1051a39Sopenharmony_ci or %r11b, %r12b 273e1051a39Sopenharmony_ci 274e1051a39Sopenharmony_ci add %r14b, %r14b 275e1051a39Sopenharmony_ci adc %r12b, %r12b 276e1051a39Sopenharmony_ci adc %r10b, %r10b 277e1051a39Sopenharmony_ci 278e1051a39Sopenharmony_ci shl \$4, %r8b 279e1051a39Sopenharmony_ci or %r8b,%r9b 280e1051a39Sopenharmony_ci shl \$4, %cl 281e1051a39Sopenharmony_ci or %cl, %bl 282e1051a39Sopenharmony_ci 283e1051a39Sopenharmony_ci add %r9b, %r14b 284e1051a39Sopenharmony_ci adc %bl, %r12b 285e1051a39Sopenharmony_ci adc %dl, %r10b 286e1051a39Sopenharmony_ci 287e1051a39Sopenharmony_ci xor %r9b, %r14b 288e1051a39Sopenharmony_ci xor %bl, %r12b 289e1051a39Sopenharmony_ci xor %dl, %r10b 290e1051a39Sopenharmony_ci 291e1051a39Sopenharmony_ci kmovb %r14d, %k1 292e1051a39Sopenharmony_ci shr \$4, %r14b 293e1051a39Sopenharmony_ci kmovb %r14d, %k2 294e1051a39Sopenharmony_ci kmovb %r12d, %k3 295e1051a39Sopenharmony_ci shr \$4, %r12b 296e1051a39Sopenharmony_ci kmovb %r12d, %k4 297e1051a39Sopenharmony_ci kmovb %r10d, %k5 298e1051a39Sopenharmony_ci 299e1051a39Sopenharmony_ci # Add carries according to the obtained mask 300e1051a39Sopenharmony_ci vpsubq $mask52x4, $_R0, ${_R0}{%k1} 301e1051a39Sopenharmony_ci vpsubq $mask52x4, $_R0h, ${_R0h}{%k2} 302e1051a39Sopenharmony_ci vpsubq $mask52x4, $_R1, ${_R1}{%k3} 303e1051a39Sopenharmony_ci vpsubq $mask52x4, $_R1h, ${_R1h}{%k4} 304e1051a39Sopenharmony_ci vpsubq $mask52x4, $_R2, ${_R2}{%k5} 305e1051a39Sopenharmony_ci 306e1051a39Sopenharmony_ci vpandq $mask52x4, $_R0, $_R0 307e1051a39Sopenharmony_ci vpandq $mask52x4, $_R0h, $_R0h 308e1051a39Sopenharmony_ci vpandq $mask52x4, $_R1, $_R1 309e1051a39Sopenharmony_ci vpandq $mask52x4, $_R1h, $_R1h 310e1051a39Sopenharmony_ci vpandq $mask52x4, $_R2, $_R2 311e1051a39Sopenharmony_ci___ 312e1051a39Sopenharmony_ci} 313e1051a39Sopenharmony_ci 314e1051a39Sopenharmony_ci$code.=<<___; 315e1051a39Sopenharmony_ci.text 316e1051a39Sopenharmony_ci 317e1051a39Sopenharmony_ci.globl ossl_rsaz_amm52x20_x1_256 318e1051a39Sopenharmony_ci.type ossl_rsaz_amm52x20_x1_256,\@function,5 319e1051a39Sopenharmony_ci.align 32 320e1051a39Sopenharmony_ciossl_rsaz_amm52x20_x1_256: 321e1051a39Sopenharmony_ci.cfi_startproc 322e1051a39Sopenharmony_ci endbranch 323e1051a39Sopenharmony_ci push %rbx 324e1051a39Sopenharmony_ci.cfi_push %rbx 325e1051a39Sopenharmony_ci push %rbp 326e1051a39Sopenharmony_ci.cfi_push %rbp 327e1051a39Sopenharmony_ci push %r12 328e1051a39Sopenharmony_ci.cfi_push %r12 329e1051a39Sopenharmony_ci push %r13 330e1051a39Sopenharmony_ci.cfi_push %r13 331e1051a39Sopenharmony_ci push %r14 332e1051a39Sopenharmony_ci.cfi_push %r14 333e1051a39Sopenharmony_ci push %r15 334e1051a39Sopenharmony_ci.cfi_push %r15 335e1051a39Sopenharmony_ci.Lrsaz_amm52x20_x1_256_body: 336e1051a39Sopenharmony_ci 337e1051a39Sopenharmony_ci # Zeroing accumulators 338e1051a39Sopenharmony_ci vpxord $zero, $zero, $zero 339e1051a39Sopenharmony_ci vmovdqa64 $zero, $R0_0 340e1051a39Sopenharmony_ci vmovdqa64 $zero, $R0_0h 341e1051a39Sopenharmony_ci vmovdqa64 $zero, $R1_0 342e1051a39Sopenharmony_ci vmovdqa64 $zero, $R1_0h 343e1051a39Sopenharmony_ci vmovdqa64 $zero, $R2_0 344e1051a39Sopenharmony_ci 345e1051a39Sopenharmony_ci xorl $acc0_0_low, $acc0_0_low 346e1051a39Sopenharmony_ci 347e1051a39Sopenharmony_ci movq $b, $b_ptr # backup address of b 348e1051a39Sopenharmony_ci movq \$0xfffffffffffff, $mask52 # 52-bit mask 349e1051a39Sopenharmony_ci 350e1051a39Sopenharmony_ci # Loop over 20 digits unrolled by 4 351e1051a39Sopenharmony_ci mov \$5, $iter 352e1051a39Sopenharmony_ci 353e1051a39Sopenharmony_ci.align 32 354e1051a39Sopenharmony_ci.Lloop5: 355e1051a39Sopenharmony_ci___ 356e1051a39Sopenharmony_ci foreach my $idx (0..3) { 357e1051a39Sopenharmony_ci &amm52x20_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$k0); 358e1051a39Sopenharmony_ci } 359e1051a39Sopenharmony_ci$code.=<<___; 360e1051a39Sopenharmony_ci lea `4*8`($b_ptr), $b_ptr 361e1051a39Sopenharmony_ci dec $iter 362e1051a39Sopenharmony_ci jne .Lloop5 363e1051a39Sopenharmony_ci 364e1051a39Sopenharmony_ci vmovdqa64 .Lmask52x4(%rip), $mask52x4 365e1051a39Sopenharmony_ci___ 366e1051a39Sopenharmony_ci &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); 367e1051a39Sopenharmony_ci$code.=<<___; 368e1051a39Sopenharmony_ci 369e1051a39Sopenharmony_ci vmovdqu64 $R0_0, ($res) 370e1051a39Sopenharmony_ci vmovdqu64 $R0_0h, 32($res) 371e1051a39Sopenharmony_ci vmovdqu64 $R1_0, 64($res) 372e1051a39Sopenharmony_ci vmovdqu64 $R1_0h, 96($res) 373e1051a39Sopenharmony_ci vmovdqu64 $R2_0, 128($res) 374e1051a39Sopenharmony_ci 375e1051a39Sopenharmony_ci vzeroupper 376e1051a39Sopenharmony_ci mov 0(%rsp),%r15 377e1051a39Sopenharmony_ci.cfi_restore %r15 378e1051a39Sopenharmony_ci mov 8(%rsp),%r14 379e1051a39Sopenharmony_ci.cfi_restore %r14 380e1051a39Sopenharmony_ci mov 16(%rsp),%r13 381e1051a39Sopenharmony_ci.cfi_restore %r13 382e1051a39Sopenharmony_ci mov 24(%rsp),%r12 383e1051a39Sopenharmony_ci.cfi_restore %r12 384e1051a39Sopenharmony_ci mov 32(%rsp),%rbp 385e1051a39Sopenharmony_ci.cfi_restore %rbp 386e1051a39Sopenharmony_ci mov 40(%rsp),%rbx 387e1051a39Sopenharmony_ci.cfi_restore %rbx 388e1051a39Sopenharmony_ci lea 48(%rsp),%rsp 389e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset -48 390e1051a39Sopenharmony_ci.Lrsaz_amm52x20_x1_256_epilogue: 391e1051a39Sopenharmony_ci ret 392e1051a39Sopenharmony_ci.cfi_endproc 393e1051a39Sopenharmony_ci.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 394e1051a39Sopenharmony_ci___ 395e1051a39Sopenharmony_ci 396e1051a39Sopenharmony_ci$code.=<<___; 397e1051a39Sopenharmony_ci.data 398e1051a39Sopenharmony_ci.align 32 399e1051a39Sopenharmony_ci.Lmask52x4: 400e1051a39Sopenharmony_ci .quad 0xfffffffffffff 401e1051a39Sopenharmony_ci .quad 0xfffffffffffff 402e1051a39Sopenharmony_ci .quad 0xfffffffffffff 403e1051a39Sopenharmony_ci .quad 0xfffffffffffff 404e1051a39Sopenharmony_ci___ 405e1051a39Sopenharmony_ci 406e1051a39Sopenharmony_ci############################################################################### 407e1051a39Sopenharmony_ci# Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52 408e1051a39Sopenharmony_ci# 409e1051a39Sopenharmony_ci# See description of ossl_rsaz_amm52x20_x1_256() above for details about Almost 410e1051a39Sopenharmony_ci# Montgomery Multiplication algorithm and function input parameters description. 411e1051a39Sopenharmony_ci# 412e1051a39Sopenharmony_ci# This function does two AMMs for two independent inputs, hence dual. 413e1051a39Sopenharmony_ci# 414e1051a39Sopenharmony_ci# void ossl_rsaz_amm52x20_x2_256(BN_ULONG out[2][20], 415e1051a39Sopenharmony_ci# const BN_ULONG a[2][20], 416e1051a39Sopenharmony_ci# const BN_ULONG b[2][20], 417e1051a39Sopenharmony_ci# const BN_ULONG m[2][20], 418e1051a39Sopenharmony_ci# const BN_ULONG k0[2]); 419e1051a39Sopenharmony_ci############################################################################### 420e1051a39Sopenharmony_ci 421e1051a39Sopenharmony_ci$code.=<<___; 422e1051a39Sopenharmony_ci.text 423e1051a39Sopenharmony_ci 424e1051a39Sopenharmony_ci.globl ossl_rsaz_amm52x20_x2_256 425e1051a39Sopenharmony_ci.type ossl_rsaz_amm52x20_x2_256,\@function,5 426e1051a39Sopenharmony_ci.align 32 427e1051a39Sopenharmony_ciossl_rsaz_amm52x20_x2_256: 428e1051a39Sopenharmony_ci.cfi_startproc 429e1051a39Sopenharmony_ci endbranch 430e1051a39Sopenharmony_ci push %rbx 431e1051a39Sopenharmony_ci.cfi_push %rbx 432e1051a39Sopenharmony_ci push %rbp 433e1051a39Sopenharmony_ci.cfi_push %rbp 434e1051a39Sopenharmony_ci push %r12 435e1051a39Sopenharmony_ci.cfi_push %r12 436e1051a39Sopenharmony_ci push %r13 437e1051a39Sopenharmony_ci.cfi_push %r13 438e1051a39Sopenharmony_ci push %r14 439e1051a39Sopenharmony_ci.cfi_push %r14 440e1051a39Sopenharmony_ci push %r15 441e1051a39Sopenharmony_ci.cfi_push %r15 442e1051a39Sopenharmony_ci.Lrsaz_amm52x20_x2_256_body: 443e1051a39Sopenharmony_ci 444e1051a39Sopenharmony_ci # Zeroing accumulators 445e1051a39Sopenharmony_ci vpxord $zero, $zero, $zero 446e1051a39Sopenharmony_ci vmovdqa64 $zero, $R0_0 447e1051a39Sopenharmony_ci vmovdqa64 $zero, $R0_0h 448e1051a39Sopenharmony_ci vmovdqa64 $zero, $R1_0 449e1051a39Sopenharmony_ci vmovdqa64 $zero, $R1_0h 450e1051a39Sopenharmony_ci vmovdqa64 $zero, $R2_0 451e1051a39Sopenharmony_ci vmovdqa64 $zero, $R0_1 452e1051a39Sopenharmony_ci vmovdqa64 $zero, $R0_1h 453e1051a39Sopenharmony_ci vmovdqa64 $zero, $R1_1 454e1051a39Sopenharmony_ci vmovdqa64 $zero, $R1_1h 455e1051a39Sopenharmony_ci vmovdqa64 $zero, $R2_1 456e1051a39Sopenharmony_ci 457e1051a39Sopenharmony_ci xorl $acc0_0_low, $acc0_0_low 458e1051a39Sopenharmony_ci xorl $acc0_1_low, $acc0_1_low 459e1051a39Sopenharmony_ci 460e1051a39Sopenharmony_ci movq $b, $b_ptr # backup address of b 461e1051a39Sopenharmony_ci movq \$0xfffffffffffff, $mask52 # 52-bit mask 462e1051a39Sopenharmony_ci 463e1051a39Sopenharmony_ci mov \$20, $iter 464e1051a39Sopenharmony_ci 465e1051a39Sopenharmony_ci.align 32 466e1051a39Sopenharmony_ci.Lloop20: 467e1051a39Sopenharmony_ci___ 468e1051a39Sopenharmony_ci &amm52x20_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,"($k0)"); 469e1051a39Sopenharmony_ci # 20*8 = offset of the next dimension in two-dimension array 470e1051a39Sopenharmony_ci &amm52x20_x1(20*8,20*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,"8($k0)"); 471e1051a39Sopenharmony_ci$code.=<<___; 472e1051a39Sopenharmony_ci lea 8($b_ptr), $b_ptr 473e1051a39Sopenharmony_ci dec $iter 474e1051a39Sopenharmony_ci jne .Lloop20 475e1051a39Sopenharmony_ci 476e1051a39Sopenharmony_ci vmovdqa64 .Lmask52x4(%rip), $mask52x4 477e1051a39Sopenharmony_ci___ 478e1051a39Sopenharmony_ci &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); 479e1051a39Sopenharmony_ci &amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1); 480e1051a39Sopenharmony_ci$code.=<<___; 481e1051a39Sopenharmony_ci 482e1051a39Sopenharmony_ci vmovdqu64 $R0_0, ($res) 483e1051a39Sopenharmony_ci vmovdqu64 $R0_0h, 32($res) 484e1051a39Sopenharmony_ci vmovdqu64 $R1_0, 64($res) 485e1051a39Sopenharmony_ci vmovdqu64 $R1_0h, 96($res) 486e1051a39Sopenharmony_ci vmovdqu64 $R2_0, 128($res) 487e1051a39Sopenharmony_ci 488e1051a39Sopenharmony_ci vmovdqu64 $R0_1, 160($res) 489e1051a39Sopenharmony_ci vmovdqu64 $R0_1h, 192($res) 490e1051a39Sopenharmony_ci vmovdqu64 $R1_1, 224($res) 491e1051a39Sopenharmony_ci vmovdqu64 $R1_1h, 256($res) 492e1051a39Sopenharmony_ci vmovdqu64 $R2_1, 288($res) 493e1051a39Sopenharmony_ci 494e1051a39Sopenharmony_ci vzeroupper 495e1051a39Sopenharmony_ci mov 0(%rsp),%r15 496e1051a39Sopenharmony_ci.cfi_restore %r15 497e1051a39Sopenharmony_ci mov 8(%rsp),%r14 498e1051a39Sopenharmony_ci.cfi_restore %r14 499e1051a39Sopenharmony_ci mov 16(%rsp),%r13 500e1051a39Sopenharmony_ci.cfi_restore %r13 501e1051a39Sopenharmony_ci mov 24(%rsp),%r12 502e1051a39Sopenharmony_ci.cfi_restore %r12 503e1051a39Sopenharmony_ci mov 32(%rsp),%rbp 504e1051a39Sopenharmony_ci.cfi_restore %rbp 505e1051a39Sopenharmony_ci mov 40(%rsp),%rbx 506e1051a39Sopenharmony_ci.cfi_restore %rbx 507e1051a39Sopenharmony_ci lea 48(%rsp),%rsp 508e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset -48 509e1051a39Sopenharmony_ci.Lrsaz_amm52x20_x2_256_epilogue: 510e1051a39Sopenharmony_ci ret 511e1051a39Sopenharmony_ci.cfi_endproc 512e1051a39Sopenharmony_ci.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256 513e1051a39Sopenharmony_ci___ 514e1051a39Sopenharmony_ci} 515e1051a39Sopenharmony_ci 516e1051a39Sopenharmony_ci############################################################################### 517e1051a39Sopenharmony_ci# Constant time extraction from the precomputed table of powers base^i, where 518e1051a39Sopenharmony_ci# i = 0..2^EXP_WIN_SIZE-1 519e1051a39Sopenharmony_ci# 520e1051a39Sopenharmony_ci# The input |red_table| contains precomputations for two independent base values, 521e1051a39Sopenharmony_ci# so the |tbl_idx| indicates for which base shall we extract the value. 522e1051a39Sopenharmony_ci# |red_table_idx| is a power index. 523e1051a39Sopenharmony_ci# 524e1051a39Sopenharmony_ci# Extracted value (output) is 20 digit number in 2^52 radix. 525e1051a39Sopenharmony_ci# 526e1051a39Sopenharmony_ci# void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, 527e1051a39Sopenharmony_ci# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20], 528e1051a39Sopenharmony_ci# int red_table_idx, 529e1051a39Sopenharmony_ci# int tbl_idx); # 0 or 1 530e1051a39Sopenharmony_ci# 531e1051a39Sopenharmony_ci# EXP_WIN_SIZE = 5 532e1051a39Sopenharmony_ci############################################################################### 533e1051a39Sopenharmony_ci{ 534e1051a39Sopenharmony_ci# input parameters 535e1051a39Sopenharmony_cimy ($out,$red_tbl,$red_tbl_idx,$tbl_idx) = @_6_args_universal_ABI; 536e1051a39Sopenharmony_ci 537e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3,$t4) = map("%ymm$_", (0..4)); 538e1051a39Sopenharmony_cimy $t4xmm = $t4; 539e1051a39Sopenharmony_ci$t4xmm =~ s/%y/%x/; 540e1051a39Sopenharmony_cimy ($tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = map("%ymm$_", (16..20)); 541e1051a39Sopenharmony_cimy ($cur_idx,$idx,$ones) = map("%ymm$_", (21..23)); 542e1051a39Sopenharmony_ci 543e1051a39Sopenharmony_ci$code.=<<___; 544e1051a39Sopenharmony_ci.text 545e1051a39Sopenharmony_ci 546e1051a39Sopenharmony_ci.align 32 547e1051a39Sopenharmony_ci.globl ossl_extract_multiplier_2x20_win5 548e1051a39Sopenharmony_ci.type ossl_extract_multiplier_2x20_win5,\@function,4 549e1051a39Sopenharmony_ciossl_extract_multiplier_2x20_win5: 550e1051a39Sopenharmony_ci.cfi_startproc 551e1051a39Sopenharmony_ci endbranch 552e1051a39Sopenharmony_ci leaq ($tbl_idx,$tbl_idx,4), %rax 553e1051a39Sopenharmony_ci salq \$5, %rax 554e1051a39Sopenharmony_ci addq %rax, $red_tbl 555e1051a39Sopenharmony_ci 556e1051a39Sopenharmony_ci vmovdqa64 .Lones(%rip), $ones # broadcast ones 557e1051a39Sopenharmony_ci vpbroadcastq $red_tbl_idx, $idx 558e1051a39Sopenharmony_ci leaq `(1<<5)*2*20*8`($red_tbl), %rax # holds end of the tbl 559e1051a39Sopenharmony_ci 560e1051a39Sopenharmony_ci vpxor $t4xmm, $t4xmm, $t4xmm 561e1051a39Sopenharmony_ci vmovdqa64 $t4, $t3 # zeroing t0..4, cur_idx 562e1051a39Sopenharmony_ci vmovdqa64 $t4, $t2 563e1051a39Sopenharmony_ci vmovdqa64 $t4, $t1 564e1051a39Sopenharmony_ci vmovdqa64 $t4, $t0 565e1051a39Sopenharmony_ci vmovdqa64 $t4, $cur_idx 566e1051a39Sopenharmony_ci 567e1051a39Sopenharmony_ci.align 32 568e1051a39Sopenharmony_ci.Lloop: 569e1051a39Sopenharmony_ci vpcmpq \$0, $cur_idx, $idx, %k1 # mask of (idx == cur_idx) 570e1051a39Sopenharmony_ci addq \$320, $red_tbl # 320 = 2 * 20 digits * 8 bytes 571e1051a39Sopenharmony_ci vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx 572e1051a39Sopenharmony_ci vmovdqu64 -320($red_tbl), $tmp0 # load data from red_tbl 573e1051a39Sopenharmony_ci vmovdqu64 -288($red_tbl), $tmp1 574e1051a39Sopenharmony_ci vmovdqu64 -256($red_tbl), $tmp2 575e1051a39Sopenharmony_ci vmovdqu64 -224($red_tbl), $tmp3 576e1051a39Sopenharmony_ci vmovdqu64 -192($red_tbl), $tmp4 577e1051a39Sopenharmony_ci vpblendmq $tmp0, $t0, ${t0}{%k1} # extract data when mask is not zero 578e1051a39Sopenharmony_ci vpblendmq $tmp1, $t1, ${t1}{%k1} 579e1051a39Sopenharmony_ci vpblendmq $tmp2, $t2, ${t2}{%k1} 580e1051a39Sopenharmony_ci vpblendmq $tmp3, $t3, ${t3}{%k1} 581e1051a39Sopenharmony_ci vpblendmq $tmp4, $t4, ${t4}{%k1} 582e1051a39Sopenharmony_ci cmpq $red_tbl, %rax 583e1051a39Sopenharmony_ci jne .Lloop 584e1051a39Sopenharmony_ci 585e1051a39Sopenharmony_ci vmovdqu64 $t0, ($out) # store t0..4 586e1051a39Sopenharmony_ci vmovdqu64 $t1, 32($out) 587e1051a39Sopenharmony_ci vmovdqu64 $t2, 64($out) 588e1051a39Sopenharmony_ci vmovdqu64 $t3, 96($out) 589e1051a39Sopenharmony_ci vmovdqu64 $t4, 128($out) 590e1051a39Sopenharmony_ci 591e1051a39Sopenharmony_ci ret 592e1051a39Sopenharmony_ci.cfi_endproc 593e1051a39Sopenharmony_ci.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 594e1051a39Sopenharmony_ci___ 595e1051a39Sopenharmony_ci$code.=<<___; 596e1051a39Sopenharmony_ci.data 597e1051a39Sopenharmony_ci.align 32 598e1051a39Sopenharmony_ci.Lones: 599e1051a39Sopenharmony_ci .quad 1,1,1,1 600e1051a39Sopenharmony_ci___ 601e1051a39Sopenharmony_ci} 602e1051a39Sopenharmony_ci 603e1051a39Sopenharmony_ciif ($win64) { 604e1051a39Sopenharmony_ci$rec="%rcx"; 605e1051a39Sopenharmony_ci$frame="%rdx"; 606e1051a39Sopenharmony_ci$context="%r8"; 607e1051a39Sopenharmony_ci$disp="%r9"; 608e1051a39Sopenharmony_ci 609e1051a39Sopenharmony_ci$code.=<<___ 610e1051a39Sopenharmony_ci.extern __imp_RtlVirtualUnwind 611e1051a39Sopenharmony_ci.type rsaz_def_handler,\@abi-omnipotent 612e1051a39Sopenharmony_ci.align 16 613e1051a39Sopenharmony_cirsaz_def_handler: 614e1051a39Sopenharmony_ci push %rsi 615e1051a39Sopenharmony_ci push %rdi 616e1051a39Sopenharmony_ci push %rbx 617e1051a39Sopenharmony_ci push %rbp 618e1051a39Sopenharmony_ci push %r12 619e1051a39Sopenharmony_ci push %r13 620e1051a39Sopenharmony_ci push %r14 621e1051a39Sopenharmony_ci push %r15 622e1051a39Sopenharmony_ci pushfq 623e1051a39Sopenharmony_ci sub \$64,%rsp 624e1051a39Sopenharmony_ci 625e1051a39Sopenharmony_ci mov 120($context),%rax # pull context->Rax 626e1051a39Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 627e1051a39Sopenharmony_ci 628e1051a39Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 629e1051a39Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 630e1051a39Sopenharmony_ci 631e1051a39Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 632e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # prologue label 633e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<.Lprologue 634e1051a39Sopenharmony_ci jb .Lcommon_seh_tail 635e1051a39Sopenharmony_ci 636e1051a39Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 637e1051a39Sopenharmony_ci 638e1051a39Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 639e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 640e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip>=.Lepilogue 641e1051a39Sopenharmony_ci jae .Lcommon_seh_tail 642e1051a39Sopenharmony_ci 643e1051a39Sopenharmony_ci lea 48(%rax),%rax 644e1051a39Sopenharmony_ci 645e1051a39Sopenharmony_ci mov -8(%rax),%rbx 646e1051a39Sopenharmony_ci mov -16(%rax),%rbp 647e1051a39Sopenharmony_ci mov -24(%rax),%r12 648e1051a39Sopenharmony_ci mov -32(%rax),%r13 649e1051a39Sopenharmony_ci mov -40(%rax),%r14 650e1051a39Sopenharmony_ci mov -48(%rax),%r15 651e1051a39Sopenharmony_ci mov %rbx,144($context) # restore context->Rbx 652e1051a39Sopenharmony_ci mov %rbp,160($context) # restore context->Rbp 653e1051a39Sopenharmony_ci mov %r12,216($context) # restore context->R12 654e1051a39Sopenharmony_ci mov %r13,224($context) # restore context->R13 655e1051a39Sopenharmony_ci mov %r14,232($context) # restore context->R14 656e1051a39Sopenharmony_ci mov %r15,240($context) # restore context->R14 657e1051a39Sopenharmony_ci 658e1051a39Sopenharmony_ci.Lcommon_seh_tail: 659e1051a39Sopenharmony_ci mov 8(%rax),%rdi 660e1051a39Sopenharmony_ci mov 16(%rax),%rsi 661e1051a39Sopenharmony_ci mov %rax,152($context) # restore context->Rsp 662e1051a39Sopenharmony_ci mov %rsi,168($context) # restore context->Rsi 663e1051a39Sopenharmony_ci mov %rdi,176($context) # restore context->Rdi 664e1051a39Sopenharmony_ci 665e1051a39Sopenharmony_ci mov 40($disp),%rdi # disp->ContextRecord 666e1051a39Sopenharmony_ci mov $context,%rsi # context 667e1051a39Sopenharmony_ci mov \$154,%ecx # sizeof(CONTEXT) 668e1051a39Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 669e1051a39Sopenharmony_ci 670e1051a39Sopenharmony_ci mov $disp,%rsi 671e1051a39Sopenharmony_ci xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 672e1051a39Sopenharmony_ci mov 8(%rsi),%rdx # arg2, disp->ImageBase 673e1051a39Sopenharmony_ci mov 0(%rsi),%r8 # arg3, disp->ControlPc 674e1051a39Sopenharmony_ci mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 675e1051a39Sopenharmony_ci mov 40(%rsi),%r10 # disp->ContextRecord 676e1051a39Sopenharmony_ci lea 56(%rsi),%r11 # &disp->HandlerData 677e1051a39Sopenharmony_ci lea 24(%rsi),%r12 # &disp->EstablisherFrame 678e1051a39Sopenharmony_ci mov %r10,32(%rsp) # arg5 679e1051a39Sopenharmony_ci mov %r11,40(%rsp) # arg6 680e1051a39Sopenharmony_ci mov %r12,48(%rsp) # arg7 681e1051a39Sopenharmony_ci mov %rcx,56(%rsp) # arg8, (NULL) 682e1051a39Sopenharmony_ci call *__imp_RtlVirtualUnwind(%rip) 683e1051a39Sopenharmony_ci 684e1051a39Sopenharmony_ci mov \$1,%eax # ExceptionContinueSearch 685e1051a39Sopenharmony_ci add \$64,%rsp 686e1051a39Sopenharmony_ci popfq 687e1051a39Sopenharmony_ci pop %r15 688e1051a39Sopenharmony_ci pop %r14 689e1051a39Sopenharmony_ci pop %r13 690e1051a39Sopenharmony_ci pop %r12 691e1051a39Sopenharmony_ci pop %rbp 692e1051a39Sopenharmony_ci pop %rbx 693e1051a39Sopenharmony_ci pop %rdi 694e1051a39Sopenharmony_ci pop %rsi 695e1051a39Sopenharmony_ci ret 696e1051a39Sopenharmony_ci.size rsaz_def_handler,.-rsaz_def_handler 697e1051a39Sopenharmony_ci 698e1051a39Sopenharmony_ci.section .pdata 699e1051a39Sopenharmony_ci.align 4 700e1051a39Sopenharmony_ci .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_256 701e1051a39Sopenharmony_ci .rva .LSEH_end_ossl_rsaz_amm52x20_x1_256 702e1051a39Sopenharmony_ci .rva .LSEH_info_ossl_rsaz_amm52x20_x1_256 703e1051a39Sopenharmony_ci 704e1051a39Sopenharmony_ci .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_256 705e1051a39Sopenharmony_ci .rva .LSEH_end_ossl_rsaz_amm52x20_x2_256 706e1051a39Sopenharmony_ci .rva .LSEH_info_ossl_rsaz_amm52x20_x2_256 707e1051a39Sopenharmony_ci 708e1051a39Sopenharmony_ci .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5 709e1051a39Sopenharmony_ci .rva .LSEH_end_ossl_extract_multiplier_2x20_win5 710e1051a39Sopenharmony_ci .rva .LSEH_info_ossl_extract_multiplier_2x20_win5 711e1051a39Sopenharmony_ci 712e1051a39Sopenharmony_ci.section .xdata 713e1051a39Sopenharmony_ci.align 8 714e1051a39Sopenharmony_ci.LSEH_info_ossl_rsaz_amm52x20_x1_256: 715e1051a39Sopenharmony_ci .byte 9,0,0,0 716e1051a39Sopenharmony_ci .rva rsaz_def_handler 717e1051a39Sopenharmony_ci .rva .Lrsaz_amm52x20_x1_256_body,.Lrsaz_amm52x20_x1_256_epilogue 718e1051a39Sopenharmony_ci.LSEH_info_ossl_rsaz_amm52x20_x2_256: 719e1051a39Sopenharmony_ci .byte 9,0,0,0 720e1051a39Sopenharmony_ci .rva rsaz_def_handler 721e1051a39Sopenharmony_ci .rva .Lrsaz_amm52x20_x2_256_body,.Lrsaz_amm52x20_x2_256_epilogue 722e1051a39Sopenharmony_ci.LSEH_info_ossl_extract_multiplier_2x20_win5: 723e1051a39Sopenharmony_ci .byte 9,0,0,0 724e1051a39Sopenharmony_ci .rva rsaz_def_handler 725e1051a39Sopenharmony_ci .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5,.LSEH_begin_ossl_extract_multiplier_2x20_win5 726e1051a39Sopenharmony_ci___ 727e1051a39Sopenharmony_ci} 728e1051a39Sopenharmony_ci}}} else {{{ # fallback for old assembler 729e1051a39Sopenharmony_ci$code.=<<___; 730e1051a39Sopenharmony_ci.text 731e1051a39Sopenharmony_ci 732e1051a39Sopenharmony_ci.globl ossl_rsaz_avx512ifma_eligible 733e1051a39Sopenharmony_ci.type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent 734e1051a39Sopenharmony_ciossl_rsaz_avx512ifma_eligible: 735e1051a39Sopenharmony_ci xor %eax,%eax 736e1051a39Sopenharmony_ci ret 737e1051a39Sopenharmony_ci.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 738e1051a39Sopenharmony_ci 739e1051a39Sopenharmony_ci.globl ossl_rsaz_amm52x20_x1_256 740e1051a39Sopenharmony_ci.globl ossl_rsaz_amm52x20_x2_256 741e1051a39Sopenharmony_ci.globl ossl_extract_multiplier_2x20_win5 742e1051a39Sopenharmony_ci.type ossl_rsaz_amm52x20_x1_256,\@abi-omnipotent 743e1051a39Sopenharmony_ciossl_rsaz_amm52x20_x1_256: 744e1051a39Sopenharmony_ciossl_rsaz_amm52x20_x2_256: 745e1051a39Sopenharmony_ciossl_extract_multiplier_2x20_win5: 746e1051a39Sopenharmony_ci .byte 0x0f,0x0b # ud2 747e1051a39Sopenharmony_ci ret 748e1051a39Sopenharmony_ci.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 749e1051a39Sopenharmony_ci___ 750e1051a39Sopenharmony_ci}}} 751e1051a39Sopenharmony_ci 752e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval $1/gem; 753e1051a39Sopenharmony_ciprint $code; 754e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 755