1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci 17e1051a39Sopenharmony_ci# October 2005. 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# Montgomery multiplication routine for x86_64. While it gives modest 20e1051a39Sopenharmony_ci# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more 21e1051a39Sopenharmony_ci# than twice, >2x, as fast. Most common rsa1024 sign is improved by 22e1051a39Sopenharmony_ci# respectful 50%. It remains to be seen if loop unrolling and 23e1051a39Sopenharmony_ci# dedicated squaring routine can provide further improvement... 24e1051a39Sopenharmony_ci 25e1051a39Sopenharmony_ci# July 2011. 26e1051a39Sopenharmony_ci# 27e1051a39Sopenharmony_ci# Add dedicated squaring procedure. Performance improvement varies 28e1051a39Sopenharmony_ci# from platform to platform, but in average it's ~5%/15%/25%/33% 29e1051a39Sopenharmony_ci# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 30e1051a39Sopenharmony_ci 31e1051a39Sopenharmony_ci# August 2011. 32e1051a39Sopenharmony_ci# 33e1051a39Sopenharmony_ci# Unroll and modulo-schedule inner loops in such manner that they 34e1051a39Sopenharmony_ci# are "fallen through" for input lengths of 8, which is critical for 35e1051a39Sopenharmony_ci# 1024-bit RSA *sign*. Average performance improvement in comparison 36e1051a39Sopenharmony_ci# to *initial* version of this module from 2005 is ~0%/30%/40%/45% 37e1051a39Sopenharmony_ci# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 38e1051a39Sopenharmony_ci 39e1051a39Sopenharmony_ci# June 2013. 40e1051a39Sopenharmony_ci# 41e1051a39Sopenharmony_ci# Optimize reduction in squaring procedure and improve 1024+-bit RSA 42e1051a39Sopenharmony_ci# sign performance by 10-16% on Intel Sandy Bridge and later 43e1051a39Sopenharmony_ci# (virtually same on non-Intel processors). 44e1051a39Sopenharmony_ci 45e1051a39Sopenharmony_ci# August 2013. 46e1051a39Sopenharmony_ci# 47e1051a39Sopenharmony_ci# Add MULX/ADOX/ADCX code path. 48e1051a39Sopenharmony_ci 49e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 50e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 51e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 52e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 53e1051a39Sopenharmony_ci 54e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 55e1051a39Sopenharmony_ci 56e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 57e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 58e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 59e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl"; 60e1051a39Sopenharmony_ci 61e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 62e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 63e1051a39Sopenharmony_ci*STDOUT=*OUT; 64e1051a39Sopenharmony_ci 65e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 66e1051a39Sopenharmony_ci =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 67e1051a39Sopenharmony_ci $addx = ($1>=2.23); 68e1051a39Sopenharmony_ci} 69e1051a39Sopenharmony_ci 70e1051a39Sopenharmony_ciif (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 71e1051a39Sopenharmony_ci `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 72e1051a39Sopenharmony_ci $addx = ($1>=2.10); 73e1051a39Sopenharmony_ci} 74e1051a39Sopenharmony_ci 75e1051a39Sopenharmony_ciif (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 76e1051a39Sopenharmony_ci `ml64 2>&1` =~ /Version ([0-9]+)\./) { 77e1051a39Sopenharmony_ci $addx = ($1>=12); 78e1051a39Sopenharmony_ci} 79e1051a39Sopenharmony_ci 80e1051a39Sopenharmony_ciif (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { 81e1051a39Sopenharmony_ci my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 82e1051a39Sopenharmony_ci $addx = ($ver>=3.03); 83e1051a39Sopenharmony_ci} 84e1051a39Sopenharmony_ci 85e1051a39Sopenharmony_ci# int bn_mul_mont( 86e1051a39Sopenharmony_ci$rp="%rdi"; # BN_ULONG *rp, 87e1051a39Sopenharmony_ci$ap="%rsi"; # const BN_ULONG *ap, 88e1051a39Sopenharmony_ci$bp="%rdx"; # const BN_ULONG *bp, 89e1051a39Sopenharmony_ci$np="%rcx"; # const BN_ULONG *np, 90e1051a39Sopenharmony_ci$n0="%r8"; # const BN_ULONG *n0, 91e1051a39Sopenharmony_ci$num="%r9"; # int num); 92e1051a39Sopenharmony_ci$lo0="%r10"; 93e1051a39Sopenharmony_ci$hi0="%r11"; 94e1051a39Sopenharmony_ci$hi1="%r13"; 95e1051a39Sopenharmony_ci$i="%r14"; 96e1051a39Sopenharmony_ci$j="%r15"; 97e1051a39Sopenharmony_ci$m0="%rbx"; 98e1051a39Sopenharmony_ci$m1="%rbp"; 99e1051a39Sopenharmony_ci 100e1051a39Sopenharmony_ci$code=<<___; 101e1051a39Sopenharmony_ci.text 102e1051a39Sopenharmony_ci 103e1051a39Sopenharmony_ci.extern OPENSSL_ia32cap_P 104e1051a39Sopenharmony_ci 105e1051a39Sopenharmony_ci.globl bn_mul_mont 106e1051a39Sopenharmony_ci.type bn_mul_mont,\@function,6 107e1051a39Sopenharmony_ci.align 16 108e1051a39Sopenharmony_cibn_mul_mont: 109e1051a39Sopenharmony_ci.cfi_startproc 110e1051a39Sopenharmony_ci mov ${num}d,${num}d 111e1051a39Sopenharmony_ci mov %rsp,%rax 112e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 113e1051a39Sopenharmony_ci test \$3,${num}d 114e1051a39Sopenharmony_ci jnz .Lmul_enter 115e1051a39Sopenharmony_ci cmp \$8,${num}d 116e1051a39Sopenharmony_ci jb .Lmul_enter 117e1051a39Sopenharmony_ci___ 118e1051a39Sopenharmony_ci$code.=<<___ if ($addx); 119e1051a39Sopenharmony_ci mov OPENSSL_ia32cap_P+8(%rip),%r11d 120e1051a39Sopenharmony_ci___ 121e1051a39Sopenharmony_ci$code.=<<___; 122e1051a39Sopenharmony_ci cmp $ap,$bp 123e1051a39Sopenharmony_ci jne .Lmul4x_enter 124e1051a39Sopenharmony_ci test \$7,${num}d 125e1051a39Sopenharmony_ci jz .Lsqr8x_enter 126e1051a39Sopenharmony_ci jmp .Lmul4x_enter 127e1051a39Sopenharmony_ci 128e1051a39Sopenharmony_ci.align 16 129e1051a39Sopenharmony_ci.Lmul_enter: 130e1051a39Sopenharmony_ci push %rbx 131e1051a39Sopenharmony_ci.cfi_push %rbx 132e1051a39Sopenharmony_ci push %rbp 133e1051a39Sopenharmony_ci.cfi_push %rbp 134e1051a39Sopenharmony_ci push %r12 135e1051a39Sopenharmony_ci.cfi_push %r12 136e1051a39Sopenharmony_ci push %r13 137e1051a39Sopenharmony_ci.cfi_push %r13 138e1051a39Sopenharmony_ci push %r14 139e1051a39Sopenharmony_ci.cfi_push %r14 140e1051a39Sopenharmony_ci push %r15 141e1051a39Sopenharmony_ci.cfi_push %r15 142e1051a39Sopenharmony_ci 143e1051a39Sopenharmony_ci neg $num 144e1051a39Sopenharmony_ci mov %rsp,%r11 145e1051a39Sopenharmony_ci lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) 146e1051a39Sopenharmony_ci neg $num # restore $num 147e1051a39Sopenharmony_ci and \$-1024,%r10 # minimize TLB usage 148e1051a39Sopenharmony_ci 149e1051a39Sopenharmony_ci # An OS-agnostic version of __chkstk. 150e1051a39Sopenharmony_ci # 151e1051a39Sopenharmony_ci # Some OSes (Windows) insist on stack being "wired" to 152e1051a39Sopenharmony_ci # physical memory in strictly sequential manner, i.e. if stack 153e1051a39Sopenharmony_ci # allocation spans two pages, then reference to farmost one can 154e1051a39Sopenharmony_ci # be punishable by SEGV. But page walking can do good even on 155e1051a39Sopenharmony_ci # other OSes, because it guarantees that villain thread hits 156e1051a39Sopenharmony_ci # the guard page before it can make damage to innocent one... 157e1051a39Sopenharmony_ci sub %r10,%r11 158e1051a39Sopenharmony_ci and \$-4096,%r11 159e1051a39Sopenharmony_ci lea (%r10,%r11),%rsp 160e1051a39Sopenharmony_ci mov (%rsp),%r11 161e1051a39Sopenharmony_ci cmp %r10,%rsp 162e1051a39Sopenharmony_ci ja .Lmul_page_walk 163e1051a39Sopenharmony_ci jmp .Lmul_page_walk_done 164e1051a39Sopenharmony_ci 165e1051a39Sopenharmony_ci.align 16 166e1051a39Sopenharmony_ci.Lmul_page_walk: 167e1051a39Sopenharmony_ci lea -4096(%rsp),%rsp 168e1051a39Sopenharmony_ci mov (%rsp),%r11 169e1051a39Sopenharmony_ci cmp %r10,%rsp 170e1051a39Sopenharmony_ci ja .Lmul_page_walk 171e1051a39Sopenharmony_ci.Lmul_page_walk_done: 172e1051a39Sopenharmony_ci 173e1051a39Sopenharmony_ci mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 174e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 175e1051a39Sopenharmony_ci.Lmul_body: 176e1051a39Sopenharmony_ci mov $bp,%r12 # reassign $bp 177e1051a39Sopenharmony_ci___ 178e1051a39Sopenharmony_ci $bp="%r12"; 179e1051a39Sopenharmony_ci$code.=<<___; 180e1051a39Sopenharmony_ci mov ($n0),$n0 # pull n0[0] value 181e1051a39Sopenharmony_ci mov ($bp),$m0 # m0=bp[0] 182e1051a39Sopenharmony_ci mov ($ap),%rax 183e1051a39Sopenharmony_ci 184e1051a39Sopenharmony_ci xor $i,$i # i=0 185e1051a39Sopenharmony_ci xor $j,$j # j=0 186e1051a39Sopenharmony_ci 187e1051a39Sopenharmony_ci mov $n0,$m1 188e1051a39Sopenharmony_ci mulq $m0 # ap[0]*bp[0] 189e1051a39Sopenharmony_ci mov %rax,$lo0 190e1051a39Sopenharmony_ci mov ($np),%rax 191e1051a39Sopenharmony_ci 192e1051a39Sopenharmony_ci imulq $lo0,$m1 # "tp[0]"*n0 193e1051a39Sopenharmony_ci mov %rdx,$hi0 194e1051a39Sopenharmony_ci 195e1051a39Sopenharmony_ci mulq $m1 # np[0]*m1 196e1051a39Sopenharmony_ci add %rax,$lo0 # discarded 197e1051a39Sopenharmony_ci mov 8($ap),%rax 198e1051a39Sopenharmony_ci adc \$0,%rdx 199e1051a39Sopenharmony_ci mov %rdx,$hi1 200e1051a39Sopenharmony_ci 201e1051a39Sopenharmony_ci lea 1($j),$j # j++ 202e1051a39Sopenharmony_ci jmp .L1st_enter 203e1051a39Sopenharmony_ci 204e1051a39Sopenharmony_ci.align 16 205e1051a39Sopenharmony_ci.L1st: 206e1051a39Sopenharmony_ci add %rax,$hi1 207e1051a39Sopenharmony_ci mov ($ap,$j,8),%rax 208e1051a39Sopenharmony_ci adc \$0,%rdx 209e1051a39Sopenharmony_ci add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 210e1051a39Sopenharmony_ci mov $lo0,$hi0 211e1051a39Sopenharmony_ci adc \$0,%rdx 212e1051a39Sopenharmony_ci mov $hi1,-16(%rsp,$j,8) # tp[j-1] 213e1051a39Sopenharmony_ci mov %rdx,$hi1 214e1051a39Sopenharmony_ci 215e1051a39Sopenharmony_ci.L1st_enter: 216e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[0] 217e1051a39Sopenharmony_ci add %rax,$hi0 218e1051a39Sopenharmony_ci mov ($np,$j,8),%rax 219e1051a39Sopenharmony_ci adc \$0,%rdx 220e1051a39Sopenharmony_ci lea 1($j),$j # j++ 221e1051a39Sopenharmony_ci mov %rdx,$lo0 222e1051a39Sopenharmony_ci 223e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 224e1051a39Sopenharmony_ci cmp $num,$j 225e1051a39Sopenharmony_ci jne .L1st 226e1051a39Sopenharmony_ci 227e1051a39Sopenharmony_ci add %rax,$hi1 228e1051a39Sopenharmony_ci mov ($ap),%rax # ap[0] 229e1051a39Sopenharmony_ci adc \$0,%rdx 230e1051a39Sopenharmony_ci add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 231e1051a39Sopenharmony_ci adc \$0,%rdx 232e1051a39Sopenharmony_ci mov $hi1,-16(%rsp,$j,8) # tp[j-1] 233e1051a39Sopenharmony_ci mov %rdx,$hi1 234e1051a39Sopenharmony_ci mov $lo0,$hi0 235e1051a39Sopenharmony_ci 236e1051a39Sopenharmony_ci xor %rdx,%rdx 237e1051a39Sopenharmony_ci add $hi0,$hi1 238e1051a39Sopenharmony_ci adc \$0,%rdx 239e1051a39Sopenharmony_ci mov $hi1,-8(%rsp,$num,8) 240e1051a39Sopenharmony_ci mov %rdx,(%rsp,$num,8) # store upmost overflow bit 241e1051a39Sopenharmony_ci 242e1051a39Sopenharmony_ci lea 1($i),$i # i++ 243e1051a39Sopenharmony_ci jmp .Louter 244e1051a39Sopenharmony_ci.align 16 245e1051a39Sopenharmony_ci.Louter: 246e1051a39Sopenharmony_ci mov ($bp,$i,8),$m0 # m0=bp[i] 247e1051a39Sopenharmony_ci xor $j,$j # j=0 248e1051a39Sopenharmony_ci mov $n0,$m1 249e1051a39Sopenharmony_ci mov (%rsp),$lo0 250e1051a39Sopenharmony_ci mulq $m0 # ap[0]*bp[i] 251e1051a39Sopenharmony_ci add %rax,$lo0 # ap[0]*bp[i]+tp[0] 252e1051a39Sopenharmony_ci mov ($np),%rax 253e1051a39Sopenharmony_ci adc \$0,%rdx 254e1051a39Sopenharmony_ci 255e1051a39Sopenharmony_ci imulq $lo0,$m1 # tp[0]*n0 256e1051a39Sopenharmony_ci mov %rdx,$hi0 257e1051a39Sopenharmony_ci 258e1051a39Sopenharmony_ci mulq $m1 # np[0]*m1 259e1051a39Sopenharmony_ci add %rax,$lo0 # discarded 260e1051a39Sopenharmony_ci mov 8($ap),%rax 261e1051a39Sopenharmony_ci adc \$0,%rdx 262e1051a39Sopenharmony_ci mov 8(%rsp),$lo0 # tp[1] 263e1051a39Sopenharmony_ci mov %rdx,$hi1 264e1051a39Sopenharmony_ci 265e1051a39Sopenharmony_ci lea 1($j),$j # j++ 266e1051a39Sopenharmony_ci jmp .Linner_enter 267e1051a39Sopenharmony_ci 268e1051a39Sopenharmony_ci.align 16 269e1051a39Sopenharmony_ci.Linner: 270e1051a39Sopenharmony_ci add %rax,$hi1 271e1051a39Sopenharmony_ci mov ($ap,$j,8),%rax 272e1051a39Sopenharmony_ci adc \$0,%rdx 273e1051a39Sopenharmony_ci add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 274e1051a39Sopenharmony_ci mov (%rsp,$j,8),$lo0 275e1051a39Sopenharmony_ci adc \$0,%rdx 276e1051a39Sopenharmony_ci mov $hi1,-16(%rsp,$j,8) # tp[j-1] 277e1051a39Sopenharmony_ci mov %rdx,$hi1 278e1051a39Sopenharmony_ci 279e1051a39Sopenharmony_ci.Linner_enter: 280e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[i] 281e1051a39Sopenharmony_ci add %rax,$hi0 282e1051a39Sopenharmony_ci mov ($np,$j,8),%rax 283e1051a39Sopenharmony_ci adc \$0,%rdx 284e1051a39Sopenharmony_ci add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 285e1051a39Sopenharmony_ci mov %rdx,$hi0 286e1051a39Sopenharmony_ci adc \$0,$hi0 287e1051a39Sopenharmony_ci lea 1($j),$j # j++ 288e1051a39Sopenharmony_ci 289e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 290e1051a39Sopenharmony_ci cmp $num,$j 291e1051a39Sopenharmony_ci jne .Linner 292e1051a39Sopenharmony_ci 293e1051a39Sopenharmony_ci add %rax,$hi1 294e1051a39Sopenharmony_ci mov ($ap),%rax # ap[0] 295e1051a39Sopenharmony_ci adc \$0,%rdx 296e1051a39Sopenharmony_ci add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 297e1051a39Sopenharmony_ci mov (%rsp,$j,8),$lo0 298e1051a39Sopenharmony_ci adc \$0,%rdx 299e1051a39Sopenharmony_ci mov $hi1,-16(%rsp,$j,8) # tp[j-1] 300e1051a39Sopenharmony_ci mov %rdx,$hi1 301e1051a39Sopenharmony_ci 302e1051a39Sopenharmony_ci xor %rdx,%rdx 303e1051a39Sopenharmony_ci add $hi0,$hi1 304e1051a39Sopenharmony_ci adc \$0,%rdx 305e1051a39Sopenharmony_ci add $lo0,$hi1 # pull upmost overflow bit 306e1051a39Sopenharmony_ci adc \$0,%rdx 307e1051a39Sopenharmony_ci mov $hi1,-8(%rsp,$num,8) 308e1051a39Sopenharmony_ci mov %rdx,(%rsp,$num,8) # store upmost overflow bit 309e1051a39Sopenharmony_ci 310e1051a39Sopenharmony_ci lea 1($i),$i # i++ 311e1051a39Sopenharmony_ci cmp $num,$i 312e1051a39Sopenharmony_ci jb .Louter 313e1051a39Sopenharmony_ci 314e1051a39Sopenharmony_ci xor $i,$i # i=0 and clear CF! 315e1051a39Sopenharmony_ci mov (%rsp),%rax # tp[0] 316e1051a39Sopenharmony_ci mov $num,$j # j=num 317e1051a39Sopenharmony_ci 318e1051a39Sopenharmony_ci.align 16 319e1051a39Sopenharmony_ci.Lsub: sbb ($np,$i,8),%rax 320e1051a39Sopenharmony_ci mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 321e1051a39Sopenharmony_ci mov 8(%rsp,$i,8),%rax # tp[i+1] 322e1051a39Sopenharmony_ci lea 1($i),$i # i++ 323e1051a39Sopenharmony_ci dec $j # doesn't affect CF! 324e1051a39Sopenharmony_ci jnz .Lsub 325e1051a39Sopenharmony_ci 326e1051a39Sopenharmony_ci sbb \$0,%rax # handle upmost overflow bit 327e1051a39Sopenharmony_ci mov \$-1,%rbx 328e1051a39Sopenharmony_ci xor %rax,%rbx # not %rax 329e1051a39Sopenharmony_ci xor $i,$i 330e1051a39Sopenharmony_ci mov $num,$j # j=num 331e1051a39Sopenharmony_ci 332e1051a39Sopenharmony_ci.Lcopy: # conditional copy 333e1051a39Sopenharmony_ci mov ($rp,$i,8),%rcx 334e1051a39Sopenharmony_ci mov (%rsp,$i,8),%rdx 335e1051a39Sopenharmony_ci and %rbx,%rcx 336e1051a39Sopenharmony_ci and %rax,%rdx 337e1051a39Sopenharmony_ci mov $num,(%rsp,$i,8) # zap temporary vector 338e1051a39Sopenharmony_ci or %rcx,%rdx 339e1051a39Sopenharmony_ci mov %rdx,($rp,$i,8) # rp[i]=tp[i] 340e1051a39Sopenharmony_ci lea 1($i),$i 341e1051a39Sopenharmony_ci sub \$1,$j 342e1051a39Sopenharmony_ci jnz .Lcopy 343e1051a39Sopenharmony_ci 344e1051a39Sopenharmony_ci mov 8(%rsp,$num,8),%rsi # restore %rsp 345e1051a39Sopenharmony_ci.cfi_def_cfa %rsi,8 346e1051a39Sopenharmony_ci mov \$1,%rax 347e1051a39Sopenharmony_ci mov -48(%rsi),%r15 348e1051a39Sopenharmony_ci.cfi_restore %r15 349e1051a39Sopenharmony_ci mov -40(%rsi),%r14 350e1051a39Sopenharmony_ci.cfi_restore %r14 351e1051a39Sopenharmony_ci mov -32(%rsi),%r13 352e1051a39Sopenharmony_ci.cfi_restore %r13 353e1051a39Sopenharmony_ci mov -24(%rsi),%r12 354e1051a39Sopenharmony_ci.cfi_restore %r12 355e1051a39Sopenharmony_ci mov -16(%rsi),%rbp 356e1051a39Sopenharmony_ci.cfi_restore %rbp 357e1051a39Sopenharmony_ci mov -8(%rsi),%rbx 358e1051a39Sopenharmony_ci.cfi_restore %rbx 359e1051a39Sopenharmony_ci lea (%rsi),%rsp 360e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 361e1051a39Sopenharmony_ci.Lmul_epilogue: 362e1051a39Sopenharmony_ci ret 363e1051a39Sopenharmony_ci.cfi_endproc 364e1051a39Sopenharmony_ci.size bn_mul_mont,.-bn_mul_mont 365e1051a39Sopenharmony_ci___ 366e1051a39Sopenharmony_ci{{{ 367e1051a39Sopenharmony_cimy @A=("%r10","%r11"); 368e1051a39Sopenharmony_cimy @N=("%r13","%rdi"); 369e1051a39Sopenharmony_ci$code.=<<___; 370e1051a39Sopenharmony_ci.type bn_mul4x_mont,\@function,6 371e1051a39Sopenharmony_ci.align 16 372e1051a39Sopenharmony_cibn_mul4x_mont: 373e1051a39Sopenharmony_ci.cfi_startproc 374e1051a39Sopenharmony_ci mov ${num}d,${num}d 375e1051a39Sopenharmony_ci mov %rsp,%rax 376e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 377e1051a39Sopenharmony_ci.Lmul4x_enter: 378e1051a39Sopenharmony_ci___ 379e1051a39Sopenharmony_ci$code.=<<___ if ($addx); 380e1051a39Sopenharmony_ci and \$0x80100,%r11d 381e1051a39Sopenharmony_ci cmp \$0x80100,%r11d 382e1051a39Sopenharmony_ci je .Lmulx4x_enter 383e1051a39Sopenharmony_ci___ 384e1051a39Sopenharmony_ci$code.=<<___; 385e1051a39Sopenharmony_ci push %rbx 386e1051a39Sopenharmony_ci.cfi_push %rbx 387e1051a39Sopenharmony_ci push %rbp 388e1051a39Sopenharmony_ci.cfi_push %rbp 389e1051a39Sopenharmony_ci push %r12 390e1051a39Sopenharmony_ci.cfi_push %r12 391e1051a39Sopenharmony_ci push %r13 392e1051a39Sopenharmony_ci.cfi_push %r13 393e1051a39Sopenharmony_ci push %r14 394e1051a39Sopenharmony_ci.cfi_push %r14 395e1051a39Sopenharmony_ci push %r15 396e1051a39Sopenharmony_ci.cfi_push %r15 397e1051a39Sopenharmony_ci 398e1051a39Sopenharmony_ci neg $num 399e1051a39Sopenharmony_ci mov %rsp,%r11 400e1051a39Sopenharmony_ci lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) 401e1051a39Sopenharmony_ci neg $num # restore 402e1051a39Sopenharmony_ci and \$-1024,%r10 # minimize TLB usage 403e1051a39Sopenharmony_ci 404e1051a39Sopenharmony_ci sub %r10,%r11 405e1051a39Sopenharmony_ci and \$-4096,%r11 406e1051a39Sopenharmony_ci lea (%r10,%r11),%rsp 407e1051a39Sopenharmony_ci mov (%rsp),%r11 408e1051a39Sopenharmony_ci cmp %r10,%rsp 409e1051a39Sopenharmony_ci ja .Lmul4x_page_walk 410e1051a39Sopenharmony_ci jmp .Lmul4x_page_walk_done 411e1051a39Sopenharmony_ci 412e1051a39Sopenharmony_ci.Lmul4x_page_walk: 413e1051a39Sopenharmony_ci lea -4096(%rsp),%rsp 414e1051a39Sopenharmony_ci mov (%rsp),%r11 415e1051a39Sopenharmony_ci cmp %r10,%rsp 416e1051a39Sopenharmony_ci ja .Lmul4x_page_walk 417e1051a39Sopenharmony_ci.Lmul4x_page_walk_done: 418e1051a39Sopenharmony_ci 419e1051a39Sopenharmony_ci mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 420e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 421e1051a39Sopenharmony_ci.Lmul4x_body: 422e1051a39Sopenharmony_ci mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 423e1051a39Sopenharmony_ci mov %rdx,%r12 # reassign $bp 424e1051a39Sopenharmony_ci___ 425e1051a39Sopenharmony_ci $bp="%r12"; 426e1051a39Sopenharmony_ci$code.=<<___; 427e1051a39Sopenharmony_ci mov ($n0),$n0 # pull n0[0] value 428e1051a39Sopenharmony_ci mov ($bp),$m0 # m0=bp[0] 429e1051a39Sopenharmony_ci mov ($ap),%rax 430e1051a39Sopenharmony_ci 431e1051a39Sopenharmony_ci xor $i,$i # i=0 432e1051a39Sopenharmony_ci xor $j,$j # j=0 433e1051a39Sopenharmony_ci 434e1051a39Sopenharmony_ci mov $n0,$m1 435e1051a39Sopenharmony_ci mulq $m0 # ap[0]*bp[0] 436e1051a39Sopenharmony_ci mov %rax,$A[0] 437e1051a39Sopenharmony_ci mov ($np),%rax 438e1051a39Sopenharmony_ci 439e1051a39Sopenharmony_ci imulq $A[0],$m1 # "tp[0]"*n0 440e1051a39Sopenharmony_ci mov %rdx,$A[1] 441e1051a39Sopenharmony_ci 442e1051a39Sopenharmony_ci mulq $m1 # np[0]*m1 443e1051a39Sopenharmony_ci add %rax,$A[0] # discarded 444e1051a39Sopenharmony_ci mov 8($ap),%rax 445e1051a39Sopenharmony_ci adc \$0,%rdx 446e1051a39Sopenharmony_ci mov %rdx,$N[1] 447e1051a39Sopenharmony_ci 448e1051a39Sopenharmony_ci mulq $m0 449e1051a39Sopenharmony_ci add %rax,$A[1] 450e1051a39Sopenharmony_ci mov 8($np),%rax 451e1051a39Sopenharmony_ci adc \$0,%rdx 452e1051a39Sopenharmony_ci mov %rdx,$A[0] 453e1051a39Sopenharmony_ci 454e1051a39Sopenharmony_ci mulq $m1 455e1051a39Sopenharmony_ci add %rax,$N[1] 456e1051a39Sopenharmony_ci mov 16($ap),%rax 457e1051a39Sopenharmony_ci adc \$0,%rdx 458e1051a39Sopenharmony_ci add $A[1],$N[1] 459e1051a39Sopenharmony_ci lea 4($j),$j # j++ 460e1051a39Sopenharmony_ci adc \$0,%rdx 461e1051a39Sopenharmony_ci mov $N[1],(%rsp) 462e1051a39Sopenharmony_ci mov %rdx,$N[0] 463e1051a39Sopenharmony_ci jmp .L1st4x 464e1051a39Sopenharmony_ci.align 16 465e1051a39Sopenharmony_ci.L1st4x: 466e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[0] 467e1051a39Sopenharmony_ci add %rax,$A[0] 468e1051a39Sopenharmony_ci mov -16($np,$j,8),%rax 469e1051a39Sopenharmony_ci adc \$0,%rdx 470e1051a39Sopenharmony_ci mov %rdx,$A[1] 471e1051a39Sopenharmony_ci 472e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 473e1051a39Sopenharmony_ci add %rax,$N[0] 474e1051a39Sopenharmony_ci mov -8($ap,$j,8),%rax 475e1051a39Sopenharmony_ci adc \$0,%rdx 476e1051a39Sopenharmony_ci add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 477e1051a39Sopenharmony_ci adc \$0,%rdx 478e1051a39Sopenharmony_ci mov $N[0],-24(%rsp,$j,8) # tp[j-1] 479e1051a39Sopenharmony_ci mov %rdx,$N[1] 480e1051a39Sopenharmony_ci 481e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[0] 482e1051a39Sopenharmony_ci add %rax,$A[1] 483e1051a39Sopenharmony_ci mov -8($np,$j,8),%rax 484e1051a39Sopenharmony_ci adc \$0,%rdx 485e1051a39Sopenharmony_ci mov %rdx,$A[0] 486e1051a39Sopenharmony_ci 487e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 488e1051a39Sopenharmony_ci add %rax,$N[1] 489e1051a39Sopenharmony_ci mov ($ap,$j,8),%rax 490e1051a39Sopenharmony_ci adc \$0,%rdx 491e1051a39Sopenharmony_ci add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 492e1051a39Sopenharmony_ci adc \$0,%rdx 493e1051a39Sopenharmony_ci mov $N[1],-16(%rsp,$j,8) # tp[j-1] 494e1051a39Sopenharmony_ci mov %rdx,$N[0] 495e1051a39Sopenharmony_ci 496e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[0] 497e1051a39Sopenharmony_ci add %rax,$A[0] 498e1051a39Sopenharmony_ci mov ($np,$j,8),%rax 499e1051a39Sopenharmony_ci adc \$0,%rdx 500e1051a39Sopenharmony_ci mov %rdx,$A[1] 501e1051a39Sopenharmony_ci 502e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 503e1051a39Sopenharmony_ci add %rax,$N[0] 504e1051a39Sopenharmony_ci mov 8($ap,$j,8),%rax 505e1051a39Sopenharmony_ci adc \$0,%rdx 506e1051a39Sopenharmony_ci add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 507e1051a39Sopenharmony_ci adc \$0,%rdx 508e1051a39Sopenharmony_ci mov $N[0],-8(%rsp,$j,8) # tp[j-1] 509e1051a39Sopenharmony_ci mov %rdx,$N[1] 510e1051a39Sopenharmony_ci 511e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[0] 512e1051a39Sopenharmony_ci add %rax,$A[1] 513e1051a39Sopenharmony_ci mov 8($np,$j,8),%rax 514e1051a39Sopenharmony_ci adc \$0,%rdx 515e1051a39Sopenharmony_ci lea 4($j),$j # j++ 516e1051a39Sopenharmony_ci mov %rdx,$A[0] 517e1051a39Sopenharmony_ci 518e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 519e1051a39Sopenharmony_ci add %rax,$N[1] 520e1051a39Sopenharmony_ci mov -16($ap,$j,8),%rax 521e1051a39Sopenharmony_ci adc \$0,%rdx 522e1051a39Sopenharmony_ci add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 523e1051a39Sopenharmony_ci adc \$0,%rdx 524e1051a39Sopenharmony_ci mov $N[1],-32(%rsp,$j,8) # tp[j-1] 525e1051a39Sopenharmony_ci mov %rdx,$N[0] 526e1051a39Sopenharmony_ci cmp $num,$j 527e1051a39Sopenharmony_ci jb .L1st4x 528e1051a39Sopenharmony_ci 529e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[0] 530e1051a39Sopenharmony_ci add %rax,$A[0] 531e1051a39Sopenharmony_ci mov -16($np,$j,8),%rax 532e1051a39Sopenharmony_ci adc \$0,%rdx 533e1051a39Sopenharmony_ci mov %rdx,$A[1] 534e1051a39Sopenharmony_ci 535e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 536e1051a39Sopenharmony_ci add %rax,$N[0] 537e1051a39Sopenharmony_ci mov -8($ap,$j,8),%rax 538e1051a39Sopenharmony_ci adc \$0,%rdx 539e1051a39Sopenharmony_ci add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 540e1051a39Sopenharmony_ci adc \$0,%rdx 541e1051a39Sopenharmony_ci mov $N[0],-24(%rsp,$j,8) # tp[j-1] 542e1051a39Sopenharmony_ci mov %rdx,$N[1] 543e1051a39Sopenharmony_ci 544e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[0] 545e1051a39Sopenharmony_ci add %rax,$A[1] 546e1051a39Sopenharmony_ci mov -8($np,$j,8),%rax 547e1051a39Sopenharmony_ci adc \$0,%rdx 548e1051a39Sopenharmony_ci mov %rdx,$A[0] 549e1051a39Sopenharmony_ci 550e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 551e1051a39Sopenharmony_ci add %rax,$N[1] 552e1051a39Sopenharmony_ci mov ($ap),%rax # ap[0] 553e1051a39Sopenharmony_ci adc \$0,%rdx 554e1051a39Sopenharmony_ci add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 555e1051a39Sopenharmony_ci adc \$0,%rdx 556e1051a39Sopenharmony_ci mov $N[1],-16(%rsp,$j,8) # tp[j-1] 557e1051a39Sopenharmony_ci mov %rdx,$N[0] 558e1051a39Sopenharmony_ci 559e1051a39Sopenharmony_ci xor $N[1],$N[1] 560e1051a39Sopenharmony_ci add $A[0],$N[0] 561e1051a39Sopenharmony_ci adc \$0,$N[1] 562e1051a39Sopenharmony_ci mov $N[0],-8(%rsp,$j,8) 563e1051a39Sopenharmony_ci mov $N[1],(%rsp,$j,8) # store upmost overflow bit 564e1051a39Sopenharmony_ci 565e1051a39Sopenharmony_ci lea 1($i),$i # i++ 566e1051a39Sopenharmony_ci.align 4 567e1051a39Sopenharmony_ci.Louter4x: 568e1051a39Sopenharmony_ci mov ($bp,$i,8),$m0 # m0=bp[i] 569e1051a39Sopenharmony_ci xor $j,$j # j=0 570e1051a39Sopenharmony_ci mov (%rsp),$A[0] 571e1051a39Sopenharmony_ci mov $n0,$m1 572e1051a39Sopenharmony_ci mulq $m0 # ap[0]*bp[i] 573e1051a39Sopenharmony_ci add %rax,$A[0] # ap[0]*bp[i]+tp[0] 574e1051a39Sopenharmony_ci mov ($np),%rax 575e1051a39Sopenharmony_ci adc \$0,%rdx 576e1051a39Sopenharmony_ci 577e1051a39Sopenharmony_ci imulq $A[0],$m1 # tp[0]*n0 578e1051a39Sopenharmony_ci mov %rdx,$A[1] 579e1051a39Sopenharmony_ci 580e1051a39Sopenharmony_ci mulq $m1 # np[0]*m1 581e1051a39Sopenharmony_ci add %rax,$A[0] # "$N[0]", discarded 582e1051a39Sopenharmony_ci mov 8($ap),%rax 583e1051a39Sopenharmony_ci adc \$0,%rdx 584e1051a39Sopenharmony_ci mov %rdx,$N[1] 585e1051a39Sopenharmony_ci 586e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[i] 587e1051a39Sopenharmony_ci add %rax,$A[1] 588e1051a39Sopenharmony_ci mov 8($np),%rax 589e1051a39Sopenharmony_ci adc \$0,%rdx 590e1051a39Sopenharmony_ci add 8(%rsp),$A[1] # +tp[1] 591e1051a39Sopenharmony_ci adc \$0,%rdx 592e1051a39Sopenharmony_ci mov %rdx,$A[0] 593e1051a39Sopenharmony_ci 594e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 595e1051a39Sopenharmony_ci add %rax,$N[1] 596e1051a39Sopenharmony_ci mov 16($ap),%rax 597e1051a39Sopenharmony_ci adc \$0,%rdx 598e1051a39Sopenharmony_ci add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 599e1051a39Sopenharmony_ci lea 4($j),$j # j+=2 600e1051a39Sopenharmony_ci adc \$0,%rdx 601e1051a39Sopenharmony_ci mov $N[1],(%rsp) # tp[j-1] 602e1051a39Sopenharmony_ci mov %rdx,$N[0] 603e1051a39Sopenharmony_ci jmp .Linner4x 604e1051a39Sopenharmony_ci.align 16 605e1051a39Sopenharmony_ci.Linner4x: 606e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[i] 607e1051a39Sopenharmony_ci add %rax,$A[0] 608e1051a39Sopenharmony_ci mov -16($np,$j,8),%rax 609e1051a39Sopenharmony_ci adc \$0,%rdx 610e1051a39Sopenharmony_ci add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 611e1051a39Sopenharmony_ci adc \$0,%rdx 612e1051a39Sopenharmony_ci mov %rdx,$A[1] 613e1051a39Sopenharmony_ci 614e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 615e1051a39Sopenharmony_ci add %rax,$N[0] 616e1051a39Sopenharmony_ci mov -8($ap,$j,8),%rax 617e1051a39Sopenharmony_ci adc \$0,%rdx 618e1051a39Sopenharmony_ci add $A[0],$N[0] 619e1051a39Sopenharmony_ci adc \$0,%rdx 620e1051a39Sopenharmony_ci mov $N[0],-24(%rsp,$j,8) # tp[j-1] 621e1051a39Sopenharmony_ci mov %rdx,$N[1] 622e1051a39Sopenharmony_ci 623e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[i] 624e1051a39Sopenharmony_ci add %rax,$A[1] 625e1051a39Sopenharmony_ci mov -8($np,$j,8),%rax 626e1051a39Sopenharmony_ci adc \$0,%rdx 627e1051a39Sopenharmony_ci add -8(%rsp,$j,8),$A[1] 628e1051a39Sopenharmony_ci adc \$0,%rdx 629e1051a39Sopenharmony_ci mov %rdx,$A[0] 630e1051a39Sopenharmony_ci 631e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 632e1051a39Sopenharmony_ci add %rax,$N[1] 633e1051a39Sopenharmony_ci mov ($ap,$j,8),%rax 634e1051a39Sopenharmony_ci adc \$0,%rdx 635e1051a39Sopenharmony_ci add $A[1],$N[1] 636e1051a39Sopenharmony_ci adc \$0,%rdx 637e1051a39Sopenharmony_ci mov $N[1],-16(%rsp,$j,8) # tp[j-1] 638e1051a39Sopenharmony_ci mov %rdx,$N[0] 639e1051a39Sopenharmony_ci 640e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[i] 641e1051a39Sopenharmony_ci add %rax,$A[0] 642e1051a39Sopenharmony_ci mov ($np,$j,8),%rax 643e1051a39Sopenharmony_ci adc \$0,%rdx 644e1051a39Sopenharmony_ci add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 645e1051a39Sopenharmony_ci adc \$0,%rdx 646e1051a39Sopenharmony_ci mov %rdx,$A[1] 647e1051a39Sopenharmony_ci 648e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 649e1051a39Sopenharmony_ci add %rax,$N[0] 650e1051a39Sopenharmony_ci mov 8($ap,$j,8),%rax 651e1051a39Sopenharmony_ci adc \$0,%rdx 652e1051a39Sopenharmony_ci add $A[0],$N[0] 653e1051a39Sopenharmony_ci adc \$0,%rdx 654e1051a39Sopenharmony_ci mov $N[0],-8(%rsp,$j,8) # tp[j-1] 655e1051a39Sopenharmony_ci mov %rdx,$N[1] 656e1051a39Sopenharmony_ci 657e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[i] 658e1051a39Sopenharmony_ci add %rax,$A[1] 659e1051a39Sopenharmony_ci mov 8($np,$j,8),%rax 660e1051a39Sopenharmony_ci adc \$0,%rdx 661e1051a39Sopenharmony_ci add 8(%rsp,$j,8),$A[1] 662e1051a39Sopenharmony_ci adc \$0,%rdx 663e1051a39Sopenharmony_ci lea 4($j),$j # j++ 664e1051a39Sopenharmony_ci mov %rdx,$A[0] 665e1051a39Sopenharmony_ci 666e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 667e1051a39Sopenharmony_ci add %rax,$N[1] 668e1051a39Sopenharmony_ci mov -16($ap,$j,8),%rax 669e1051a39Sopenharmony_ci adc \$0,%rdx 670e1051a39Sopenharmony_ci add $A[1],$N[1] 671e1051a39Sopenharmony_ci adc \$0,%rdx 672e1051a39Sopenharmony_ci mov $N[1],-32(%rsp,$j,8) # tp[j-1] 673e1051a39Sopenharmony_ci mov %rdx,$N[0] 674e1051a39Sopenharmony_ci cmp $num,$j 675e1051a39Sopenharmony_ci jb .Linner4x 676e1051a39Sopenharmony_ci 677e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[i] 678e1051a39Sopenharmony_ci add %rax,$A[0] 679e1051a39Sopenharmony_ci mov -16($np,$j,8),%rax 680e1051a39Sopenharmony_ci adc \$0,%rdx 681e1051a39Sopenharmony_ci add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 682e1051a39Sopenharmony_ci adc \$0,%rdx 683e1051a39Sopenharmony_ci mov %rdx,$A[1] 684e1051a39Sopenharmony_ci 685e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 686e1051a39Sopenharmony_ci add %rax,$N[0] 687e1051a39Sopenharmony_ci mov -8($ap,$j,8),%rax 688e1051a39Sopenharmony_ci adc \$0,%rdx 689e1051a39Sopenharmony_ci add $A[0],$N[0] 690e1051a39Sopenharmony_ci adc \$0,%rdx 691e1051a39Sopenharmony_ci mov $N[0],-24(%rsp,$j,8) # tp[j-1] 692e1051a39Sopenharmony_ci mov %rdx,$N[1] 693e1051a39Sopenharmony_ci 694e1051a39Sopenharmony_ci mulq $m0 # ap[j]*bp[i] 695e1051a39Sopenharmony_ci add %rax,$A[1] 696e1051a39Sopenharmony_ci mov -8($np,$j,8),%rax 697e1051a39Sopenharmony_ci adc \$0,%rdx 698e1051a39Sopenharmony_ci add -8(%rsp,$j,8),$A[1] 699e1051a39Sopenharmony_ci adc \$0,%rdx 700e1051a39Sopenharmony_ci lea 1($i),$i # i++ 701e1051a39Sopenharmony_ci mov %rdx,$A[0] 702e1051a39Sopenharmony_ci 703e1051a39Sopenharmony_ci mulq $m1 # np[j]*m1 704e1051a39Sopenharmony_ci add %rax,$N[1] 705e1051a39Sopenharmony_ci mov ($ap),%rax # ap[0] 706e1051a39Sopenharmony_ci adc \$0,%rdx 707e1051a39Sopenharmony_ci add $A[1],$N[1] 708e1051a39Sopenharmony_ci adc \$0,%rdx 709e1051a39Sopenharmony_ci mov $N[1],-16(%rsp,$j,8) # tp[j-1] 710e1051a39Sopenharmony_ci mov %rdx,$N[0] 711e1051a39Sopenharmony_ci 712e1051a39Sopenharmony_ci xor $N[1],$N[1] 713e1051a39Sopenharmony_ci add $A[0],$N[0] 714e1051a39Sopenharmony_ci adc \$0,$N[1] 715e1051a39Sopenharmony_ci add (%rsp,$num,8),$N[0] # pull upmost overflow bit 716e1051a39Sopenharmony_ci adc \$0,$N[1] 717e1051a39Sopenharmony_ci mov $N[0],-8(%rsp,$j,8) 718e1051a39Sopenharmony_ci mov $N[1],(%rsp,$j,8) # store upmost overflow bit 719e1051a39Sopenharmony_ci 720e1051a39Sopenharmony_ci cmp $num,$i 721e1051a39Sopenharmony_ci jb .Louter4x 722e1051a39Sopenharmony_ci___ 723e1051a39Sopenharmony_ci{ 724e1051a39Sopenharmony_cimy @ri=("%rax","%rdx",$m0,$m1); 725e1051a39Sopenharmony_ci$code.=<<___; 726e1051a39Sopenharmony_ci mov 16(%rsp,$num,8),$rp # restore $rp 727e1051a39Sopenharmony_ci lea -4($num),$j 728e1051a39Sopenharmony_ci mov 0(%rsp),@ri[0] # tp[0] 729e1051a39Sopenharmony_ci mov 8(%rsp),@ri[1] # tp[1] 730e1051a39Sopenharmony_ci shr \$2,$j # j=num/4-1 731e1051a39Sopenharmony_ci lea (%rsp),$ap # borrow ap for tp 732e1051a39Sopenharmony_ci xor $i,$i # i=0 and clear CF! 733e1051a39Sopenharmony_ci 734e1051a39Sopenharmony_ci sub 0($np),@ri[0] 735e1051a39Sopenharmony_ci mov 16($ap),@ri[2] # tp[2] 736e1051a39Sopenharmony_ci mov 24($ap),@ri[3] # tp[3] 737e1051a39Sopenharmony_ci sbb 8($np),@ri[1] 738e1051a39Sopenharmony_ci 739e1051a39Sopenharmony_ci.Lsub4x: 740e1051a39Sopenharmony_ci mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 741e1051a39Sopenharmony_ci mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 742e1051a39Sopenharmony_ci sbb 16($np,$i,8),@ri[2] 743e1051a39Sopenharmony_ci mov 32($ap,$i,8),@ri[0] # tp[i+1] 744e1051a39Sopenharmony_ci mov 40($ap,$i,8),@ri[1] 745e1051a39Sopenharmony_ci sbb 24($np,$i,8),@ri[3] 746e1051a39Sopenharmony_ci mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 747e1051a39Sopenharmony_ci mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 748e1051a39Sopenharmony_ci sbb 32($np,$i,8),@ri[0] 749e1051a39Sopenharmony_ci mov 48($ap,$i,8),@ri[2] 750e1051a39Sopenharmony_ci mov 56($ap,$i,8),@ri[3] 751e1051a39Sopenharmony_ci sbb 40($np,$i,8),@ri[1] 752e1051a39Sopenharmony_ci lea 4($i),$i # i++ 753e1051a39Sopenharmony_ci dec $j # doesn't affect CF! 754e1051a39Sopenharmony_ci jnz .Lsub4x 755e1051a39Sopenharmony_ci 756e1051a39Sopenharmony_ci mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 757e1051a39Sopenharmony_ci mov 32($ap,$i,8),@ri[0] # load overflow bit 758e1051a39Sopenharmony_ci sbb 16($np,$i,8),@ri[2] 759e1051a39Sopenharmony_ci mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 760e1051a39Sopenharmony_ci sbb 24($np,$i,8),@ri[3] 761e1051a39Sopenharmony_ci mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 762e1051a39Sopenharmony_ci 763e1051a39Sopenharmony_ci sbb \$0,@ri[0] # handle upmost overflow bit 764e1051a39Sopenharmony_ci mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 765e1051a39Sopenharmony_ci pxor %xmm0,%xmm0 766e1051a39Sopenharmony_ci movq @ri[0],%xmm4 767e1051a39Sopenharmony_ci pcmpeqd %xmm5,%xmm5 768e1051a39Sopenharmony_ci pshufd \$0,%xmm4,%xmm4 769e1051a39Sopenharmony_ci mov $num,$j 770e1051a39Sopenharmony_ci pxor %xmm4,%xmm5 771e1051a39Sopenharmony_ci shr \$2,$j # j=num/4 772e1051a39Sopenharmony_ci xor %eax,%eax # i=0 773e1051a39Sopenharmony_ci 774e1051a39Sopenharmony_ci jmp .Lcopy4x 775e1051a39Sopenharmony_ci.align 16 776e1051a39Sopenharmony_ci.Lcopy4x: # conditional copy 777e1051a39Sopenharmony_ci movdqa (%rsp,%rax),%xmm1 778e1051a39Sopenharmony_ci movdqu ($rp,%rax),%xmm2 779e1051a39Sopenharmony_ci pand %xmm4,%xmm1 780e1051a39Sopenharmony_ci pand %xmm5,%xmm2 781e1051a39Sopenharmony_ci movdqa 16(%rsp,%rax),%xmm3 782e1051a39Sopenharmony_ci movdqa %xmm0,(%rsp,%rax) 783e1051a39Sopenharmony_ci por %xmm2,%xmm1 784e1051a39Sopenharmony_ci movdqu 16($rp,%rax),%xmm2 785e1051a39Sopenharmony_ci movdqu %xmm1,($rp,%rax) 786e1051a39Sopenharmony_ci pand %xmm4,%xmm3 787e1051a39Sopenharmony_ci pand %xmm5,%xmm2 788e1051a39Sopenharmony_ci movdqa %xmm0,16(%rsp,%rax) 789e1051a39Sopenharmony_ci por %xmm2,%xmm3 790e1051a39Sopenharmony_ci movdqu %xmm3,16($rp,%rax) 791e1051a39Sopenharmony_ci lea 32(%rax),%rax 792e1051a39Sopenharmony_ci dec $j 793e1051a39Sopenharmony_ci jnz .Lcopy4x 794e1051a39Sopenharmony_ci___ 795e1051a39Sopenharmony_ci} 796e1051a39Sopenharmony_ci$code.=<<___; 797e1051a39Sopenharmony_ci mov 8(%rsp,$num,8),%rsi # restore %rsp 798e1051a39Sopenharmony_ci.cfi_def_cfa %rsi, 8 799e1051a39Sopenharmony_ci mov \$1,%rax 800e1051a39Sopenharmony_ci mov -48(%rsi),%r15 801e1051a39Sopenharmony_ci.cfi_restore %r15 802e1051a39Sopenharmony_ci mov -40(%rsi),%r14 803e1051a39Sopenharmony_ci.cfi_restore %r14 804e1051a39Sopenharmony_ci mov -32(%rsi),%r13 805e1051a39Sopenharmony_ci.cfi_restore %r13 806e1051a39Sopenharmony_ci mov -24(%rsi),%r12 807e1051a39Sopenharmony_ci.cfi_restore %r12 808e1051a39Sopenharmony_ci mov -16(%rsi),%rbp 809e1051a39Sopenharmony_ci.cfi_restore %rbp 810e1051a39Sopenharmony_ci mov -8(%rsi),%rbx 811e1051a39Sopenharmony_ci.cfi_restore %rbx 812e1051a39Sopenharmony_ci lea (%rsi),%rsp 813e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 814e1051a39Sopenharmony_ci.Lmul4x_epilogue: 815e1051a39Sopenharmony_ci ret 816e1051a39Sopenharmony_ci.cfi_endproc 817e1051a39Sopenharmony_ci.size bn_mul4x_mont,.-bn_mul4x_mont 818e1051a39Sopenharmony_ci___ 819e1051a39Sopenharmony_ci}}} 820e1051a39Sopenharmony_ci{{{ 821e1051a39Sopenharmony_ci###################################################################### 822e1051a39Sopenharmony_ci# void bn_sqr8x_mont( 823e1051a39Sopenharmony_cimy $rptr="%rdi"; # const BN_ULONG *rptr, 824e1051a39Sopenharmony_cimy $aptr="%rsi"; # const BN_ULONG *aptr, 825e1051a39Sopenharmony_cimy $bptr="%rdx"; # not used 826e1051a39Sopenharmony_cimy $nptr="%rcx"; # const BN_ULONG *nptr, 827e1051a39Sopenharmony_cimy $n0 ="%r8"; # const BN_ULONG *n0); 828e1051a39Sopenharmony_cimy $num ="%r9"; # int num, has to be divisible by 8 829e1051a39Sopenharmony_ci 830e1051a39Sopenharmony_cimy ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 831e1051a39Sopenharmony_cimy @A0=("%r10","%r11"); 832e1051a39Sopenharmony_cimy @A1=("%r12","%r13"); 833e1051a39Sopenharmony_cimy ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 834e1051a39Sopenharmony_ci 835e1051a39Sopenharmony_ci$code.=<<___ if ($addx); 836e1051a39Sopenharmony_ci.extern bn_sqrx8x_internal # see x86_64-mont5 module 837e1051a39Sopenharmony_ci___ 838e1051a39Sopenharmony_ci$code.=<<___; 839e1051a39Sopenharmony_ci.extern bn_sqr8x_internal # see x86_64-mont5 module 840e1051a39Sopenharmony_ci 841e1051a39Sopenharmony_ci.type bn_sqr8x_mont,\@function,6 842e1051a39Sopenharmony_ci.align 32 843e1051a39Sopenharmony_cibn_sqr8x_mont: 844e1051a39Sopenharmony_ci.cfi_startproc 845e1051a39Sopenharmony_ci mov %rsp,%rax 846e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 847e1051a39Sopenharmony_ci.Lsqr8x_enter: 848e1051a39Sopenharmony_ci push %rbx 849e1051a39Sopenharmony_ci.cfi_push %rbx 850e1051a39Sopenharmony_ci push %rbp 851e1051a39Sopenharmony_ci.cfi_push %rbp 852e1051a39Sopenharmony_ci push %r12 853e1051a39Sopenharmony_ci.cfi_push %r12 854e1051a39Sopenharmony_ci push %r13 855e1051a39Sopenharmony_ci.cfi_push %r13 856e1051a39Sopenharmony_ci push %r14 857e1051a39Sopenharmony_ci.cfi_push %r14 858e1051a39Sopenharmony_ci push %r15 859e1051a39Sopenharmony_ci.cfi_push %r15 860e1051a39Sopenharmony_ci.Lsqr8x_prologue: 861e1051a39Sopenharmony_ci 862e1051a39Sopenharmony_ci mov ${num}d,%r10d 863e1051a39Sopenharmony_ci shl \$3,${num}d # convert $num to bytes 864e1051a39Sopenharmony_ci shl \$3+2,%r10 # 4*$num 865e1051a39Sopenharmony_ci neg $num 866e1051a39Sopenharmony_ci 867e1051a39Sopenharmony_ci ############################################################## 868e1051a39Sopenharmony_ci # ensure that stack frame doesn't alias with $aptr modulo 869e1051a39Sopenharmony_ci # 4096. this is done to allow memory disambiguation logic 870e1051a39Sopenharmony_ci # do its job. 871e1051a39Sopenharmony_ci # 872e1051a39Sopenharmony_ci lea -64(%rsp,$num,2),%r11 873e1051a39Sopenharmony_ci mov %rsp,%rbp 874e1051a39Sopenharmony_ci mov ($n0),$n0 # *n0 875e1051a39Sopenharmony_ci sub $aptr,%r11 876e1051a39Sopenharmony_ci and \$4095,%r11 877e1051a39Sopenharmony_ci cmp %r11,%r10 878e1051a39Sopenharmony_ci jb .Lsqr8x_sp_alt 879e1051a39Sopenharmony_ci sub %r11,%rbp # align with $aptr 880e1051a39Sopenharmony_ci lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) 881e1051a39Sopenharmony_ci jmp .Lsqr8x_sp_done 882e1051a39Sopenharmony_ci 883e1051a39Sopenharmony_ci.align 32 884e1051a39Sopenharmony_ci.Lsqr8x_sp_alt: 885e1051a39Sopenharmony_ci lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 886e1051a39Sopenharmony_ci lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) 887e1051a39Sopenharmony_ci sub %r10,%r11 888e1051a39Sopenharmony_ci mov \$0,%r10 889e1051a39Sopenharmony_ci cmovc %r10,%r11 890e1051a39Sopenharmony_ci sub %r11,%rbp 891e1051a39Sopenharmony_ci.Lsqr8x_sp_done: 892e1051a39Sopenharmony_ci and \$-64,%rbp 893e1051a39Sopenharmony_ci mov %rsp,%r11 894e1051a39Sopenharmony_ci sub %rbp,%r11 895e1051a39Sopenharmony_ci and \$-4096,%r11 896e1051a39Sopenharmony_ci lea (%rbp,%r11),%rsp 897e1051a39Sopenharmony_ci mov (%rsp),%r10 898e1051a39Sopenharmony_ci cmp %rbp,%rsp 899e1051a39Sopenharmony_ci ja .Lsqr8x_page_walk 900e1051a39Sopenharmony_ci jmp .Lsqr8x_page_walk_done 901e1051a39Sopenharmony_ci 902e1051a39Sopenharmony_ci.align 16 903e1051a39Sopenharmony_ci.Lsqr8x_page_walk: 904e1051a39Sopenharmony_ci lea -4096(%rsp),%rsp 905e1051a39Sopenharmony_ci mov (%rsp),%r10 906e1051a39Sopenharmony_ci cmp %rbp,%rsp 907e1051a39Sopenharmony_ci ja .Lsqr8x_page_walk 908e1051a39Sopenharmony_ci.Lsqr8x_page_walk_done: 909e1051a39Sopenharmony_ci 910e1051a39Sopenharmony_ci mov $num,%r10 911e1051a39Sopenharmony_ci neg $num 912e1051a39Sopenharmony_ci 913e1051a39Sopenharmony_ci mov $n0, 32(%rsp) 914e1051a39Sopenharmony_ci mov %rax, 40(%rsp) # save original %rsp 915e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp+40,deref,+8 916e1051a39Sopenharmony_ci.Lsqr8x_body: 917e1051a39Sopenharmony_ci 918e1051a39Sopenharmony_ci movq $nptr, %xmm2 # save pointer to modulus 919e1051a39Sopenharmony_ci pxor %xmm0,%xmm0 920e1051a39Sopenharmony_ci movq $rptr,%xmm1 # save $rptr 921e1051a39Sopenharmony_ci movq %r10, %xmm3 # -$num 922e1051a39Sopenharmony_ci___ 923e1051a39Sopenharmony_ci$code.=<<___ if ($addx); 924e1051a39Sopenharmony_ci mov OPENSSL_ia32cap_P+8(%rip),%eax 925e1051a39Sopenharmony_ci and \$0x80100,%eax 926e1051a39Sopenharmony_ci cmp \$0x80100,%eax 927e1051a39Sopenharmony_ci jne .Lsqr8x_nox 928e1051a39Sopenharmony_ci 929e1051a39Sopenharmony_ci call bn_sqrx8x_internal # see x86_64-mont5 module 930e1051a39Sopenharmony_ci # %rax top-most carry 931e1051a39Sopenharmony_ci # %rbp nptr 932e1051a39Sopenharmony_ci # %rcx -8*num 933e1051a39Sopenharmony_ci # %r8 end of tp[2*num] 934e1051a39Sopenharmony_ci lea (%r8,%rcx),%rbx 935e1051a39Sopenharmony_ci mov %rcx,$num 936e1051a39Sopenharmony_ci mov %rcx,%rdx 937e1051a39Sopenharmony_ci movq %xmm1,$rptr 938e1051a39Sopenharmony_ci sar \$3+2,%rcx # %cf=0 939e1051a39Sopenharmony_ci jmp .Lsqr8x_sub 940e1051a39Sopenharmony_ci 941e1051a39Sopenharmony_ci.align 32 942e1051a39Sopenharmony_ci.Lsqr8x_nox: 943e1051a39Sopenharmony_ci___ 944e1051a39Sopenharmony_ci$code.=<<___; 945e1051a39Sopenharmony_ci call bn_sqr8x_internal # see x86_64-mont5 module 946e1051a39Sopenharmony_ci # %rax top-most carry 947e1051a39Sopenharmony_ci # %rbp nptr 948e1051a39Sopenharmony_ci # %r8 -8*num 949e1051a39Sopenharmony_ci # %rdi end of tp[2*num] 950e1051a39Sopenharmony_ci lea (%rdi,$num),%rbx 951e1051a39Sopenharmony_ci mov $num,%rcx 952e1051a39Sopenharmony_ci mov $num,%rdx 953e1051a39Sopenharmony_ci movq %xmm1,$rptr 954e1051a39Sopenharmony_ci sar \$3+2,%rcx # %cf=0 955e1051a39Sopenharmony_ci jmp .Lsqr8x_sub 956e1051a39Sopenharmony_ci 957e1051a39Sopenharmony_ci.align 32 958e1051a39Sopenharmony_ci.Lsqr8x_sub: 959e1051a39Sopenharmony_ci mov 8*0(%rbx),%r12 960e1051a39Sopenharmony_ci mov 8*1(%rbx),%r13 961e1051a39Sopenharmony_ci mov 8*2(%rbx),%r14 962e1051a39Sopenharmony_ci mov 8*3(%rbx),%r15 963e1051a39Sopenharmony_ci lea 8*4(%rbx),%rbx 964e1051a39Sopenharmony_ci sbb 8*0(%rbp),%r12 965e1051a39Sopenharmony_ci sbb 8*1(%rbp),%r13 966e1051a39Sopenharmony_ci sbb 8*2(%rbp),%r14 967e1051a39Sopenharmony_ci sbb 8*3(%rbp),%r15 968e1051a39Sopenharmony_ci lea 8*4(%rbp),%rbp 969e1051a39Sopenharmony_ci mov %r12,8*0($rptr) 970e1051a39Sopenharmony_ci mov %r13,8*1($rptr) 971e1051a39Sopenharmony_ci mov %r14,8*2($rptr) 972e1051a39Sopenharmony_ci mov %r15,8*3($rptr) 973e1051a39Sopenharmony_ci lea 8*4($rptr),$rptr 974e1051a39Sopenharmony_ci inc %rcx # preserves %cf 975e1051a39Sopenharmony_ci jnz .Lsqr8x_sub 976e1051a39Sopenharmony_ci 977e1051a39Sopenharmony_ci sbb \$0,%rax # top-most carry 978e1051a39Sopenharmony_ci lea (%rbx,$num),%rbx # rewind 979e1051a39Sopenharmony_ci lea ($rptr,$num),$rptr # rewind 980e1051a39Sopenharmony_ci 981e1051a39Sopenharmony_ci movq %rax,%xmm1 982e1051a39Sopenharmony_ci pxor %xmm0,%xmm0 983e1051a39Sopenharmony_ci pshufd \$0,%xmm1,%xmm1 984e1051a39Sopenharmony_ci mov 40(%rsp),%rsi # restore %rsp 985e1051a39Sopenharmony_ci.cfi_def_cfa %rsi,8 986e1051a39Sopenharmony_ci jmp .Lsqr8x_cond_copy 987e1051a39Sopenharmony_ci 988e1051a39Sopenharmony_ci.align 32 989e1051a39Sopenharmony_ci.Lsqr8x_cond_copy: 990e1051a39Sopenharmony_ci movdqa 16*0(%rbx),%xmm2 991e1051a39Sopenharmony_ci movdqa 16*1(%rbx),%xmm3 992e1051a39Sopenharmony_ci lea 16*2(%rbx),%rbx 993e1051a39Sopenharmony_ci movdqu 16*0($rptr),%xmm4 994e1051a39Sopenharmony_ci movdqu 16*1($rptr),%xmm5 995e1051a39Sopenharmony_ci lea 16*2($rptr),$rptr 996e1051a39Sopenharmony_ci movdqa %xmm0,-16*2(%rbx) # zero tp 997e1051a39Sopenharmony_ci movdqa %xmm0,-16*1(%rbx) 998e1051a39Sopenharmony_ci movdqa %xmm0,-16*2(%rbx,%rdx) 999e1051a39Sopenharmony_ci movdqa %xmm0,-16*1(%rbx,%rdx) 1000e1051a39Sopenharmony_ci pcmpeqd %xmm1,%xmm0 1001e1051a39Sopenharmony_ci pand %xmm1,%xmm2 1002e1051a39Sopenharmony_ci pand %xmm1,%xmm3 1003e1051a39Sopenharmony_ci pand %xmm0,%xmm4 1004e1051a39Sopenharmony_ci pand %xmm0,%xmm5 1005e1051a39Sopenharmony_ci pxor %xmm0,%xmm0 1006e1051a39Sopenharmony_ci por %xmm2,%xmm4 1007e1051a39Sopenharmony_ci por %xmm3,%xmm5 1008e1051a39Sopenharmony_ci movdqu %xmm4,-16*2($rptr) 1009e1051a39Sopenharmony_ci movdqu %xmm5,-16*1($rptr) 1010e1051a39Sopenharmony_ci add \$32,$num 1011e1051a39Sopenharmony_ci jnz .Lsqr8x_cond_copy 1012e1051a39Sopenharmony_ci 1013e1051a39Sopenharmony_ci mov \$1,%rax 1014e1051a39Sopenharmony_ci mov -48(%rsi),%r15 1015e1051a39Sopenharmony_ci.cfi_restore %r15 1016e1051a39Sopenharmony_ci mov -40(%rsi),%r14 1017e1051a39Sopenharmony_ci.cfi_restore %r14 1018e1051a39Sopenharmony_ci mov -32(%rsi),%r13 1019e1051a39Sopenharmony_ci.cfi_restore %r13 1020e1051a39Sopenharmony_ci mov -24(%rsi),%r12 1021e1051a39Sopenharmony_ci.cfi_restore %r12 1022e1051a39Sopenharmony_ci mov -16(%rsi),%rbp 1023e1051a39Sopenharmony_ci.cfi_restore %rbp 1024e1051a39Sopenharmony_ci mov -8(%rsi),%rbx 1025e1051a39Sopenharmony_ci.cfi_restore %rbx 1026e1051a39Sopenharmony_ci lea (%rsi),%rsp 1027e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 1028e1051a39Sopenharmony_ci.Lsqr8x_epilogue: 1029e1051a39Sopenharmony_ci ret 1030e1051a39Sopenharmony_ci.cfi_endproc 1031e1051a39Sopenharmony_ci.size bn_sqr8x_mont,.-bn_sqr8x_mont 1032e1051a39Sopenharmony_ci___ 1033e1051a39Sopenharmony_ci}}} 1034e1051a39Sopenharmony_ci 1035e1051a39Sopenharmony_ciif ($addx) {{{ 1036e1051a39Sopenharmony_cimy $bp="%rdx"; # original value 1037e1051a39Sopenharmony_ci 1038e1051a39Sopenharmony_ci$code.=<<___; 1039e1051a39Sopenharmony_ci.type bn_mulx4x_mont,\@function,6 1040e1051a39Sopenharmony_ci.align 32 1041e1051a39Sopenharmony_cibn_mulx4x_mont: 1042e1051a39Sopenharmony_ci.cfi_startproc 1043e1051a39Sopenharmony_ci mov %rsp,%rax 1044e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 1045e1051a39Sopenharmony_ci.Lmulx4x_enter: 1046e1051a39Sopenharmony_ci push %rbx 1047e1051a39Sopenharmony_ci.cfi_push %rbx 1048e1051a39Sopenharmony_ci push %rbp 1049e1051a39Sopenharmony_ci.cfi_push %rbp 1050e1051a39Sopenharmony_ci push %r12 1051e1051a39Sopenharmony_ci.cfi_push %r12 1052e1051a39Sopenharmony_ci push %r13 1053e1051a39Sopenharmony_ci.cfi_push %r13 1054e1051a39Sopenharmony_ci push %r14 1055e1051a39Sopenharmony_ci.cfi_push %r14 1056e1051a39Sopenharmony_ci push %r15 1057e1051a39Sopenharmony_ci.cfi_push %r15 1058e1051a39Sopenharmony_ci.Lmulx4x_prologue: 1059e1051a39Sopenharmony_ci 1060e1051a39Sopenharmony_ci shl \$3,${num}d # convert $num to bytes 1061e1051a39Sopenharmony_ci xor %r10,%r10 1062e1051a39Sopenharmony_ci sub $num,%r10 # -$num 1063e1051a39Sopenharmony_ci mov ($n0),$n0 # *n0 1064e1051a39Sopenharmony_ci lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) 1065e1051a39Sopenharmony_ci and \$-128,%rbp 1066e1051a39Sopenharmony_ci mov %rsp,%r11 1067e1051a39Sopenharmony_ci sub %rbp,%r11 1068e1051a39Sopenharmony_ci and \$-4096,%r11 1069e1051a39Sopenharmony_ci lea (%rbp,%r11),%rsp 1070e1051a39Sopenharmony_ci mov (%rsp),%r10 1071e1051a39Sopenharmony_ci cmp %rbp,%rsp 1072e1051a39Sopenharmony_ci ja .Lmulx4x_page_walk 1073e1051a39Sopenharmony_ci jmp .Lmulx4x_page_walk_done 1074e1051a39Sopenharmony_ci 1075e1051a39Sopenharmony_ci.align 16 1076e1051a39Sopenharmony_ci.Lmulx4x_page_walk: 1077e1051a39Sopenharmony_ci lea -4096(%rsp),%rsp 1078e1051a39Sopenharmony_ci mov (%rsp),%r10 1079e1051a39Sopenharmony_ci cmp %rbp,%rsp 1080e1051a39Sopenharmony_ci ja .Lmulx4x_page_walk 1081e1051a39Sopenharmony_ci.Lmulx4x_page_walk_done: 1082e1051a39Sopenharmony_ci 1083e1051a39Sopenharmony_ci lea ($bp,$num),%r10 1084e1051a39Sopenharmony_ci ############################################################## 1085e1051a39Sopenharmony_ci # Stack layout 1086e1051a39Sopenharmony_ci # +0 num 1087e1051a39Sopenharmony_ci # +8 off-loaded &b[i] 1088e1051a39Sopenharmony_ci # +16 end of b[num] 1089e1051a39Sopenharmony_ci # +24 saved n0 1090e1051a39Sopenharmony_ci # +32 saved rp 1091e1051a39Sopenharmony_ci # +40 saved %rsp 1092e1051a39Sopenharmony_ci # +48 inner counter 1093e1051a39Sopenharmony_ci # +56 1094e1051a39Sopenharmony_ci # +64 tmp[num+1] 1095e1051a39Sopenharmony_ci # 1096e1051a39Sopenharmony_ci mov $num,0(%rsp) # save $num 1097e1051a39Sopenharmony_ci shr \$5,$num 1098e1051a39Sopenharmony_ci mov %r10,16(%rsp) # end of b[num] 1099e1051a39Sopenharmony_ci sub \$1,$num 1100e1051a39Sopenharmony_ci mov $n0, 24(%rsp) # save *n0 1101e1051a39Sopenharmony_ci mov $rp, 32(%rsp) # save $rp 1102e1051a39Sopenharmony_ci mov %rax,40(%rsp) # save original %rsp 1103e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp+40,deref,+8 1104e1051a39Sopenharmony_ci mov $num,48(%rsp) # inner counter 1105e1051a39Sopenharmony_ci jmp .Lmulx4x_body 1106e1051a39Sopenharmony_ci 1107e1051a39Sopenharmony_ci.align 32 1108e1051a39Sopenharmony_ci.Lmulx4x_body: 1109e1051a39Sopenharmony_ci___ 1110e1051a39Sopenharmony_cimy ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 1111e1051a39Sopenharmony_ci ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 1112e1051a39Sopenharmony_cimy $rptr=$bptr; 1113e1051a39Sopenharmony_ci$code.=<<___; 1114e1051a39Sopenharmony_ci lea 8($bp),$bptr 1115e1051a39Sopenharmony_ci mov ($bp),%rdx # b[0], $bp==%rdx actually 1116e1051a39Sopenharmony_ci lea 64+32(%rsp),$tptr 1117e1051a39Sopenharmony_ci mov %rdx,$bi 1118e1051a39Sopenharmony_ci 1119e1051a39Sopenharmony_ci mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 1120e1051a39Sopenharmony_ci mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] 1121e1051a39Sopenharmony_ci add %rax,%r11 1122e1051a39Sopenharmony_ci mov $bptr,8(%rsp) # off-load &b[i] 1123e1051a39Sopenharmony_ci mulx 2*8($aptr),%r12,%r13 # ... 1124e1051a39Sopenharmony_ci adc %r14,%r12 1125e1051a39Sopenharmony_ci adc \$0,%r13 1126e1051a39Sopenharmony_ci 1127e1051a39Sopenharmony_ci mov $mi,$bptr # borrow $bptr 1128e1051a39Sopenharmony_ci imulq 24(%rsp),$mi # "t[0]"*n0 1129e1051a39Sopenharmony_ci xor $zero,$zero # cf=0, of=0 1130e1051a39Sopenharmony_ci 1131e1051a39Sopenharmony_ci mulx 3*8($aptr),%rax,%r14 1132e1051a39Sopenharmony_ci mov $mi,%rdx 1133e1051a39Sopenharmony_ci lea 4*8($aptr),$aptr 1134e1051a39Sopenharmony_ci adcx %rax,%r13 1135e1051a39Sopenharmony_ci adcx $zero,%r14 # cf=0 1136e1051a39Sopenharmony_ci 1137e1051a39Sopenharmony_ci mulx 0*8($nptr),%rax,%r10 1138e1051a39Sopenharmony_ci adcx %rax,$bptr # discarded 1139e1051a39Sopenharmony_ci adox %r11,%r10 1140e1051a39Sopenharmony_ci mulx 1*8($nptr),%rax,%r11 1141e1051a39Sopenharmony_ci adcx %rax,%r10 1142e1051a39Sopenharmony_ci adox %r12,%r11 1143e1051a39Sopenharmony_ci .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12 1144e1051a39Sopenharmony_ci mov 48(%rsp),$bptr # counter value 1145e1051a39Sopenharmony_ci mov %r10,-4*8($tptr) 1146e1051a39Sopenharmony_ci adcx %rax,%r11 1147e1051a39Sopenharmony_ci adox %r13,%r12 1148e1051a39Sopenharmony_ci mulx 3*8($nptr),%rax,%r15 1149e1051a39Sopenharmony_ci mov $bi,%rdx 1150e1051a39Sopenharmony_ci mov %r11,-3*8($tptr) 1151e1051a39Sopenharmony_ci adcx %rax,%r12 1152e1051a39Sopenharmony_ci adox $zero,%r15 # of=0 1153e1051a39Sopenharmony_ci lea 4*8($nptr),$nptr 1154e1051a39Sopenharmony_ci mov %r12,-2*8($tptr) 1155e1051a39Sopenharmony_ci 1156e1051a39Sopenharmony_ci jmp .Lmulx4x_1st 1157e1051a39Sopenharmony_ci 1158e1051a39Sopenharmony_ci.align 32 1159e1051a39Sopenharmony_ci.Lmulx4x_1st: 1160e1051a39Sopenharmony_ci adcx $zero,%r15 # cf=0, modulo-scheduled 1161e1051a39Sopenharmony_ci mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 1162e1051a39Sopenharmony_ci adcx %r14,%r10 1163e1051a39Sopenharmony_ci mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 1164e1051a39Sopenharmony_ci adcx %rax,%r11 1165e1051a39Sopenharmony_ci mulx 2*8($aptr),%r12,%rax # ... 1166e1051a39Sopenharmony_ci adcx %r14,%r12 1167e1051a39Sopenharmony_ci mulx 3*8($aptr),%r13,%r14 1168e1051a39Sopenharmony_ci .byte 0x67,0x67 1169e1051a39Sopenharmony_ci mov $mi,%rdx 1170e1051a39Sopenharmony_ci adcx %rax,%r13 1171e1051a39Sopenharmony_ci adcx $zero,%r14 # cf=0 1172e1051a39Sopenharmony_ci lea 4*8($aptr),$aptr 1173e1051a39Sopenharmony_ci lea 4*8($tptr),$tptr 1174e1051a39Sopenharmony_ci 1175e1051a39Sopenharmony_ci adox %r15,%r10 1176e1051a39Sopenharmony_ci mulx 0*8($nptr),%rax,%r15 1177e1051a39Sopenharmony_ci adcx %rax,%r10 1178e1051a39Sopenharmony_ci adox %r15,%r11 1179e1051a39Sopenharmony_ci mulx 1*8($nptr),%rax,%r15 1180e1051a39Sopenharmony_ci adcx %rax,%r11 1181e1051a39Sopenharmony_ci adox %r15,%r12 1182e1051a39Sopenharmony_ci mulx 2*8($nptr),%rax,%r15 1183e1051a39Sopenharmony_ci mov %r10,-5*8($tptr) 1184e1051a39Sopenharmony_ci adcx %rax,%r12 1185e1051a39Sopenharmony_ci mov %r11,-4*8($tptr) 1186e1051a39Sopenharmony_ci adox %r15,%r13 1187e1051a39Sopenharmony_ci mulx 3*8($nptr),%rax,%r15 1188e1051a39Sopenharmony_ci mov $bi,%rdx 1189e1051a39Sopenharmony_ci mov %r12,-3*8($tptr) 1190e1051a39Sopenharmony_ci adcx %rax,%r13 1191e1051a39Sopenharmony_ci adox $zero,%r15 1192e1051a39Sopenharmony_ci lea 4*8($nptr),$nptr 1193e1051a39Sopenharmony_ci mov %r13,-2*8($tptr) 1194e1051a39Sopenharmony_ci 1195e1051a39Sopenharmony_ci dec $bptr # of=0, pass cf 1196e1051a39Sopenharmony_ci jnz .Lmulx4x_1st 1197e1051a39Sopenharmony_ci 1198e1051a39Sopenharmony_ci mov 0(%rsp),$num # load num 1199e1051a39Sopenharmony_ci mov 8(%rsp),$bptr # re-load &b[i] 1200e1051a39Sopenharmony_ci adc $zero,%r15 # modulo-scheduled 1201e1051a39Sopenharmony_ci add %r15,%r14 1202e1051a39Sopenharmony_ci sbb %r15,%r15 # top-most carry 1203e1051a39Sopenharmony_ci mov %r14,-1*8($tptr) 1204e1051a39Sopenharmony_ci jmp .Lmulx4x_outer 1205e1051a39Sopenharmony_ci 1206e1051a39Sopenharmony_ci.align 32 1207e1051a39Sopenharmony_ci.Lmulx4x_outer: 1208e1051a39Sopenharmony_ci mov ($bptr),%rdx # b[i] 1209e1051a39Sopenharmony_ci lea 8($bptr),$bptr # b++ 1210e1051a39Sopenharmony_ci sub $num,$aptr # rewind $aptr 1211e1051a39Sopenharmony_ci mov %r15,($tptr) # save top-most carry 1212e1051a39Sopenharmony_ci lea 64+4*8(%rsp),$tptr 1213e1051a39Sopenharmony_ci sub $num,$nptr # rewind $nptr 1214e1051a39Sopenharmony_ci 1215e1051a39Sopenharmony_ci mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 1216e1051a39Sopenharmony_ci xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 1217e1051a39Sopenharmony_ci mov %rdx,$bi 1218e1051a39Sopenharmony_ci mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 1219e1051a39Sopenharmony_ci adox -4*8($tptr),$mi 1220e1051a39Sopenharmony_ci adcx %r14,%r11 1221e1051a39Sopenharmony_ci mulx 2*8($aptr),%r15,%r13 # ... 1222e1051a39Sopenharmony_ci adox -3*8($tptr),%r11 1223e1051a39Sopenharmony_ci adcx %r15,%r12 1224e1051a39Sopenharmony_ci adox -2*8($tptr),%r12 1225e1051a39Sopenharmony_ci adcx $zero,%r13 1226e1051a39Sopenharmony_ci adox $zero,%r13 1227e1051a39Sopenharmony_ci 1228e1051a39Sopenharmony_ci mov $bptr,8(%rsp) # off-load &b[i] 1229e1051a39Sopenharmony_ci mov $mi,%r15 1230e1051a39Sopenharmony_ci imulq 24(%rsp),$mi # "t[0]"*n0 1231e1051a39Sopenharmony_ci xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 1232e1051a39Sopenharmony_ci 1233e1051a39Sopenharmony_ci mulx 3*8($aptr),%rax,%r14 1234e1051a39Sopenharmony_ci mov $mi,%rdx 1235e1051a39Sopenharmony_ci adcx %rax,%r13 1236e1051a39Sopenharmony_ci adox -1*8($tptr),%r13 1237e1051a39Sopenharmony_ci adcx $zero,%r14 1238e1051a39Sopenharmony_ci lea 4*8($aptr),$aptr 1239e1051a39Sopenharmony_ci adox $zero,%r14 1240e1051a39Sopenharmony_ci 1241e1051a39Sopenharmony_ci mulx 0*8($nptr),%rax,%r10 1242e1051a39Sopenharmony_ci adcx %rax,%r15 # discarded 1243e1051a39Sopenharmony_ci adox %r11,%r10 1244e1051a39Sopenharmony_ci mulx 1*8($nptr),%rax,%r11 1245e1051a39Sopenharmony_ci adcx %rax,%r10 1246e1051a39Sopenharmony_ci adox %r12,%r11 1247e1051a39Sopenharmony_ci mulx 2*8($nptr),%rax,%r12 1248e1051a39Sopenharmony_ci mov %r10,-4*8($tptr) 1249e1051a39Sopenharmony_ci adcx %rax,%r11 1250e1051a39Sopenharmony_ci adox %r13,%r12 1251e1051a39Sopenharmony_ci mulx 3*8($nptr),%rax,%r15 1252e1051a39Sopenharmony_ci mov $bi,%rdx 1253e1051a39Sopenharmony_ci mov %r11,-3*8($tptr) 1254e1051a39Sopenharmony_ci lea 4*8($nptr),$nptr 1255e1051a39Sopenharmony_ci adcx %rax,%r12 1256e1051a39Sopenharmony_ci adox $zero,%r15 # of=0 1257e1051a39Sopenharmony_ci mov 48(%rsp),$bptr # counter value 1258e1051a39Sopenharmony_ci mov %r12,-2*8($tptr) 1259e1051a39Sopenharmony_ci 1260e1051a39Sopenharmony_ci jmp .Lmulx4x_inner 1261e1051a39Sopenharmony_ci 1262e1051a39Sopenharmony_ci.align 32 1263e1051a39Sopenharmony_ci.Lmulx4x_inner: 1264e1051a39Sopenharmony_ci mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 1265e1051a39Sopenharmony_ci adcx $zero,%r15 # cf=0, modulo-scheduled 1266e1051a39Sopenharmony_ci adox %r14,%r10 1267e1051a39Sopenharmony_ci mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 1268e1051a39Sopenharmony_ci adcx 0*8($tptr),%r10 1269e1051a39Sopenharmony_ci adox %rax,%r11 1270e1051a39Sopenharmony_ci mulx 2*8($aptr),%r12,%rax # ... 1271e1051a39Sopenharmony_ci adcx 1*8($tptr),%r11 1272e1051a39Sopenharmony_ci adox %r14,%r12 1273e1051a39Sopenharmony_ci mulx 3*8($aptr),%r13,%r14 1274e1051a39Sopenharmony_ci mov $mi,%rdx 1275e1051a39Sopenharmony_ci adcx 2*8($tptr),%r12 1276e1051a39Sopenharmony_ci adox %rax,%r13 1277e1051a39Sopenharmony_ci adcx 3*8($tptr),%r13 1278e1051a39Sopenharmony_ci adox $zero,%r14 # of=0 1279e1051a39Sopenharmony_ci lea 4*8($aptr),$aptr 1280e1051a39Sopenharmony_ci lea 4*8($tptr),$tptr 1281e1051a39Sopenharmony_ci adcx $zero,%r14 # cf=0 1282e1051a39Sopenharmony_ci 1283e1051a39Sopenharmony_ci adox %r15,%r10 1284e1051a39Sopenharmony_ci mulx 0*8($nptr),%rax,%r15 1285e1051a39Sopenharmony_ci adcx %rax,%r10 1286e1051a39Sopenharmony_ci adox %r15,%r11 1287e1051a39Sopenharmony_ci mulx 1*8($nptr),%rax,%r15 1288e1051a39Sopenharmony_ci adcx %rax,%r11 1289e1051a39Sopenharmony_ci adox %r15,%r12 1290e1051a39Sopenharmony_ci mulx 2*8($nptr),%rax,%r15 1291e1051a39Sopenharmony_ci mov %r10,-5*8($tptr) 1292e1051a39Sopenharmony_ci adcx %rax,%r12 1293e1051a39Sopenharmony_ci adox %r15,%r13 1294e1051a39Sopenharmony_ci mulx 3*8($nptr),%rax,%r15 1295e1051a39Sopenharmony_ci mov $bi,%rdx 1296e1051a39Sopenharmony_ci mov %r11,-4*8($tptr) 1297e1051a39Sopenharmony_ci mov %r12,-3*8($tptr) 1298e1051a39Sopenharmony_ci adcx %rax,%r13 1299e1051a39Sopenharmony_ci adox $zero,%r15 1300e1051a39Sopenharmony_ci lea 4*8($nptr),$nptr 1301e1051a39Sopenharmony_ci mov %r13,-2*8($tptr) 1302e1051a39Sopenharmony_ci 1303e1051a39Sopenharmony_ci dec $bptr # of=0, pass cf 1304e1051a39Sopenharmony_ci jnz .Lmulx4x_inner 1305e1051a39Sopenharmony_ci 1306e1051a39Sopenharmony_ci mov 0(%rsp),$num # load num 1307e1051a39Sopenharmony_ci mov 8(%rsp),$bptr # re-load &b[i] 1308e1051a39Sopenharmony_ci adc $zero,%r15 # modulo-scheduled 1309e1051a39Sopenharmony_ci sub 0*8($tptr),$zero # pull top-most carry 1310e1051a39Sopenharmony_ci adc %r15,%r14 1311e1051a39Sopenharmony_ci sbb %r15,%r15 # top-most carry 1312e1051a39Sopenharmony_ci mov %r14,-1*8($tptr) 1313e1051a39Sopenharmony_ci 1314e1051a39Sopenharmony_ci cmp 16(%rsp),$bptr 1315e1051a39Sopenharmony_ci jne .Lmulx4x_outer 1316e1051a39Sopenharmony_ci 1317e1051a39Sopenharmony_ci lea 64(%rsp),$tptr 1318e1051a39Sopenharmony_ci sub $num,$nptr # rewind $nptr 1319e1051a39Sopenharmony_ci neg %r15 1320e1051a39Sopenharmony_ci mov $num,%rdx 1321e1051a39Sopenharmony_ci shr \$3+2,$num # %cf=0 1322e1051a39Sopenharmony_ci mov 32(%rsp),$rptr # restore rp 1323e1051a39Sopenharmony_ci jmp .Lmulx4x_sub 1324e1051a39Sopenharmony_ci 1325e1051a39Sopenharmony_ci.align 32 1326e1051a39Sopenharmony_ci.Lmulx4x_sub: 1327e1051a39Sopenharmony_ci mov 8*0($tptr),%r11 1328e1051a39Sopenharmony_ci mov 8*1($tptr),%r12 1329e1051a39Sopenharmony_ci mov 8*2($tptr),%r13 1330e1051a39Sopenharmony_ci mov 8*3($tptr),%r14 1331e1051a39Sopenharmony_ci lea 8*4($tptr),$tptr 1332e1051a39Sopenharmony_ci sbb 8*0($nptr),%r11 1333e1051a39Sopenharmony_ci sbb 8*1($nptr),%r12 1334e1051a39Sopenharmony_ci sbb 8*2($nptr),%r13 1335e1051a39Sopenharmony_ci sbb 8*3($nptr),%r14 1336e1051a39Sopenharmony_ci lea 8*4($nptr),$nptr 1337e1051a39Sopenharmony_ci mov %r11,8*0($rptr) 1338e1051a39Sopenharmony_ci mov %r12,8*1($rptr) 1339e1051a39Sopenharmony_ci mov %r13,8*2($rptr) 1340e1051a39Sopenharmony_ci mov %r14,8*3($rptr) 1341e1051a39Sopenharmony_ci lea 8*4($rptr),$rptr 1342e1051a39Sopenharmony_ci dec $num # preserves %cf 1343e1051a39Sopenharmony_ci jnz .Lmulx4x_sub 1344e1051a39Sopenharmony_ci 1345e1051a39Sopenharmony_ci sbb \$0,%r15 # top-most carry 1346e1051a39Sopenharmony_ci lea 64(%rsp),$tptr 1347e1051a39Sopenharmony_ci sub %rdx,$rptr # rewind 1348e1051a39Sopenharmony_ci 1349e1051a39Sopenharmony_ci movq %r15,%xmm1 1350e1051a39Sopenharmony_ci pxor %xmm0,%xmm0 1351e1051a39Sopenharmony_ci pshufd \$0,%xmm1,%xmm1 1352e1051a39Sopenharmony_ci mov 40(%rsp),%rsi # restore %rsp 1353e1051a39Sopenharmony_ci.cfi_def_cfa %rsi,8 1354e1051a39Sopenharmony_ci jmp .Lmulx4x_cond_copy 1355e1051a39Sopenharmony_ci 1356e1051a39Sopenharmony_ci.align 32 1357e1051a39Sopenharmony_ci.Lmulx4x_cond_copy: 1358e1051a39Sopenharmony_ci movdqa 16*0($tptr),%xmm2 1359e1051a39Sopenharmony_ci movdqa 16*1($tptr),%xmm3 1360e1051a39Sopenharmony_ci lea 16*2($tptr),$tptr 1361e1051a39Sopenharmony_ci movdqu 16*0($rptr),%xmm4 1362e1051a39Sopenharmony_ci movdqu 16*1($rptr),%xmm5 1363e1051a39Sopenharmony_ci lea 16*2($rptr),$rptr 1364e1051a39Sopenharmony_ci movdqa %xmm0,-16*2($tptr) # zero tp 1365e1051a39Sopenharmony_ci movdqa %xmm0,-16*1($tptr) 1366e1051a39Sopenharmony_ci pcmpeqd %xmm1,%xmm0 1367e1051a39Sopenharmony_ci pand %xmm1,%xmm2 1368e1051a39Sopenharmony_ci pand %xmm1,%xmm3 1369e1051a39Sopenharmony_ci pand %xmm0,%xmm4 1370e1051a39Sopenharmony_ci pand %xmm0,%xmm5 1371e1051a39Sopenharmony_ci pxor %xmm0,%xmm0 1372e1051a39Sopenharmony_ci por %xmm2,%xmm4 1373e1051a39Sopenharmony_ci por %xmm3,%xmm5 1374e1051a39Sopenharmony_ci movdqu %xmm4,-16*2($rptr) 1375e1051a39Sopenharmony_ci movdqu %xmm5,-16*1($rptr) 1376e1051a39Sopenharmony_ci sub \$32,%rdx 1377e1051a39Sopenharmony_ci jnz .Lmulx4x_cond_copy 1378e1051a39Sopenharmony_ci 1379e1051a39Sopenharmony_ci mov %rdx,($tptr) 1380e1051a39Sopenharmony_ci 1381e1051a39Sopenharmony_ci mov \$1,%rax 1382e1051a39Sopenharmony_ci mov -48(%rsi),%r15 1383e1051a39Sopenharmony_ci.cfi_restore %r15 1384e1051a39Sopenharmony_ci mov -40(%rsi),%r14 1385e1051a39Sopenharmony_ci.cfi_restore %r14 1386e1051a39Sopenharmony_ci mov -32(%rsi),%r13 1387e1051a39Sopenharmony_ci.cfi_restore %r13 1388e1051a39Sopenharmony_ci mov -24(%rsi),%r12 1389e1051a39Sopenharmony_ci.cfi_restore %r12 1390e1051a39Sopenharmony_ci mov -16(%rsi),%rbp 1391e1051a39Sopenharmony_ci.cfi_restore %rbp 1392e1051a39Sopenharmony_ci mov -8(%rsi),%rbx 1393e1051a39Sopenharmony_ci.cfi_restore %rbx 1394e1051a39Sopenharmony_ci lea (%rsi),%rsp 1395e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 1396e1051a39Sopenharmony_ci.Lmulx4x_epilogue: 1397e1051a39Sopenharmony_ci ret 1398e1051a39Sopenharmony_ci.cfi_endproc 1399e1051a39Sopenharmony_ci.size bn_mulx4x_mont,.-bn_mulx4x_mont 1400e1051a39Sopenharmony_ci___ 1401e1051a39Sopenharmony_ci}}} 1402e1051a39Sopenharmony_ci$code.=<<___; 1403e1051a39Sopenharmony_ci.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1404e1051a39Sopenharmony_ci.align 16 1405e1051a39Sopenharmony_ci___ 1406e1051a39Sopenharmony_ci 1407e1051a39Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1408e1051a39Sopenharmony_ci# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1409e1051a39Sopenharmony_ciif ($win64) { 1410e1051a39Sopenharmony_ci$rec="%rcx"; 1411e1051a39Sopenharmony_ci$frame="%rdx"; 1412e1051a39Sopenharmony_ci$context="%r8"; 1413e1051a39Sopenharmony_ci$disp="%r9"; 1414e1051a39Sopenharmony_ci 1415e1051a39Sopenharmony_ci$code.=<<___; 1416e1051a39Sopenharmony_ci.extern __imp_RtlVirtualUnwind 1417e1051a39Sopenharmony_ci.type mul_handler,\@abi-omnipotent 1418e1051a39Sopenharmony_ci.align 16 1419e1051a39Sopenharmony_cimul_handler: 1420e1051a39Sopenharmony_ci push %rsi 1421e1051a39Sopenharmony_ci push %rdi 1422e1051a39Sopenharmony_ci push %rbx 1423e1051a39Sopenharmony_ci push %rbp 1424e1051a39Sopenharmony_ci push %r12 1425e1051a39Sopenharmony_ci push %r13 1426e1051a39Sopenharmony_ci push %r14 1427e1051a39Sopenharmony_ci push %r15 1428e1051a39Sopenharmony_ci pushfq 1429e1051a39Sopenharmony_ci sub \$64,%rsp 1430e1051a39Sopenharmony_ci 1431e1051a39Sopenharmony_ci mov 120($context),%rax # pull context->Rax 1432e1051a39Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 1433e1051a39Sopenharmony_ci 1434e1051a39Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 1435e1051a39Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 1436e1051a39Sopenharmony_ci 1437e1051a39Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 1438e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # end of prologue label 1439e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<end of prologue label 1440e1051a39Sopenharmony_ci jb .Lcommon_seh_tail 1441e1051a39Sopenharmony_ci 1442e1051a39Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 1443e1051a39Sopenharmony_ci 1444e1051a39Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 1445e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 1446e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip>=epilogue label 1447e1051a39Sopenharmony_ci jae .Lcommon_seh_tail 1448e1051a39Sopenharmony_ci 1449e1051a39Sopenharmony_ci mov 192($context),%r10 # pull $num 1450e1051a39Sopenharmony_ci mov 8(%rax,%r10,8),%rax # pull saved stack pointer 1451e1051a39Sopenharmony_ci 1452e1051a39Sopenharmony_ci jmp .Lcommon_pop_regs 1453e1051a39Sopenharmony_ci.size mul_handler,.-mul_handler 1454e1051a39Sopenharmony_ci 1455e1051a39Sopenharmony_ci.type sqr_handler,\@abi-omnipotent 1456e1051a39Sopenharmony_ci.align 16 1457e1051a39Sopenharmony_cisqr_handler: 1458e1051a39Sopenharmony_ci push %rsi 1459e1051a39Sopenharmony_ci push %rdi 1460e1051a39Sopenharmony_ci push %rbx 1461e1051a39Sopenharmony_ci push %rbp 1462e1051a39Sopenharmony_ci push %r12 1463e1051a39Sopenharmony_ci push %r13 1464e1051a39Sopenharmony_ci push %r14 1465e1051a39Sopenharmony_ci push %r15 1466e1051a39Sopenharmony_ci pushfq 1467e1051a39Sopenharmony_ci sub \$64,%rsp 1468e1051a39Sopenharmony_ci 1469e1051a39Sopenharmony_ci mov 120($context),%rax # pull context->Rax 1470e1051a39Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 1471e1051a39Sopenharmony_ci 1472e1051a39Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 1473e1051a39Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 1474e1051a39Sopenharmony_ci 1475e1051a39Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 1476e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # end of prologue label 1477e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<.Lsqr_prologue 1478e1051a39Sopenharmony_ci jb .Lcommon_seh_tail 1479e1051a39Sopenharmony_ci 1480e1051a39Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 1481e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # body label 1482e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<.Lsqr_body 1483e1051a39Sopenharmony_ci jb .Lcommon_pop_regs 1484e1051a39Sopenharmony_ci 1485e1051a39Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 1486e1051a39Sopenharmony_ci 1487e1051a39Sopenharmony_ci mov 8(%r11),%r10d # HandlerData[2] 1488e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 1489e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue 1490e1051a39Sopenharmony_ci jae .Lcommon_seh_tail 1491e1051a39Sopenharmony_ci 1492e1051a39Sopenharmony_ci mov 40(%rax),%rax # pull saved stack pointer 1493e1051a39Sopenharmony_ci 1494e1051a39Sopenharmony_ci.Lcommon_pop_regs: 1495e1051a39Sopenharmony_ci mov -8(%rax),%rbx 1496e1051a39Sopenharmony_ci mov -16(%rax),%rbp 1497e1051a39Sopenharmony_ci mov -24(%rax),%r12 1498e1051a39Sopenharmony_ci mov -32(%rax),%r13 1499e1051a39Sopenharmony_ci mov -40(%rax),%r14 1500e1051a39Sopenharmony_ci mov -48(%rax),%r15 1501e1051a39Sopenharmony_ci mov %rbx,144($context) # restore context->Rbx 1502e1051a39Sopenharmony_ci mov %rbp,160($context) # restore context->Rbp 1503e1051a39Sopenharmony_ci mov %r12,216($context) # restore context->R12 1504e1051a39Sopenharmony_ci mov %r13,224($context) # restore context->R13 1505e1051a39Sopenharmony_ci mov %r14,232($context) # restore context->R14 1506e1051a39Sopenharmony_ci mov %r15,240($context) # restore context->R15 1507e1051a39Sopenharmony_ci 1508e1051a39Sopenharmony_ci.Lcommon_seh_tail: 1509e1051a39Sopenharmony_ci mov 8(%rax),%rdi 1510e1051a39Sopenharmony_ci mov 16(%rax),%rsi 1511e1051a39Sopenharmony_ci mov %rax,152($context) # restore context->Rsp 1512e1051a39Sopenharmony_ci mov %rsi,168($context) # restore context->Rsi 1513e1051a39Sopenharmony_ci mov %rdi,176($context) # restore context->Rdi 1514e1051a39Sopenharmony_ci 1515e1051a39Sopenharmony_ci mov 40($disp),%rdi # disp->ContextRecord 1516e1051a39Sopenharmony_ci mov $context,%rsi # context 1517e1051a39Sopenharmony_ci mov \$154,%ecx # sizeof(CONTEXT) 1518e1051a39Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 1519e1051a39Sopenharmony_ci 1520e1051a39Sopenharmony_ci mov $disp,%rsi 1521e1051a39Sopenharmony_ci xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1522e1051a39Sopenharmony_ci mov 8(%rsi),%rdx # arg2, disp->ImageBase 1523e1051a39Sopenharmony_ci mov 0(%rsi),%r8 # arg3, disp->ControlPc 1524e1051a39Sopenharmony_ci mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1525e1051a39Sopenharmony_ci mov 40(%rsi),%r10 # disp->ContextRecord 1526e1051a39Sopenharmony_ci lea 56(%rsi),%r11 # &disp->HandlerData 1527e1051a39Sopenharmony_ci lea 24(%rsi),%r12 # &disp->EstablisherFrame 1528e1051a39Sopenharmony_ci mov %r10,32(%rsp) # arg5 1529e1051a39Sopenharmony_ci mov %r11,40(%rsp) # arg6 1530e1051a39Sopenharmony_ci mov %r12,48(%rsp) # arg7 1531e1051a39Sopenharmony_ci mov %rcx,56(%rsp) # arg8, (NULL) 1532e1051a39Sopenharmony_ci call *__imp_RtlVirtualUnwind(%rip) 1533e1051a39Sopenharmony_ci 1534e1051a39Sopenharmony_ci mov \$1,%eax # ExceptionContinueSearch 1535e1051a39Sopenharmony_ci add \$64,%rsp 1536e1051a39Sopenharmony_ci popfq 1537e1051a39Sopenharmony_ci pop %r15 1538e1051a39Sopenharmony_ci pop %r14 1539e1051a39Sopenharmony_ci pop %r13 1540e1051a39Sopenharmony_ci pop %r12 1541e1051a39Sopenharmony_ci pop %rbp 1542e1051a39Sopenharmony_ci pop %rbx 1543e1051a39Sopenharmony_ci pop %rdi 1544e1051a39Sopenharmony_ci pop %rsi 1545e1051a39Sopenharmony_ci ret 1546e1051a39Sopenharmony_ci.size sqr_handler,.-sqr_handler 1547e1051a39Sopenharmony_ci 1548e1051a39Sopenharmony_ci.section .pdata 1549e1051a39Sopenharmony_ci.align 4 1550e1051a39Sopenharmony_ci .rva .LSEH_begin_bn_mul_mont 1551e1051a39Sopenharmony_ci .rva .LSEH_end_bn_mul_mont 1552e1051a39Sopenharmony_ci .rva .LSEH_info_bn_mul_mont 1553e1051a39Sopenharmony_ci 1554e1051a39Sopenharmony_ci .rva .LSEH_begin_bn_mul4x_mont 1555e1051a39Sopenharmony_ci .rva .LSEH_end_bn_mul4x_mont 1556e1051a39Sopenharmony_ci .rva .LSEH_info_bn_mul4x_mont 1557e1051a39Sopenharmony_ci 1558e1051a39Sopenharmony_ci .rva .LSEH_begin_bn_sqr8x_mont 1559e1051a39Sopenharmony_ci .rva .LSEH_end_bn_sqr8x_mont 1560e1051a39Sopenharmony_ci .rva .LSEH_info_bn_sqr8x_mont 1561e1051a39Sopenharmony_ci___ 1562e1051a39Sopenharmony_ci$code.=<<___ if ($addx); 1563e1051a39Sopenharmony_ci .rva .LSEH_begin_bn_mulx4x_mont 1564e1051a39Sopenharmony_ci .rva .LSEH_end_bn_mulx4x_mont 1565e1051a39Sopenharmony_ci .rva .LSEH_info_bn_mulx4x_mont 1566e1051a39Sopenharmony_ci___ 1567e1051a39Sopenharmony_ci$code.=<<___; 1568e1051a39Sopenharmony_ci.section .xdata 1569e1051a39Sopenharmony_ci.align 8 1570e1051a39Sopenharmony_ci.LSEH_info_bn_mul_mont: 1571e1051a39Sopenharmony_ci .byte 9,0,0,0 1572e1051a39Sopenharmony_ci .rva mul_handler 1573e1051a39Sopenharmony_ci .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 1574e1051a39Sopenharmony_ci.LSEH_info_bn_mul4x_mont: 1575e1051a39Sopenharmony_ci .byte 9,0,0,0 1576e1051a39Sopenharmony_ci .rva mul_handler 1577e1051a39Sopenharmony_ci .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1578e1051a39Sopenharmony_ci.LSEH_info_bn_sqr8x_mont: 1579e1051a39Sopenharmony_ci .byte 9,0,0,0 1580e1051a39Sopenharmony_ci .rva sqr_handler 1581e1051a39Sopenharmony_ci .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] 1582e1051a39Sopenharmony_ci.align 8 1583e1051a39Sopenharmony_ci___ 1584e1051a39Sopenharmony_ci$code.=<<___ if ($addx); 1585e1051a39Sopenharmony_ci.LSEH_info_bn_mulx4x_mont: 1586e1051a39Sopenharmony_ci .byte 9,0,0,0 1587e1051a39Sopenharmony_ci .rva sqr_handler 1588e1051a39Sopenharmony_ci .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 1589e1051a39Sopenharmony_ci.align 8 1590e1051a39Sopenharmony_ci___ 1591e1051a39Sopenharmony_ci} 1592e1051a39Sopenharmony_ci 1593e1051a39Sopenharmony_ciprint $code; 1594e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 1595