1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci 17e1051a39Sopenharmony_ci# October 2005 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# This is a "teaser" code, as it can be improved in several ways... 20e1051a39Sopenharmony_ci# First of all non-SSE2 path should be implemented (yes, for now it 21e1051a39Sopenharmony_ci# performs Montgomery multiplication/convolution only on SSE2-capable 22e1051a39Sopenharmony_ci# CPUs such as P4, others fall down to original code). Then inner loop 23e1051a39Sopenharmony_ci# can be unrolled and modulo-scheduled to improve ILP and possibly 24e1051a39Sopenharmony_ci# moved to 128-bit XMM register bank (though it would require input 25e1051a39Sopenharmony_ci# rearrangement and/or increase bus bandwidth utilization). Dedicated 26e1051a39Sopenharmony_ci# squaring procedure should give further performance improvement... 27e1051a39Sopenharmony_ci# Yet, for being draft, the code improves rsa512 *sign* benchmark by 28e1051a39Sopenharmony_ci# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) 29e1051a39Sopenharmony_ci 30e1051a39Sopenharmony_ci# December 2006 31e1051a39Sopenharmony_ci# 32e1051a39Sopenharmony_ci# Modulo-scheduling SSE2 loops results in further 15-20% improvement. 33e1051a39Sopenharmony_ci# Integer-only code [being equipped with dedicated squaring procedure] 34e1051a39Sopenharmony_ci# gives ~40% on rsa512 sign benchmark... 35e1051a39Sopenharmony_ci 36e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 37e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 38e1051a39Sopenharmony_cirequire "x86asm.pl"; 39e1051a39Sopenharmony_ci 40e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output"; 41e1051a39Sopenharmony_ci 42e1051a39Sopenharmony_ci&asm_init($ARGV[0]); 43e1051a39Sopenharmony_ci 44e1051a39Sopenharmony_ci$sse2=0; 45e1051a39Sopenharmony_cifor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 46e1051a39Sopenharmony_ci 47e1051a39Sopenharmony_ci&external_label("OPENSSL_ia32cap_P") if ($sse2); 48e1051a39Sopenharmony_ci 49e1051a39Sopenharmony_ci&function_begin("bn_mul_mont"); 50e1051a39Sopenharmony_ci 51e1051a39Sopenharmony_ci$i="edx"; 52e1051a39Sopenharmony_ci$j="ecx"; 53e1051a39Sopenharmony_ci$ap="esi"; $tp="esi"; # overlapping variables!!! 54e1051a39Sopenharmony_ci$rp="edi"; $bp="edi"; # overlapping variables!!! 55e1051a39Sopenharmony_ci$np="ebp"; 56e1051a39Sopenharmony_ci$num="ebx"; 57e1051a39Sopenharmony_ci 58e1051a39Sopenharmony_ci$_num=&DWP(4*0,"esp"); # stack top layout 59e1051a39Sopenharmony_ci$_rp=&DWP(4*1,"esp"); 60e1051a39Sopenharmony_ci$_ap=&DWP(4*2,"esp"); 61e1051a39Sopenharmony_ci$_bp=&DWP(4*3,"esp"); 62e1051a39Sopenharmony_ci$_np=&DWP(4*4,"esp"); 63e1051a39Sopenharmony_ci$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); 64e1051a39Sopenharmony_ci$_sp=&DWP(4*6,"esp"); 65e1051a39Sopenharmony_ci$_bpend=&DWP(4*7,"esp"); 66e1051a39Sopenharmony_ci$frame=32; # size of above frame rounded up to 16n 67e1051a39Sopenharmony_ci 68e1051a39Sopenharmony_ci &xor ("eax","eax"); 69e1051a39Sopenharmony_ci &mov ("edi",&wparam(5)); # int num 70e1051a39Sopenharmony_ci &cmp ("edi",4); 71e1051a39Sopenharmony_ci &jl (&label("just_leave")); 72e1051a39Sopenharmony_ci 73e1051a39Sopenharmony_ci &lea ("esi",&wparam(0)); # put aside pointer to argument block 74e1051a39Sopenharmony_ci &lea ("edx",&wparam(1)); # load ap 75e1051a39Sopenharmony_ci &add ("edi",2); # extra two words on top of tp 76e1051a39Sopenharmony_ci &neg ("edi"); 77e1051a39Sopenharmony_ci &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2)) 78e1051a39Sopenharmony_ci &neg ("edi"); 79e1051a39Sopenharmony_ci 80e1051a39Sopenharmony_ci # minimize cache contention by arranging 2K window between stack 81e1051a39Sopenharmony_ci # pointer and ap argument [np is also position sensitive vector, 82e1051a39Sopenharmony_ci # but it's assumed to be near ap, as it's allocated at ~same 83e1051a39Sopenharmony_ci # time]. 84e1051a39Sopenharmony_ci &mov ("eax","ebp"); 85e1051a39Sopenharmony_ci &sub ("eax","edx"); 86e1051a39Sopenharmony_ci &and ("eax",2047); 87e1051a39Sopenharmony_ci &sub ("ebp","eax"); # this aligns sp and ap modulo 2048 88e1051a39Sopenharmony_ci 89e1051a39Sopenharmony_ci &xor ("edx","ebp"); 90e1051a39Sopenharmony_ci &and ("edx",2048); 91e1051a39Sopenharmony_ci &xor ("edx",2048); 92e1051a39Sopenharmony_ci &sub ("ebp","edx"); # this splits them apart modulo 4096 93e1051a39Sopenharmony_ci 94e1051a39Sopenharmony_ci &and ("ebp",-64); # align to cache line 95e1051a39Sopenharmony_ci 96e1051a39Sopenharmony_ci # An OS-agnostic version of __chkstk. 97e1051a39Sopenharmony_ci # 98e1051a39Sopenharmony_ci # Some OSes (Windows) insist on stack being "wired" to 99e1051a39Sopenharmony_ci # physical memory in strictly sequential manner, i.e. if stack 100e1051a39Sopenharmony_ci # allocation spans two pages, then reference to farmost one can 101e1051a39Sopenharmony_ci # be punishable by SEGV. But page walking can do good even on 102e1051a39Sopenharmony_ci # other OSes, because it guarantees that villain thread hits 103e1051a39Sopenharmony_ci # the guard page before it can make damage to innocent one... 104e1051a39Sopenharmony_ci &mov ("eax","esp"); 105e1051a39Sopenharmony_ci &sub ("eax","ebp"); 106e1051a39Sopenharmony_ci &and ("eax",-4096); 107e1051a39Sopenharmony_ci &mov ("edx","esp"); # saved stack pointer! 108e1051a39Sopenharmony_ci &lea ("esp",&DWP(0,"ebp","eax")); 109e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"esp")); 110e1051a39Sopenharmony_ci &cmp ("esp","ebp"); 111e1051a39Sopenharmony_ci &ja (&label("page_walk")); 112e1051a39Sopenharmony_ci &jmp (&label("page_walk_done")); 113e1051a39Sopenharmony_ci 114e1051a39Sopenharmony_ci&set_label("page_walk",16); 115e1051a39Sopenharmony_ci &lea ("esp",&DWP(-4096,"esp")); 116e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"esp")); 117e1051a39Sopenharmony_ci &cmp ("esp","ebp"); 118e1051a39Sopenharmony_ci &ja (&label("page_walk")); 119e1051a39Sopenharmony_ci&set_label("page_walk_done"); 120e1051a39Sopenharmony_ci 121e1051a39Sopenharmony_ci ################################# load argument block... 122e1051a39Sopenharmony_ci &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp 123e1051a39Sopenharmony_ci &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap 124e1051a39Sopenharmony_ci &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp 125e1051a39Sopenharmony_ci &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np 126e1051a39Sopenharmony_ci &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 127e1051a39Sopenharmony_ci #&mov ("edi",&DWP(5*4,"esi"));# int num 128e1051a39Sopenharmony_ci 129e1051a39Sopenharmony_ci &mov ("esi",&DWP(0,"esi")); # pull n0[0] 130e1051a39Sopenharmony_ci &mov ($_rp,"eax"); # ... save a copy of argument block 131e1051a39Sopenharmony_ci &mov ($_ap,"ebx"); 132e1051a39Sopenharmony_ci &mov ($_bp,"ecx"); 133e1051a39Sopenharmony_ci &mov ($_np,"ebp"); 134e1051a39Sopenharmony_ci &mov ($_n0,"esi"); 135e1051a39Sopenharmony_ci &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling 136e1051a39Sopenharmony_ci #&mov ($_num,$num); # redundant as $num is not reused 137e1051a39Sopenharmony_ci &mov ($_sp,"edx"); # saved stack pointer! 138e1051a39Sopenharmony_ci 139e1051a39Sopenharmony_ciif($sse2) { 140e1051a39Sopenharmony_ci$acc0="mm0"; # mmx register bank layout 141e1051a39Sopenharmony_ci$acc1="mm1"; 142e1051a39Sopenharmony_ci$car0="mm2"; 143e1051a39Sopenharmony_ci$car1="mm3"; 144e1051a39Sopenharmony_ci$mul0="mm4"; 145e1051a39Sopenharmony_ci$mul1="mm5"; 146e1051a39Sopenharmony_ci$temp="mm6"; 147e1051a39Sopenharmony_ci$mask="mm7"; 148e1051a39Sopenharmony_ci 149e1051a39Sopenharmony_ci &picmeup("eax","OPENSSL_ia32cap_P"); 150e1051a39Sopenharmony_ci &bt (&DWP(0,"eax"),26); 151e1051a39Sopenharmony_ci &jnc (&label("non_sse2")); 152e1051a39Sopenharmony_ci 153e1051a39Sopenharmony_ci &mov ("eax",-1); 154e1051a39Sopenharmony_ci &movd ($mask,"eax"); # mask 32 lower bits 155e1051a39Sopenharmony_ci 156e1051a39Sopenharmony_ci &mov ($ap,$_ap); # load input pointers 157e1051a39Sopenharmony_ci &mov ($bp,$_bp); 158e1051a39Sopenharmony_ci &mov ($np,$_np); 159e1051a39Sopenharmony_ci 160e1051a39Sopenharmony_ci &xor ($i,$i); # i=0 161e1051a39Sopenharmony_ci &xor ($j,$j); # j=0 162e1051a39Sopenharmony_ci 163e1051a39Sopenharmony_ci &movd ($mul0,&DWP(0,$bp)); # bp[0] 164e1051a39Sopenharmony_ci &movd ($mul1,&DWP(0,$ap)); # ap[0] 165e1051a39Sopenharmony_ci &movd ($car1,&DWP(0,$np)); # np[0] 166e1051a39Sopenharmony_ci 167e1051a39Sopenharmony_ci &pmuludq($mul1,$mul0); # ap[0]*bp[0] 168e1051a39Sopenharmony_ci &movq ($car0,$mul1); 169e1051a39Sopenharmony_ci &movq ($acc0,$mul1); # I wish movd worked for 170e1051a39Sopenharmony_ci &pand ($acc0,$mask); # inter-register transfers 171e1051a39Sopenharmony_ci 172e1051a39Sopenharmony_ci &pmuludq($mul1,$_n0q); # *=n0 173e1051a39Sopenharmony_ci 174e1051a39Sopenharmony_ci &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 175e1051a39Sopenharmony_ci &paddq ($car1,$acc0); 176e1051a39Sopenharmony_ci 177e1051a39Sopenharmony_ci &movd ($acc1,&DWP(4,$np)); # np[1] 178e1051a39Sopenharmony_ci &movd ($acc0,&DWP(4,$ap)); # ap[1] 179e1051a39Sopenharmony_ci 180e1051a39Sopenharmony_ci &psrlq ($car0,32); 181e1051a39Sopenharmony_ci &psrlq ($car1,32); 182e1051a39Sopenharmony_ci 183e1051a39Sopenharmony_ci &inc ($j); # j++ 184e1051a39Sopenharmony_ci&set_label("1st",16); 185e1051a39Sopenharmony_ci &pmuludq($acc0,$mul0); # ap[j]*bp[0] 186e1051a39Sopenharmony_ci &pmuludq($acc1,$mul1); # np[j]*m1 187e1051a39Sopenharmony_ci &paddq ($car0,$acc0); # +=c0 188e1051a39Sopenharmony_ci &paddq ($car1,$acc1); # +=c1 189e1051a39Sopenharmony_ci 190e1051a39Sopenharmony_ci &movq ($acc0,$car0); 191e1051a39Sopenharmony_ci &pand ($acc0,$mask); 192e1051a39Sopenharmony_ci &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 193e1051a39Sopenharmony_ci &paddq ($car1,$acc0); # +=ap[j]*bp[0]; 194e1051a39Sopenharmony_ci &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 195e1051a39Sopenharmony_ci &psrlq ($car0,32); 196e1051a39Sopenharmony_ci &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= 197e1051a39Sopenharmony_ci &psrlq ($car1,32); 198e1051a39Sopenharmony_ci 199e1051a39Sopenharmony_ci &lea ($j,&DWP(1,$j)); 200e1051a39Sopenharmony_ci &cmp ($j,$num); 201e1051a39Sopenharmony_ci &jl (&label("1st")); 202e1051a39Sopenharmony_ci 203e1051a39Sopenharmony_ci &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] 204e1051a39Sopenharmony_ci &pmuludq($acc1,$mul1); # np[num-1]*m1 205e1051a39Sopenharmony_ci &paddq ($car0,$acc0); # +=c0 206e1051a39Sopenharmony_ci &paddq ($car1,$acc1); # +=c1 207e1051a39Sopenharmony_ci 208e1051a39Sopenharmony_ci &movq ($acc0,$car0); 209e1051a39Sopenharmony_ci &pand ($acc0,$mask); 210e1051a39Sopenharmony_ci &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; 211e1051a39Sopenharmony_ci &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 212e1051a39Sopenharmony_ci 213e1051a39Sopenharmony_ci &psrlq ($car0,32); 214e1051a39Sopenharmony_ci &psrlq ($car1,32); 215e1051a39Sopenharmony_ci 216e1051a39Sopenharmony_ci &paddq ($car1,$car0); 217e1051a39Sopenharmony_ci &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 218e1051a39Sopenharmony_ci 219e1051a39Sopenharmony_ci &inc ($i); # i++ 220e1051a39Sopenharmony_ci&set_label("outer"); 221e1051a39Sopenharmony_ci &xor ($j,$j); # j=0 222e1051a39Sopenharmony_ci 223e1051a39Sopenharmony_ci &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] 224e1051a39Sopenharmony_ci &movd ($mul1,&DWP(0,$ap)); # ap[0] 225e1051a39Sopenharmony_ci &movd ($temp,&DWP($frame,"esp")); # tp[0] 226e1051a39Sopenharmony_ci &movd ($car1,&DWP(0,$np)); # np[0] 227e1051a39Sopenharmony_ci &pmuludq($mul1,$mul0); # ap[0]*bp[i] 228e1051a39Sopenharmony_ci 229e1051a39Sopenharmony_ci &paddq ($mul1,$temp); # +=tp[0] 230e1051a39Sopenharmony_ci &movq ($acc0,$mul1); 231e1051a39Sopenharmony_ci &movq ($car0,$mul1); 232e1051a39Sopenharmony_ci &pand ($acc0,$mask); 233e1051a39Sopenharmony_ci 234e1051a39Sopenharmony_ci &pmuludq($mul1,$_n0q); # *=n0 235e1051a39Sopenharmony_ci 236e1051a39Sopenharmony_ci &pmuludq($car1,$mul1); 237e1051a39Sopenharmony_ci &paddq ($car1,$acc0); 238e1051a39Sopenharmony_ci 239e1051a39Sopenharmony_ci &movd ($temp,&DWP($frame+4,"esp")); # tp[1] 240e1051a39Sopenharmony_ci &movd ($acc1,&DWP(4,$np)); # np[1] 241e1051a39Sopenharmony_ci &movd ($acc0,&DWP(4,$ap)); # ap[1] 242e1051a39Sopenharmony_ci 243e1051a39Sopenharmony_ci &psrlq ($car0,32); 244e1051a39Sopenharmony_ci &psrlq ($car1,32); 245e1051a39Sopenharmony_ci &paddq ($car0,$temp); # +=tp[1] 246e1051a39Sopenharmony_ci 247e1051a39Sopenharmony_ci &inc ($j); # j++ 248e1051a39Sopenharmony_ci &dec ($num); 249e1051a39Sopenharmony_ci&set_label("inner"); 250e1051a39Sopenharmony_ci &pmuludq($acc0,$mul0); # ap[j]*bp[i] 251e1051a39Sopenharmony_ci &pmuludq($acc1,$mul1); # np[j]*m1 252e1051a39Sopenharmony_ci &paddq ($car0,$acc0); # +=c0 253e1051a39Sopenharmony_ci &paddq ($car1,$acc1); # +=c1 254e1051a39Sopenharmony_ci 255e1051a39Sopenharmony_ci &movq ($acc0,$car0); 256e1051a39Sopenharmony_ci &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] 257e1051a39Sopenharmony_ci &pand ($acc0,$mask); 258e1051a39Sopenharmony_ci &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 259e1051a39Sopenharmony_ci &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] 260e1051a39Sopenharmony_ci &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 261e1051a39Sopenharmony_ci &psrlq ($car0,32); 262e1051a39Sopenharmony_ci &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= 263e1051a39Sopenharmony_ci &psrlq ($car1,32); 264e1051a39Sopenharmony_ci &paddq ($car0,$temp); # +=tp[j+1] 265e1051a39Sopenharmony_ci 266e1051a39Sopenharmony_ci &dec ($num); 267e1051a39Sopenharmony_ci &lea ($j,&DWP(1,$j)); # j++ 268e1051a39Sopenharmony_ci &jnz (&label("inner")); 269e1051a39Sopenharmony_ci 270e1051a39Sopenharmony_ci &mov ($num,$j); 271e1051a39Sopenharmony_ci &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] 272e1051a39Sopenharmony_ci &pmuludq($acc1,$mul1); # np[num-1]*m1 273e1051a39Sopenharmony_ci &paddq ($car0,$acc0); # +=c0 274e1051a39Sopenharmony_ci &paddq ($car1,$acc1); # +=c1 275e1051a39Sopenharmony_ci 276e1051a39Sopenharmony_ci &movq ($acc0,$car0); 277e1051a39Sopenharmony_ci &pand ($acc0,$mask); 278e1051a39Sopenharmony_ci &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] 279e1051a39Sopenharmony_ci &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 280e1051a39Sopenharmony_ci &psrlq ($car0,32); 281e1051a39Sopenharmony_ci &psrlq ($car1,32); 282e1051a39Sopenharmony_ci 283e1051a39Sopenharmony_ci &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] 284e1051a39Sopenharmony_ci &paddq ($car1,$car0); 285e1051a39Sopenharmony_ci &paddq ($car1,$temp); 286e1051a39Sopenharmony_ci &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 287e1051a39Sopenharmony_ci 288e1051a39Sopenharmony_ci &lea ($i,&DWP(1,$i)); # i++ 289e1051a39Sopenharmony_ci &cmp ($i,$num); 290e1051a39Sopenharmony_ci &jle (&label("outer")); 291e1051a39Sopenharmony_ci 292e1051a39Sopenharmony_ci &emms (); # done with mmx bank 293e1051a39Sopenharmony_ci &jmp (&label("common_tail")); 294e1051a39Sopenharmony_ci 295e1051a39Sopenharmony_ci&set_label("non_sse2",16); 296e1051a39Sopenharmony_ci} 297e1051a39Sopenharmony_ci 298e1051a39Sopenharmony_ciif (0) { 299e1051a39Sopenharmony_ci &mov ("esp",$_sp); 300e1051a39Sopenharmony_ci &xor ("eax","eax"); # signal "not fast enough [yet]" 301e1051a39Sopenharmony_ci &jmp (&label("just_leave")); 302e1051a39Sopenharmony_ci # While the below code provides competitive performance for 303e1051a39Sopenharmony_ci # all key lengths on modern Intel cores, it's still more 304e1051a39Sopenharmony_ci # than 10% slower for 4096-bit key elsewhere:-( "Competitive" 305e1051a39Sopenharmony_ci # means compared to the original integer-only assembler. 306e1051a39Sopenharmony_ci # 512-bit RSA sign is better by ~40%, but that's about all 307e1051a39Sopenharmony_ci # one can say about all CPUs... 308e1051a39Sopenharmony_ci} else { 309e1051a39Sopenharmony_ci$inp="esi"; # integer path uses these registers differently 310e1051a39Sopenharmony_ci$word="edi"; 311e1051a39Sopenharmony_ci$carry="ebp"; 312e1051a39Sopenharmony_ci 313e1051a39Sopenharmony_ci &mov ($inp,$_ap); 314e1051a39Sopenharmony_ci &lea ($carry,&DWP(1,$num)); 315e1051a39Sopenharmony_ci &mov ($word,$_bp); 316e1051a39Sopenharmony_ci &xor ($j,$j); # j=0 317e1051a39Sopenharmony_ci &mov ("edx",$inp); 318e1051a39Sopenharmony_ci &and ($carry,1); # see if num is even 319e1051a39Sopenharmony_ci &sub ("edx",$word); # see if ap==bp 320e1051a39Sopenharmony_ci &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] 321e1051a39Sopenharmony_ci &or ($carry,"edx"); 322e1051a39Sopenharmony_ci &mov ($word,&DWP(0,$word)); # bp[0] 323e1051a39Sopenharmony_ci &jz (&label("bn_sqr_mont")); 324e1051a39Sopenharmony_ci &mov ($_bpend,"eax"); 325e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp)); 326e1051a39Sopenharmony_ci &xor ("edx","edx"); 327e1051a39Sopenharmony_ci 328e1051a39Sopenharmony_ci&set_label("mull",16); 329e1051a39Sopenharmony_ci &mov ($carry,"edx"); 330e1051a39Sopenharmony_ci &mul ($word); # ap[j]*bp[0] 331e1051a39Sopenharmony_ci &add ($carry,"eax"); 332e1051a39Sopenharmony_ci &lea ($j,&DWP(1,$j)); 333e1051a39Sopenharmony_ci &adc ("edx",0); 334e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 335e1051a39Sopenharmony_ci &cmp ($j,$num); 336e1051a39Sopenharmony_ci &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 337e1051a39Sopenharmony_ci &jl (&label("mull")); 338e1051a39Sopenharmony_ci 339e1051a39Sopenharmony_ci &mov ($carry,"edx"); 340e1051a39Sopenharmony_ci &mul ($word); # ap[num-1]*bp[0] 341e1051a39Sopenharmony_ci &mov ($word,$_n0); 342e1051a39Sopenharmony_ci &add ("eax",$carry); 343e1051a39Sopenharmony_ci &mov ($inp,$_np); 344e1051a39Sopenharmony_ci &adc ("edx",0); 345e1051a39Sopenharmony_ci &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 346e1051a39Sopenharmony_ci 347e1051a39Sopenharmony_ci &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= 348e1051a39Sopenharmony_ci &xor ($j,$j); 349e1051a39Sopenharmony_ci &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 350e1051a39Sopenharmony_ci &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 351e1051a39Sopenharmony_ci 352e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp)); # np[0] 353e1051a39Sopenharmony_ci &mul ($word); # np[0]*m 354e1051a39Sopenharmony_ci &add ("eax",&DWP($frame,"esp")); # +=tp[0] 355e1051a39Sopenharmony_ci &mov ("eax",&DWP(4,$inp)); # np[1] 356e1051a39Sopenharmony_ci &adc ("edx",0); 357e1051a39Sopenharmony_ci &inc ($j); 358e1051a39Sopenharmony_ci 359e1051a39Sopenharmony_ci &jmp (&label("2ndmadd")); 360e1051a39Sopenharmony_ci 361e1051a39Sopenharmony_ci&set_label("1stmadd",16); 362e1051a39Sopenharmony_ci &mov ($carry,"edx"); 363e1051a39Sopenharmony_ci &mul ($word); # ap[j]*bp[i] 364e1051a39Sopenharmony_ci &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 365e1051a39Sopenharmony_ci &lea ($j,&DWP(1,$j)); 366e1051a39Sopenharmony_ci &adc ("edx",0); 367e1051a39Sopenharmony_ci &add ($carry,"eax"); 368e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 369e1051a39Sopenharmony_ci &adc ("edx",0); 370e1051a39Sopenharmony_ci &cmp ($j,$num); 371e1051a39Sopenharmony_ci &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 372e1051a39Sopenharmony_ci &jl (&label("1stmadd")); 373e1051a39Sopenharmony_ci 374e1051a39Sopenharmony_ci &mov ($carry,"edx"); 375e1051a39Sopenharmony_ci &mul ($word); # ap[num-1]*bp[i] 376e1051a39Sopenharmony_ci &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] 377e1051a39Sopenharmony_ci &mov ($word,$_n0); 378e1051a39Sopenharmony_ci &adc ("edx",0); 379e1051a39Sopenharmony_ci &mov ($inp,$_np); 380e1051a39Sopenharmony_ci &add ($carry,"eax"); 381e1051a39Sopenharmony_ci &adc ("edx",0); 382e1051a39Sopenharmony_ci &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 383e1051a39Sopenharmony_ci 384e1051a39Sopenharmony_ci &xor ($j,$j); 385e1051a39Sopenharmony_ci &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 386e1051a39Sopenharmony_ci &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= 387e1051a39Sopenharmony_ci &adc ($j,0); 388e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp)); # np[0] 389e1051a39Sopenharmony_ci &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 390e1051a39Sopenharmony_ci &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 391e1051a39Sopenharmony_ci 392e1051a39Sopenharmony_ci &mul ($word); # np[0]*m 393e1051a39Sopenharmony_ci &add ("eax",&DWP($frame,"esp")); # +=tp[0] 394e1051a39Sopenharmony_ci &mov ("eax",&DWP(4,$inp)); # np[1] 395e1051a39Sopenharmony_ci &adc ("edx",0); 396e1051a39Sopenharmony_ci &mov ($j,1); 397e1051a39Sopenharmony_ci 398e1051a39Sopenharmony_ci&set_label("2ndmadd",16); 399e1051a39Sopenharmony_ci &mov ($carry,"edx"); 400e1051a39Sopenharmony_ci &mul ($word); # np[j]*m 401e1051a39Sopenharmony_ci &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 402e1051a39Sopenharmony_ci &lea ($j,&DWP(1,$j)); 403e1051a39Sopenharmony_ci &adc ("edx",0); 404e1051a39Sopenharmony_ci &add ($carry,"eax"); 405e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] 406e1051a39Sopenharmony_ci &adc ("edx",0); 407e1051a39Sopenharmony_ci &cmp ($j,$num); 408e1051a39Sopenharmony_ci &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= 409e1051a39Sopenharmony_ci &jl (&label("2ndmadd")); 410e1051a39Sopenharmony_ci 411e1051a39Sopenharmony_ci &mov ($carry,"edx"); 412e1051a39Sopenharmony_ci &mul ($word); # np[j]*m 413e1051a39Sopenharmony_ci &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 414e1051a39Sopenharmony_ci &adc ("edx",0); 415e1051a39Sopenharmony_ci &add ($carry,"eax"); 416e1051a39Sopenharmony_ci &adc ("edx",0); 417e1051a39Sopenharmony_ci &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 418e1051a39Sopenharmony_ci 419e1051a39Sopenharmony_ci &xor ("eax","eax"); 420e1051a39Sopenharmony_ci &mov ($j,$_bp); # &bp[i] 421e1051a39Sopenharmony_ci &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 422e1051a39Sopenharmony_ci &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 423e1051a39Sopenharmony_ci &lea ($j,&DWP(4,$j)); 424e1051a39Sopenharmony_ci &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 425e1051a39Sopenharmony_ci &cmp ($j,$_bpend); 426e1051a39Sopenharmony_ci &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 427e1051a39Sopenharmony_ci &je (&label("common_tail")); 428e1051a39Sopenharmony_ci 429e1051a39Sopenharmony_ci &mov ($word,&DWP(0,$j)); # bp[i+1] 430e1051a39Sopenharmony_ci &mov ($inp,$_ap); 431e1051a39Sopenharmony_ci &mov ($_bp,$j); # &bp[++i] 432e1051a39Sopenharmony_ci &xor ($j,$j); 433e1051a39Sopenharmony_ci &xor ("edx","edx"); 434e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp)); 435e1051a39Sopenharmony_ci &jmp (&label("1stmadd")); 436e1051a39Sopenharmony_ci 437e1051a39Sopenharmony_ci&set_label("bn_sqr_mont",16); 438e1051a39Sopenharmony_ci$sbit=$num; 439e1051a39Sopenharmony_ci &mov ($_num,$num); 440e1051a39Sopenharmony_ci &mov ($_bp,$j); # i=0 441e1051a39Sopenharmony_ci 442e1051a39Sopenharmony_ci &mov ("eax",$word); # ap[0] 443e1051a39Sopenharmony_ci &mul ($word); # ap[0]*ap[0] 444e1051a39Sopenharmony_ci &mov (&DWP($frame,"esp"),"eax"); # tp[0]= 445e1051a39Sopenharmony_ci &mov ($sbit,"edx"); 446e1051a39Sopenharmony_ci &shr ("edx",1); 447e1051a39Sopenharmony_ci &and ($sbit,1); 448e1051a39Sopenharmony_ci &inc ($j); 449e1051a39Sopenharmony_ci&set_label("sqr",16); 450e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 451e1051a39Sopenharmony_ci &mov ($carry,"edx"); 452e1051a39Sopenharmony_ci &mul ($word); # ap[j]*ap[0] 453e1051a39Sopenharmony_ci &add ("eax",$carry); 454e1051a39Sopenharmony_ci &lea ($j,&DWP(1,$j)); 455e1051a39Sopenharmony_ci &adc ("edx",0); 456e1051a39Sopenharmony_ci &lea ($carry,&DWP(0,$sbit,"eax",2)); 457e1051a39Sopenharmony_ci &shr ("eax",31); 458e1051a39Sopenharmony_ci &cmp ($j,$_num); 459e1051a39Sopenharmony_ci &mov ($sbit,"eax"); 460e1051a39Sopenharmony_ci &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 461e1051a39Sopenharmony_ci &jl (&label("sqr")); 462e1051a39Sopenharmony_ci 463e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] 464e1051a39Sopenharmony_ci &mov ($carry,"edx"); 465e1051a39Sopenharmony_ci &mul ($word); # ap[num-1]*ap[0] 466e1051a39Sopenharmony_ci &add ("eax",$carry); 467e1051a39Sopenharmony_ci &mov ($word,$_n0); 468e1051a39Sopenharmony_ci &adc ("edx",0); 469e1051a39Sopenharmony_ci &mov ($inp,$_np); 470e1051a39Sopenharmony_ci &lea ($carry,&DWP(0,$sbit,"eax",2)); 471e1051a39Sopenharmony_ci &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 472e1051a39Sopenharmony_ci &shr ("eax",31); 473e1051a39Sopenharmony_ci &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= 474e1051a39Sopenharmony_ci 475e1051a39Sopenharmony_ci &lea ($carry,&DWP(0,"eax","edx",2)); 476e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp)); # np[0] 477e1051a39Sopenharmony_ci &shr ("edx",31); 478e1051a39Sopenharmony_ci &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= 479e1051a39Sopenharmony_ci &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= 480e1051a39Sopenharmony_ci 481e1051a39Sopenharmony_ci &mul ($word); # np[0]*m 482e1051a39Sopenharmony_ci &add ("eax",&DWP($frame,"esp")); # +=tp[0] 483e1051a39Sopenharmony_ci &mov ($num,$j); 484e1051a39Sopenharmony_ci &adc ("edx",0); 485e1051a39Sopenharmony_ci &mov ("eax",&DWP(4,$inp)); # np[1] 486e1051a39Sopenharmony_ci &mov ($j,1); 487e1051a39Sopenharmony_ci 488e1051a39Sopenharmony_ci&set_label("3rdmadd",16); 489e1051a39Sopenharmony_ci &mov ($carry,"edx"); 490e1051a39Sopenharmony_ci &mul ($word); # np[j]*m 491e1051a39Sopenharmony_ci &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 492e1051a39Sopenharmony_ci &adc ("edx",0); 493e1051a39Sopenharmony_ci &add ($carry,"eax"); 494e1051a39Sopenharmony_ci &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] 495e1051a39Sopenharmony_ci &adc ("edx",0); 496e1051a39Sopenharmony_ci &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= 497e1051a39Sopenharmony_ci 498e1051a39Sopenharmony_ci &mov ($carry,"edx"); 499e1051a39Sopenharmony_ci &mul ($word); # np[j+1]*m 500e1051a39Sopenharmony_ci &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] 501e1051a39Sopenharmony_ci &lea ($j,&DWP(2,$j)); 502e1051a39Sopenharmony_ci &adc ("edx",0); 503e1051a39Sopenharmony_ci &add ($carry,"eax"); 504e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] 505e1051a39Sopenharmony_ci &adc ("edx",0); 506e1051a39Sopenharmony_ci &cmp ($j,$num); 507e1051a39Sopenharmony_ci &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= 508e1051a39Sopenharmony_ci &jl (&label("3rdmadd")); 509e1051a39Sopenharmony_ci 510e1051a39Sopenharmony_ci &mov ($carry,"edx"); 511e1051a39Sopenharmony_ci &mul ($word); # np[j]*m 512e1051a39Sopenharmony_ci &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 513e1051a39Sopenharmony_ci &adc ("edx",0); 514e1051a39Sopenharmony_ci &add ($carry,"eax"); 515e1051a39Sopenharmony_ci &adc ("edx",0); 516e1051a39Sopenharmony_ci &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 517e1051a39Sopenharmony_ci 518e1051a39Sopenharmony_ci &mov ($j,$_bp); # i 519e1051a39Sopenharmony_ci &xor ("eax","eax"); 520e1051a39Sopenharmony_ci &mov ($inp,$_ap); 521e1051a39Sopenharmony_ci &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 522e1051a39Sopenharmony_ci &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 523e1051a39Sopenharmony_ci &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 524e1051a39Sopenharmony_ci &cmp ($j,$num); 525e1051a39Sopenharmony_ci &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 526e1051a39Sopenharmony_ci &je (&label("common_tail")); 527e1051a39Sopenharmony_ci 528e1051a39Sopenharmony_ci &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] 529e1051a39Sopenharmony_ci &lea ($j,&DWP(1,$j)); 530e1051a39Sopenharmony_ci &mov ("eax",$word); 531e1051a39Sopenharmony_ci &mov ($_bp,$j); # ++i 532e1051a39Sopenharmony_ci &mul ($word); # ap[i]*ap[i] 533e1051a39Sopenharmony_ci &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] 534e1051a39Sopenharmony_ci &adc ("edx",0); 535e1051a39Sopenharmony_ci &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= 536e1051a39Sopenharmony_ci &xor ($carry,$carry); 537e1051a39Sopenharmony_ci &cmp ($j,$num); 538e1051a39Sopenharmony_ci &lea ($j,&DWP(1,$j)); 539e1051a39Sopenharmony_ci &je (&label("sqrlast")); 540e1051a39Sopenharmony_ci 541e1051a39Sopenharmony_ci &mov ($sbit,"edx"); # zaps $num 542e1051a39Sopenharmony_ci &shr ("edx",1); 543e1051a39Sopenharmony_ci &and ($sbit,1); 544e1051a39Sopenharmony_ci&set_label("sqradd",16); 545e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 546e1051a39Sopenharmony_ci &mov ($carry,"edx"); 547e1051a39Sopenharmony_ci &mul ($word); # ap[j]*ap[i] 548e1051a39Sopenharmony_ci &add ("eax",$carry); 549e1051a39Sopenharmony_ci &lea ($carry,&DWP(0,"eax","eax")); 550e1051a39Sopenharmony_ci &adc ("edx",0); 551e1051a39Sopenharmony_ci &shr ("eax",31); 552e1051a39Sopenharmony_ci &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 553e1051a39Sopenharmony_ci &lea ($j,&DWP(1,$j)); 554e1051a39Sopenharmony_ci &adc ("eax",0); 555e1051a39Sopenharmony_ci &add ($carry,$sbit); 556e1051a39Sopenharmony_ci &adc ("eax",0); 557e1051a39Sopenharmony_ci &cmp ($j,$_num); 558e1051a39Sopenharmony_ci &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 559e1051a39Sopenharmony_ci &mov ($sbit,"eax"); 560e1051a39Sopenharmony_ci &jle (&label("sqradd")); 561e1051a39Sopenharmony_ci 562e1051a39Sopenharmony_ci &mov ($carry,"edx"); 563e1051a39Sopenharmony_ci &add ("edx","edx"); 564e1051a39Sopenharmony_ci &shr ($carry,31); 565e1051a39Sopenharmony_ci &add ("edx",$sbit); 566e1051a39Sopenharmony_ci &adc ($carry,0); 567e1051a39Sopenharmony_ci&set_label("sqrlast"); 568e1051a39Sopenharmony_ci &mov ($word,$_n0); 569e1051a39Sopenharmony_ci &mov ($inp,$_np); 570e1051a39Sopenharmony_ci &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 571e1051a39Sopenharmony_ci 572e1051a39Sopenharmony_ci &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] 573e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$inp)); # np[0] 574e1051a39Sopenharmony_ci &adc ($carry,0); 575e1051a39Sopenharmony_ci &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= 576e1051a39Sopenharmony_ci &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= 577e1051a39Sopenharmony_ci 578e1051a39Sopenharmony_ci &mul ($word); # np[0]*m 579e1051a39Sopenharmony_ci &add ("eax",&DWP($frame,"esp")); # +=tp[0] 580e1051a39Sopenharmony_ci &lea ($num,&DWP(-1,$j)); 581e1051a39Sopenharmony_ci &adc ("edx",0); 582e1051a39Sopenharmony_ci &mov ($j,1); 583e1051a39Sopenharmony_ci &mov ("eax",&DWP(4,$inp)); # np[1] 584e1051a39Sopenharmony_ci 585e1051a39Sopenharmony_ci &jmp (&label("3rdmadd")); 586e1051a39Sopenharmony_ci} 587e1051a39Sopenharmony_ci 588e1051a39Sopenharmony_ci&set_label("common_tail",16); 589e1051a39Sopenharmony_ci &mov ($np,$_np); # load modulus pointer 590e1051a39Sopenharmony_ci &mov ($rp,$_rp); # load result pointer 591e1051a39Sopenharmony_ci &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] 592e1051a39Sopenharmony_ci 593e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,$tp)); # tp[0] 594e1051a39Sopenharmony_ci &mov ($j,$num); # j=num-1 595e1051a39Sopenharmony_ci &xor ($i,$i); # i=0 and clear CF! 596e1051a39Sopenharmony_ci 597e1051a39Sopenharmony_ci&set_label("sub",16); 598e1051a39Sopenharmony_ci &sbb ("eax",&DWP(0,$np,$i,4)); 599e1051a39Sopenharmony_ci &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] 600e1051a39Sopenharmony_ci &dec ($j); # doesn't affect CF! 601e1051a39Sopenharmony_ci &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] 602e1051a39Sopenharmony_ci &lea ($i,&DWP(1,$i)); # i++ 603e1051a39Sopenharmony_ci &jge (&label("sub")); 604e1051a39Sopenharmony_ci 605e1051a39Sopenharmony_ci &sbb ("eax",0); # handle upmost overflow bit 606e1051a39Sopenharmony_ci &mov ("edx",-1); 607e1051a39Sopenharmony_ci &xor ("edx","eax"); 608e1051a39Sopenharmony_ci &jmp (&label("copy")); 609e1051a39Sopenharmony_ci 610e1051a39Sopenharmony_ci&set_label("copy",16); # conditional copy 611e1051a39Sopenharmony_ci &mov ($tp,&DWP($frame,"esp",$num,4)); 612e1051a39Sopenharmony_ci &mov ($np,&DWP(0,$rp,$num,4)); 613e1051a39Sopenharmony_ci &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector 614e1051a39Sopenharmony_ci &and ($tp,"eax"); 615e1051a39Sopenharmony_ci &and ($np,"edx"); 616e1051a39Sopenharmony_ci &or ($np,$tp); 617e1051a39Sopenharmony_ci &mov (&DWP(0,$rp,$num,4),$np); 618e1051a39Sopenharmony_ci &dec ($num); 619e1051a39Sopenharmony_ci &jge (&label("copy")); 620e1051a39Sopenharmony_ci 621e1051a39Sopenharmony_ci &mov ("esp",$_sp); # pull saved stack pointer 622e1051a39Sopenharmony_ci &mov ("eax",1); 623e1051a39Sopenharmony_ci&set_label("just_leave"); 624e1051a39Sopenharmony_ci&function_end("bn_mul_mont"); 625e1051a39Sopenharmony_ci 626e1051a39Sopenharmony_ci&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); 627e1051a39Sopenharmony_ci 628e1051a39Sopenharmony_ci&asm_finish(); 629e1051a39Sopenharmony_ci 630e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 631