1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# ECP_NISTZ256 module for x86/SSE2. 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# October 2014. 20e1051a39Sopenharmony_ci# 21e1051a39Sopenharmony_ci# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22e1051a39Sopenharmony_ci# http://eprint.iacr.org/2013/816. In the process of adaptation 23e1051a39Sopenharmony_ci# original .c module was made 32-bit savvy in order to make this 24e1051a39Sopenharmony_ci# implementation possible. 25e1051a39Sopenharmony_ci# 26e1051a39Sopenharmony_ci# with/without -DECP_NISTZ256_ASM 27e1051a39Sopenharmony_ci# Pentium +66-163% 28e1051a39Sopenharmony_ci# PIII +72-172% 29e1051a39Sopenharmony_ci# P4 +65-132% 30e1051a39Sopenharmony_ci# Core2 +90-215% 31e1051a39Sopenharmony_ci# Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) 32e1051a39Sopenharmony_ci# Atom +65-155% 33e1051a39Sopenharmony_ci# Opteron +54-110% 34e1051a39Sopenharmony_ci# Bulldozer +99-240% 35e1051a39Sopenharmony_ci# VIA Nano +93-290% 36e1051a39Sopenharmony_ci# 37e1051a39Sopenharmony_ci# Ranges denote minimum and maximum improvement coefficients depending 38e1051a39Sopenharmony_ci# on benchmark. Lower coefficients are for ECDSA sign, server-side 39e1051a39Sopenharmony_ci# operation. Keep in mind that +200% means 3x improvement. 40e1051a39Sopenharmony_ci 41e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 43e1051a39Sopenharmony_cirequire "x86asm.pl"; 44e1051a39Sopenharmony_ci 45e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output"; 46e1051a39Sopenharmony_ci 47e1051a39Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 48e1051a39Sopenharmony_ci 49e1051a39Sopenharmony_ci$sse2=0; 50e1051a39Sopenharmony_cifor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 51e1051a39Sopenharmony_ci 52e1051a39Sopenharmony_ci&external_label("OPENSSL_ia32cap_P") if ($sse2); 53e1051a39Sopenharmony_ci 54e1051a39Sopenharmony_ci 55e1051a39Sopenharmony_ci######################################################################## 56e1051a39Sopenharmony_ci# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 57e1051a39Sopenharmony_ci# 58e1051a39Sopenharmony_ciopen TABLE,"<ecp_nistz256_table.c" or 59e1051a39Sopenharmony_ciopen TABLE,"<${dir}../ecp_nistz256_table.c" or 60e1051a39Sopenharmony_cidie "failed to open ecp_nistz256_table.c:",$!; 61e1051a39Sopenharmony_ci 62e1051a39Sopenharmony_ciuse integer; 63e1051a39Sopenharmony_ci 64e1051a39Sopenharmony_ciforeach(<TABLE>) { 65e1051a39Sopenharmony_ci s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 66e1051a39Sopenharmony_ci} 67e1051a39Sopenharmony_ciclose TABLE; 68e1051a39Sopenharmony_ci 69e1051a39Sopenharmony_ci# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 70e1051a39Sopenharmony_ci# 64*16*37-1 is because $#arr returns last valid index or @arr, not 71e1051a39Sopenharmony_ci# amount of elements. 72e1051a39Sopenharmony_cidie "insane number of elements" if ($#arr != 64*16*37-1); 73e1051a39Sopenharmony_ci 74e1051a39Sopenharmony_ci&public_label("ecp_nistz256_precomputed"); 75e1051a39Sopenharmony_ci&align(4096); 76e1051a39Sopenharmony_ci&set_label("ecp_nistz256_precomputed"); 77e1051a39Sopenharmony_ci 78e1051a39Sopenharmony_ci######################################################################## 79e1051a39Sopenharmony_ci# this conversion smashes P256_POINT_AFFINE by individual bytes with 80e1051a39Sopenharmony_ci# 64 byte interval, similar to 81e1051a39Sopenharmony_ci# 1111222233334444 82e1051a39Sopenharmony_ci# 1234123412341234 83e1051a39Sopenharmony_cifor(1..37) { 84e1051a39Sopenharmony_ci @tbl = splice(@arr,0,64*16); 85e1051a39Sopenharmony_ci for($i=0;$i<64;$i++) { 86e1051a39Sopenharmony_ci undef @line; 87e1051a39Sopenharmony_ci for($j=0;$j<64;$j++) { 88e1051a39Sopenharmony_ci push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 89e1051a39Sopenharmony_ci } 90e1051a39Sopenharmony_ci &data_byte(join(',',map { sprintf "0x%02x",$_} @line)); 91e1051a39Sopenharmony_ci } 92e1051a39Sopenharmony_ci} 93e1051a39Sopenharmony_ci 94e1051a39Sopenharmony_ci######################################################################## 95e1051a39Sopenharmony_ci# Keep in mind that constants are stored least to most significant word 96e1051a39Sopenharmony_ci&static_label("RR"); 97e1051a39Sopenharmony_ci&set_label("RR",64); 98e1051a39Sopenharmony_ci&data_word(3,0,-1,-5,-2,-1,-3,4); # 2^512 mod P-256 99e1051a39Sopenharmony_ci 100e1051a39Sopenharmony_ci&static_label("ONE_mont"); 101e1051a39Sopenharmony_ci&set_label("ONE_mont"); 102e1051a39Sopenharmony_ci&data_word(1,0,0,-1,-1,-1,-2,0); 103e1051a39Sopenharmony_ci 104e1051a39Sopenharmony_ci&static_label("ONE"); 105e1051a39Sopenharmony_ci&set_label("ONE"); 106e1051a39Sopenharmony_ci&data_word(1,0,0,0,0,0,0,0); 107e1051a39Sopenharmony_ci&asciz("ECP_NISZ256 for x86/SSE2, CRYPTOGAMS by <appro\@openssl.org>"); 108e1051a39Sopenharmony_ci&align(64); 109e1051a39Sopenharmony_ci 110e1051a39Sopenharmony_ci######################################################################## 111e1051a39Sopenharmony_ci# void ecp_nistz256_mul_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); 112e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_mul_by_2"); 113e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 114e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 115e1051a39Sopenharmony_ci &mov ("ebp","esi"); 116e1051a39Sopenharmony_ci######################################################################## 117e1051a39Sopenharmony_ci# common pattern for internal functions is that %edi is result pointer, 118e1051a39Sopenharmony_ci# %esi and %ebp are input ones, %ebp being optional. %edi is preserved. 119e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); 120e1051a39Sopenharmony_ci&function_end("ecp_nistz256_mul_by_2"); 121e1051a39Sopenharmony_ci 122e1051a39Sopenharmony_ci######################################################################## 123e1051a39Sopenharmony_ci# void ecp_nistz256_mul_by_3(BN_ULONG edi[8],const BN_ULONG esi[8]); 124e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_mul_by_3"); 125e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 126e1051a39Sopenharmony_ci # multiplication by 3 is performed 127e1051a39Sopenharmony_ci # as 2*n+n, but we can't use output 128e1051a39Sopenharmony_ci # to store 2*n, because if output 129e1051a39Sopenharmony_ci # pointer equals to input, then 130e1051a39Sopenharmony_ci # we'll get 2*n+2*n. 131e1051a39Sopenharmony_ci &stack_push(8); # therefore we need to allocate 132e1051a39Sopenharmony_ci # 256-bit intermediate buffer. 133e1051a39Sopenharmony_ci &mov ("edi","esp"); 134e1051a39Sopenharmony_ci &mov ("ebp","esi"); 135e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); 136e1051a39Sopenharmony_ci &lea ("esi",&DWP(0,"edi")); 137e1051a39Sopenharmony_ci &mov ("ebp",&wparam(1)); 138e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 139e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); 140e1051a39Sopenharmony_ci &stack_pop(8); 141e1051a39Sopenharmony_ci&function_end("ecp_nistz256_mul_by_3"); 142e1051a39Sopenharmony_ci 143e1051a39Sopenharmony_ci######################################################################## 144e1051a39Sopenharmony_ci# void ecp_nistz256_div_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); 145e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_div_by_2"); 146e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 147e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 148e1051a39Sopenharmony_ci &call ("_ecp_nistz256_div_by_2"); 149e1051a39Sopenharmony_ci&function_end("ecp_nistz256_div_by_2"); 150e1051a39Sopenharmony_ci 151e1051a39Sopenharmony_ci&function_begin_B("_ecp_nistz256_div_by_2"); 152e1051a39Sopenharmony_ci # tmp = a is odd ? a+mod : a 153e1051a39Sopenharmony_ci # 154e1051a39Sopenharmony_ci # note that because mod has special form, i.e. consists of 155e1051a39Sopenharmony_ci # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 156e1051a39Sopenharmony_ci # assigning least significant bit of input to one register, 157e1051a39Sopenharmony_ci # %ebp, and its negative to another, %edx. 158e1051a39Sopenharmony_ci 159e1051a39Sopenharmony_ci &mov ("ebp",&DWP(0,"esi")); 160e1051a39Sopenharmony_ci &xor ("edx","edx"); 161e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"esi")); 162e1051a39Sopenharmony_ci &mov ("eax","ebp"); 163e1051a39Sopenharmony_ci &and ("ebp",1); 164e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"esi")); 165e1051a39Sopenharmony_ci &sub ("edx","ebp"); 166e1051a39Sopenharmony_ci 167e1051a39Sopenharmony_ci &add ("eax","edx"); 168e1051a39Sopenharmony_ci &adc ("ebx","edx"); 169e1051a39Sopenharmony_ci &mov (&DWP(0,"edi"),"eax"); 170e1051a39Sopenharmony_ci &adc ("ecx","edx"); 171e1051a39Sopenharmony_ci &mov (&DWP(4,"edi"),"ebx"); 172e1051a39Sopenharmony_ci &mov (&DWP(8,"edi"),"ecx"); 173e1051a39Sopenharmony_ci 174e1051a39Sopenharmony_ci &mov ("eax",&DWP(12,"esi")); 175e1051a39Sopenharmony_ci &mov ("ebx",&DWP(16,"esi")); 176e1051a39Sopenharmony_ci &adc ("eax",0); 177e1051a39Sopenharmony_ci &mov ("ecx",&DWP(20,"esi")); 178e1051a39Sopenharmony_ci &adc ("ebx",0); 179e1051a39Sopenharmony_ci &mov (&DWP(12,"edi"),"eax"); 180e1051a39Sopenharmony_ci &adc ("ecx",0); 181e1051a39Sopenharmony_ci &mov (&DWP(16,"edi"),"ebx"); 182e1051a39Sopenharmony_ci &mov (&DWP(20,"edi"),"ecx"); 183e1051a39Sopenharmony_ci 184e1051a39Sopenharmony_ci &mov ("eax",&DWP(24,"esi")); 185e1051a39Sopenharmony_ci &mov ("ebx",&DWP(28,"esi")); 186e1051a39Sopenharmony_ci &adc ("eax","ebp"); 187e1051a39Sopenharmony_ci &adc ("ebx","edx"); 188e1051a39Sopenharmony_ci &mov (&DWP(24,"edi"),"eax"); 189e1051a39Sopenharmony_ci &sbb ("esi","esi"); # broadcast carry bit 190e1051a39Sopenharmony_ci &mov (&DWP(28,"edi"),"ebx"); 191e1051a39Sopenharmony_ci 192e1051a39Sopenharmony_ci # ret = tmp >> 1 193e1051a39Sopenharmony_ci 194e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"edi")); 195e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"edi")); 196e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"edi")); 197e1051a39Sopenharmony_ci &mov ("edx",&DWP(12,"edi")); 198e1051a39Sopenharmony_ci 199e1051a39Sopenharmony_ci &shr ("eax",1); 200e1051a39Sopenharmony_ci &mov ("ebp","ebx"); 201e1051a39Sopenharmony_ci &shl ("ebx",31); 202e1051a39Sopenharmony_ci &or ("eax","ebx"); 203e1051a39Sopenharmony_ci 204e1051a39Sopenharmony_ci &shr ("ebp",1); 205e1051a39Sopenharmony_ci &mov ("ebx","ecx"); 206e1051a39Sopenharmony_ci &shl ("ecx",31); 207e1051a39Sopenharmony_ci &mov (&DWP(0,"edi"),"eax"); 208e1051a39Sopenharmony_ci &or ("ebp","ecx"); 209e1051a39Sopenharmony_ci &mov ("eax",&DWP(16,"edi")); 210e1051a39Sopenharmony_ci 211e1051a39Sopenharmony_ci &shr ("ebx",1); 212e1051a39Sopenharmony_ci &mov ("ecx","edx"); 213e1051a39Sopenharmony_ci &shl ("edx",31); 214e1051a39Sopenharmony_ci &mov (&DWP(4,"edi"),"ebp"); 215e1051a39Sopenharmony_ci &or ("ebx","edx"); 216e1051a39Sopenharmony_ci &mov ("ebp",&DWP(20,"edi")); 217e1051a39Sopenharmony_ci 218e1051a39Sopenharmony_ci &shr ("ecx",1); 219e1051a39Sopenharmony_ci &mov ("edx","eax"); 220e1051a39Sopenharmony_ci &shl ("eax",31); 221e1051a39Sopenharmony_ci &mov (&DWP(8,"edi"),"ebx"); 222e1051a39Sopenharmony_ci &or ("ecx","eax"); 223e1051a39Sopenharmony_ci &mov ("ebx",&DWP(24,"edi")); 224e1051a39Sopenharmony_ci 225e1051a39Sopenharmony_ci &shr ("edx",1); 226e1051a39Sopenharmony_ci &mov ("eax","ebp"); 227e1051a39Sopenharmony_ci &shl ("ebp",31); 228e1051a39Sopenharmony_ci &mov (&DWP(12,"edi"),"ecx"); 229e1051a39Sopenharmony_ci &or ("edx","ebp"); 230e1051a39Sopenharmony_ci &mov ("ecx",&DWP(28,"edi")); 231e1051a39Sopenharmony_ci 232e1051a39Sopenharmony_ci &shr ("eax",1); 233e1051a39Sopenharmony_ci &mov ("ebp","ebx"); 234e1051a39Sopenharmony_ci &shl ("ebx",31); 235e1051a39Sopenharmony_ci &mov (&DWP(16,"edi"),"edx"); 236e1051a39Sopenharmony_ci &or ("eax","ebx"); 237e1051a39Sopenharmony_ci 238e1051a39Sopenharmony_ci &shr ("ebp",1); 239e1051a39Sopenharmony_ci &mov ("ebx","ecx"); 240e1051a39Sopenharmony_ci &shl ("ecx",31); 241e1051a39Sopenharmony_ci &mov (&DWP(20,"edi"),"eax"); 242e1051a39Sopenharmony_ci &or ("ebp","ecx"); 243e1051a39Sopenharmony_ci 244e1051a39Sopenharmony_ci &shr ("ebx",1); 245e1051a39Sopenharmony_ci &shl ("esi",31); 246e1051a39Sopenharmony_ci &mov (&DWP(24,"edi"),"ebp"); 247e1051a39Sopenharmony_ci &or ("ebx","esi"); # handle top-most carry bit 248e1051a39Sopenharmony_ci &mov (&DWP(28,"edi"),"ebx"); 249e1051a39Sopenharmony_ci 250e1051a39Sopenharmony_ci &ret (); 251e1051a39Sopenharmony_ci&function_end_B("_ecp_nistz256_div_by_2"); 252e1051a39Sopenharmony_ci 253e1051a39Sopenharmony_ci######################################################################## 254e1051a39Sopenharmony_ci# void ecp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8], 255e1051a39Sopenharmony_ci# const BN_ULONG ebp[8]); 256e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_add"); 257e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 258e1051a39Sopenharmony_ci &mov ("ebp",&wparam(2)); 259e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 260e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); 261e1051a39Sopenharmony_ci&function_end("ecp_nistz256_add"); 262e1051a39Sopenharmony_ci 263e1051a39Sopenharmony_ci&function_begin_B("_ecp_nistz256_add"); 264e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"esi")); 265e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"esi")); 266e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"esi")); 267e1051a39Sopenharmony_ci &add ("eax",&DWP(0,"ebp")); 268e1051a39Sopenharmony_ci &mov ("edx",&DWP(12,"esi")); 269e1051a39Sopenharmony_ci &adc ("ebx",&DWP(4,"ebp")); 270e1051a39Sopenharmony_ci &mov (&DWP(0,"edi"),"eax"); 271e1051a39Sopenharmony_ci &adc ("ecx",&DWP(8,"ebp")); 272e1051a39Sopenharmony_ci &mov (&DWP(4,"edi"),"ebx"); 273e1051a39Sopenharmony_ci &adc ("edx",&DWP(12,"ebp")); 274e1051a39Sopenharmony_ci &mov (&DWP(8,"edi"),"ecx"); 275e1051a39Sopenharmony_ci &mov (&DWP(12,"edi"),"edx"); 276e1051a39Sopenharmony_ci 277e1051a39Sopenharmony_ci &mov ("eax",&DWP(16,"esi")); 278e1051a39Sopenharmony_ci &mov ("ebx",&DWP(20,"esi")); 279e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esi")); 280e1051a39Sopenharmony_ci &adc ("eax",&DWP(16,"ebp")); 281e1051a39Sopenharmony_ci &mov ("edx",&DWP(28,"esi")); 282e1051a39Sopenharmony_ci &adc ("ebx",&DWP(20,"ebp")); 283e1051a39Sopenharmony_ci &mov (&DWP(16,"edi"),"eax"); 284e1051a39Sopenharmony_ci &adc ("ecx",&DWP(24,"ebp")); 285e1051a39Sopenharmony_ci &mov (&DWP(20,"edi"),"ebx"); 286e1051a39Sopenharmony_ci &mov ("esi",0); 287e1051a39Sopenharmony_ci &adc ("edx",&DWP(28,"ebp")); 288e1051a39Sopenharmony_ci &mov (&DWP(24,"edi"),"ecx"); 289e1051a39Sopenharmony_ci &adc ("esi",0); 290e1051a39Sopenharmony_ci &mov (&DWP(28,"edi"),"edx"); 291e1051a39Sopenharmony_ci 292e1051a39Sopenharmony_ci # if a+b >= modulus, subtract modulus. 293e1051a39Sopenharmony_ci # 294e1051a39Sopenharmony_ci # But since comparison implies subtraction, we subtract modulus 295e1051a39Sopenharmony_ci # to see if it borrows, and then subtract it for real if 296e1051a39Sopenharmony_ci # subtraction didn't borrow. 297e1051a39Sopenharmony_ci 298e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"edi")); 299e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"edi")); 300e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"edi")); 301e1051a39Sopenharmony_ci &sub ("eax",-1); 302e1051a39Sopenharmony_ci &mov ("edx",&DWP(12,"edi")); 303e1051a39Sopenharmony_ci &sbb ("ebx",-1); 304e1051a39Sopenharmony_ci &mov ("eax",&DWP(16,"edi")); 305e1051a39Sopenharmony_ci &sbb ("ecx",-1); 306e1051a39Sopenharmony_ci &mov ("ebx",&DWP(20,"edi")); 307e1051a39Sopenharmony_ci &sbb ("edx",0); 308e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"edi")); 309e1051a39Sopenharmony_ci &sbb ("eax",0); 310e1051a39Sopenharmony_ci &mov ("edx",&DWP(28,"edi")); 311e1051a39Sopenharmony_ci &sbb ("ebx",0); 312e1051a39Sopenharmony_ci &sbb ("ecx",1); 313e1051a39Sopenharmony_ci &sbb ("edx",-1); 314e1051a39Sopenharmony_ci &sbb ("esi",0); 315e1051a39Sopenharmony_ci 316e1051a39Sopenharmony_ci # Note that because mod has special form, i.e. consists of 317e1051a39Sopenharmony_ci # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 318e1051a39Sopenharmony_ci # by using borrow. 319e1051a39Sopenharmony_ci 320e1051a39Sopenharmony_ci ¬ ("esi"); 321e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"edi")); 322e1051a39Sopenharmony_ci &mov ("ebp","esi"); 323e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"edi")); 324e1051a39Sopenharmony_ci &shr ("ebp",31); 325e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"edi")); 326e1051a39Sopenharmony_ci &sub ("eax","esi"); 327e1051a39Sopenharmony_ci &mov ("edx",&DWP(12,"edi")); 328e1051a39Sopenharmony_ci &sbb ("ebx","esi"); 329e1051a39Sopenharmony_ci &mov (&DWP(0,"edi"),"eax"); 330e1051a39Sopenharmony_ci &sbb ("ecx","esi"); 331e1051a39Sopenharmony_ci &mov (&DWP(4,"edi"),"ebx"); 332e1051a39Sopenharmony_ci &sbb ("edx",0); 333e1051a39Sopenharmony_ci &mov (&DWP(8,"edi"),"ecx"); 334e1051a39Sopenharmony_ci &mov (&DWP(12,"edi"),"edx"); 335e1051a39Sopenharmony_ci 336e1051a39Sopenharmony_ci &mov ("eax",&DWP(16,"edi")); 337e1051a39Sopenharmony_ci &mov ("ebx",&DWP(20,"edi")); 338e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"edi")); 339e1051a39Sopenharmony_ci &sbb ("eax",0); 340e1051a39Sopenharmony_ci &mov ("edx",&DWP(28,"edi")); 341e1051a39Sopenharmony_ci &sbb ("ebx",0); 342e1051a39Sopenharmony_ci &mov (&DWP(16,"edi"),"eax"); 343e1051a39Sopenharmony_ci &sbb ("ecx","ebp"); 344e1051a39Sopenharmony_ci &mov (&DWP(20,"edi"),"ebx"); 345e1051a39Sopenharmony_ci &sbb ("edx","esi"); 346e1051a39Sopenharmony_ci &mov (&DWP(24,"edi"),"ecx"); 347e1051a39Sopenharmony_ci &mov (&DWP(28,"edi"),"edx"); 348e1051a39Sopenharmony_ci 349e1051a39Sopenharmony_ci &ret (); 350e1051a39Sopenharmony_ci&function_end_B("_ecp_nistz256_add"); 351e1051a39Sopenharmony_ci 352e1051a39Sopenharmony_ci######################################################################## 353e1051a39Sopenharmony_ci# void ecp_nistz256_sub(BN_ULONG edi[8],const BN_ULONG esi[8], 354e1051a39Sopenharmony_ci# const BN_ULONG ebp[8]); 355e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_sub"); 356e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 357e1051a39Sopenharmony_ci &mov ("ebp",&wparam(2)); 358e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 359e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); 360e1051a39Sopenharmony_ci&function_end("ecp_nistz256_sub"); 361e1051a39Sopenharmony_ci 362e1051a39Sopenharmony_ci&function_begin_B("_ecp_nistz256_sub"); 363e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"esi")); 364e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"esi")); 365e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"esi")); 366e1051a39Sopenharmony_ci &sub ("eax",&DWP(0,"ebp")); 367e1051a39Sopenharmony_ci &mov ("edx",&DWP(12,"esi")); 368e1051a39Sopenharmony_ci &sbb ("ebx",&DWP(4,"ebp")); 369e1051a39Sopenharmony_ci &mov (&DWP(0,"edi"),"eax"); 370e1051a39Sopenharmony_ci &sbb ("ecx",&DWP(8,"ebp")); 371e1051a39Sopenharmony_ci &mov (&DWP(4,"edi"),"ebx"); 372e1051a39Sopenharmony_ci &sbb ("edx",&DWP(12,"ebp")); 373e1051a39Sopenharmony_ci &mov (&DWP(8,"edi"),"ecx"); 374e1051a39Sopenharmony_ci &mov (&DWP(12,"edi"),"edx"); 375e1051a39Sopenharmony_ci 376e1051a39Sopenharmony_ci &mov ("eax",&DWP(16,"esi")); 377e1051a39Sopenharmony_ci &mov ("ebx",&DWP(20,"esi")); 378e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esi")); 379e1051a39Sopenharmony_ci &sbb ("eax",&DWP(16,"ebp")); 380e1051a39Sopenharmony_ci &mov ("edx",&DWP(28,"esi")); 381e1051a39Sopenharmony_ci &sbb ("ebx",&DWP(20,"ebp")); 382e1051a39Sopenharmony_ci &sbb ("ecx",&DWP(24,"ebp")); 383e1051a39Sopenharmony_ci &mov (&DWP(16,"edi"),"eax"); 384e1051a39Sopenharmony_ci &sbb ("edx",&DWP(28,"ebp")); 385e1051a39Sopenharmony_ci &mov (&DWP(20,"edi"),"ebx"); 386e1051a39Sopenharmony_ci &sbb ("esi","esi"); # broadcast borrow bit 387e1051a39Sopenharmony_ci &mov (&DWP(24,"edi"),"ecx"); 388e1051a39Sopenharmony_ci &mov (&DWP(28,"edi"),"edx"); 389e1051a39Sopenharmony_ci 390e1051a39Sopenharmony_ci # if a-b borrows, add modulus. 391e1051a39Sopenharmony_ci # 392e1051a39Sopenharmony_ci # Note that because mod has special form, i.e. consists of 393e1051a39Sopenharmony_ci # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 394e1051a39Sopenharmony_ci # assigning borrow bit to one register, %ebp, and its negative 395e1051a39Sopenharmony_ci # to another, %esi. But we started by calculating %esi... 396e1051a39Sopenharmony_ci 397e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"edi")); 398e1051a39Sopenharmony_ci &mov ("ebp","esi"); 399e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"edi")); 400e1051a39Sopenharmony_ci &shr ("ebp",31); 401e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"edi")); 402e1051a39Sopenharmony_ci &add ("eax","esi"); 403e1051a39Sopenharmony_ci &mov ("edx",&DWP(12,"edi")); 404e1051a39Sopenharmony_ci &adc ("ebx","esi"); 405e1051a39Sopenharmony_ci &mov (&DWP(0,"edi"),"eax"); 406e1051a39Sopenharmony_ci &adc ("ecx","esi"); 407e1051a39Sopenharmony_ci &mov (&DWP(4,"edi"),"ebx"); 408e1051a39Sopenharmony_ci &adc ("edx",0); 409e1051a39Sopenharmony_ci &mov (&DWP(8,"edi"),"ecx"); 410e1051a39Sopenharmony_ci &mov (&DWP(12,"edi"),"edx"); 411e1051a39Sopenharmony_ci 412e1051a39Sopenharmony_ci &mov ("eax",&DWP(16,"edi")); 413e1051a39Sopenharmony_ci &mov ("ebx",&DWP(20,"edi")); 414e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"edi")); 415e1051a39Sopenharmony_ci &adc ("eax",0); 416e1051a39Sopenharmony_ci &mov ("edx",&DWP(28,"edi")); 417e1051a39Sopenharmony_ci &adc ("ebx",0); 418e1051a39Sopenharmony_ci &mov (&DWP(16,"edi"),"eax"); 419e1051a39Sopenharmony_ci &adc ("ecx","ebp"); 420e1051a39Sopenharmony_ci &mov (&DWP(20,"edi"),"ebx"); 421e1051a39Sopenharmony_ci &adc ("edx","esi"); 422e1051a39Sopenharmony_ci &mov (&DWP(24,"edi"),"ecx"); 423e1051a39Sopenharmony_ci &mov (&DWP(28,"edi"),"edx"); 424e1051a39Sopenharmony_ci 425e1051a39Sopenharmony_ci &ret (); 426e1051a39Sopenharmony_ci&function_end_B("_ecp_nistz256_sub"); 427e1051a39Sopenharmony_ci 428e1051a39Sopenharmony_ci######################################################################## 429e1051a39Sopenharmony_ci# void ecp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]); 430e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_neg"); 431e1051a39Sopenharmony_ci &mov ("ebp",&wparam(1)); 432e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 433e1051a39Sopenharmony_ci 434e1051a39Sopenharmony_ci &xor ("eax","eax"); 435e1051a39Sopenharmony_ci &stack_push(8); 436e1051a39Sopenharmony_ci &mov (&DWP(0,"esp"),"eax"); 437e1051a39Sopenharmony_ci &mov ("esi","esp"); 438e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),"eax"); 439e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"eax"); 440e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"eax"); 441e1051a39Sopenharmony_ci &mov (&DWP(16,"esp"),"eax"); 442e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),"eax"); 443e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),"eax"); 444e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),"eax"); 445e1051a39Sopenharmony_ci 446e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); 447e1051a39Sopenharmony_ci 448e1051a39Sopenharmony_ci &stack_pop(8); 449e1051a39Sopenharmony_ci&function_end("ecp_nistz256_neg"); 450e1051a39Sopenharmony_ci 451e1051a39Sopenharmony_ci&function_begin_B("_picup_eax"); 452e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"esp")); 453e1051a39Sopenharmony_ci &ret (); 454e1051a39Sopenharmony_ci&function_end_B("_picup_eax"); 455e1051a39Sopenharmony_ci 456e1051a39Sopenharmony_ci######################################################################## 457e1051a39Sopenharmony_ci# void ecp_nistz256_to_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); 458e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_to_mont"); 459e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 460e1051a39Sopenharmony_ci &call ("_picup_eax"); 461e1051a39Sopenharmony_ci &set_label("pic"); 462e1051a39Sopenharmony_ci &lea ("ebp",&DWP(&label("RR")."-".&label("pic"),"eax")); 463e1051a39Sopenharmony_ci if ($sse2) { 464e1051a39Sopenharmony_ci &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 465e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"eax")); } 466e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 467e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); 468e1051a39Sopenharmony_ci&function_end("ecp_nistz256_to_mont"); 469e1051a39Sopenharmony_ci 470e1051a39Sopenharmony_ci######################################################################## 471e1051a39Sopenharmony_ci# void ecp_nistz256_from_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); 472e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_from_mont"); 473e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 474e1051a39Sopenharmony_ci &call ("_picup_eax"); 475e1051a39Sopenharmony_ci &set_label("pic"); 476e1051a39Sopenharmony_ci &lea ("ebp",&DWP(&label("ONE")."-".&label("pic"),"eax")); 477e1051a39Sopenharmony_ci if ($sse2) { 478e1051a39Sopenharmony_ci &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 479e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"eax")); } 480e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 481e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); 482e1051a39Sopenharmony_ci&function_end("ecp_nistz256_from_mont"); 483e1051a39Sopenharmony_ci 484e1051a39Sopenharmony_ci######################################################################## 485e1051a39Sopenharmony_ci# void ecp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8], 486e1051a39Sopenharmony_ci# const BN_ULONG ebp[8]); 487e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_mul_mont"); 488e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 489e1051a39Sopenharmony_ci &mov ("ebp",&wparam(2)); 490e1051a39Sopenharmony_ci if ($sse2) { 491e1051a39Sopenharmony_ci &call ("_picup_eax"); 492e1051a39Sopenharmony_ci &set_label("pic"); 493e1051a39Sopenharmony_ci &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 494e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"eax")); } 495e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 496e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); 497e1051a39Sopenharmony_ci&function_end("ecp_nistz256_mul_mont"); 498e1051a39Sopenharmony_ci 499e1051a39Sopenharmony_ci######################################################################## 500e1051a39Sopenharmony_ci# void ecp_nistz256_sqr_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); 501e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_sqr_mont"); 502e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 503e1051a39Sopenharmony_ci if ($sse2) { 504e1051a39Sopenharmony_ci &call ("_picup_eax"); 505e1051a39Sopenharmony_ci &set_label("pic"); 506e1051a39Sopenharmony_ci &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 507e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"eax")); } 508e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 509e1051a39Sopenharmony_ci &mov ("ebp","esi"); 510e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); 511e1051a39Sopenharmony_ci&function_end("ecp_nistz256_sqr_mont"); 512e1051a39Sopenharmony_ci 513e1051a39Sopenharmony_ci&function_begin_B("_ecp_nistz256_mul_mont"); 514e1051a39Sopenharmony_ci if ($sse2) { 515e1051a39Sopenharmony_ci &and ("eax",1<<24|1<<26); 516e1051a39Sopenharmony_ci &cmp ("eax",1<<24|1<<26); # see if XMM+SSE2 is on 517e1051a39Sopenharmony_ci &jne (&label("mul_mont_ialu")); 518e1051a39Sopenharmony_ci 519e1051a39Sopenharmony_ci ######################################## 520e1051a39Sopenharmony_ci # SSE2 code path featuring 32x16-bit 521e1051a39Sopenharmony_ci # multiplications is ~2x faster than 522e1051a39Sopenharmony_ci # IALU counterpart (except on Atom)... 523e1051a39Sopenharmony_ci ######################################## 524e1051a39Sopenharmony_ci # stack layout: 525e1051a39Sopenharmony_ci # +------------------------------------+< %esp 526e1051a39Sopenharmony_ci # | 7 16-byte temporary XMM words, | 527e1051a39Sopenharmony_ci # | "sliding" toward lower address | 528e1051a39Sopenharmony_ci # . . 529e1051a39Sopenharmony_ci # +------------------------------------+ 530e1051a39Sopenharmony_ci # | unused XMM word | 531e1051a39Sopenharmony_ci # +------------------------------------+< +128,%ebx 532e1051a39Sopenharmony_ci # | 8 16-byte XMM words holding copies | 533e1051a39Sopenharmony_ci # | of a[i]<<64|a[i] | 534e1051a39Sopenharmony_ci # . . 535e1051a39Sopenharmony_ci # . . 536e1051a39Sopenharmony_ci # +------------------------------------+< +256 537e1051a39Sopenharmony_ci &mov ("edx","esp"); 538e1051a39Sopenharmony_ci &sub ("esp",0x100); 539e1051a39Sopenharmony_ci 540e1051a39Sopenharmony_ci &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy 541e1051a39Sopenharmony_ci &lea ("ebp",&DWP(4,"ebp")); 542e1051a39Sopenharmony_ci &pcmpeqd("xmm6","xmm6"); 543e1051a39Sopenharmony_ci &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff 544e1051a39Sopenharmony_ci 545e1051a39Sopenharmony_ci &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y 546e1051a39Sopenharmony_ci &and ("esp",-64); 547e1051a39Sopenharmony_ci &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 548e1051a39Sopenharmony_ci &lea ("ebx",&DWP(0x80,"esp")); 549e1051a39Sopenharmony_ci 550e1051a39Sopenharmony_ci &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy 551e1051a39Sopenharmony_ci &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy 552e1051a39Sopenharmony_ci &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ... 553e1051a39Sopenharmony_ci &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0] 554e1051a39Sopenharmony_ci &pmuludq("xmm0","xmm7"); # a[0]*b[0] 555e1051a39Sopenharmony_ci 556e1051a39Sopenharmony_ci &movd ("xmm2",&DWP(4*2,"esi")); 557e1051a39Sopenharmony_ci &pshufd ("xmm1","xmm1",0b11001100); 558e1051a39Sopenharmony_ci &movdqa (&QWP(0x10,"ebx"),"xmm1"); 559e1051a39Sopenharmony_ci &pmuludq("xmm1","xmm7"); # a[1]*b[0] 560e1051a39Sopenharmony_ci 561e1051a39Sopenharmony_ci &movq ("xmm4","xmm0"); # clear upper 64 bits 562e1051a39Sopenharmony_ci &pslldq("xmm4",6); 563e1051a39Sopenharmony_ci &paddq ("xmm4","xmm0"); 564e1051a39Sopenharmony_ci &movdqa("xmm5","xmm4"); 565e1051a39Sopenharmony_ci &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0] 566e1051a39Sopenharmony_ci &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0] 567e1051a39Sopenharmony_ci 568e1051a39Sopenharmony_ci # Upper half of a[0]*b[i] is carried into next multiplication 569e1051a39Sopenharmony_ci # iteration, while lower one "participates" in actual reduction. 570e1051a39Sopenharmony_ci # Normally latter is done by accumulating result of multiplication 571e1051a39Sopenharmony_ci # of modulus by "magic" digit, but thanks to special form of modulus 572e1051a39Sopenharmony_ci # and "magic" digit it can be performed only with additions and 573e1051a39Sopenharmony_ci # subtractions (see note in IALU section below). Note that we are 574e1051a39Sopenharmony_ci # not bothered with carry bits, they are accumulated in "flatten" 575e1051a39Sopenharmony_ci # phase after all multiplications and reductions. 576e1051a39Sopenharmony_ci 577e1051a39Sopenharmony_ci &movd ("xmm3",&DWP(4*3,"esi")); 578e1051a39Sopenharmony_ci &pshufd ("xmm2","xmm2",0b11001100); 579e1051a39Sopenharmony_ci &movdqa (&QWP(0x20,"ebx"),"xmm2"); 580e1051a39Sopenharmony_ci &pmuludq("xmm2","xmm7"); # a[2]*b[0] 581e1051a39Sopenharmony_ci &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry 582e1051a39Sopenharmony_ci &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0] 583e1051a39Sopenharmony_ci 584e1051a39Sopenharmony_ci &movd ("xmm0",&DWP(4*4,"esi")); 585e1051a39Sopenharmony_ci &pshufd ("xmm3","xmm3",0b11001100); 586e1051a39Sopenharmony_ci &movdqa (&QWP(0x30,"ebx"),"xmm3"); 587e1051a39Sopenharmony_ci &pmuludq("xmm3","xmm7"); # a[3]*b[0] 588e1051a39Sopenharmony_ci &movdqa (&QWP(0x10,"esp"),"xmm2"); 589e1051a39Sopenharmony_ci 590e1051a39Sopenharmony_ci &movd ("xmm1",&DWP(4*5,"esi")); 591e1051a39Sopenharmony_ci &pshufd ("xmm0","xmm0",0b11001100); 592e1051a39Sopenharmony_ci &movdqa (&QWP(0x40,"ebx"),"xmm0"); 593e1051a39Sopenharmony_ci &pmuludq("xmm0","xmm7"); # a[4]*b[0] 594e1051a39Sopenharmony_ci &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step 595e1051a39Sopenharmony_ci &movdqa (&QWP(0x20,"esp"),"xmm3"); 596e1051a39Sopenharmony_ci 597e1051a39Sopenharmony_ci &movd ("xmm2",&DWP(4*6,"esi")); 598e1051a39Sopenharmony_ci &pshufd ("xmm1","xmm1",0b11001100); 599e1051a39Sopenharmony_ci &movdqa (&QWP(0x50,"ebx"),"xmm1"); 600e1051a39Sopenharmony_ci &pmuludq("xmm1","xmm7"); # a[5]*b[0] 601e1051a39Sopenharmony_ci &movdqa (&QWP(0x30,"esp"),"xmm0"); 602e1051a39Sopenharmony_ci &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 603e1051a39Sopenharmony_ci 604e1051a39Sopenharmony_ci &movd ("xmm3",&DWP(4*7,"esi")); 605e1051a39Sopenharmony_ci &pshufd ("xmm2","xmm2",0b11001100); 606e1051a39Sopenharmony_ci &movdqa (&QWP(0x60,"ebx"),"xmm2"); 607e1051a39Sopenharmony_ci &pmuludq("xmm2","xmm7"); # a[6]*b[0] 608e1051a39Sopenharmony_ci &movdqa (&QWP(0x40,"esp"),"xmm1"); 609e1051a39Sopenharmony_ci &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 610e1051a39Sopenharmony_ci 611e1051a39Sopenharmony_ci &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy 612e1051a39Sopenharmony_ci &pshufd ("xmm3","xmm3",0b11001100); 613e1051a39Sopenharmony_ci &movdqa (&QWP(0x70,"ebx"),"xmm3"); 614e1051a39Sopenharmony_ci &pmuludq("xmm3","xmm7"); # a[7]*b[0] 615e1051a39Sopenharmony_ci 616e1051a39Sopenharmony_ci &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y 617e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] 618e1051a39Sopenharmony_ci &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 619e1051a39Sopenharmony_ci 620e1051a39Sopenharmony_ci &mov ("ecx",6); 621e1051a39Sopenharmony_ci &lea ("ebp",&DWP(4,"ebp")); 622e1051a39Sopenharmony_ci &jmp (&label("madd_sse2")); 623e1051a39Sopenharmony_ci 624e1051a39Sopenharmony_ci&set_label("madd_sse2",16); 625e1051a39Sopenharmony_ci &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled] 626e1051a39Sopenharmony_ci &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled] 627e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(0x10,"ebx")); 628e1051a39Sopenharmony_ci &pmuludq("xmm0","xmm7"); # a[0]*b[i] 629e1051a39Sopenharmony_ci &movdqa(&QWP(0x50,"esp"),"xmm2"); 630e1051a39Sopenharmony_ci 631e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0x20,"ebx")); 632e1051a39Sopenharmony_ci &pmuludq("xmm1","xmm7"); # a[1]*b[i] 633e1051a39Sopenharmony_ci &movdqa(&QWP(0x60,"esp"),"xmm3"); 634e1051a39Sopenharmony_ci &paddq ("xmm0",&QWP(0x00,"esp")); 635e1051a39Sopenharmony_ci 636e1051a39Sopenharmony_ci &movdqa ("xmm3",&QWP(0x30,"ebx")); 637e1051a39Sopenharmony_ci &pmuludq("xmm2","xmm7"); # a[2]*b[i] 638e1051a39Sopenharmony_ci &movq ("xmm4","xmm0"); # clear upper 64 bits 639e1051a39Sopenharmony_ci &pslldq("xmm4",6); 640e1051a39Sopenharmony_ci &paddq ("xmm1",&QWP(0x10,"esp")); 641e1051a39Sopenharmony_ci &paddq ("xmm4","xmm0"); 642e1051a39Sopenharmony_ci &movdqa("xmm5","xmm4"); 643e1051a39Sopenharmony_ci &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] 644e1051a39Sopenharmony_ci 645e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP(0x40,"ebx")); 646e1051a39Sopenharmony_ci &pmuludq("xmm3","xmm7"); # a[3]*b[i] 647e1051a39Sopenharmony_ci &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry 648e1051a39Sopenharmony_ci &paddq ("xmm2",&QWP(0x20,"esp")); 649e1051a39Sopenharmony_ci &movdqa (&QWP(0x00,"esp"),"xmm1"); 650e1051a39Sopenharmony_ci 651e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(0x50,"ebx")); 652e1051a39Sopenharmony_ci &pmuludq("xmm0","xmm7"); # a[4]*b[i] 653e1051a39Sopenharmony_ci &paddq ("xmm3",&QWP(0x30,"esp")); 654e1051a39Sopenharmony_ci &movdqa (&QWP(0x10,"esp"),"xmm2"); 655e1051a39Sopenharmony_ci &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] 656e1051a39Sopenharmony_ci 657e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0x60,"ebx")); 658e1051a39Sopenharmony_ci &pmuludq("xmm1","xmm7"); # a[5]*b[i] 659e1051a39Sopenharmony_ci &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step 660e1051a39Sopenharmony_ci &paddq ("xmm0",&QWP(0x40,"esp")); 661e1051a39Sopenharmony_ci &movdqa (&QWP(0x20,"esp"),"xmm3"); 662e1051a39Sopenharmony_ci &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 663e1051a39Sopenharmony_ci 664e1051a39Sopenharmony_ci &movdqa ("xmm3","xmm7"); 665e1051a39Sopenharmony_ci &pmuludq("xmm2","xmm7"); # a[6]*b[i] 666e1051a39Sopenharmony_ci &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy 667e1051a39Sopenharmony_ci &lea ("ebp",&DWP(4,"ebp")); 668e1051a39Sopenharmony_ci &paddq ("xmm1",&QWP(0x50,"esp")); 669e1051a39Sopenharmony_ci &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 670e1051a39Sopenharmony_ci &movdqa (&QWP(0x30,"esp"),"xmm0"); 671e1051a39Sopenharmony_ci &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y 672e1051a39Sopenharmony_ci 673e1051a39Sopenharmony_ci &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i] 674e1051a39Sopenharmony_ci &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 675e1051a39Sopenharmony_ci &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] 676e1051a39Sopenharmony_ci &movdqa (&QWP(0x40,"esp"),"xmm1"); 677e1051a39Sopenharmony_ci &paddq ("xmm2",&QWP(0x60,"esp")); 678e1051a39Sopenharmony_ci 679e1051a39Sopenharmony_ci &dec ("ecx"); 680e1051a39Sopenharmony_ci &jnz (&label("madd_sse2")); 681e1051a39Sopenharmony_ci 682e1051a39Sopenharmony_ci &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled] 683e1051a39Sopenharmony_ci &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled] 684e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(0x10,"ebx")); 685e1051a39Sopenharmony_ci &pmuludq("xmm0","xmm7"); # a[0]*b[7] 686e1051a39Sopenharmony_ci &movdqa(&QWP(0x50,"esp"),"xmm2"); 687e1051a39Sopenharmony_ci 688e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0x20,"ebx")); 689e1051a39Sopenharmony_ci &pmuludq("xmm1","xmm7"); # a[1]*b[7] 690e1051a39Sopenharmony_ci &movdqa(&QWP(0x60,"esp"),"xmm3"); 691e1051a39Sopenharmony_ci &paddq ("xmm0",&QWP(0x00,"esp")); 692e1051a39Sopenharmony_ci 693e1051a39Sopenharmony_ci &movdqa ("xmm3",&QWP(0x30,"ebx")); 694e1051a39Sopenharmony_ci &pmuludq("xmm2","xmm7"); # a[2]*b[7] 695e1051a39Sopenharmony_ci &movq ("xmm4","xmm0"); # clear upper 64 bits 696e1051a39Sopenharmony_ci &pslldq("xmm4",6); 697e1051a39Sopenharmony_ci &paddq ("xmm1",&QWP(0x10,"esp")); 698e1051a39Sopenharmony_ci &paddq ("xmm4","xmm0"); 699e1051a39Sopenharmony_ci &movdqa("xmm5","xmm4"); 700e1051a39Sopenharmony_ci &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] 701e1051a39Sopenharmony_ci 702e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP(0x40,"ebx")); 703e1051a39Sopenharmony_ci &pmuludq("xmm3","xmm7"); # a[3]*b[7] 704e1051a39Sopenharmony_ci &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry 705e1051a39Sopenharmony_ci &paddq ("xmm2",&QWP(0x20,"esp")); 706e1051a39Sopenharmony_ci &movdqa (&QWP(0x00,"esp"),"xmm1"); 707e1051a39Sopenharmony_ci 708e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(0x50,"ebx")); 709e1051a39Sopenharmony_ci &pmuludq("xmm0","xmm7"); # a[4]*b[7] 710e1051a39Sopenharmony_ci &paddq ("xmm3",&QWP(0x30,"esp")); 711e1051a39Sopenharmony_ci &movdqa (&QWP(0x10,"esp"),"xmm2"); 712e1051a39Sopenharmony_ci &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] 713e1051a39Sopenharmony_ci 714e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0x60,"ebx")); 715e1051a39Sopenharmony_ci &pmuludq("xmm1","xmm7"); # a[5]*b[7] 716e1051a39Sopenharmony_ci &paddq ("xmm3","xmm5"); # reduction step 717e1051a39Sopenharmony_ci &paddq ("xmm0",&QWP(0x40,"esp")); 718e1051a39Sopenharmony_ci &movdqa (&QWP(0x20,"esp"),"xmm3"); 719e1051a39Sopenharmony_ci &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 720e1051a39Sopenharmony_ci 721e1051a39Sopenharmony_ci &movdqa ("xmm3",&QWP(0x70,"ebx")); 722e1051a39Sopenharmony_ci &pmuludq("xmm2","xmm7"); # a[6]*b[7] 723e1051a39Sopenharmony_ci &paddq ("xmm1",&QWP(0x50,"esp")); 724e1051a39Sopenharmony_ci &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 725e1051a39Sopenharmony_ci &movdqa (&QWP(0x30,"esp"),"xmm0"); 726e1051a39Sopenharmony_ci 727e1051a39Sopenharmony_ci &pmuludq("xmm3","xmm7"); # a[7]*b[7] 728e1051a39Sopenharmony_ci &pcmpeqd("xmm7","xmm7"); 729e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP(0x00,"esp")); 730e1051a39Sopenharmony_ci &pslldq ("xmm7",8); 731e1051a39Sopenharmony_ci &movdqa (&QWP(0x40,"esp"),"xmm1"); 732e1051a39Sopenharmony_ci &paddq ("xmm2",&QWP(0x60,"esp")); 733e1051a39Sopenharmony_ci 734e1051a39Sopenharmony_ci &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step 735e1051a39Sopenharmony_ci &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step 736e1051a39Sopenharmony_ci &movdqa(&QWP(0x50,"esp"),"xmm2"); 737e1051a39Sopenharmony_ci &movdqa(&QWP(0x60,"esp"),"xmm3"); 738e1051a39Sopenharmony_ci 739e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(0x10,"esp")); 740e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0x20,"esp")); 741e1051a39Sopenharmony_ci &movdqa ("xmm3",&QWP(0x30,"esp")); 742e1051a39Sopenharmony_ci 743e1051a39Sopenharmony_ci &movq ("xmm4","xmm0"); # "flatten" 744e1051a39Sopenharmony_ci &pand ("xmm0","xmm7"); 745e1051a39Sopenharmony_ci &xor ("ebp","ebp"); 746e1051a39Sopenharmony_ci &pslldq ("xmm4",6); 747e1051a39Sopenharmony_ci &movq ("xmm5","xmm1"); 748e1051a39Sopenharmony_ci &paddq ("xmm0","xmm4"); 749e1051a39Sopenharmony_ci &pand ("xmm1","xmm7"); 750e1051a39Sopenharmony_ci &psrldq ("xmm0",6); 751e1051a39Sopenharmony_ci &movd ("eax","xmm0"); 752e1051a39Sopenharmony_ci &psrldq ("xmm0",4); 753e1051a39Sopenharmony_ci 754e1051a39Sopenharmony_ci &paddq ("xmm5","xmm0"); 755e1051a39Sopenharmony_ci &movdqa ("xmm0",&QWP(0x40,"esp")); 756e1051a39Sopenharmony_ci &sub ("eax",-1); # start subtracting modulus, 757e1051a39Sopenharmony_ci # this is used to determine 758e1051a39Sopenharmony_ci # if result is larger/smaller 759e1051a39Sopenharmony_ci # than modulus (see below) 760e1051a39Sopenharmony_ci &pslldq ("xmm5",6); 761e1051a39Sopenharmony_ci &movq ("xmm4","xmm2"); 762e1051a39Sopenharmony_ci &paddq ("xmm1","xmm5"); 763e1051a39Sopenharmony_ci &pand ("xmm2","xmm7"); 764e1051a39Sopenharmony_ci &psrldq ("xmm1",6); 765e1051a39Sopenharmony_ci &mov (&DWP(4*0,"edi"),"eax"); 766e1051a39Sopenharmony_ci &movd ("eax","xmm1"); 767e1051a39Sopenharmony_ci &psrldq ("xmm1",4); 768e1051a39Sopenharmony_ci 769e1051a39Sopenharmony_ci &paddq ("xmm4","xmm1"); 770e1051a39Sopenharmony_ci &movdqa ("xmm1",&QWP(0x50,"esp")); 771e1051a39Sopenharmony_ci &sbb ("eax",-1); 772e1051a39Sopenharmony_ci &pslldq ("xmm4",6); 773e1051a39Sopenharmony_ci &movq ("xmm5","xmm3"); 774e1051a39Sopenharmony_ci &paddq ("xmm2","xmm4"); 775e1051a39Sopenharmony_ci &pand ("xmm3","xmm7"); 776e1051a39Sopenharmony_ci &psrldq ("xmm2",6); 777e1051a39Sopenharmony_ci &mov (&DWP(4*1,"edi"),"eax"); 778e1051a39Sopenharmony_ci &movd ("eax","xmm2"); 779e1051a39Sopenharmony_ci &psrldq ("xmm2",4); 780e1051a39Sopenharmony_ci 781e1051a39Sopenharmony_ci &paddq ("xmm5","xmm2"); 782e1051a39Sopenharmony_ci &movdqa ("xmm2",&QWP(0x60,"esp")); 783e1051a39Sopenharmony_ci &sbb ("eax",-1); 784e1051a39Sopenharmony_ci &pslldq ("xmm5",6); 785e1051a39Sopenharmony_ci &movq ("xmm4","xmm0"); 786e1051a39Sopenharmony_ci &paddq ("xmm3","xmm5"); 787e1051a39Sopenharmony_ci &pand ("xmm0","xmm7"); 788e1051a39Sopenharmony_ci &psrldq ("xmm3",6); 789e1051a39Sopenharmony_ci &mov (&DWP(4*2,"edi"),"eax"); 790e1051a39Sopenharmony_ci &movd ("eax","xmm3"); 791e1051a39Sopenharmony_ci &psrldq ("xmm3",4); 792e1051a39Sopenharmony_ci 793e1051a39Sopenharmony_ci &paddq ("xmm4","xmm3"); 794e1051a39Sopenharmony_ci &sbb ("eax",0); 795e1051a39Sopenharmony_ci &pslldq ("xmm4",6); 796e1051a39Sopenharmony_ci &movq ("xmm5","xmm1"); 797e1051a39Sopenharmony_ci &paddq ("xmm0","xmm4"); 798e1051a39Sopenharmony_ci &pand ("xmm1","xmm7"); 799e1051a39Sopenharmony_ci &psrldq ("xmm0",6); 800e1051a39Sopenharmony_ci &mov (&DWP(4*3,"edi"),"eax"); 801e1051a39Sopenharmony_ci &movd ("eax","xmm0"); 802e1051a39Sopenharmony_ci &psrldq ("xmm0",4); 803e1051a39Sopenharmony_ci 804e1051a39Sopenharmony_ci &paddq ("xmm5","xmm0"); 805e1051a39Sopenharmony_ci &sbb ("eax",0); 806e1051a39Sopenharmony_ci &pslldq ("xmm5",6); 807e1051a39Sopenharmony_ci &movq ("xmm4","xmm2"); 808e1051a39Sopenharmony_ci &paddq ("xmm1","xmm5"); 809e1051a39Sopenharmony_ci &pand ("xmm2","xmm7"); 810e1051a39Sopenharmony_ci &psrldq ("xmm1",6); 811e1051a39Sopenharmony_ci &movd ("ebx","xmm1"); 812e1051a39Sopenharmony_ci &psrldq ("xmm1",4); 813e1051a39Sopenharmony_ci &mov ("esp","edx"); 814e1051a39Sopenharmony_ci 815e1051a39Sopenharmony_ci &paddq ("xmm4","xmm1"); 816e1051a39Sopenharmony_ci &pslldq ("xmm4",6); 817e1051a39Sopenharmony_ci &paddq ("xmm2","xmm4"); 818e1051a39Sopenharmony_ci &psrldq ("xmm2",6); 819e1051a39Sopenharmony_ci &movd ("ecx","xmm2"); 820e1051a39Sopenharmony_ci &psrldq ("xmm2",4); 821e1051a39Sopenharmony_ci &sbb ("ebx",0); 822e1051a39Sopenharmony_ci &movd ("edx","xmm2"); 823e1051a39Sopenharmony_ci &pextrw ("esi","xmm2",2); # top-most overflow bit 824e1051a39Sopenharmony_ci &sbb ("ecx",1); 825e1051a39Sopenharmony_ci &sbb ("edx",-1); 826e1051a39Sopenharmony_ci &sbb ("esi",0); # borrow from subtraction 827e1051a39Sopenharmony_ci 828e1051a39Sopenharmony_ci # Final step is "if result > mod, subtract mod", and at this point 829e1051a39Sopenharmony_ci # we have result - mod written to output buffer, as well as borrow 830e1051a39Sopenharmony_ci # bit from this subtraction, and if borrow bit is set, we add 831e1051a39Sopenharmony_ci # modulus back. 832e1051a39Sopenharmony_ci # 833e1051a39Sopenharmony_ci # Note that because mod has special form, i.e. consists of 834e1051a39Sopenharmony_ci # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 835e1051a39Sopenharmony_ci # assigning borrow bit to one register, %ebp, and its negative 836e1051a39Sopenharmony_ci # to another, %esi. But we started by calculating %esi... 837e1051a39Sopenharmony_ci 838e1051a39Sopenharmony_ci &sub ("ebp","esi"); 839e1051a39Sopenharmony_ci &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero 840e1051a39Sopenharmony_ci &adc (&DWP(4*1,"edi"),"esi"); 841e1051a39Sopenharmony_ci &adc (&DWP(4*2,"edi"),"esi"); 842e1051a39Sopenharmony_ci &adc (&DWP(4*3,"edi"),0); 843e1051a39Sopenharmony_ci &adc ("eax",0); 844e1051a39Sopenharmony_ci &adc ("ebx",0); 845e1051a39Sopenharmony_ci &mov (&DWP(4*4,"edi"),"eax"); 846e1051a39Sopenharmony_ci &adc ("ecx","ebp"); 847e1051a39Sopenharmony_ci &mov (&DWP(4*5,"edi"),"ebx"); 848e1051a39Sopenharmony_ci &adc ("edx","esi"); 849e1051a39Sopenharmony_ci &mov (&DWP(4*6,"edi"),"ecx"); 850e1051a39Sopenharmony_ci &mov (&DWP(4*7,"edi"),"edx"); 851e1051a39Sopenharmony_ci 852e1051a39Sopenharmony_ci &ret (); 853e1051a39Sopenharmony_ci 854e1051a39Sopenharmony_ci&set_label("mul_mont_ialu",16); } 855e1051a39Sopenharmony_ci 856e1051a39Sopenharmony_ci ######################################## 857e1051a39Sopenharmony_ci # IALU code path suitable for all CPUs. 858e1051a39Sopenharmony_ci ######################################## 859e1051a39Sopenharmony_ci # stack layout: 860e1051a39Sopenharmony_ci # +------------------------------------+< %esp 861e1051a39Sopenharmony_ci # | 8 32-bit temporary words, accessed | 862e1051a39Sopenharmony_ci # | as circular buffer | 863e1051a39Sopenharmony_ci # . . 864e1051a39Sopenharmony_ci # . . 865e1051a39Sopenharmony_ci # +------------------------------------+< +32 866e1051a39Sopenharmony_ci # | offloaded destination pointer | 867e1051a39Sopenharmony_ci # +------------------------------------+ 868e1051a39Sopenharmony_ci # | unused | 869e1051a39Sopenharmony_ci # +------------------------------------+< +40 870e1051a39Sopenharmony_ci &sub ("esp",10*4); 871e1051a39Sopenharmony_ci 872e1051a39Sopenharmony_ci &mov ("eax",&DWP(0*4,"esi")); # a[0] 873e1051a39Sopenharmony_ci &mov ("ebx",&DWP(0*4,"ebp")); # b[0] 874e1051a39Sopenharmony_ci &mov (&DWP(8*4,"esp"),"edi"); # off-load dst ptr 875e1051a39Sopenharmony_ci 876e1051a39Sopenharmony_ci &mul ("ebx"); # a[0]*b[0] 877e1051a39Sopenharmony_ci &mov (&DWP(0*4,"esp"),"eax"); # t[0] 878e1051a39Sopenharmony_ci &mov ("eax",&DWP(1*4,"esi")); 879e1051a39Sopenharmony_ci &mov ("ecx","edx") 880e1051a39Sopenharmony_ci 881e1051a39Sopenharmony_ci &mul ("ebx"); # a[1]*b[0] 882e1051a39Sopenharmony_ci &add ("ecx","eax"); 883e1051a39Sopenharmony_ci &mov ("eax",&DWP(2*4,"esi")); 884e1051a39Sopenharmony_ci &adc ("edx",0); 885e1051a39Sopenharmony_ci &mov (&DWP(1*4,"esp"),"ecx"); # t[1] 886e1051a39Sopenharmony_ci &mov ("ecx","edx"); 887e1051a39Sopenharmony_ci 888e1051a39Sopenharmony_ci &mul ("ebx"); # a[2]*b[0] 889e1051a39Sopenharmony_ci &add ("ecx","eax"); 890e1051a39Sopenharmony_ci &mov ("eax",&DWP(3*4,"esi")); 891e1051a39Sopenharmony_ci &adc ("edx",0); 892e1051a39Sopenharmony_ci &mov (&DWP(2*4,"esp"),"ecx"); # t[2] 893e1051a39Sopenharmony_ci &mov ("ecx","edx"); 894e1051a39Sopenharmony_ci 895e1051a39Sopenharmony_ci &mul ("ebx"); # a[3]*b[0] 896e1051a39Sopenharmony_ci &add ("ecx","eax"); 897e1051a39Sopenharmony_ci &mov ("eax",&DWP(4*4,"esi")); 898e1051a39Sopenharmony_ci &adc ("edx",0); 899e1051a39Sopenharmony_ci &mov (&DWP(3*4,"esp"),"ecx"); # t[3] 900e1051a39Sopenharmony_ci &mov ("ecx","edx"); 901e1051a39Sopenharmony_ci 902e1051a39Sopenharmony_ci &mul ("ebx"); # a[4]*b[0] 903e1051a39Sopenharmony_ci &add ("ecx","eax"); 904e1051a39Sopenharmony_ci &mov ("eax",&DWP(5*4,"esi")); 905e1051a39Sopenharmony_ci &adc ("edx",0); 906e1051a39Sopenharmony_ci &mov (&DWP(4*4,"esp"),"ecx"); # t[4] 907e1051a39Sopenharmony_ci &mov ("ecx","edx"); 908e1051a39Sopenharmony_ci 909e1051a39Sopenharmony_ci &mul ("ebx"); # a[5]*b[0] 910e1051a39Sopenharmony_ci &add ("ecx","eax"); 911e1051a39Sopenharmony_ci &mov ("eax",&DWP(6*4,"esi")); 912e1051a39Sopenharmony_ci &adc ("edx",0); 913e1051a39Sopenharmony_ci &mov (&DWP(5*4,"esp"),"ecx"); # t[5] 914e1051a39Sopenharmony_ci &mov ("ecx","edx"); 915e1051a39Sopenharmony_ci 916e1051a39Sopenharmony_ci &mul ("ebx"); # a[6]*b[0] 917e1051a39Sopenharmony_ci &add ("ecx","eax"); 918e1051a39Sopenharmony_ci &mov ("eax",&DWP(7*4,"esi")); 919e1051a39Sopenharmony_ci &adc ("edx",0); 920e1051a39Sopenharmony_ci &mov (&DWP(6*4,"esp"),"ecx"); # t[6] 921e1051a39Sopenharmony_ci &mov ("ecx","edx"); 922e1051a39Sopenharmony_ci 923e1051a39Sopenharmony_ci &xor ("edi","edi"); # initial top-most carry 924e1051a39Sopenharmony_ci &mul ("ebx"); # a[7]*b[0] 925e1051a39Sopenharmony_ci &add ("ecx","eax"); # t[7] 926e1051a39Sopenharmony_ci &mov ("eax",&DWP(0*4,"esp")); # t[0] 927e1051a39Sopenharmony_ci &adc ("edx",0); # t[8] 928e1051a39Sopenharmony_ci 929e1051a39Sopenharmony_cifor ($i=0;$i<7;$i++) { 930e1051a39Sopenharmony_ci my $j=$i+1; 931e1051a39Sopenharmony_ci 932e1051a39Sopenharmony_ci # Reduction iteration is normally performed by accumulating 933e1051a39Sopenharmony_ci # result of multiplication of modulus by "magic" digit [and 934e1051a39Sopenharmony_ci # omitting least significant word, which is guaranteed to 935e1051a39Sopenharmony_ci # be 0], but thanks to special form of modulus and "magic" 936e1051a39Sopenharmony_ci # digit being equal to least significant word, it can be 937e1051a39Sopenharmony_ci # performed with additions and subtractions alone. Indeed: 938e1051a39Sopenharmony_ci # 939e1051a39Sopenharmony_ci # ffff.0001.0000.0000.0000.ffff.ffff.ffff 940e1051a39Sopenharmony_ci # * abcd 941e1051a39Sopenharmony_ci # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 942e1051a39Sopenharmony_ci # 943e1051a39Sopenharmony_ci # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 944e1051a39Sopenharmony_ci # rewrite above as: 945e1051a39Sopenharmony_ci # 946e1051a39Sopenharmony_ci # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 947e1051a39Sopenharmony_ci # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 948e1051a39Sopenharmony_ci # - abcd.0000.0000.0000.0000.0000.0000.abcd 949e1051a39Sopenharmony_ci # 950e1051a39Sopenharmony_ci # or marking redundant operations: 951e1051a39Sopenharmony_ci # 952e1051a39Sopenharmony_ci # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- 953e1051a39Sopenharmony_ci # + abcd.0000.abcd.0000.0000.abcd.----.----.---- 954e1051a39Sopenharmony_ci # - abcd.----.----.----.----.----.----.---- 955e1051a39Sopenharmony_ci 956e1051a39Sopenharmony_ci &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] 957e1051a39Sopenharmony_ci &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 958e1051a39Sopenharmony_ci &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 959e1051a39Sopenharmony_ci &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] 960e1051a39Sopenharmony_ci &adc ("ecx",0); # t[7]+=0 961e1051a39Sopenharmony_ci &adc ("edx","eax"); # t[8]+=t[0] 962e1051a39Sopenharmony_ci &adc ("edi",0); # top-most carry 963e1051a39Sopenharmony_ci &mov ("ebx",&DWP($j*4,"ebp")); # b[i] 964e1051a39Sopenharmony_ci &sub ("ecx","eax"); # t[7]-=t[0] 965e1051a39Sopenharmony_ci &mov ("eax",&DWP(0*4,"esi")); # a[0] 966e1051a39Sopenharmony_ci &sbb ("edx",0); # t[8]-=0 967e1051a39Sopenharmony_ci &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); 968e1051a39Sopenharmony_ci &sbb ("edi",0); # top-most carry, 969e1051a39Sopenharmony_ci # keep in mind that 970e1051a39Sopenharmony_ci # netto result is 971e1051a39Sopenharmony_ci # *addition* of value 972e1051a39Sopenharmony_ci # with (abcd<<32)-abcd 973e1051a39Sopenharmony_ci # on top, so that 974e1051a39Sopenharmony_ci # underflow is 975e1051a39Sopenharmony_ci # impossible, because 976e1051a39Sopenharmony_ci # (abcd<<32)-abcd 977e1051a39Sopenharmony_ci # doesn't underflow 978e1051a39Sopenharmony_ci &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); 979e1051a39Sopenharmony_ci 980e1051a39Sopenharmony_ci &mul ("ebx"); # a[0]*b[i] 981e1051a39Sopenharmony_ci &add ("eax",&DWP((($j+0)%8)*4,"esp")); 982e1051a39Sopenharmony_ci &adc ("edx",0); 983e1051a39Sopenharmony_ci &mov (&DWP((($j+0)%8)*4,"esp"),"eax"); 984e1051a39Sopenharmony_ci &mov ("eax",&DWP(1*4,"esi")); 985e1051a39Sopenharmony_ci &mov ("ecx","edx") 986e1051a39Sopenharmony_ci 987e1051a39Sopenharmony_ci &mul ("ebx"); # a[1]*b[i] 988e1051a39Sopenharmony_ci &add ("ecx",&DWP((($j+1)%8)*4,"esp")); 989e1051a39Sopenharmony_ci &adc ("edx",0); 990e1051a39Sopenharmony_ci &add ("ecx","eax"); 991e1051a39Sopenharmony_ci &adc ("edx",0); 992e1051a39Sopenharmony_ci &mov ("eax",&DWP(2*4,"esi")); 993e1051a39Sopenharmony_ci &mov (&DWP((($j+1)%8)*4,"esp"),"ecx"); 994e1051a39Sopenharmony_ci &mov ("ecx","edx"); 995e1051a39Sopenharmony_ci 996e1051a39Sopenharmony_ci &mul ("ebx"); # a[2]*b[i] 997e1051a39Sopenharmony_ci &add ("ecx",&DWP((($j+2)%8)*4,"esp")); 998e1051a39Sopenharmony_ci &adc ("edx",0); 999e1051a39Sopenharmony_ci &add ("ecx","eax"); 1000e1051a39Sopenharmony_ci &adc ("edx",0); 1001e1051a39Sopenharmony_ci &mov ("eax",&DWP(3*4,"esi")); 1002e1051a39Sopenharmony_ci &mov (&DWP((($j+2)%8)*4,"esp"),"ecx"); 1003e1051a39Sopenharmony_ci &mov ("ecx","edx"); 1004e1051a39Sopenharmony_ci 1005e1051a39Sopenharmony_ci &mul ("ebx"); # a[3]*b[i] 1006e1051a39Sopenharmony_ci &add ("ecx",&DWP((($j+3)%8)*4,"esp")); 1007e1051a39Sopenharmony_ci &adc ("edx",0); 1008e1051a39Sopenharmony_ci &add ("ecx","eax"); 1009e1051a39Sopenharmony_ci &adc ("edx",0); 1010e1051a39Sopenharmony_ci &mov ("eax",&DWP(4*4,"esi")); 1011e1051a39Sopenharmony_ci &mov (&DWP((($j+3)%8)*4,"esp"),"ecx"); 1012e1051a39Sopenharmony_ci &mov ("ecx","edx"); 1013e1051a39Sopenharmony_ci 1014e1051a39Sopenharmony_ci &mul ("ebx"); # a[4]*b[i] 1015e1051a39Sopenharmony_ci &add ("ecx",&DWP((($j+4)%8)*4,"esp")); 1016e1051a39Sopenharmony_ci &adc ("edx",0); 1017e1051a39Sopenharmony_ci &add ("ecx","eax"); 1018e1051a39Sopenharmony_ci &adc ("edx",0); 1019e1051a39Sopenharmony_ci &mov ("eax",&DWP(5*4,"esi")); 1020e1051a39Sopenharmony_ci &mov (&DWP((($j+4)%8)*4,"esp"),"ecx"); 1021e1051a39Sopenharmony_ci &mov ("ecx","edx"); 1022e1051a39Sopenharmony_ci 1023e1051a39Sopenharmony_ci &mul ("ebx"); # a[5]*b[i] 1024e1051a39Sopenharmony_ci &add ("ecx",&DWP((($j+5)%8)*4,"esp")); 1025e1051a39Sopenharmony_ci &adc ("edx",0); 1026e1051a39Sopenharmony_ci &add ("ecx","eax"); 1027e1051a39Sopenharmony_ci &adc ("edx",0); 1028e1051a39Sopenharmony_ci &mov ("eax",&DWP(6*4,"esi")); 1029e1051a39Sopenharmony_ci &mov (&DWP((($j+5)%8)*4,"esp"),"ecx"); 1030e1051a39Sopenharmony_ci &mov ("ecx","edx"); 1031e1051a39Sopenharmony_ci 1032e1051a39Sopenharmony_ci &mul ("ebx"); # a[6]*b[i] 1033e1051a39Sopenharmony_ci &add ("ecx",&DWP((($j+6)%8)*4,"esp")); 1034e1051a39Sopenharmony_ci &adc ("edx",0); 1035e1051a39Sopenharmony_ci &add ("ecx","eax"); 1036e1051a39Sopenharmony_ci &adc ("edx",0); 1037e1051a39Sopenharmony_ci &mov ("eax",&DWP(7*4,"esi")); 1038e1051a39Sopenharmony_ci &mov (&DWP((($j+6)%8)*4,"esp"),"ecx"); 1039e1051a39Sopenharmony_ci &mov ("ecx","edx"); 1040e1051a39Sopenharmony_ci 1041e1051a39Sopenharmony_ci &mul ("ebx"); # a[7]*b[i] 1042e1051a39Sopenharmony_ci &add ("ecx",&DWP((($j+7)%8)*4,"esp")); 1043e1051a39Sopenharmony_ci &adc ("edx",0); 1044e1051a39Sopenharmony_ci &add ("ecx","eax"); # t[7] 1045e1051a39Sopenharmony_ci &mov ("eax",&DWP((($j+0)%8)*4,"esp")); # t[0] 1046e1051a39Sopenharmony_ci &adc ("edx","edi"); # t[8] 1047e1051a39Sopenharmony_ci &mov ("edi",0); 1048e1051a39Sopenharmony_ci &adc ("edi",0); # top-most carry 1049e1051a39Sopenharmony_ci} 1050e1051a39Sopenharmony_ci &mov ("ebp",&DWP(8*4,"esp")); # restore dst ptr 1051e1051a39Sopenharmony_ci &xor ("esi","esi"); 1052e1051a39Sopenharmony_ci my $j=$i+1; 1053e1051a39Sopenharmony_ci 1054e1051a39Sopenharmony_ci # last multiplication-less reduction 1055e1051a39Sopenharmony_ci &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] 1056e1051a39Sopenharmony_ci &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 1057e1051a39Sopenharmony_ci &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 1058e1051a39Sopenharmony_ci &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] 1059e1051a39Sopenharmony_ci &adc ("ecx",0); # t[7]+=0 1060e1051a39Sopenharmony_ci &adc ("edx","eax"); # t[8]+=t[0] 1061e1051a39Sopenharmony_ci &adc ("edi",0); # top-most carry 1062e1051a39Sopenharmony_ci &mov ("ebx",&DWP((($j+1)%8)*4,"esp")); 1063e1051a39Sopenharmony_ci &sub ("ecx","eax"); # t[7]-=t[0] 1064e1051a39Sopenharmony_ci &mov ("eax",&DWP((($j+0)%8)*4,"esp")); 1065e1051a39Sopenharmony_ci &sbb ("edx",0); # t[8]-=0 1066e1051a39Sopenharmony_ci &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); 1067e1051a39Sopenharmony_ci &sbb ("edi",0); # top-most carry 1068e1051a39Sopenharmony_ci &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); 1069e1051a39Sopenharmony_ci 1070e1051a39Sopenharmony_ci # Final step is "if result > mod, subtract mod", but we do it 1071e1051a39Sopenharmony_ci # "other way around", namely write result - mod to output buffer 1072e1051a39Sopenharmony_ci # and if subtraction borrowed, add modulus back. 1073e1051a39Sopenharmony_ci 1074e1051a39Sopenharmony_ci &mov ("ecx",&DWP((($j+2)%8)*4,"esp")); 1075e1051a39Sopenharmony_ci &sub ("eax",-1); 1076e1051a39Sopenharmony_ci &mov ("edx",&DWP((($j+3)%8)*4,"esp")); 1077e1051a39Sopenharmony_ci &sbb ("ebx",-1); 1078e1051a39Sopenharmony_ci &mov (&DWP(0*4,"ebp"),"eax"); 1079e1051a39Sopenharmony_ci &sbb ("ecx",-1); 1080e1051a39Sopenharmony_ci &mov (&DWP(1*4,"ebp"),"ebx"); 1081e1051a39Sopenharmony_ci &sbb ("edx",0); 1082e1051a39Sopenharmony_ci &mov (&DWP(2*4,"ebp"),"ecx"); 1083e1051a39Sopenharmony_ci &mov (&DWP(3*4,"ebp"),"edx"); 1084e1051a39Sopenharmony_ci 1085e1051a39Sopenharmony_ci &mov ("eax",&DWP((($j+4)%8)*4,"esp")); 1086e1051a39Sopenharmony_ci &mov ("ebx",&DWP((($j+5)%8)*4,"esp")); 1087e1051a39Sopenharmony_ci &mov ("ecx",&DWP((($j+6)%8)*4,"esp")); 1088e1051a39Sopenharmony_ci &sbb ("eax",0); 1089e1051a39Sopenharmony_ci &mov ("edx",&DWP((($j+7)%8)*4,"esp")); 1090e1051a39Sopenharmony_ci &sbb ("ebx",0); 1091e1051a39Sopenharmony_ci &sbb ("ecx",1); 1092e1051a39Sopenharmony_ci &sbb ("edx",-1); 1093e1051a39Sopenharmony_ci &sbb ("edi",0); 1094e1051a39Sopenharmony_ci 1095e1051a39Sopenharmony_ci # Note that because mod has special form, i.e. consists of 1096e1051a39Sopenharmony_ci # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 1097e1051a39Sopenharmony_ci # assigning borrow bit to one register, %ebp, and its negative 1098e1051a39Sopenharmony_ci # to another, %esi. But we started by calculating %esi... 1099e1051a39Sopenharmony_ci 1100e1051a39Sopenharmony_ci &sub ("esi","edi"); 1101e1051a39Sopenharmony_ci &add (&DWP(0*4,"ebp"),"edi"); # add modulus or zero 1102e1051a39Sopenharmony_ci &adc (&DWP(1*4,"ebp"),"edi"); 1103e1051a39Sopenharmony_ci &adc (&DWP(2*4,"ebp"),"edi"); 1104e1051a39Sopenharmony_ci &adc (&DWP(3*4,"ebp"),0); 1105e1051a39Sopenharmony_ci &adc ("eax",0); 1106e1051a39Sopenharmony_ci &adc ("ebx",0); 1107e1051a39Sopenharmony_ci &mov (&DWP(4*4,"ebp"),"eax"); 1108e1051a39Sopenharmony_ci &adc ("ecx","esi"); 1109e1051a39Sopenharmony_ci &mov (&DWP(5*4,"ebp"),"ebx"); 1110e1051a39Sopenharmony_ci &adc ("edx","edi"); 1111e1051a39Sopenharmony_ci &mov (&DWP(6*4,"ebp"),"ecx"); 1112e1051a39Sopenharmony_ci &mov ("edi","ebp"); # fulfill contract 1113e1051a39Sopenharmony_ci &mov (&DWP(7*4,"ebp"),"edx"); 1114e1051a39Sopenharmony_ci 1115e1051a39Sopenharmony_ci &add ("esp",10*4); 1116e1051a39Sopenharmony_ci &ret (); 1117e1051a39Sopenharmony_ci&function_end_B("_ecp_nistz256_mul_mont"); 1118e1051a39Sopenharmony_ci 1119e1051a39Sopenharmony_ci######################################################################## 1120e1051a39Sopenharmony_ci# void ecp_nistz256_scatter_w5(void *edi,const P256_POINT *esi, 1121e1051a39Sopenharmony_ci# int ebp); 1122e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_scatter_w5"); 1123e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 1124e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 1125e1051a39Sopenharmony_ci &mov ("ebp",&wparam(2)); 1126e1051a39Sopenharmony_ci 1127e1051a39Sopenharmony_ci &lea ("edi",&DWP(128-4,"edi","ebp",4)); 1128e1051a39Sopenharmony_ci &mov ("ebp",96/16); 1129e1051a39Sopenharmony_ci&set_label("scatter_w5_loop"); 1130e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"esi")); 1131e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"esi")); 1132e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"esi")); 1133e1051a39Sopenharmony_ci &mov ("edx",&DWP(12,"esi")); 1134e1051a39Sopenharmony_ci &lea ("esi",&DWP(16,"esi")); 1135e1051a39Sopenharmony_ci &mov (&DWP(64*0-128,"edi"),"eax"); 1136e1051a39Sopenharmony_ci &mov (&DWP(64*1-128,"edi"),"ebx"); 1137e1051a39Sopenharmony_ci &mov (&DWP(64*2-128,"edi"),"ecx"); 1138e1051a39Sopenharmony_ci &mov (&DWP(64*3-128,"edi"),"edx"); 1139e1051a39Sopenharmony_ci &lea ("edi",&DWP(64*4,"edi")); 1140e1051a39Sopenharmony_ci &dec ("ebp"); 1141e1051a39Sopenharmony_ci &jnz (&label("scatter_w5_loop")); 1142e1051a39Sopenharmony_ci&function_end("ecp_nistz256_scatter_w5"); 1143e1051a39Sopenharmony_ci 1144e1051a39Sopenharmony_ci######################################################################## 1145e1051a39Sopenharmony_ci# void ecp_nistz256_gather_w5(P256_POINT *edi,const void *esi, 1146e1051a39Sopenharmony_ci# int ebp); 1147e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_gather_w5"); 1148e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 1149e1051a39Sopenharmony_ci &mov ("ebp",&wparam(2)); 1150e1051a39Sopenharmony_ci 1151e1051a39Sopenharmony_ci &lea ("esi",&DWP(0,"esi","ebp",4)); 1152e1051a39Sopenharmony_ci &neg ("ebp"); 1153e1051a39Sopenharmony_ci &sar ("ebp",31); 1154e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 1155e1051a39Sopenharmony_ci &lea ("esi",&DWP(0,"esi","ebp",4)); 1156e1051a39Sopenharmony_ci 1157e1051a39Sopenharmony_ci for($i=0;$i<24;$i+=4) { 1158e1051a39Sopenharmony_ci &mov ("eax",&DWP(64*($i+0),"esi")); 1159e1051a39Sopenharmony_ci &mov ("ebx",&DWP(64*($i+1),"esi")); 1160e1051a39Sopenharmony_ci &mov ("ecx",&DWP(64*($i+2),"esi")); 1161e1051a39Sopenharmony_ci &mov ("edx",&DWP(64*($i+3),"esi")); 1162e1051a39Sopenharmony_ci &and ("eax","ebp"); 1163e1051a39Sopenharmony_ci &and ("ebx","ebp"); 1164e1051a39Sopenharmony_ci &and ("ecx","ebp"); 1165e1051a39Sopenharmony_ci &and ("edx","ebp"); 1166e1051a39Sopenharmony_ci &mov (&DWP(4*($i+0),"edi"),"eax"); 1167e1051a39Sopenharmony_ci &mov (&DWP(4*($i+1),"edi"),"ebx"); 1168e1051a39Sopenharmony_ci &mov (&DWP(4*($i+2),"edi"),"ecx"); 1169e1051a39Sopenharmony_ci &mov (&DWP(4*($i+3),"edi"),"edx"); 1170e1051a39Sopenharmony_ci } 1171e1051a39Sopenharmony_ci&function_end("ecp_nistz256_gather_w5"); 1172e1051a39Sopenharmony_ci 1173e1051a39Sopenharmony_ci######################################################################## 1174e1051a39Sopenharmony_ci# void ecp_nistz256_scatter_w7(void *edi,const P256_POINT_AFFINE *esi, 1175e1051a39Sopenharmony_ci# int ebp); 1176e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_scatter_w7"); 1177e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 1178e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 1179e1051a39Sopenharmony_ci &mov ("ebp",&wparam(2)); 1180e1051a39Sopenharmony_ci 1181e1051a39Sopenharmony_ci &lea ("edi",&DWP(0,"edi","ebp")); 1182e1051a39Sopenharmony_ci &mov ("ebp",64/4); 1183e1051a39Sopenharmony_ci&set_label("scatter_w7_loop"); 1184e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"esi")); 1185e1051a39Sopenharmony_ci &lea ("esi",&DWP(4,"esi")); 1186e1051a39Sopenharmony_ci &mov (&BP(64*0,"edi"),"al"); 1187e1051a39Sopenharmony_ci &mov (&BP(64*1,"edi"),"ah"); 1188e1051a39Sopenharmony_ci &shr ("eax",16); 1189e1051a39Sopenharmony_ci &mov (&BP(64*2,"edi"),"al"); 1190e1051a39Sopenharmony_ci &mov (&BP(64*3,"edi"),"ah"); 1191e1051a39Sopenharmony_ci &lea ("edi",&DWP(64*4,"edi")); 1192e1051a39Sopenharmony_ci &dec ("ebp"); 1193e1051a39Sopenharmony_ci &jnz (&label("scatter_w7_loop")); 1194e1051a39Sopenharmony_ci&function_end("ecp_nistz256_scatter_w7"); 1195e1051a39Sopenharmony_ci 1196e1051a39Sopenharmony_ci######################################################################## 1197e1051a39Sopenharmony_ci# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *edi,const void *esi, 1198e1051a39Sopenharmony_ci# int ebp); 1199e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_gather_w7"); 1200e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 1201e1051a39Sopenharmony_ci &mov ("ebp",&wparam(2)); 1202e1051a39Sopenharmony_ci 1203e1051a39Sopenharmony_ci &add ("esi","ebp"); 1204e1051a39Sopenharmony_ci &neg ("ebp"), 1205e1051a39Sopenharmony_ci &sar ("ebp",31); 1206e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 1207e1051a39Sopenharmony_ci &lea ("esi",&DWP(0,"esi","ebp")); 1208e1051a39Sopenharmony_ci 1209e1051a39Sopenharmony_ci for($i=0;$i<64;$i+=4) { 1210e1051a39Sopenharmony_ci &movz ("eax",&BP(64*($i+0),"esi")); 1211e1051a39Sopenharmony_ci &movz ("ebx",&BP(64*($i+1),"esi")); 1212e1051a39Sopenharmony_ci &movz ("ecx",&BP(64*($i+2),"esi")); 1213e1051a39Sopenharmony_ci &and ("eax","ebp"); 1214e1051a39Sopenharmony_ci &movz ("edx",&BP(64*($i+3),"esi")); 1215e1051a39Sopenharmony_ci &and ("ebx","ebp"); 1216e1051a39Sopenharmony_ci &mov (&BP($i+0,"edi"),"al"); 1217e1051a39Sopenharmony_ci &and ("ecx","ebp"); 1218e1051a39Sopenharmony_ci &mov (&BP($i+1,"edi"),"bl"); 1219e1051a39Sopenharmony_ci &and ("edx","ebp"); 1220e1051a39Sopenharmony_ci &mov (&BP($i+2,"edi"),"cl"); 1221e1051a39Sopenharmony_ci &mov (&BP($i+3,"edi"),"dl"); 1222e1051a39Sopenharmony_ci } 1223e1051a39Sopenharmony_ci&function_end("ecp_nistz256_gather_w7"); 1224e1051a39Sopenharmony_ci 1225e1051a39Sopenharmony_ci######################################################################## 1226e1051a39Sopenharmony_ci# following subroutines are "literal" implementation of those found in 1227e1051a39Sopenharmony_ci# ecp_nistz256.c 1228e1051a39Sopenharmony_ci# 1229e1051a39Sopenharmony_ci######################################################################## 1230e1051a39Sopenharmony_ci# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 1231e1051a39Sopenharmony_ci# 1232e1051a39Sopenharmony_ci&static_label("point_double_shortcut"); 1233e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_point_double"); 1234e1051a39Sopenharmony_ci{ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1235e1051a39Sopenharmony_ci 1236e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 1237e1051a39Sopenharmony_ci 1238e1051a39Sopenharmony_ci # above map() describes stack layout with 5 temporary 1239e1051a39Sopenharmony_ci # 256-bit vectors on top, then we take extra word for 1240e1051a39Sopenharmony_ci # OPENSSL_ia32cap_P copy. 1241e1051a39Sopenharmony_ci &stack_push(8*5+1); 1242e1051a39Sopenharmony_ci if ($sse2) { 1243e1051a39Sopenharmony_ci &call ("_picup_eax"); 1244e1051a39Sopenharmony_ci &set_label("pic"); 1245e1051a39Sopenharmony_ci &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); 1246e1051a39Sopenharmony_ci &mov ("ebp",&DWP(0,"edx")); } 1247e1051a39Sopenharmony_ci 1248e1051a39Sopenharmony_ci&set_label("point_double_shortcut"); 1249e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"esi")); # copy in_x 1250e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"esi")); 1251e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"esi")); 1252e1051a39Sopenharmony_ci &mov ("edx",&DWP(12,"esi")); 1253e1051a39Sopenharmony_ci &mov (&DWP($in_x+0,"esp"),"eax"); 1254e1051a39Sopenharmony_ci &mov (&DWP($in_x+4,"esp"),"ebx"); 1255e1051a39Sopenharmony_ci &mov (&DWP($in_x+8,"esp"),"ecx"); 1256e1051a39Sopenharmony_ci &mov (&DWP($in_x+12,"esp"),"edx"); 1257e1051a39Sopenharmony_ci &mov ("eax",&DWP(16,"esi")); 1258e1051a39Sopenharmony_ci &mov ("ebx",&DWP(20,"esi")); 1259e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esi")); 1260e1051a39Sopenharmony_ci &mov ("edx",&DWP(28,"esi")); 1261e1051a39Sopenharmony_ci &mov (&DWP($in_x+16,"esp"),"eax"); 1262e1051a39Sopenharmony_ci &mov (&DWP($in_x+20,"esp"),"ebx"); 1263e1051a39Sopenharmony_ci &mov (&DWP($in_x+24,"esp"),"ecx"); 1264e1051a39Sopenharmony_ci &mov (&DWP($in_x+28,"esp"),"edx"); 1265e1051a39Sopenharmony_ci &mov (&DWP(32*5,"esp"),"ebp"); # OPENSSL_ia32cap_P copy 1266e1051a39Sopenharmony_ci 1267e1051a39Sopenharmony_ci &lea ("ebp",&DWP(32,"esi")); 1268e1051a39Sopenharmony_ci &lea ("esi",&DWP(32,"esi")); 1269e1051a39Sopenharmony_ci &lea ("edi",&DWP($S,"esp")); 1270e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y); 1271e1051a39Sopenharmony_ci 1272e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1273e1051a39Sopenharmony_ci &mov ("esi",64); 1274e1051a39Sopenharmony_ci &add ("esi",&wparam(1)); 1275e1051a39Sopenharmony_ci &lea ("edi",&DWP($Zsqr,"esp")); 1276e1051a39Sopenharmony_ci &mov ("ebp","esi"); 1277e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z); 1278e1051a39Sopenharmony_ci 1279e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1280e1051a39Sopenharmony_ci &lea ("esi",&DWP($S,"esp")); 1281e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S,"esp")); 1282e1051a39Sopenharmony_ci &lea ("edi",&DWP($S,"esp")); 1283e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S); 1284e1051a39Sopenharmony_ci 1285e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1286e1051a39Sopenharmony_ci &mov ("ebp",&wparam(1)); 1287e1051a39Sopenharmony_ci &lea ("esi",&DWP(32,"ebp")); 1288e1051a39Sopenharmony_ci &lea ("ebp",&DWP(64,"ebp")); 1289e1051a39Sopenharmony_ci &lea ("edi",&DWP($tmp0,"esp")); 1290e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y); 1291e1051a39Sopenharmony_ci 1292e1051a39Sopenharmony_ci &lea ("esi",&DWP($in_x,"esp")); 1293e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Zsqr,"esp")); 1294e1051a39Sopenharmony_ci &lea ("edi",&DWP($M,"esp")); 1295e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr); 1296e1051a39Sopenharmony_ci 1297e1051a39Sopenharmony_ci &mov ("edi",64); 1298e1051a39Sopenharmony_ci &lea ("esi",&DWP($tmp0,"esp")); 1299e1051a39Sopenharmony_ci &lea ("ebp",&DWP($tmp0,"esp")); 1300e1051a39Sopenharmony_ci &add ("edi",&wparam(0)); 1301e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0); 1302e1051a39Sopenharmony_ci 1303e1051a39Sopenharmony_ci &lea ("esi",&DWP($in_x,"esp")); 1304e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Zsqr,"esp")); 1305e1051a39Sopenharmony_ci &lea ("edi",&DWP($Zsqr,"esp")); 1306e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr); 1307e1051a39Sopenharmony_ci 1308e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1309e1051a39Sopenharmony_ci &lea ("esi",&DWP($S,"esp")); 1310e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S,"esp")); 1311e1051a39Sopenharmony_ci &lea ("edi",&DWP($tmp0,"esp")); 1312e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S); 1313e1051a39Sopenharmony_ci 1314e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1315e1051a39Sopenharmony_ci &lea ("esi",&DWP($M,"esp")); 1316e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Zsqr,"esp")); 1317e1051a39Sopenharmony_ci &lea ("edi",&DWP($M,"esp")); 1318e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr); 1319e1051a39Sopenharmony_ci 1320e1051a39Sopenharmony_ci &mov ("edi",32); 1321e1051a39Sopenharmony_ci &lea ("esi",&DWP($tmp0,"esp")); 1322e1051a39Sopenharmony_ci &add ("edi",&wparam(0)); 1323e1051a39Sopenharmony_ci &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0); 1324e1051a39Sopenharmony_ci 1325e1051a39Sopenharmony_ci &lea ("esi",&DWP($M,"esp")); 1326e1051a39Sopenharmony_ci &lea ("ebp",&DWP($M,"esp")); 1327e1051a39Sopenharmony_ci &lea ("edi",&DWP($tmp0,"esp")); 1328e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M); 1329e1051a39Sopenharmony_ci 1330e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1331e1051a39Sopenharmony_ci &lea ("esi",&DWP($in_x,"esp")); 1332e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S,"esp")); 1333e1051a39Sopenharmony_ci &lea ("edi",&DWP($S,"esp")); 1334e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x); 1335e1051a39Sopenharmony_ci 1336e1051a39Sopenharmony_ci &lea ("esi",&DWP($tmp0,"esp")); 1337e1051a39Sopenharmony_ci &lea ("ebp",&DWP($M,"esp")); 1338e1051a39Sopenharmony_ci &lea ("edi",&DWP($M,"esp")); 1339e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M); 1340e1051a39Sopenharmony_ci 1341e1051a39Sopenharmony_ci &lea ("esi",&DWP($S,"esp")); 1342e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S,"esp")); 1343e1051a39Sopenharmony_ci &lea ("edi",&DWP($tmp0,"esp")); 1344e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S); 1345e1051a39Sopenharmony_ci 1346e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1347e1051a39Sopenharmony_ci &lea ("esi",&DWP($M,"esp")); 1348e1051a39Sopenharmony_ci &lea ("ebp",&DWP($M,"esp")); 1349e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 1350e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M); 1351e1051a39Sopenharmony_ci 1352e1051a39Sopenharmony_ci &mov ("esi","edi"); # %edi is still res_x here 1353e1051a39Sopenharmony_ci &lea ("ebp",&DWP($tmp0,"esp")); 1354e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0); 1355e1051a39Sopenharmony_ci 1356e1051a39Sopenharmony_ci &lea ("esi",&DWP($S,"esp")); 1357e1051a39Sopenharmony_ci &mov ("ebp","edi"); # %edi is still res_x 1358e1051a39Sopenharmony_ci &lea ("edi",&DWP($S,"esp")); 1359e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x); 1360e1051a39Sopenharmony_ci 1361e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1362e1051a39Sopenharmony_ci &mov ("esi","edi"); # %edi is still &S 1363e1051a39Sopenharmony_ci &lea ("ebp",&DWP($M,"esp")); 1364e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M); 1365e1051a39Sopenharmony_ci 1366e1051a39Sopenharmony_ci &mov ("ebp",32); 1367e1051a39Sopenharmony_ci &lea ("esi",&DWP($S,"esp")); 1368e1051a39Sopenharmony_ci &add ("ebp",&wparam(0)); 1369e1051a39Sopenharmony_ci &mov ("edi","ebp"); 1370e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y); 1371e1051a39Sopenharmony_ci 1372e1051a39Sopenharmony_ci &stack_pop(8*5+1); 1373e1051a39Sopenharmony_ci} &function_end("ecp_nistz256_point_double"); 1374e1051a39Sopenharmony_ci 1375e1051a39Sopenharmony_ci######################################################################## 1376e1051a39Sopenharmony_ci# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 1377e1051a39Sopenharmony_ci# const P256_POINT *in2); 1378e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_point_add"); 1379e1051a39Sopenharmony_ci{ my ($res_x,$res_y,$res_z, 1380e1051a39Sopenharmony_ci $in1_x,$in1_y,$in1_z, 1381e1051a39Sopenharmony_ci $in2_x,$in2_y,$in2_z, 1382e1051a39Sopenharmony_ci $H,$Hsqr,$R,$Rsqr,$Hcub, 1383e1051a39Sopenharmony_ci $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); 1384e1051a39Sopenharmony_ci my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1385e1051a39Sopenharmony_ci 1386e1051a39Sopenharmony_ci &mov ("esi",&wparam(2)); 1387e1051a39Sopenharmony_ci 1388e1051a39Sopenharmony_ci # above map() describes stack layout with 18 temporary 1389e1051a39Sopenharmony_ci # 256-bit vectors on top, then we take extra words for 1390e1051a39Sopenharmony_ci # ~in1infty, ~in2infty, result of check for zero and 1391e1051a39Sopenharmony_ci # OPENSSL_ia32cap_P copy. [one unused word for padding] 1392e1051a39Sopenharmony_ci &stack_push(8*18+5); 1393e1051a39Sopenharmony_ci if ($sse2) { 1394e1051a39Sopenharmony_ci &call ("_picup_eax"); 1395e1051a39Sopenharmony_ci &set_label("pic"); 1396e1051a39Sopenharmony_ci &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); 1397e1051a39Sopenharmony_ci &mov ("ebp",&DWP(0,"edx")); } 1398e1051a39Sopenharmony_ci 1399e1051a39Sopenharmony_ci &lea ("edi",&DWP($in2_x,"esp")); 1400e1051a39Sopenharmony_ci for($i=0;$i<96;$i+=16) { 1401e1051a39Sopenharmony_ci &mov ("eax",&DWP($i+0,"esi")); # copy in2 1402e1051a39Sopenharmony_ci &mov ("ebx",&DWP($i+4,"esi")); 1403e1051a39Sopenharmony_ci &mov ("ecx",&DWP($i+8,"esi")); 1404e1051a39Sopenharmony_ci &mov ("edx",&DWP($i+12,"esi")); 1405e1051a39Sopenharmony_ci &mov (&DWP($i+0,"edi"),"eax"); 1406e1051a39Sopenharmony_ci &mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0); 1407e1051a39Sopenharmony_ci &mov ("ebp","eax") if ($i==64); 1408e1051a39Sopenharmony_ci &or ("ebp","eax") if ($i>64); 1409e1051a39Sopenharmony_ci &mov (&DWP($i+4,"edi"),"ebx"); 1410e1051a39Sopenharmony_ci &or ("ebp","ebx") if ($i>=64); 1411e1051a39Sopenharmony_ci &mov (&DWP($i+8,"edi"),"ecx"); 1412e1051a39Sopenharmony_ci &or ("ebp","ecx") if ($i>=64); 1413e1051a39Sopenharmony_ci &mov (&DWP($i+12,"edi"),"edx"); 1414e1051a39Sopenharmony_ci &or ("ebp","edx") if ($i>=64); 1415e1051a39Sopenharmony_ci } 1416e1051a39Sopenharmony_ci &xor ("eax","eax"); 1417e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 1418e1051a39Sopenharmony_ci &sub ("eax","ebp"); 1419e1051a39Sopenharmony_ci &or ("ebp","eax"); 1420e1051a39Sopenharmony_ci &sar ("ebp",31); 1421e1051a39Sopenharmony_ci &mov (&DWP(32*18+4,"esp"),"ebp"); # ~in2infty 1422e1051a39Sopenharmony_ci 1423e1051a39Sopenharmony_ci &lea ("edi",&DWP($in1_x,"esp")); 1424e1051a39Sopenharmony_ci for($i=0;$i<96;$i+=16) { 1425e1051a39Sopenharmony_ci &mov ("eax",&DWP($i+0,"esi")); # copy in1 1426e1051a39Sopenharmony_ci &mov ("ebx",&DWP($i+4,"esi")); 1427e1051a39Sopenharmony_ci &mov ("ecx",&DWP($i+8,"esi")); 1428e1051a39Sopenharmony_ci &mov ("edx",&DWP($i+12,"esi")); 1429e1051a39Sopenharmony_ci &mov (&DWP($i+0,"edi"),"eax"); 1430e1051a39Sopenharmony_ci &mov ("ebp","eax") if ($i==64); 1431e1051a39Sopenharmony_ci &or ("ebp","eax") if ($i>64); 1432e1051a39Sopenharmony_ci &mov (&DWP($i+4,"edi"),"ebx"); 1433e1051a39Sopenharmony_ci &or ("ebp","ebx") if ($i>=64); 1434e1051a39Sopenharmony_ci &mov (&DWP($i+8,"edi"),"ecx"); 1435e1051a39Sopenharmony_ci &or ("ebp","ecx") if ($i>=64); 1436e1051a39Sopenharmony_ci &mov (&DWP($i+12,"edi"),"edx"); 1437e1051a39Sopenharmony_ci &or ("ebp","edx") if ($i>=64); 1438e1051a39Sopenharmony_ci } 1439e1051a39Sopenharmony_ci &xor ("eax","eax"); 1440e1051a39Sopenharmony_ci &sub ("eax","ebp"); 1441e1051a39Sopenharmony_ci &or ("ebp","eax"); 1442e1051a39Sopenharmony_ci &sar ("ebp",31); 1443e1051a39Sopenharmony_ci &mov (&DWP(32*18+0,"esp"),"ebp"); # ~in1infty 1444e1051a39Sopenharmony_ci 1445e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1446e1051a39Sopenharmony_ci &lea ("esi",&DWP($in2_z,"esp")); 1447e1051a39Sopenharmony_ci &lea ("ebp",&DWP($in2_z,"esp")); 1448e1051a39Sopenharmony_ci &lea ("edi",&DWP($Z2sqr,"esp")); 1449e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z2sqr, in2_z); 1450e1051a39Sopenharmony_ci 1451e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1452e1051a39Sopenharmony_ci &lea ("esi",&DWP($in1_z,"esp")); 1453e1051a39Sopenharmony_ci &lea ("ebp",&DWP($in1_z,"esp")); 1454e1051a39Sopenharmony_ci &lea ("edi",&DWP($Z1sqr,"esp")); 1455e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); 1456e1051a39Sopenharmony_ci 1457e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1458e1051a39Sopenharmony_ci &lea ("esi",&DWP($Z2sqr,"esp")); 1459e1051a39Sopenharmony_ci &lea ("ebp",&DWP($in2_z,"esp")); 1460e1051a39Sopenharmony_ci &lea ("edi",&DWP($S1,"esp")); 1461e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, Z2sqr, in2_z); 1462e1051a39Sopenharmony_ci 1463e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1464e1051a39Sopenharmony_ci &lea ("esi",&DWP($Z1sqr,"esp")); 1465e1051a39Sopenharmony_ci &lea ("ebp",&DWP($in1_z,"esp")); 1466e1051a39Sopenharmony_ci &lea ("edi",&DWP($S2,"esp")); 1467e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); 1468e1051a39Sopenharmony_ci 1469e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1470e1051a39Sopenharmony_ci &lea ("esi",&DWP($in1_y,"esp")); 1471e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S1,"esp")); 1472e1051a39Sopenharmony_ci &lea ("edi",&DWP($S1,"esp")); 1473e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, S1, in1_y); 1474e1051a39Sopenharmony_ci 1475e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1476e1051a39Sopenharmony_ci &lea ("esi",&DWP($in2_y,"esp")); 1477e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S2,"esp")); 1478e1051a39Sopenharmony_ci &lea ("edi",&DWP($S2,"esp")); 1479e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); 1480e1051a39Sopenharmony_ci 1481e1051a39Sopenharmony_ci &lea ("esi",&DWP($S2,"esp")); 1482e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S1,"esp")); 1483e1051a39Sopenharmony_ci &lea ("edi",&DWP($R,"esp")); 1484e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, S1); 1485e1051a39Sopenharmony_ci 1486e1051a39Sopenharmony_ci &or ("ebx","eax"); # see if result is zero 1487e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1488e1051a39Sopenharmony_ci &or ("ebx","ecx"); 1489e1051a39Sopenharmony_ci &or ("ebx","edx"); 1490e1051a39Sopenharmony_ci &or ("ebx",&DWP(0,"edi")); 1491e1051a39Sopenharmony_ci &or ("ebx",&DWP(4,"edi")); 1492e1051a39Sopenharmony_ci &lea ("esi",&DWP($in1_x,"esp")); 1493e1051a39Sopenharmony_ci &or ("ebx",&DWP(8,"edi")); 1494e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Z2sqr,"esp")); 1495e1051a39Sopenharmony_ci &or ("ebx",&DWP(12,"edi")); 1496e1051a39Sopenharmony_ci &lea ("edi",&DWP($U1,"esp")); 1497e1051a39Sopenharmony_ci &mov (&DWP(32*18+8,"esp"),"ebx"); 1498e1051a39Sopenharmony_ci 1499e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U1, in1_x, Z2sqr); 1500e1051a39Sopenharmony_ci 1501e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1502e1051a39Sopenharmony_ci &lea ("esi",&DWP($in2_x,"esp")); 1503e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Z1sqr,"esp")); 1504e1051a39Sopenharmony_ci &lea ("edi",&DWP($U2,"esp")); 1505e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in2_x, Z1sqr); 1506e1051a39Sopenharmony_ci 1507e1051a39Sopenharmony_ci &lea ("esi",&DWP($U2,"esp")); 1508e1051a39Sopenharmony_ci &lea ("ebp",&DWP($U1,"esp")); 1509e1051a39Sopenharmony_ci &lea ("edi",&DWP($H,"esp")); 1510e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, U1); 1511e1051a39Sopenharmony_ci 1512e1051a39Sopenharmony_ci &or ("eax","ebx"); # see if result is zero 1513e1051a39Sopenharmony_ci &or ("eax","ecx"); 1514e1051a39Sopenharmony_ci &or ("eax","edx"); 1515e1051a39Sopenharmony_ci &or ("eax",&DWP(0,"edi")); 1516e1051a39Sopenharmony_ci &or ("eax",&DWP(4,"edi")); 1517e1051a39Sopenharmony_ci &or ("eax",&DWP(8,"edi")); 1518e1051a39Sopenharmony_ci &or ("eax",&DWP(12,"edi")); # ~is_equal(U1,U2) 1519e1051a39Sopenharmony_ci 1520e1051a39Sopenharmony_ci &mov ("ebx",&DWP(32*18+0,"esp")); # ~in1infty 1521e1051a39Sopenharmony_ci ¬ ("ebx"); # -1/0 -> 0/-1 1522e1051a39Sopenharmony_ci &or ("eax","ebx"); 1523e1051a39Sopenharmony_ci &mov ("ebx",&DWP(32*18+4,"esp")); # ~in2infty 1524e1051a39Sopenharmony_ci ¬ ("ebx"); # -1/0 -> 0/-1 1525e1051a39Sopenharmony_ci &or ("eax","ebx"); 1526e1051a39Sopenharmony_ci &or ("eax",&DWP(32*18+8,"esp")); # ~is_equal(S1,S2) 1527e1051a39Sopenharmony_ci 1528e1051a39Sopenharmony_ci # if (~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 1529e1051a39Sopenharmony_ci &data_byte(0x3e); # predict taken 1530e1051a39Sopenharmony_ci &jnz (&label("add_proceed")); 1531e1051a39Sopenharmony_ci 1532e1051a39Sopenharmony_ci&set_label("add_double",16); 1533e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 1534e1051a39Sopenharmony_ci &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1535e1051a39Sopenharmony_ci &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes 1536e1051a39Sopenharmony_ci &jmp (&label("point_double_shortcut")); 1537e1051a39Sopenharmony_ci 1538e1051a39Sopenharmony_ci&set_label("add_proceed",16); 1539e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1540e1051a39Sopenharmony_ci &lea ("esi",&DWP($R,"esp")); 1541e1051a39Sopenharmony_ci &lea ("ebp",&DWP($R,"esp")); 1542e1051a39Sopenharmony_ci &lea ("edi",&DWP($Rsqr,"esp")); 1543e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); 1544e1051a39Sopenharmony_ci 1545e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1546e1051a39Sopenharmony_ci &lea ("esi",&DWP($H,"esp")); 1547e1051a39Sopenharmony_ci &lea ("ebp",&DWP($in1_z,"esp")); 1548e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_z,"esp")); 1549e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); 1550e1051a39Sopenharmony_ci 1551e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1552e1051a39Sopenharmony_ci &lea ("esi",&DWP($H,"esp")); 1553e1051a39Sopenharmony_ci &lea ("ebp",&DWP($H,"esp")); 1554e1051a39Sopenharmony_ci &lea ("edi",&DWP($Hsqr,"esp")); 1555e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); 1556e1051a39Sopenharmony_ci 1557e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1558e1051a39Sopenharmony_ci &lea ("esi",&DWP($in2_z,"esp")); 1559e1051a39Sopenharmony_ci &lea ("ebp",&DWP($res_z,"esp")); 1560e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_z,"esp")); 1561e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, res_z, in2_z); 1562e1051a39Sopenharmony_ci 1563e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1564e1051a39Sopenharmony_ci &lea ("esi",&DWP($Hsqr,"esp")); 1565e1051a39Sopenharmony_ci &lea ("ebp",&DWP($U1,"esp")); 1566e1051a39Sopenharmony_ci &lea ("edi",&DWP($U2,"esp")); 1567e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, U1, Hsqr); 1568e1051a39Sopenharmony_ci 1569e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1570e1051a39Sopenharmony_ci &lea ("esi",&DWP($H,"esp")); 1571e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Hsqr,"esp")); 1572e1051a39Sopenharmony_ci &lea ("edi",&DWP($Hcub,"esp")); 1573e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); 1574e1051a39Sopenharmony_ci 1575e1051a39Sopenharmony_ci &lea ("esi",&DWP($U2,"esp")); 1576e1051a39Sopenharmony_ci &lea ("ebp",&DWP($U2,"esp")); 1577e1051a39Sopenharmony_ci &lea ("edi",&DWP($Hsqr,"esp")); 1578e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); 1579e1051a39Sopenharmony_ci 1580e1051a39Sopenharmony_ci &lea ("esi",&DWP($Rsqr,"esp")); 1581e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Hsqr,"esp")); 1582e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_x,"esp")); 1583e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); 1584e1051a39Sopenharmony_ci 1585e1051a39Sopenharmony_ci &lea ("esi",&DWP($res_x,"esp")); 1586e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Hcub,"esp")); 1587e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_x,"esp")); 1588e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); 1589e1051a39Sopenharmony_ci 1590e1051a39Sopenharmony_ci &lea ("esi",&DWP($U2,"esp")); 1591e1051a39Sopenharmony_ci &lea ("ebp",&DWP($res_x,"esp")); 1592e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_y,"esp")); 1593e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); 1594e1051a39Sopenharmony_ci 1595e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1596e1051a39Sopenharmony_ci &lea ("esi",&DWP($Hcub,"esp")); 1597e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S1,"esp")); 1598e1051a39Sopenharmony_ci &lea ("edi",&DWP($S2,"esp")); 1599e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S1, Hcub); 1600e1051a39Sopenharmony_ci 1601e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1602e1051a39Sopenharmony_ci &lea ("esi",&DWP($R,"esp")); 1603e1051a39Sopenharmony_ci &lea ("ebp",&DWP($res_y,"esp")); 1604e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_y,"esp")); 1605e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, R, res_y); 1606e1051a39Sopenharmony_ci 1607e1051a39Sopenharmony_ci &lea ("esi",&DWP($res_y,"esp")); 1608e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S2,"esp")); 1609e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_y,"esp")); 1610e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); 1611e1051a39Sopenharmony_ci 1612e1051a39Sopenharmony_ci &mov ("ebp",&DWP(32*18+0,"esp")); # ~in1infty 1613e1051a39Sopenharmony_ci &mov ("esi",&DWP(32*18+4,"esp")); # ~in2infty 1614e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 1615e1051a39Sopenharmony_ci &mov ("edx","ebp"); 1616e1051a39Sopenharmony_ci ¬ ("ebp"); 1617e1051a39Sopenharmony_ci &and ("edx","esi"); # ~in1infty & ~in2infty 1618e1051a39Sopenharmony_ci &and ("ebp","esi"); # in1infty & ~in2infty 1619e1051a39Sopenharmony_ci ¬ ("esi"); # in2infty 1620e1051a39Sopenharmony_ci 1621e1051a39Sopenharmony_ci ######################################## 1622e1051a39Sopenharmony_ci # conditional moves 1623e1051a39Sopenharmony_ci for($i=64;$i<96;$i+=4) { 1624e1051a39Sopenharmony_ci &mov ("eax","edx"); # ~in1infty & ~in2infty 1625e1051a39Sopenharmony_ci &and ("eax",&DWP($res_x+$i,"esp")); 1626e1051a39Sopenharmony_ci &mov ("ebx","ebp"); # in1infty & ~in2infty 1627e1051a39Sopenharmony_ci &and ("ebx",&DWP($in2_x+$i,"esp")); 1628e1051a39Sopenharmony_ci &mov ("ecx","esi"); # in2infty 1629e1051a39Sopenharmony_ci &and ("ecx",&DWP($in1_x+$i,"esp")); 1630e1051a39Sopenharmony_ci &or ("eax","ebx"); 1631e1051a39Sopenharmony_ci &or ("eax","ecx"); 1632e1051a39Sopenharmony_ci &mov (&DWP($i,"edi"),"eax"); 1633e1051a39Sopenharmony_ci } 1634e1051a39Sopenharmony_ci for($i=0;$i<64;$i+=4) { 1635e1051a39Sopenharmony_ci &mov ("eax","edx"); # ~in1infty & ~in2infty 1636e1051a39Sopenharmony_ci &and ("eax",&DWP($res_x+$i,"esp")); 1637e1051a39Sopenharmony_ci &mov ("ebx","ebp"); # in1infty & ~in2infty 1638e1051a39Sopenharmony_ci &and ("ebx",&DWP($in2_x+$i,"esp")); 1639e1051a39Sopenharmony_ci &mov ("ecx","esi"); # in2infty 1640e1051a39Sopenharmony_ci &and ("ecx",&DWP($in1_x+$i,"esp")); 1641e1051a39Sopenharmony_ci &or ("eax","ebx"); 1642e1051a39Sopenharmony_ci &or ("eax","ecx"); 1643e1051a39Sopenharmony_ci &mov (&DWP($i,"edi"),"eax"); 1644e1051a39Sopenharmony_ci } 1645e1051a39Sopenharmony_ci &set_label("add_done"); 1646e1051a39Sopenharmony_ci &stack_pop(8*18+5); 1647e1051a39Sopenharmony_ci} &function_end("ecp_nistz256_point_add"); 1648e1051a39Sopenharmony_ci 1649e1051a39Sopenharmony_ci######################################################################## 1650e1051a39Sopenharmony_ci# void ecp_nistz256_point_add_affine(P256_POINT *out, 1651e1051a39Sopenharmony_ci# const P256_POINT *in1, 1652e1051a39Sopenharmony_ci# const P256_POINT_AFFINE *in2); 1653e1051a39Sopenharmony_ci&function_begin("ecp_nistz256_point_add_affine"); 1654e1051a39Sopenharmony_ci{ 1655e1051a39Sopenharmony_ci my ($res_x,$res_y,$res_z, 1656e1051a39Sopenharmony_ci $in1_x,$in1_y,$in1_z, 1657e1051a39Sopenharmony_ci $in2_x,$in2_y, 1658e1051a39Sopenharmony_ci $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 1659e1051a39Sopenharmony_ci my $Z1sqr = $S2; 1660e1051a39Sopenharmony_ci my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 1661e1051a39Sopenharmony_ci 1662e1051a39Sopenharmony_ci &mov ("esi",&wparam(1)); 1663e1051a39Sopenharmony_ci 1664e1051a39Sopenharmony_ci # above map() describes stack layout with 15 temporary 1665e1051a39Sopenharmony_ci # 256-bit vectors on top, then we take extra words for 1666e1051a39Sopenharmony_ci # ~in1infty, ~in2infty, and OPENSSL_ia32cap_P copy. 1667e1051a39Sopenharmony_ci &stack_push(8*15+3); 1668e1051a39Sopenharmony_ci if ($sse2) { 1669e1051a39Sopenharmony_ci &call ("_picup_eax"); 1670e1051a39Sopenharmony_ci &set_label("pic"); 1671e1051a39Sopenharmony_ci &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); 1672e1051a39Sopenharmony_ci &mov ("ebp",&DWP(0,"edx")); } 1673e1051a39Sopenharmony_ci 1674e1051a39Sopenharmony_ci &lea ("edi",&DWP($in1_x,"esp")); 1675e1051a39Sopenharmony_ci for($i=0;$i<96;$i+=16) { 1676e1051a39Sopenharmony_ci &mov ("eax",&DWP($i+0,"esi")); # copy in1 1677e1051a39Sopenharmony_ci &mov ("ebx",&DWP($i+4,"esi")); 1678e1051a39Sopenharmony_ci &mov ("ecx",&DWP($i+8,"esi")); 1679e1051a39Sopenharmony_ci &mov ("edx",&DWP($i+12,"esi")); 1680e1051a39Sopenharmony_ci &mov (&DWP($i+0,"edi"),"eax"); 1681e1051a39Sopenharmony_ci &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0); 1682e1051a39Sopenharmony_ci &mov ("ebp","eax") if ($i==64); 1683e1051a39Sopenharmony_ci &or ("ebp","eax") if ($i>64); 1684e1051a39Sopenharmony_ci &mov (&DWP($i+4,"edi"),"ebx"); 1685e1051a39Sopenharmony_ci &or ("ebp","ebx") if ($i>=64); 1686e1051a39Sopenharmony_ci &mov (&DWP($i+8,"edi"),"ecx"); 1687e1051a39Sopenharmony_ci &or ("ebp","ecx") if ($i>=64); 1688e1051a39Sopenharmony_ci &mov (&DWP($i+12,"edi"),"edx"); 1689e1051a39Sopenharmony_ci &or ("ebp","edx") if ($i>=64); 1690e1051a39Sopenharmony_ci } 1691e1051a39Sopenharmony_ci &xor ("eax","eax"); 1692e1051a39Sopenharmony_ci &mov ("esi",&wparam(2)); 1693e1051a39Sopenharmony_ci &sub ("eax","ebp"); 1694e1051a39Sopenharmony_ci &or ("ebp","eax"); 1695e1051a39Sopenharmony_ci &sar ("ebp",31); 1696e1051a39Sopenharmony_ci &mov (&DWP(32*15+0,"esp"),"ebp"); # ~in1infty 1697e1051a39Sopenharmony_ci 1698e1051a39Sopenharmony_ci &lea ("edi",&DWP($in2_x,"esp")); 1699e1051a39Sopenharmony_ci for($i=0;$i<64;$i+=16) { 1700e1051a39Sopenharmony_ci &mov ("eax",&DWP($i+0,"esi")); # copy in2 1701e1051a39Sopenharmony_ci &mov ("ebx",&DWP($i+4,"esi")); 1702e1051a39Sopenharmony_ci &mov ("ecx",&DWP($i+8,"esi")); 1703e1051a39Sopenharmony_ci &mov ("edx",&DWP($i+12,"esi")); 1704e1051a39Sopenharmony_ci &mov (&DWP($i+0,"edi"),"eax"); 1705e1051a39Sopenharmony_ci &mov ("ebp","eax") if ($i==0); 1706e1051a39Sopenharmony_ci &or ("ebp","eax") if ($i!=0); 1707e1051a39Sopenharmony_ci &mov (&DWP($i+4,"edi"),"ebx"); 1708e1051a39Sopenharmony_ci &or ("ebp","ebx"); 1709e1051a39Sopenharmony_ci &mov (&DWP($i+8,"edi"),"ecx"); 1710e1051a39Sopenharmony_ci &or ("ebp","ecx"); 1711e1051a39Sopenharmony_ci &mov (&DWP($i+12,"edi"),"edx"); 1712e1051a39Sopenharmony_ci &or ("ebp","edx"); 1713e1051a39Sopenharmony_ci } 1714e1051a39Sopenharmony_ci &xor ("ebx","ebx"); 1715e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1716e1051a39Sopenharmony_ci &sub ("ebx","ebp"); 1717e1051a39Sopenharmony_ci &lea ("esi",&DWP($in1_z,"esp")); 1718e1051a39Sopenharmony_ci &or ("ebx","ebp"); 1719e1051a39Sopenharmony_ci &lea ("ebp",&DWP($in1_z,"esp")); 1720e1051a39Sopenharmony_ci &sar ("ebx",31); 1721e1051a39Sopenharmony_ci &lea ("edi",&DWP($Z1sqr,"esp")); 1722e1051a39Sopenharmony_ci &mov (&DWP(32*15+4,"esp"),"ebx"); # ~in2infty 1723e1051a39Sopenharmony_ci 1724e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); 1725e1051a39Sopenharmony_ci 1726e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1727e1051a39Sopenharmony_ci &lea ("esi",&DWP($in2_x,"esp")); 1728e1051a39Sopenharmony_ci &mov ("ebp","edi"); # %esi is stull &Z1sqr 1729e1051a39Sopenharmony_ci &lea ("edi",&DWP($U2,"esp")); 1730e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x); 1731e1051a39Sopenharmony_ci 1732e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1733e1051a39Sopenharmony_ci &lea ("esi",&DWP($in1_z,"esp")); 1734e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Z1sqr,"esp")); 1735e1051a39Sopenharmony_ci &lea ("edi",&DWP($S2,"esp")); 1736e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); 1737e1051a39Sopenharmony_ci 1738e1051a39Sopenharmony_ci &lea ("esi",&DWP($U2,"esp")); 1739e1051a39Sopenharmony_ci &lea ("ebp",&DWP($in1_x,"esp")); 1740e1051a39Sopenharmony_ci &lea ("edi",&DWP($H,"esp")); 1741e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x); 1742e1051a39Sopenharmony_ci 1743e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1744e1051a39Sopenharmony_ci &lea ("esi",&DWP($in2_y,"esp")); 1745e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S2,"esp")); 1746e1051a39Sopenharmony_ci &lea ("edi",&DWP($S2,"esp")); 1747e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); 1748e1051a39Sopenharmony_ci 1749e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1750e1051a39Sopenharmony_ci &lea ("esi",&DWP($in1_z,"esp")); 1751e1051a39Sopenharmony_ci &lea ("ebp",&DWP($H,"esp")); 1752e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_z,"esp")); 1753e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); 1754e1051a39Sopenharmony_ci 1755e1051a39Sopenharmony_ci &lea ("esi",&DWP($S2,"esp")); 1756e1051a39Sopenharmony_ci &lea ("ebp",&DWP($in1_y,"esp")); 1757e1051a39Sopenharmony_ci &lea ("edi",&DWP($R,"esp")); 1758e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y); 1759e1051a39Sopenharmony_ci 1760e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1761e1051a39Sopenharmony_ci &lea ("esi",&DWP($H,"esp")); 1762e1051a39Sopenharmony_ci &lea ("ebp",&DWP($H,"esp")); 1763e1051a39Sopenharmony_ci &lea ("edi",&DWP($Hsqr,"esp")); 1764e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); 1765e1051a39Sopenharmony_ci 1766e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1767e1051a39Sopenharmony_ci &lea ("esi",&DWP($R,"esp")); 1768e1051a39Sopenharmony_ci &lea ("ebp",&DWP($R,"esp")); 1769e1051a39Sopenharmony_ci &lea ("edi",&DWP($Rsqr,"esp")); 1770e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); 1771e1051a39Sopenharmony_ci 1772e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1773e1051a39Sopenharmony_ci &lea ("esi",&DWP($in1_x,"esp")); 1774e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Hsqr,"esp")); 1775e1051a39Sopenharmony_ci &lea ("edi",&DWP($U2,"esp")); 1776e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr); 1777e1051a39Sopenharmony_ci 1778e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1779e1051a39Sopenharmony_ci &lea ("esi",&DWP($H,"esp")); 1780e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Hsqr,"esp")); 1781e1051a39Sopenharmony_ci &lea ("edi",&DWP($Hcub,"esp")); 1782e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); 1783e1051a39Sopenharmony_ci 1784e1051a39Sopenharmony_ci &lea ("esi",&DWP($U2,"esp")); 1785e1051a39Sopenharmony_ci &lea ("ebp",&DWP($U2,"esp")); 1786e1051a39Sopenharmony_ci &lea ("edi",&DWP($Hsqr,"esp")); 1787e1051a39Sopenharmony_ci &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); 1788e1051a39Sopenharmony_ci 1789e1051a39Sopenharmony_ci &lea ("esi",&DWP($Rsqr,"esp")); 1790e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Hsqr,"esp")); 1791e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_x,"esp")); 1792e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); 1793e1051a39Sopenharmony_ci 1794e1051a39Sopenharmony_ci &lea ("esi",&DWP($res_x,"esp")); 1795e1051a39Sopenharmony_ci &lea ("ebp",&DWP($Hcub,"esp")); 1796e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_x,"esp")); 1797e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); 1798e1051a39Sopenharmony_ci 1799e1051a39Sopenharmony_ci &lea ("esi",&DWP($U2,"esp")); 1800e1051a39Sopenharmony_ci &lea ("ebp",&DWP($res_x,"esp")); 1801e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_y,"esp")); 1802e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); 1803e1051a39Sopenharmony_ci 1804e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1805e1051a39Sopenharmony_ci &lea ("esi",&DWP($Hcub,"esp")); 1806e1051a39Sopenharmony_ci &lea ("ebp",&DWP($in1_y,"esp")); 1807e1051a39Sopenharmony_ci &lea ("edi",&DWP($S2,"esp")); 1808e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y); 1809e1051a39Sopenharmony_ci 1810e1051a39Sopenharmony_ci &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1811e1051a39Sopenharmony_ci &lea ("esi",&DWP($R,"esp")); 1812e1051a39Sopenharmony_ci &lea ("ebp",&DWP($res_y,"esp")); 1813e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_y,"esp")); 1814e1051a39Sopenharmony_ci &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R); 1815e1051a39Sopenharmony_ci 1816e1051a39Sopenharmony_ci &lea ("esi",&DWP($res_y,"esp")); 1817e1051a39Sopenharmony_ci &lea ("ebp",&DWP($S2,"esp")); 1818e1051a39Sopenharmony_ci &lea ("edi",&DWP($res_y,"esp")); 1819e1051a39Sopenharmony_ci &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); 1820e1051a39Sopenharmony_ci 1821e1051a39Sopenharmony_ci &mov ("ebp",&DWP(32*15+0,"esp")); # ~in1infty 1822e1051a39Sopenharmony_ci &mov ("esi",&DWP(32*15+4,"esp")); # ~in2infty 1823e1051a39Sopenharmony_ci &mov ("edi",&wparam(0)); 1824e1051a39Sopenharmony_ci &mov ("edx","ebp"); 1825e1051a39Sopenharmony_ci ¬ ("ebp"); 1826e1051a39Sopenharmony_ci &and ("edx","esi"); # ~in1infty & ~in2infty 1827e1051a39Sopenharmony_ci &and ("ebp","esi"); # in1infty & ~in2infty 1828e1051a39Sopenharmony_ci ¬ ("esi"); # in2infty 1829e1051a39Sopenharmony_ci 1830e1051a39Sopenharmony_ci ######################################## 1831e1051a39Sopenharmony_ci # conditional moves 1832e1051a39Sopenharmony_ci for($i=64;$i<96;$i+=4) { 1833e1051a39Sopenharmony_ci my $one=@ONE_mont[($i-64)/4]; 1834e1051a39Sopenharmony_ci 1835e1051a39Sopenharmony_ci &mov ("eax","edx"); 1836e1051a39Sopenharmony_ci &and ("eax",&DWP($res_x+$i,"esp")); 1837e1051a39Sopenharmony_ci &mov ("ebx","ebp") if ($one && $one!=-1); 1838e1051a39Sopenharmony_ci &and ("ebx",$one) if ($one && $one!=-1); 1839e1051a39Sopenharmony_ci &mov ("ecx","esi"); 1840e1051a39Sopenharmony_ci &and ("ecx",&DWP($in1_x+$i,"esp")); 1841e1051a39Sopenharmony_ci &or ("eax",$one==-1?"ebp":"ebx") if ($one); 1842e1051a39Sopenharmony_ci &or ("eax","ecx"); 1843e1051a39Sopenharmony_ci &mov (&DWP($i,"edi"),"eax"); 1844e1051a39Sopenharmony_ci } 1845e1051a39Sopenharmony_ci for($i=0;$i<64;$i+=4) { 1846e1051a39Sopenharmony_ci &mov ("eax","edx"); # ~in1infty & ~in2infty 1847e1051a39Sopenharmony_ci &and ("eax",&DWP($res_x+$i,"esp")); 1848e1051a39Sopenharmony_ci &mov ("ebx","ebp"); # in1infty & ~in2infty 1849e1051a39Sopenharmony_ci &and ("ebx",&DWP($in2_x+$i,"esp")); 1850e1051a39Sopenharmony_ci &mov ("ecx","esi"); # in2infty 1851e1051a39Sopenharmony_ci &and ("ecx",&DWP($in1_x+$i,"esp")); 1852e1051a39Sopenharmony_ci &or ("eax","ebx"); 1853e1051a39Sopenharmony_ci &or ("eax","ecx"); 1854e1051a39Sopenharmony_ci &mov (&DWP($i,"edi"),"eax"); 1855e1051a39Sopenharmony_ci } 1856e1051a39Sopenharmony_ci &stack_pop(8*15+3); 1857e1051a39Sopenharmony_ci} &function_end("ecp_nistz256_point_add_affine"); 1858e1051a39Sopenharmony_ci 1859e1051a39Sopenharmony_ci&asm_finish(); 1860e1051a39Sopenharmony_ci 1861e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 1862