1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv8. 18# 19# February 2015. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. 23# 24# with/without -DECP_NISTZ256_ASM 25# Apple A7 +190-360% 26# Cortex-A53 +190-400% 27# Cortex-A57 +190-350% 28# Denver +230-400% 29# 30# Ranges denote minimum and maximum improvement coefficients depending 31# on benchmark. Lower coefficients are for ECDSA sign, server-side 32# operation. Keep in mind that +400% means 5x improvement. 33 34# $output is the last argument if it looks like a file (it has an extension) 35# $flavour is the first argument if it doesn't look like a file 36$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 37$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 38 39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 41( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 42die "can't locate arm-xlate.pl"; 43 44open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 45 or die "can't call $xlate: $!"; 46*STDOUT=*OUT; 47 48{ 49my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, 50 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = 51 map("x$_",(0..17,19,20)); 52 53my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont 54 55$code.=<<___; 56#include "arm_arch.h" 57 58.rodata 59___ 60######################################################################## 61# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 62# 63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64open TABLE,"<ecp_nistz256_table.c" or 65open TABLE,"<${dir}../ecp_nistz256_table.c" or 66die "failed to open ecp_nistz256_table.c:",$!; 67 68use integer; 69 70foreach(<TABLE>) { 71 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 72} 73close TABLE; 74 75# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 76# 64*16*37-1 is because $#arr returns last valid index or @arr, not 77# amount of elements. 78die "insane number of elements" if ($#arr != 64*16*37-1); 79 80$code.=<<___; 81.globl ecp_nistz256_precomputed 82.type ecp_nistz256_precomputed,%object 83.align 12 84ecp_nistz256_precomputed: 85___ 86######################################################################## 87# this conversion smashes P256_POINT_AFFINE by individual bytes with 88# 64 byte interval, similar to 89# 1111222233334444 90# 1234123412341234 91for(1..37) { 92 @tbl = splice(@arr,0,64*16); 93 for($i=0;$i<64;$i++) { 94 undef @line; 95 for($j=0;$j<64;$j++) { 96 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 97 } 98 $code.=".byte\t"; 99 $code.=join(',',map { sprintf "0x%02x",$_} @line); 100 $code.="\n"; 101 } 102} 103$code.=<<___; 104.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 105.align 5 106.Lpoly: 107.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 108.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 109.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 110.Lone_mont: 111.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 112.Lone: 113.quad 1,0,0,0 114.Lord: 115.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 116.LordK: 117.quad 0xccd1c8aaee00bc4f 118.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 119 120.text 121 122// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 123.globl ecp_nistz256_to_mont 124.type ecp_nistz256_to_mont,%function 125.align 6 126ecp_nistz256_to_mont: 127 .inst 0xd503233f // paciasp 128 stp x29,x30,[sp,#-32]! 129 add x29,sp,#0 130 stp x19,x20,[sp,#16] 131 132 adrp $bi,.LRR 133 ldr $bi,[$bi,:lo12:.LRR] // bp[0] 134 ldp $a0,$a1,[$ap] 135 ldp $a2,$a3,[$ap,#16] 136 adrp $poly3,.Lpoly 137 add $poly3,$poly3,:lo12:.Lpoly 138 ldr $poly1,[$poly3,#8] 139 ldr $poly3,[$poly3,#24] 140 adrp $bp,.LRR // &bp[0] 141 add $bp,$bp,:lo12:.LRR 142 143 bl __ecp_nistz256_mul_mont 144 145 ldp x19,x20,[sp,#16] 146 ldp x29,x30,[sp],#32 147 .inst 0xd50323bf // autiasp 148 ret 149.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 150 151// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 152.globl ecp_nistz256_from_mont 153.type ecp_nistz256_from_mont,%function 154.align 4 155ecp_nistz256_from_mont: 156 .inst 0xd503233f // paciasp 157 stp x29,x30,[sp,#-32]! 158 add x29,sp,#0 159 stp x19,x20,[sp,#16] 160 161 mov $bi,#1 // bp[0] 162 ldp $a0,$a1,[$ap] 163 ldp $a2,$a3,[$ap,#16] 164 adrp $poly3,.Lpoly 165 add $poly3,$poly3,:lo12:.Lpoly 166 ldr $poly1,[$poly3,#8] 167 ldr $poly3,[$poly3,#24] 168 adrp $bp,.Lone // &bp[0] 169 add $bp,$bp,:lo12:.Lone 170 171 bl __ecp_nistz256_mul_mont 172 173 ldp x19,x20,[sp,#16] 174 ldp x29,x30,[sp],#32 175 .inst 0xd50323bf // autiasp 176 ret 177.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 178 179// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 180// const BN_ULONG x2[4]); 181.globl ecp_nistz256_mul_mont 182.type ecp_nistz256_mul_mont,%function 183.align 4 184ecp_nistz256_mul_mont: 185 .inst 0xd503233f // paciasp 186 stp x29,x30,[sp,#-32]! 187 add x29,sp,#0 188 stp x19,x20,[sp,#16] 189 190 ldr $bi,[$bp] // bp[0] 191 ldp $a0,$a1,[$ap] 192 ldp $a2,$a3,[$ap,#16] 193 adrp $poly3,.Lpoly 194 add $poly3,$poly3,:lo12:.Lpoly 195 ldr $poly1,[$poly3,#8] 196 ldr $poly3,[$poly3,#24] 197 198 bl __ecp_nistz256_mul_mont 199 200 ldp x19,x20,[sp,#16] 201 ldp x29,x30,[sp],#32 202 .inst 0xd50323bf // autiasp 203 ret 204.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 205 206// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 207.globl ecp_nistz256_sqr_mont 208.type ecp_nistz256_sqr_mont,%function 209.align 4 210ecp_nistz256_sqr_mont: 211 .inst 0xd503233f // paciasp 212 stp x29,x30,[sp,#-32]! 213 add x29,sp,#0 214 stp x19,x20,[sp,#16] 215 216 ldp $a0,$a1,[$ap] 217 ldp $a2,$a3,[$ap,#16] 218 adrp $poly3,.Lpoly 219 add $poly3,$poly3,:lo12:.Lpoly 220 ldr $poly1,[$poly3,#8] 221 ldr $poly3,[$poly3,#24] 222 223 bl __ecp_nistz256_sqr_mont 224 225 ldp x19,x20,[sp,#16] 226 ldp x29,x30,[sp],#32 227 .inst 0xd50323bf // autiasp 228 ret 229.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 230 231// void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], 232// const BN_ULONG x2[4]); 233.globl ecp_nistz256_add 234.type ecp_nistz256_add,%function 235.align 4 236ecp_nistz256_add: 237 .inst 0xd503233f // paciasp 238 stp x29,x30,[sp,#-16]! 239 add x29,sp,#0 240 241 ldp $acc0,$acc1,[$ap] 242 ldp $t0,$t1,[$bp] 243 ldp $acc2,$acc3,[$ap,#16] 244 ldp $t2,$t3,[$bp,#16] 245 adrp $poly3,.Lpoly 246 add $poly3,$poly3,:lo12:.Lpoly 247 ldr $poly1,[$poly3,#8] 248 ldr $poly3,[$poly3,#24] 249 250 bl __ecp_nistz256_add 251 252 ldp x29,x30,[sp],#16 253 .inst 0xd50323bf // autiasp 254 ret 255.size ecp_nistz256_add,.-ecp_nistz256_add 256 257// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 258.globl ecp_nistz256_div_by_2 259.type ecp_nistz256_div_by_2,%function 260.align 4 261ecp_nistz256_div_by_2: 262 .inst 0xd503233f // paciasp 263 stp x29,x30,[sp,#-16]! 264 add x29,sp,#0 265 266 ldp $acc0,$acc1,[$ap] 267 ldp $acc2,$acc3,[$ap,#16] 268 adrp $poly3,.Lpoly 269 add $poly3,$poly3,:lo12:.Lpoly 270 ldr $poly1,[$poly3,#8] 271 ldr $poly3,[$poly3,#24] 272 273 bl __ecp_nistz256_div_by_2 274 275 ldp x29,x30,[sp],#16 276 .inst 0xd50323bf // autiasp 277 ret 278.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 279 280// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 281.globl ecp_nistz256_mul_by_2 282.type ecp_nistz256_mul_by_2,%function 283.align 4 284ecp_nistz256_mul_by_2: 285 .inst 0xd503233f // paciasp 286 stp x29,x30,[sp,#-16]! 287 add x29,sp,#0 288 289 ldp $acc0,$acc1,[$ap] 290 ldp $acc2,$acc3,[$ap,#16] 291 adrp $poly3,.Lpoly 292 add $poly3,$poly3,:lo12:.Lpoly 293 ldr $poly1,[$poly3,#8] 294 ldr $poly3,[$poly3,#24] 295 mov $t0,$acc0 296 mov $t1,$acc1 297 mov $t2,$acc2 298 mov $t3,$acc3 299 300 bl __ecp_nistz256_add // ret = a+a // 2*a 301 302 ldp x29,x30,[sp],#16 303 .inst 0xd50323bf // autiasp 304 ret 305.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 306 307// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 308.globl ecp_nistz256_mul_by_3 309.type ecp_nistz256_mul_by_3,%function 310.align 4 311ecp_nistz256_mul_by_3: 312 .inst 0xd503233f // paciasp 313 stp x29,x30,[sp,#-16]! 314 add x29,sp,#0 315 316 ldp $acc0,$acc1,[$ap] 317 ldp $acc2,$acc3,[$ap,#16] 318 adrp $poly3,.Lpoly 319 add $poly3,$poly3,:lo12:.Lpoly 320 ldr $poly1,[$poly3,#8] 321 ldr $poly3,[$poly3,#24] 322 mov $t0,$acc0 323 mov $t1,$acc1 324 mov $t2,$acc2 325 mov $t3,$acc3 326 mov $a0,$acc0 327 mov $a1,$acc1 328 mov $a2,$acc2 329 mov $a3,$acc3 330 331 bl __ecp_nistz256_add // ret = a+a // 2*a 332 333 mov $t0,$a0 334 mov $t1,$a1 335 mov $t2,$a2 336 mov $t3,$a3 337 338 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a 339 340 ldp x29,x30,[sp],#16 341 .inst 0xd50323bf // autiasp 342 ret 343.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 344 345// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 346// const BN_ULONG x2[4]); 347.globl ecp_nistz256_sub 348.type ecp_nistz256_sub,%function 349.align 4 350ecp_nistz256_sub: 351 .inst 0xd503233f // paciasp 352 stp x29,x30,[sp,#-16]! 353 add x29,sp,#0 354 355 ldp $acc0,$acc1,[$ap] 356 ldp $acc2,$acc3,[$ap,#16] 357 adrp $poly3,.Lpoly 358 add $poly3,$poly3,:lo12:.Lpoly 359 ldr $poly1,[$poly3,#8] 360 ldr $poly3,[$poly3,#24] 361 362 bl __ecp_nistz256_sub_from 363 364 ldp x29,x30,[sp],#16 365 .inst 0xd50323bf // autiasp 366 ret 367.size ecp_nistz256_sub,.-ecp_nistz256_sub 368 369// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 370.globl ecp_nistz256_neg 371.type ecp_nistz256_neg,%function 372.align 4 373ecp_nistz256_neg: 374 .inst 0xd503233f // paciasp 375 stp x29,x30,[sp,#-16]! 376 add x29,sp,#0 377 378 mov $bp,$ap 379 mov $acc0,xzr // a = 0 380 mov $acc1,xzr 381 mov $acc2,xzr 382 mov $acc3,xzr 383 adrp $poly3,.Lpoly 384 add $poly3,$poly3,:lo12:.Lpoly 385 ldr $poly1,[$poly3,#8] 386 ldr $poly3,[$poly3,#24] 387 388 bl __ecp_nistz256_sub_from 389 390 ldp x29,x30,[sp],#16 391 .inst 0xd50323bf // autiasp 392 ret 393.size ecp_nistz256_neg,.-ecp_nistz256_neg 394 395// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 396// to $a0-$a3 and b[0] - to $bi 397.type __ecp_nistz256_mul_mont,%function 398.align 4 399__ecp_nistz256_mul_mont: 400 mul $acc0,$a0,$bi // a[0]*b[0] 401 umulh $t0,$a0,$bi 402 403 mul $acc1,$a1,$bi // a[1]*b[0] 404 umulh $t1,$a1,$bi 405 406 mul $acc2,$a2,$bi // a[2]*b[0] 407 umulh $t2,$a2,$bi 408 409 mul $acc3,$a3,$bi // a[3]*b[0] 410 umulh $t3,$a3,$bi 411 ldr $bi,[$bp,#8] // b[1] 412 413 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 414 lsl $t0,$acc0,#32 415 adcs $acc2,$acc2,$t1 416 lsr $t1,$acc0,#32 417 adcs $acc3,$acc3,$t2 418 adc $acc4,xzr,$t3 419 mov $acc5,xzr 420___ 421for($i=1;$i<4;$i++) { 422 # Reduction iteration is normally performed by accumulating 423 # result of multiplication of modulus by "magic" digit [and 424 # omitting least significant word, which is guaranteed to 425 # be 0], but thanks to special form of modulus and "magic" 426 # digit being equal to least significant word, it can be 427 # performed with additions and subtractions alone. Indeed: 428 # 429 # ffff0001.00000000.0000ffff.ffffffff 430 # * abcdefgh 431 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 432 # 433 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 434 # rewrite above as: 435 # 436 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 437 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 438 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 439 # 440 # or marking redundant operations: 441 # 442 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 443 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 444 # - 0000abcd.efgh0000.--------.--------.-------- 445 446$code.=<<___; 447 subs $t2,$acc0,$t0 // "*0xffff0001" 448 sbc $t3,$acc0,$t1 449 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 450 mul $t0,$a0,$bi // lo(a[0]*b[i]) 451 adcs $acc1,$acc2,$t1 452 mul $t1,$a1,$bi // lo(a[1]*b[i]) 453 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 454 mul $t2,$a2,$bi // lo(a[2]*b[i]) 455 adcs $acc3,$acc4,$t3 456 mul $t3,$a3,$bi // lo(a[3]*b[i]) 457 adc $acc4,$acc5,xzr 458 459 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication 460 umulh $t0,$a0,$bi // hi(a[0]*b[i]) 461 adcs $acc1,$acc1,$t1 462 umulh $t1,$a1,$bi // hi(a[1]*b[i]) 463 adcs $acc2,$acc2,$t2 464 umulh $t2,$a2,$bi // hi(a[2]*b[i]) 465 adcs $acc3,$acc3,$t3 466 umulh $t3,$a3,$bi // hi(a[3]*b[i]) 467 adc $acc4,$acc4,xzr 468___ 469$code.=<<___ if ($i<3); 470 ldr $bi,[$bp,#8*($i+1)] // b[$i+1] 471___ 472$code.=<<___; 473 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 474 lsl $t0,$acc0,#32 475 adcs $acc2,$acc2,$t1 476 lsr $t1,$acc0,#32 477 adcs $acc3,$acc3,$t2 478 adcs $acc4,$acc4,$t3 479 adc $acc5,xzr,xzr 480___ 481} 482$code.=<<___; 483 // last reduction 484 subs $t2,$acc0,$t0 // "*0xffff0001" 485 sbc $t3,$acc0,$t1 486 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 487 adcs $acc1,$acc2,$t1 488 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 489 adcs $acc3,$acc4,$t3 490 adc $acc4,$acc5,xzr 491 492 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 493 sbcs $t1,$acc1,$poly1 494 sbcs $t2,$acc2,xzr 495 sbcs $t3,$acc3,$poly3 496 sbcs xzr,$acc4,xzr // did it borrow? 497 498 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 499 csel $acc1,$acc1,$t1,lo 500 csel $acc2,$acc2,$t2,lo 501 stp $acc0,$acc1,[$rp] 502 csel $acc3,$acc3,$t3,lo 503 stp $acc2,$acc3,[$rp,#16] 504 505 ret 506.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 507 508// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 509// to $a0-$a3 510.type __ecp_nistz256_sqr_mont,%function 511.align 4 512__ecp_nistz256_sqr_mont: 513 // | | | | | |a1*a0| | 514 // | | | | |a2*a0| | | 515 // | |a3*a2|a3*a0| | | | 516 // | | | |a2*a1| | | | 517 // | | |a3*a1| | | | | 518 // *| | | | | | | | 2| 519 // +|a3*a3|a2*a2|a1*a1|a0*a0| 520 // |--+--+--+--+--+--+--+--| 521 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 522 // 523 // "can't overflow" below mark carrying into high part of 524 // multiplication result, which can't overflow, because it 525 // can never be all ones. 526 527 mul $acc1,$a1,$a0 // a[1]*a[0] 528 umulh $t1,$a1,$a0 529 mul $acc2,$a2,$a0 // a[2]*a[0] 530 umulh $t2,$a2,$a0 531 mul $acc3,$a3,$a0 // a[3]*a[0] 532 umulh $acc4,$a3,$a0 533 534 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 535 mul $t0,$a2,$a1 // a[2]*a[1] 536 umulh $t1,$a2,$a1 537 adcs $acc3,$acc3,$t2 538 mul $t2,$a3,$a1 // a[3]*a[1] 539 umulh $t3,$a3,$a1 540 adc $acc4,$acc4,xzr // can't overflow 541 542 mul $acc5,$a3,$a2 // a[3]*a[2] 543 umulh $acc6,$a3,$a2 544 545 adds $t1,$t1,$t2 // accumulate high parts of multiplication 546 mul $acc0,$a0,$a0 // a[0]*a[0] 547 adc $t2,$t3,xzr // can't overflow 548 549 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 550 umulh $a0,$a0,$a0 551 adcs $acc4,$acc4,$t1 552 mul $t1,$a1,$a1 // a[1]*a[1] 553 adcs $acc5,$acc5,$t2 554 umulh $a1,$a1,$a1 555 adc $acc6,$acc6,xzr // can't overflow 556 557 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 558 mul $t2,$a2,$a2 // a[2]*a[2] 559 adcs $acc2,$acc2,$acc2 560 umulh $a2,$a2,$a2 561 adcs $acc3,$acc3,$acc3 562 mul $t3,$a3,$a3 // a[3]*a[3] 563 adcs $acc4,$acc4,$acc4 564 umulh $a3,$a3,$a3 565 adcs $acc5,$acc5,$acc5 566 adcs $acc6,$acc6,$acc6 567 adc $acc7,xzr,xzr 568 569 adds $acc1,$acc1,$a0 // +a[i]*a[i] 570 adcs $acc2,$acc2,$t1 571 adcs $acc3,$acc3,$a1 572 adcs $acc4,$acc4,$t2 573 adcs $acc5,$acc5,$a2 574 lsl $t0,$acc0,#32 575 adcs $acc6,$acc6,$t3 576 lsr $t1,$acc0,#32 577 adc $acc7,$acc7,$a3 578___ 579for($i=0;$i<3;$i++) { # reductions, see commentary in 580 # multiplication for details 581$code.=<<___; 582 subs $t2,$acc0,$t0 // "*0xffff0001" 583 sbc $t3,$acc0,$t1 584 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 585 adcs $acc1,$acc2,$t1 586 lsl $t0,$acc0,#32 587 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 588 lsr $t1,$acc0,#32 589 adc $acc3,$t3,xzr // can't overflow 590___ 591} 592$code.=<<___; 593 subs $t2,$acc0,$t0 // "*0xffff0001" 594 sbc $t3,$acc0,$t1 595 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 596 adcs $acc1,$acc2,$t1 597 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 598 adc $acc3,$t3,xzr // can't overflow 599 600 adds $acc0,$acc0,$acc4 // accumulate upper half 601 adcs $acc1,$acc1,$acc5 602 adcs $acc2,$acc2,$acc6 603 adcs $acc3,$acc3,$acc7 604 adc $acc4,xzr,xzr 605 606 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 607 sbcs $t1,$acc1,$poly1 608 sbcs $t2,$acc2,xzr 609 sbcs $t3,$acc3,$poly3 610 sbcs xzr,$acc4,xzr // did it borrow? 611 612 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 613 csel $acc1,$acc1,$t1,lo 614 csel $acc2,$acc2,$t2,lo 615 stp $acc0,$acc1,[$rp] 616 csel $acc3,$acc3,$t3,lo 617 stp $acc2,$acc3,[$rp,#16] 618 619 ret 620.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 621 622// Note that __ecp_nistz256_add expects both input vectors pre-loaded to 623// $a0-$a3 and $t0-$t3. This is done because it's used in multiple 624// contexts, e.g. in multiplication by 2 and 3... 625.type __ecp_nistz256_add,%function 626.align 4 627__ecp_nistz256_add: 628 adds $acc0,$acc0,$t0 // ret = a+b 629 adcs $acc1,$acc1,$t1 630 adcs $acc2,$acc2,$t2 631 adcs $acc3,$acc3,$t3 632 adc $ap,xzr,xzr // zap $ap 633 634 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus 635 sbcs $t1,$acc1,$poly1 636 sbcs $t2,$acc2,xzr 637 sbcs $t3,$acc3,$poly3 638 sbcs xzr,$ap,xzr // did subtraction borrow? 639 640 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 641 csel $acc1,$acc1,$t1,lo 642 csel $acc2,$acc2,$t2,lo 643 stp $acc0,$acc1,[$rp] 644 csel $acc3,$acc3,$t3,lo 645 stp $acc2,$acc3,[$rp,#16] 646 647 ret 648.size __ecp_nistz256_add,.-__ecp_nistz256_add 649 650.type __ecp_nistz256_sub_from,%function 651.align 4 652__ecp_nistz256_sub_from: 653 ldp $t0,$t1,[$bp] 654 ldp $t2,$t3,[$bp,#16] 655 subs $acc0,$acc0,$t0 // ret = a-b 656 sbcs $acc1,$acc1,$t1 657 sbcs $acc2,$acc2,$t2 658 sbcs $acc3,$acc3,$t3 659 sbc $ap,xzr,xzr // zap $ap 660 661 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 662 adcs $t1,$acc1,$poly1 663 adcs $t2,$acc2,xzr 664 adc $t3,$acc3,$poly3 665 cmp $ap,xzr // did subtraction borrow? 666 667 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 668 csel $acc1,$acc1,$t1,eq 669 csel $acc2,$acc2,$t2,eq 670 stp $acc0,$acc1,[$rp] 671 csel $acc3,$acc3,$t3,eq 672 stp $acc2,$acc3,[$rp,#16] 673 674 ret 675.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 676 677.type __ecp_nistz256_sub_morf,%function 678.align 4 679__ecp_nistz256_sub_morf: 680 ldp $t0,$t1,[$bp] 681 ldp $t2,$t3,[$bp,#16] 682 subs $acc0,$t0,$acc0 // ret = b-a 683 sbcs $acc1,$t1,$acc1 684 sbcs $acc2,$t2,$acc2 685 sbcs $acc3,$t3,$acc3 686 sbc $ap,xzr,xzr // zap $ap 687 688 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 689 adcs $t1,$acc1,$poly1 690 adcs $t2,$acc2,xzr 691 adc $t3,$acc3,$poly3 692 cmp $ap,xzr // did subtraction borrow? 693 694 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 695 csel $acc1,$acc1,$t1,eq 696 csel $acc2,$acc2,$t2,eq 697 stp $acc0,$acc1,[$rp] 698 csel $acc3,$acc3,$t3,eq 699 stp $acc2,$acc3,[$rp,#16] 700 701 ret 702.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 703 704.type __ecp_nistz256_div_by_2,%function 705.align 4 706__ecp_nistz256_div_by_2: 707 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus 708 adcs $t1,$acc1,$poly1 709 adcs $t2,$acc2,xzr 710 adcs $t3,$acc3,$poly3 711 adc $ap,xzr,xzr // zap $ap 712 tst $acc0,#1 // is a even? 713 714 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus 715 csel $acc1,$acc1,$t1,eq 716 csel $acc2,$acc2,$t2,eq 717 csel $acc3,$acc3,$t3,eq 718 csel $ap,xzr,$ap,eq 719 720 lsr $acc0,$acc0,#1 // ret >>= 1 721 orr $acc0,$acc0,$acc1,lsl#63 722 lsr $acc1,$acc1,#1 723 orr $acc1,$acc1,$acc2,lsl#63 724 lsr $acc2,$acc2,#1 725 orr $acc2,$acc2,$acc3,lsl#63 726 lsr $acc3,$acc3,#1 727 stp $acc0,$acc1,[$rp] 728 orr $acc3,$acc3,$ap,lsl#63 729 stp $acc2,$acc3,[$rp,#16] 730 731 ret 732.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 733___ 734######################################################################## 735# following subroutines are "literal" implementation of those found in 736# ecp_nistz256.c 737# 738######################################################################## 739# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 740# 741{ 742my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 743# above map() describes stack layout with 4 temporary 744# 256-bit vectors on top. 745my ($rp_real,$ap_real) = map("x$_",(21,22)); 746 747$code.=<<___; 748.globl ecp_nistz256_point_double 749.type ecp_nistz256_point_double,%function 750.align 5 751ecp_nistz256_point_double: 752 .inst 0xd503233f // paciasp 753 stp x29,x30,[sp,#-96]! 754 add x29,sp,#0 755 stp x19,x20,[sp,#16] 756 stp x21,x22,[sp,#32] 757 sub sp,sp,#32*4 758 759.Ldouble_shortcut: 760 ldp $acc0,$acc1,[$ap,#32] 761 mov $rp_real,$rp 762 ldp $acc2,$acc3,[$ap,#48] 763 mov $ap_real,$ap 764 adrp $poly3,.Lpoly 765 add $poly3,$poly3,:lo12:.Lpoly 766 ldr $poly1,[$poly3,#8] 767 mov $t0,$acc0 768 ldr $poly3,[$poly3,#24] 769 mov $t1,$acc1 770 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont 771 mov $t2,$acc2 772 mov $t3,$acc3 773 ldp $a2,$a3,[$ap_real,#64+16] 774 add $rp,sp,#$S 775 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); 776 777 add $rp,sp,#$Zsqr 778 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 779 780 ldp $t0,$t1,[$ap_real] 781 ldp $t2,$t3,[$ap_real,#16] 782 mov $a0,$acc0 // put Zsqr aside for p256_sub 783 mov $a1,$acc1 784 mov $a2,$acc2 785 mov $a3,$acc3 786 add $rp,sp,#$M 787 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); 788 789 add $bp,$ap_real,#0 790 mov $acc0,$a0 // restore Zsqr 791 mov $acc1,$a1 792 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 793 mov $acc2,$a2 794 mov $acc3,$a3 795 ldp $a2,$a3,[sp,#$S+16] 796 add $rp,sp,#$Zsqr 797 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 798 799 add $rp,sp,#$S 800 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 801 802 ldr $bi,[$ap_real,#32] 803 ldp $a0,$a1,[$ap_real,#64] 804 ldp $a2,$a3,[$ap_real,#64+16] 805 add $bp,$ap_real,#32 806 add $rp,sp,#$tmp0 807 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 808 809 mov $t0,$acc0 810 mov $t1,$acc1 811 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 812 mov $t2,$acc2 813 mov $t3,$acc3 814 ldp $a2,$a3,[sp,#$S+16] 815 add $rp,$rp_real,#64 816 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); 817 818 add $rp,sp,#$tmp0 819 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 820 821 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont 822 ldp $a0,$a1,[sp,#$M] 823 ldp $a2,$a3,[sp,#$M+16] 824 add $rp,$rp_real,#32 825 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 826 827 add $bp,sp,#$Zsqr 828 add $rp,sp,#$M 829 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 830 831 mov $t0,$acc0 // duplicate M 832 mov $t1,$acc1 833 mov $t2,$acc2 834 mov $t3,$acc3 835 mov $a0,$acc0 // put M aside 836 mov $a1,$acc1 837 mov $a2,$acc2 838 mov $a3,$acc3 839 add $rp,sp,#$M 840 bl __ecp_nistz256_add 841 mov $t0,$a0 // restore M 842 mov $t1,$a1 843 ldr $bi,[$ap_real] // forward load for p256_mul_mont 844 mov $t2,$a2 845 ldp $a0,$a1,[sp,#$S] 846 mov $t3,$a3 847 ldp $a2,$a3,[sp,#$S+16] 848 bl __ecp_nistz256_add // p256_mul_by_3(M, M); 849 850 add $bp,$ap_real,#0 851 add $rp,sp,#$S 852 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 853 854 mov $t0,$acc0 855 mov $t1,$acc1 856 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont 857 mov $t2,$acc2 858 mov $t3,$acc3 859 ldp $a2,$a3,[sp,#$M+16] 860 add $rp,sp,#$tmp0 861 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); 862 863 add $rp,$rp_real,#0 864 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 865 866 add $bp,sp,#$tmp0 867 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 868 869 add $bp,sp,#$S 870 add $rp,sp,#$S 871 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 872 873 ldr $bi,[sp,#$M] 874 mov $a0,$acc0 // copy S 875 mov $a1,$acc1 876 mov $a2,$acc2 877 mov $a3,$acc3 878 add $bp,sp,#$M 879 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 880 881 add $bp,$rp_real,#32 882 add $rp,$rp_real,#32 883 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 884 885 add sp,x29,#0 // destroy frame 886 ldp x19,x20,[x29,#16] 887 ldp x21,x22,[x29,#32] 888 ldp x29,x30,[sp],#96 889 .inst 0xd50323bf // autiasp 890 ret 891.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 892___ 893} 894 895######################################################################## 896# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 897# const P256_POINT *in2); 898{ 899my ($res_x,$res_y,$res_z, 900 $H,$Hsqr,$R,$Rsqr,$Hcub, 901 $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); 902my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 903# above map() describes stack layout with 12 temporary 904# 256-bit vectors on top. 905my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); 906 907$code.=<<___; 908.globl ecp_nistz256_point_add 909.type ecp_nistz256_point_add,%function 910.align 5 911ecp_nistz256_point_add: 912 .inst 0xd503233f // paciasp 913 stp x29,x30,[sp,#-96]! 914 add x29,sp,#0 915 stp x19,x20,[sp,#16] 916 stp x21,x22,[sp,#32] 917 stp x23,x24,[sp,#48] 918 stp x25,x26,[sp,#64] 919 stp x27,x28,[sp,#80] 920 sub sp,sp,#32*12 921 922 ldp $a0,$a1,[$bp,#64] // in2_z 923 ldp $a2,$a3,[$bp,#64+16] 924 mov $rp_real,$rp 925 mov $ap_real,$ap 926 mov $bp_real,$bp 927 adrp $poly3,.Lpoly 928 add $poly3,$poly3,:lo12:.Lpoly 929 ldr $poly1,[$poly3,#8] 930 ldr $poly3,[$poly3,#24] 931 orr $t0,$a0,$a1 932 orr $t2,$a2,$a3 933 orr $in2infty,$t0,$t2 934 cmp $in2infty,#0 935 csetm $in2infty,ne // ~in2infty 936 add $rp,sp,#$Z2sqr 937 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 938 939 ldp $a0,$a1,[$ap_real,#64] // in1_z 940 ldp $a2,$a3,[$ap_real,#64+16] 941 orr $t0,$a0,$a1 942 orr $t2,$a2,$a3 943 orr $in1infty,$t0,$t2 944 cmp $in1infty,#0 945 csetm $in1infty,ne // ~in1infty 946 add $rp,sp,#$Z1sqr 947 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 948 949 ldr $bi,[$bp_real,#64] 950 ldp $a0,$a1,[sp,#$Z2sqr] 951 ldp $a2,$a3,[sp,#$Z2sqr+16] 952 add $bp,$bp_real,#64 953 add $rp,sp,#$S1 954 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 955 956 ldr $bi,[$ap_real,#64] 957 ldp $a0,$a1,[sp,#$Z1sqr] 958 ldp $a2,$a3,[sp,#$Z1sqr+16] 959 add $bp,$ap_real,#64 960 add $rp,sp,#$S2 961 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 962 963 ldr $bi,[$ap_real,#32] 964 ldp $a0,$a1,[sp,#$S1] 965 ldp $a2,$a3,[sp,#$S1+16] 966 add $bp,$ap_real,#32 967 add $rp,sp,#$S1 968 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 969 970 ldr $bi,[$bp_real,#32] 971 ldp $a0,$a1,[sp,#$S2] 972 ldp $a2,$a3,[sp,#$S2+16] 973 add $bp,$bp_real,#32 974 add $rp,sp,#$S2 975 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 976 977 add $bp,sp,#$S1 978 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont 979 ldp $a0,$a1,[$ap_real] 980 ldp $a2,$a3,[$ap_real,#16] 981 add $rp,sp,#$R 982 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 983 984 orr $acc0,$acc0,$acc1 // see if result is zero 985 orr $acc2,$acc2,$acc3 986 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) 987 988 add $bp,sp,#$Z2sqr 989 add $rp,sp,#$U1 990 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 991 992 ldr $bi,[sp,#$Z1sqr] 993 ldp $a0,$a1,[$bp_real] 994 ldp $a2,$a3,[$bp_real,#16] 995 add $bp,sp,#$Z1sqr 996 add $rp,sp,#$U2 997 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 998 999 add $bp,sp,#$U1 1000 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont 1001 ldp $a2,$a3,[sp,#$R+16] 1002 add $rp,sp,#$H 1003 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 1004 1005 orr $acc0,$acc0,$acc1 // see if result is zero 1006 orr $acc2,$acc2,$acc3 1007 orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) 1008 1009 mvn $temp1,$in1infty // -1/0 -> 0/-1 1010 mvn $temp2,$in2infty // -1/0 -> 0/-1 1011 orr $acc0,$acc0,$temp1 1012 orr $acc0,$acc0,$temp2 1013 orr $acc0,$acc0,$temp0 1014 cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 1015 1016.Ladd_double: 1017 mov $ap,$ap_real 1018 mov $rp,$rp_real 1019 ldp x23,x24,[x29,#48] 1020 ldp x25,x26,[x29,#64] 1021 ldp x27,x28,[x29,#80] 1022 add sp,sp,#32*(12-4) // difference in stack frames 1023 b .Ldouble_shortcut 1024 1025.align 4 1026.Ladd_proceed: 1027 add $rp,sp,#$Rsqr 1028 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1029 1030 ldr $bi,[$ap_real,#64] 1031 ldp $a0,$a1,[sp,#$H] 1032 ldp $a2,$a3,[sp,#$H+16] 1033 add $bp,$ap_real,#64 1034 add $rp,sp,#$res_z 1035 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1036 1037 ldp $a0,$a1,[sp,#$H] 1038 ldp $a2,$a3,[sp,#$H+16] 1039 add $rp,sp,#$Hsqr 1040 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1041 1042 ldr $bi,[$bp_real,#64] 1043 ldp $a0,$a1,[sp,#$res_z] 1044 ldp $a2,$a3,[sp,#$res_z+16] 1045 add $bp,$bp_real,#64 1046 add $rp,sp,#$res_z 1047 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 1048 1049 ldr $bi,[sp,#$H] 1050 ldp $a0,$a1,[sp,#$Hsqr] 1051 ldp $a2,$a3,[sp,#$Hsqr+16] 1052 add $bp,sp,#$H 1053 add $rp,sp,#$Hcub 1054 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1055 1056 ldr $bi,[sp,#$Hsqr] 1057 ldp $a0,$a1,[sp,#$U1] 1058 ldp $a2,$a3,[sp,#$U1+16] 1059 add $bp,sp,#$Hsqr 1060 add $rp,sp,#$U2 1061 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 1062 1063 mov $t0,$acc0 1064 mov $t1,$acc1 1065 mov $t2,$acc2 1066 mov $t3,$acc3 1067 add $rp,sp,#$Hsqr 1068 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1069 1070 add $bp,sp,#$Rsqr 1071 add $rp,sp,#$res_x 1072 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1073 1074 add $bp,sp,#$Hcub 1075 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1076 1077 add $bp,sp,#$U2 1078 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont 1079 ldp $a0,$a1,[sp,#$S1] 1080 ldp $a2,$a3,[sp,#$S1+16] 1081 add $rp,sp,#$res_y 1082 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1083 1084 add $bp,sp,#$Hcub 1085 add $rp,sp,#$S2 1086 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 1087 1088 ldr $bi,[sp,#$R] 1089 ldp $a0,$a1,[sp,#$res_y] 1090 ldp $a2,$a3,[sp,#$res_y+16] 1091 add $bp,sp,#$R 1092 add $rp,sp,#$res_y 1093 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1094 1095 add $bp,sp,#$S2 1096 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1097 1098 ldp $a0,$a1,[sp,#$res_x] // res 1099 ldp $a2,$a3,[sp,#$res_x+16] 1100 ldp $t0,$t1,[$bp_real] // in2 1101 ldp $t2,$t3,[$bp_real,#16] 1102___ 1103for($i=0;$i<64;$i+=32) { # conditional moves 1104$code.=<<___; 1105 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1106 cmp $in1infty,#0 // ~$in1intfy, remember? 1107 ldp $acc2,$acc3,[$ap_real,#$i+16] 1108 csel $t0,$a0,$t0,ne 1109 csel $t1,$a1,$t1,ne 1110 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1111 csel $t2,$a2,$t2,ne 1112 csel $t3,$a3,$t3,ne 1113 cmp $in2infty,#0 // ~$in2intfy, remember? 1114 ldp $a2,$a3,[sp,#$res_x+$i+48] 1115 csel $acc0,$t0,$acc0,ne 1116 csel $acc1,$t1,$acc1,ne 1117 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1118 csel $acc2,$t2,$acc2,ne 1119 csel $acc3,$t3,$acc3,ne 1120 ldp $t2,$t3,[$bp_real,#$i+48] 1121 stp $acc0,$acc1,[$rp_real,#$i] 1122 stp $acc2,$acc3,[$rp_real,#$i+16] 1123___ 1124} 1125$code.=<<___; 1126 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1127 cmp $in1infty,#0 // ~$in1intfy, remember? 1128 ldp $acc2,$acc3,[$ap_real,#$i+16] 1129 csel $t0,$a0,$t0,ne 1130 csel $t1,$a1,$t1,ne 1131 csel $t2,$a2,$t2,ne 1132 csel $t3,$a3,$t3,ne 1133 cmp $in2infty,#0 // ~$in2intfy, remember? 1134 csel $acc0,$t0,$acc0,ne 1135 csel $acc1,$t1,$acc1,ne 1136 csel $acc2,$t2,$acc2,ne 1137 csel $acc3,$t3,$acc3,ne 1138 stp $acc0,$acc1,[$rp_real,#$i] 1139 stp $acc2,$acc3,[$rp_real,#$i+16] 1140 1141.Ladd_done: 1142 add sp,x29,#0 // destroy frame 1143 ldp x19,x20,[x29,#16] 1144 ldp x21,x22,[x29,#32] 1145 ldp x23,x24,[x29,#48] 1146 ldp x25,x26,[x29,#64] 1147 ldp x27,x28,[x29,#80] 1148 ldp x29,x30,[sp],#96 1149 .inst 0xd50323bf // autiasp 1150 ret 1151.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1152___ 1153} 1154 1155######################################################################## 1156# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1157# const P256_POINT_AFFINE *in2); 1158{ 1159my ($res_x,$res_y,$res_z, 1160 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 1161my $Z1sqr = $S2; 1162# above map() describes stack layout with 10 temporary 1163# 256-bit vectors on top. 1164my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 1165 1166$code.=<<___; 1167.globl ecp_nistz256_point_add_affine 1168.type ecp_nistz256_point_add_affine,%function 1169.align 5 1170ecp_nistz256_point_add_affine: 1171 .inst 0xd503233f // paciasp 1172 stp x29,x30,[sp,#-80]! 1173 add x29,sp,#0 1174 stp x19,x20,[sp,#16] 1175 stp x21,x22,[sp,#32] 1176 stp x23,x24,[sp,#48] 1177 stp x25,x26,[sp,#64] 1178 sub sp,sp,#32*10 1179 1180 mov $rp_real,$rp 1181 mov $ap_real,$ap 1182 mov $bp_real,$bp 1183 adrp $poly3,.Lpoly 1184 add $poly3,$poly3,:lo12:.Lpoly 1185 ldr $poly1,[$poly3,#8] 1186 ldr $poly3,[$poly3,#24] 1187 1188 ldp $a0,$a1,[$ap,#64] // in1_z 1189 ldp $a2,$a3,[$ap,#64+16] 1190 orr $t0,$a0,$a1 1191 orr $t2,$a2,$a3 1192 orr $in1infty,$t0,$t2 1193 cmp $in1infty,#0 1194 csetm $in1infty,ne // ~in1infty 1195 1196 ldp $acc0,$acc1,[$bp] // in2_x 1197 ldp $acc2,$acc3,[$bp,#16] 1198 ldp $t0,$t1,[$bp,#32] // in2_y 1199 ldp $t2,$t3,[$bp,#48] 1200 orr $acc0,$acc0,$acc1 1201 orr $acc2,$acc2,$acc3 1202 orr $t0,$t0,$t1 1203 orr $t2,$t2,$t3 1204 orr $acc0,$acc0,$acc2 1205 orr $t0,$t0,$t2 1206 orr $in2infty,$acc0,$t0 1207 cmp $in2infty,#0 1208 csetm $in2infty,ne // ~in2infty 1209 1210 add $rp,sp,#$Z1sqr 1211 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1212 1213 mov $a0,$acc0 1214 mov $a1,$acc1 1215 mov $a2,$acc2 1216 mov $a3,$acc3 1217 ldr $bi,[$bp_real] 1218 add $bp,$bp_real,#0 1219 add $rp,sp,#$U2 1220 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1221 1222 add $bp,$ap_real,#0 1223 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont 1224 ldp $a0,$a1,[sp,#$Z1sqr] 1225 ldp $a2,$a3,[sp,#$Z1sqr+16] 1226 add $rp,sp,#$H 1227 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1228 1229 add $bp,$ap_real,#64 1230 add $rp,sp,#$S2 1231 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1232 1233 ldr $bi,[$ap_real,#64] 1234 ldp $a0,$a1,[sp,#$H] 1235 ldp $a2,$a3,[sp,#$H+16] 1236 add $bp,$ap_real,#64 1237 add $rp,sp,#$res_z 1238 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1239 1240 ldr $bi,[$bp_real,#32] 1241 ldp $a0,$a1,[sp,#$S2] 1242 ldp $a2,$a3,[sp,#$S2+16] 1243 add $bp,$bp_real,#32 1244 add $rp,sp,#$S2 1245 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1246 1247 add $bp,$ap_real,#32 1248 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont 1249 ldp $a2,$a3,[sp,#$H+16] 1250 add $rp,sp,#$R 1251 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1252 1253 add $rp,sp,#$Hsqr 1254 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1255 1256 ldp $a0,$a1,[sp,#$R] 1257 ldp $a2,$a3,[sp,#$R+16] 1258 add $rp,sp,#$Rsqr 1259 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1260 1261 ldr $bi,[sp,#$H] 1262 ldp $a0,$a1,[sp,#$Hsqr] 1263 ldp $a2,$a3,[sp,#$Hsqr+16] 1264 add $bp,sp,#$H 1265 add $rp,sp,#$Hcub 1266 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1267 1268 ldr $bi,[$ap_real] 1269 ldp $a0,$a1,[sp,#$Hsqr] 1270 ldp $a2,$a3,[sp,#$Hsqr+16] 1271 add $bp,$ap_real,#0 1272 add $rp,sp,#$U2 1273 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1274 1275 mov $t0,$acc0 1276 mov $t1,$acc1 1277 mov $t2,$acc2 1278 mov $t3,$acc3 1279 add $rp,sp,#$Hsqr 1280 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1281 1282 add $bp,sp,#$Rsqr 1283 add $rp,sp,#$res_x 1284 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1285 1286 add $bp,sp,#$Hcub 1287 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1288 1289 add $bp,sp,#$U2 1290 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont 1291 ldp $a0,$a1,[sp,#$Hcub] 1292 ldp $a2,$a3,[sp,#$Hcub+16] 1293 add $rp,sp,#$res_y 1294 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1295 1296 add $bp,$ap_real,#32 1297 add $rp,sp,#$S2 1298 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1299 1300 ldr $bi,[sp,#$R] 1301 ldp $a0,$a1,[sp,#$res_y] 1302 ldp $a2,$a3,[sp,#$res_y+16] 1303 add $bp,sp,#$R 1304 add $rp,sp,#$res_y 1305 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1306 1307 add $bp,sp,#$S2 1308 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1309 1310 ldp $a0,$a1,[sp,#$res_x] // res 1311 ldp $a2,$a3,[sp,#$res_x+16] 1312 ldp $t0,$t1,[$bp_real] // in2 1313 ldp $t2,$t3,[$bp_real,#16] 1314___ 1315for($i=0;$i<64;$i+=32) { # conditional moves 1316$code.=<<___; 1317 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1318 cmp $in1infty,#0 // ~$in1intfy, remember? 1319 ldp $acc2,$acc3,[$ap_real,#$i+16] 1320 csel $t0,$a0,$t0,ne 1321 csel $t1,$a1,$t1,ne 1322 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1323 csel $t2,$a2,$t2,ne 1324 csel $t3,$a3,$t3,ne 1325 cmp $in2infty,#0 // ~$in2intfy, remember? 1326 ldp $a2,$a3,[sp,#$res_x+$i+48] 1327 csel $acc0,$t0,$acc0,ne 1328 csel $acc1,$t1,$acc1,ne 1329 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1330 csel $acc2,$t2,$acc2,ne 1331 csel $acc3,$t3,$acc3,ne 1332 ldp $t2,$t3,[$bp_real,#$i+48] 1333 stp $acc0,$acc1,[$rp_real,#$i] 1334 stp $acc2,$acc3,[$rp_real,#$i+16] 1335___ 1336$code.=<<___ if ($i == 0); 1337 adrp $bp_real,.Lone_mont-64 1338 add $bp_real,$bp_real,:lo12:.Lone_mont-64 1339___ 1340} 1341$code.=<<___; 1342 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1343 cmp $in1infty,#0 // ~$in1intfy, remember? 1344 ldp $acc2,$acc3,[$ap_real,#$i+16] 1345 csel $t0,$a0,$t0,ne 1346 csel $t1,$a1,$t1,ne 1347 csel $t2,$a2,$t2,ne 1348 csel $t3,$a3,$t3,ne 1349 cmp $in2infty,#0 // ~$in2intfy, remember? 1350 csel $acc0,$t0,$acc0,ne 1351 csel $acc1,$t1,$acc1,ne 1352 csel $acc2,$t2,$acc2,ne 1353 csel $acc3,$t3,$acc3,ne 1354 stp $acc0,$acc1,[$rp_real,#$i] 1355 stp $acc2,$acc3,[$rp_real,#$i+16] 1356 1357 add sp,x29,#0 // destroy frame 1358 ldp x19,x20,[x29,#16] 1359 ldp x21,x22,[x29,#32] 1360 ldp x23,x24,[x29,#48] 1361 ldp x25,x26,[x29,#64] 1362 ldp x29,x30,[sp],#80 1363 .inst 0xd50323bf // autiasp 1364 ret 1365.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1366___ 1367} 1368if (1) { 1369my ($ord0,$ord1) = ($poly1,$poly3); 1370my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); 1371my $acc7 = $bi; 1372 1373$code.=<<___; 1374//////////////////////////////////////////////////////////////////////// 1375// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1376// uint64_t b[4]); 1377.globl ecp_nistz256_ord_mul_mont 1378.type ecp_nistz256_ord_mul_mont,%function 1379.align 4 1380ecp_nistz256_ord_mul_mont: 1381 stp x29,x30,[sp,#-64]! 1382 add x29,sp,#0 1383 stp x19,x20,[sp,#16] 1384 stp x21,x22,[sp,#32] 1385 stp x23,x24,[sp,#48] 1386 1387 adrp $ordk,.Lord 1388 add $ordk,$ordk,:lo12:.Lord 1389 ldr $bi,[$bp] // bp[0] 1390 ldp $a0,$a1,[$ap] 1391 ldp $a2,$a3,[$ap,#16] 1392 1393 ldp $ord0,$ord1,[$ordk,#0] 1394 ldp $ord2,$ord3,[$ordk,#16] 1395 ldr $ordk,[$ordk,#32] 1396 1397 mul $acc0,$a0,$bi // a[0]*b[0] 1398 umulh $t0,$a0,$bi 1399 1400 mul $acc1,$a1,$bi // a[1]*b[0] 1401 umulh $t1,$a1,$bi 1402 1403 mul $acc2,$a2,$bi // a[2]*b[0] 1404 umulh $t2,$a2,$bi 1405 1406 mul $acc3,$a3,$bi // a[3]*b[0] 1407 umulh $acc4,$a3,$bi 1408 1409 mul $t4,$acc0,$ordk 1410 1411 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 1412 adcs $acc2,$acc2,$t1 1413 adcs $acc3,$acc3,$t2 1414 adc $acc4,$acc4,xzr 1415 mov $acc5,xzr 1416___ 1417for ($i=1;$i<4;$i++) { 1418 ################################################################ 1419 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz 1420 # * abcdefgh 1421 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1422 # 1423 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1424 # rewrite above as: 1425 # 1426 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1427 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 1428 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh 1429$code.=<<___; 1430 ldr $bi,[$bp,#8*$i] // b[i] 1431 1432 lsl $t0,$t4,#32 1433 subs $acc2,$acc2,$t4 1434 lsr $t1,$t4,#32 1435 sbcs $acc3,$acc3,$t0 1436 sbcs $acc4,$acc4,$t1 1437 sbc $acc5,$acc5,xzr 1438 1439 subs xzr,$acc0,#1 1440 umulh $t1,$ord0,$t4 1441 mul $t2,$ord1,$t4 1442 umulh $t3,$ord1,$t4 1443 1444 adcs $t2,$t2,$t1 1445 mul $t0,$a0,$bi 1446 adc $t3,$t3,xzr 1447 mul $t1,$a1,$bi 1448 1449 adds $acc0,$acc1,$t2 1450 mul $t2,$a2,$bi 1451 adcs $acc1,$acc2,$t3 1452 mul $t3,$a3,$bi 1453 adcs $acc2,$acc3,$t4 1454 adcs $acc3,$acc4,$t4 1455 adc $acc4,$acc5,xzr 1456 1457 adds $acc0,$acc0,$t0 // accumulate low parts 1458 umulh $t0,$a0,$bi 1459 adcs $acc1,$acc1,$t1 1460 umulh $t1,$a1,$bi 1461 adcs $acc2,$acc2,$t2 1462 umulh $t2,$a2,$bi 1463 adcs $acc3,$acc3,$t3 1464 umulh $t3,$a3,$bi 1465 adc $acc4,$acc4,xzr 1466 mul $t4,$acc0,$ordk 1467 adds $acc1,$acc1,$t0 // accumulate high parts 1468 adcs $acc2,$acc2,$t1 1469 adcs $acc3,$acc3,$t2 1470 adcs $acc4,$acc4,$t3 1471 adc $acc5,xzr,xzr 1472___ 1473} 1474$code.=<<___; 1475 lsl $t0,$t4,#32 // last reduction 1476 subs $acc2,$acc2,$t4 1477 lsr $t1,$t4,#32 1478 sbcs $acc3,$acc3,$t0 1479 sbcs $acc4,$acc4,$t1 1480 sbc $acc5,$acc5,xzr 1481 1482 subs xzr,$acc0,#1 1483 umulh $t1,$ord0,$t4 1484 mul $t2,$ord1,$t4 1485 umulh $t3,$ord1,$t4 1486 1487 adcs $t2,$t2,$t1 1488 adc $t3,$t3,xzr 1489 1490 adds $acc0,$acc1,$t2 1491 adcs $acc1,$acc2,$t3 1492 adcs $acc2,$acc3,$t4 1493 adcs $acc3,$acc4,$t4 1494 adc $acc4,$acc5,xzr 1495 1496 subs $t0,$acc0,$ord0 // ret -= modulus 1497 sbcs $t1,$acc1,$ord1 1498 sbcs $t2,$acc2,$ord2 1499 sbcs $t3,$acc3,$ord3 1500 sbcs xzr,$acc4,xzr 1501 1502 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1503 csel $acc1,$acc1,$t1,lo 1504 csel $acc2,$acc2,$t2,lo 1505 stp $acc0,$acc1,[$rp] 1506 csel $acc3,$acc3,$t3,lo 1507 stp $acc2,$acc3,[$rp,#16] 1508 1509 ldp x19,x20,[sp,#16] 1510 ldp x21,x22,[sp,#32] 1511 ldp x23,x24,[sp,#48] 1512 ldr x29,[sp],#64 1513 ret 1514.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1515 1516//////////////////////////////////////////////////////////////////////// 1517// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1518// uint64_t rep); 1519.globl ecp_nistz256_ord_sqr_mont 1520.type ecp_nistz256_ord_sqr_mont,%function 1521.align 4 1522ecp_nistz256_ord_sqr_mont: 1523 stp x29,x30,[sp,#-64]! 1524 add x29,sp,#0 1525 stp x19,x20,[sp,#16] 1526 stp x21,x22,[sp,#32] 1527 stp x23,x24,[sp,#48] 1528 1529 adrp $ordk,.Lord 1530 add $ordk,$ordk,:lo12:.Lord 1531 ldp $a0,$a1,[$ap] 1532 ldp $a2,$a3,[$ap,#16] 1533 1534 ldp $ord0,$ord1,[$ordk,#0] 1535 ldp $ord2,$ord3,[$ordk,#16] 1536 ldr $ordk,[$ordk,#32] 1537 b .Loop_ord_sqr 1538 1539.align 4 1540.Loop_ord_sqr: 1541 sub $bp,$bp,#1 1542 //////////////////////////////////////////////////////////////// 1543 // | | | | | |a1*a0| | 1544 // | | | | |a2*a0| | | 1545 // | |a3*a2|a3*a0| | | | 1546 // | | | |a2*a1| | | | 1547 // | | |a3*a1| | | | | 1548 // *| | | | | | | | 2| 1549 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1550 // |--+--+--+--+--+--+--+--| 1551 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1552 // 1553 // "can't overflow" below mark carrying into high part of 1554 // multiplication result, which can't overflow, because it 1555 // can never be all ones. 1556 1557 mul $acc1,$a1,$a0 // a[1]*a[0] 1558 umulh $t1,$a1,$a0 1559 mul $acc2,$a2,$a0 // a[2]*a[0] 1560 umulh $t2,$a2,$a0 1561 mul $acc3,$a3,$a0 // a[3]*a[0] 1562 umulh $acc4,$a3,$a0 1563 1564 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 1565 mul $t0,$a2,$a1 // a[2]*a[1] 1566 umulh $t1,$a2,$a1 1567 adcs $acc3,$acc3,$t2 1568 mul $t2,$a3,$a1 // a[3]*a[1] 1569 umulh $t3,$a3,$a1 1570 adc $acc4,$acc4,xzr // can't overflow 1571 1572 mul $acc5,$a3,$a2 // a[3]*a[2] 1573 umulh $acc6,$a3,$a2 1574 1575 adds $t1,$t1,$t2 // accumulate high parts of multiplication 1576 mul $acc0,$a0,$a0 // a[0]*a[0] 1577 adc $t2,$t3,xzr // can't overflow 1578 1579 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 1580 umulh $a0,$a0,$a0 1581 adcs $acc4,$acc4,$t1 1582 mul $t1,$a1,$a1 // a[1]*a[1] 1583 adcs $acc5,$acc5,$t2 1584 umulh $a1,$a1,$a1 1585 adc $acc6,$acc6,xzr // can't overflow 1586 1587 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 1588 mul $t2,$a2,$a2 // a[2]*a[2] 1589 adcs $acc2,$acc2,$acc2 1590 umulh $a2,$a2,$a2 1591 adcs $acc3,$acc3,$acc3 1592 mul $t3,$a3,$a3 // a[3]*a[3] 1593 adcs $acc4,$acc4,$acc4 1594 umulh $a3,$a3,$a3 1595 adcs $acc5,$acc5,$acc5 1596 adcs $acc6,$acc6,$acc6 1597 adc $acc7,xzr,xzr 1598 1599 adds $acc1,$acc1,$a0 // +a[i]*a[i] 1600 mul $t4,$acc0,$ordk 1601 adcs $acc2,$acc2,$t1 1602 adcs $acc3,$acc3,$a1 1603 adcs $acc4,$acc4,$t2 1604 adcs $acc5,$acc5,$a2 1605 adcs $acc6,$acc6,$t3 1606 adc $acc7,$acc7,$a3 1607___ 1608for($i=0; $i<4; $i++) { # reductions 1609$code.=<<___; 1610 subs xzr,$acc0,#1 1611 umulh $t1,$ord0,$t4 1612 mul $t2,$ord1,$t4 1613 umulh $t3,$ord1,$t4 1614 1615 adcs $t2,$t2,$t1 1616 adc $t3,$t3,xzr 1617 1618 adds $acc0,$acc1,$t2 1619 adcs $acc1,$acc2,$t3 1620 adcs $acc2,$acc3,$t4 1621 adc $acc3,xzr,$t4 // can't overflow 1622___ 1623$code.=<<___ if ($i<3); 1624 mul $t3,$acc0,$ordk 1625___ 1626$code.=<<___; 1627 lsl $t0,$t4,#32 1628 subs $acc1,$acc1,$t4 1629 lsr $t1,$t4,#32 1630 sbcs $acc2,$acc2,$t0 1631 sbc $acc3,$acc3,$t1 // can't borrow 1632___ 1633 ($t3,$t4) = ($t4,$t3); 1634} 1635$code.=<<___; 1636 adds $acc0,$acc0,$acc4 // accumulate upper half 1637 adcs $acc1,$acc1,$acc5 1638 adcs $acc2,$acc2,$acc6 1639 adcs $acc3,$acc3,$acc7 1640 adc $acc4,xzr,xzr 1641 1642 subs $t0,$acc0,$ord0 // ret -= modulus 1643 sbcs $t1,$acc1,$ord1 1644 sbcs $t2,$acc2,$ord2 1645 sbcs $t3,$acc3,$ord3 1646 sbcs xzr,$acc4,xzr 1647 1648 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1649 csel $a1,$acc1,$t1,lo 1650 csel $a2,$acc2,$t2,lo 1651 csel $a3,$acc3,$t3,lo 1652 1653 cbnz $bp,.Loop_ord_sqr 1654 1655 stp $a0,$a1,[$rp] 1656 stp $a2,$a3,[$rp,#16] 1657 1658 ldp x19,x20,[sp,#16] 1659 ldp x21,x22,[sp,#32] 1660 ldp x23,x24,[sp,#48] 1661 ldr x29,[sp],#64 1662 ret 1663.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1664___ 1665} } 1666 1667######################################################################## 1668# scatter-gather subroutines 1669{ 1670my ($out,$inp,$index,$mask)=map("x$_",(0..3)); 1671$code.=<<___; 1672// void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, 1673// int x2); 1674.globl ecp_nistz256_scatter_w5 1675.type ecp_nistz256_scatter_w5,%function 1676.align 4 1677ecp_nistz256_scatter_w5: 1678 stp x29,x30,[sp,#-16]! 1679 add x29,sp,#0 1680 1681 add $out,$out,$index,lsl#2 1682 1683 ldp x4,x5,[$inp] // X 1684 ldp x6,x7,[$inp,#16] 1685 stur w4,[$out,#64*0-4] 1686 lsr x4,x4,#32 1687 str w5,[$out,#64*1-4] 1688 lsr x5,x5,#32 1689 str w6,[$out,#64*2-4] 1690 lsr x6,x6,#32 1691 str w7,[$out,#64*3-4] 1692 lsr x7,x7,#32 1693 str w4,[$out,#64*4-4] 1694 str w5,[$out,#64*5-4] 1695 str w6,[$out,#64*6-4] 1696 str w7,[$out,#64*7-4] 1697 add $out,$out,#64*8 1698 1699 ldp x4,x5,[$inp,#32] // Y 1700 ldp x6,x7,[$inp,#48] 1701 stur w4,[$out,#64*0-4] 1702 lsr x4,x4,#32 1703 str w5,[$out,#64*1-4] 1704 lsr x5,x5,#32 1705 str w6,[$out,#64*2-4] 1706 lsr x6,x6,#32 1707 str w7,[$out,#64*3-4] 1708 lsr x7,x7,#32 1709 str w4,[$out,#64*4-4] 1710 str w5,[$out,#64*5-4] 1711 str w6,[$out,#64*6-4] 1712 str w7,[$out,#64*7-4] 1713 add $out,$out,#64*8 1714 1715 ldp x4,x5,[$inp,#64] // Z 1716 ldp x6,x7,[$inp,#80] 1717 stur w4,[$out,#64*0-4] 1718 lsr x4,x4,#32 1719 str w5,[$out,#64*1-4] 1720 lsr x5,x5,#32 1721 str w6,[$out,#64*2-4] 1722 lsr x6,x6,#32 1723 str w7,[$out,#64*3-4] 1724 lsr x7,x7,#32 1725 str w4,[$out,#64*4-4] 1726 str w5,[$out,#64*5-4] 1727 str w6,[$out,#64*6-4] 1728 str w7,[$out,#64*7-4] 1729 1730 ldr x29,[sp],#16 1731 ret 1732.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 1733 1734// void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1, 1735// int x2); 1736.globl ecp_nistz256_gather_w5 1737.type ecp_nistz256_gather_w5,%function 1738.align 4 1739ecp_nistz256_gather_w5: 1740 stp x29,x30,[sp,#-16]! 1741 add x29,sp,#0 1742 1743 cmp $index,xzr 1744 csetm x3,ne 1745 add $index,$index,x3 1746 add $inp,$inp,$index,lsl#2 1747 1748 ldr w4,[$inp,#64*0] 1749 ldr w5,[$inp,#64*1] 1750 ldr w6,[$inp,#64*2] 1751 ldr w7,[$inp,#64*3] 1752 ldr w8,[$inp,#64*4] 1753 ldr w9,[$inp,#64*5] 1754 ldr w10,[$inp,#64*6] 1755 ldr w11,[$inp,#64*7] 1756 add $inp,$inp,#64*8 1757 orr x4,x4,x8,lsl#32 1758 orr x5,x5,x9,lsl#32 1759 orr x6,x6,x10,lsl#32 1760 orr x7,x7,x11,lsl#32 1761 csel x4,x4,xzr,ne 1762 csel x5,x5,xzr,ne 1763 csel x6,x6,xzr,ne 1764 csel x7,x7,xzr,ne 1765 stp x4,x5,[$out] // X 1766 stp x6,x7,[$out,#16] 1767 1768 ldr w4,[$inp,#64*0] 1769 ldr w5,[$inp,#64*1] 1770 ldr w6,[$inp,#64*2] 1771 ldr w7,[$inp,#64*3] 1772 ldr w8,[$inp,#64*4] 1773 ldr w9,[$inp,#64*5] 1774 ldr w10,[$inp,#64*6] 1775 ldr w11,[$inp,#64*7] 1776 add $inp,$inp,#64*8 1777 orr x4,x4,x8,lsl#32 1778 orr x5,x5,x9,lsl#32 1779 orr x6,x6,x10,lsl#32 1780 orr x7,x7,x11,lsl#32 1781 csel x4,x4,xzr,ne 1782 csel x5,x5,xzr,ne 1783 csel x6,x6,xzr,ne 1784 csel x7,x7,xzr,ne 1785 stp x4,x5,[$out,#32] // Y 1786 stp x6,x7,[$out,#48] 1787 1788 ldr w4,[$inp,#64*0] 1789 ldr w5,[$inp,#64*1] 1790 ldr w6,[$inp,#64*2] 1791 ldr w7,[$inp,#64*3] 1792 ldr w8,[$inp,#64*4] 1793 ldr w9,[$inp,#64*5] 1794 ldr w10,[$inp,#64*6] 1795 ldr w11,[$inp,#64*7] 1796 orr x4,x4,x8,lsl#32 1797 orr x5,x5,x9,lsl#32 1798 orr x6,x6,x10,lsl#32 1799 orr x7,x7,x11,lsl#32 1800 csel x4,x4,xzr,ne 1801 csel x5,x5,xzr,ne 1802 csel x6,x6,xzr,ne 1803 csel x7,x7,xzr,ne 1804 stp x4,x5,[$out,#64] // Z 1805 stp x6,x7,[$out,#80] 1806 1807 ldr x29,[sp],#16 1808 ret 1809.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 1810 1811// void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1, 1812// int x2); 1813.globl ecp_nistz256_scatter_w7 1814.type ecp_nistz256_scatter_w7,%function 1815.align 4 1816ecp_nistz256_scatter_w7: 1817 stp x29,x30,[sp,#-16]! 1818 add x29,sp,#0 1819 1820 add $out,$out,$index 1821 mov $index,#64/8 1822.Loop_scatter_w7: 1823 ldr x3,[$inp],#8 1824 subs $index,$index,#1 1825 prfm pstl1strm,[$out,#4096+64*0] 1826 prfm pstl1strm,[$out,#4096+64*1] 1827 prfm pstl1strm,[$out,#4096+64*2] 1828 prfm pstl1strm,[$out,#4096+64*3] 1829 prfm pstl1strm,[$out,#4096+64*4] 1830 prfm pstl1strm,[$out,#4096+64*5] 1831 prfm pstl1strm,[$out,#4096+64*6] 1832 prfm pstl1strm,[$out,#4096+64*7] 1833 strb w3,[$out,#64*0] 1834 lsr x3,x3,#8 1835 strb w3,[$out,#64*1] 1836 lsr x3,x3,#8 1837 strb w3,[$out,#64*2] 1838 lsr x3,x3,#8 1839 strb w3,[$out,#64*3] 1840 lsr x3,x3,#8 1841 strb w3,[$out,#64*4] 1842 lsr x3,x3,#8 1843 strb w3,[$out,#64*5] 1844 lsr x3,x3,#8 1845 strb w3,[$out,#64*6] 1846 lsr x3,x3,#8 1847 strb w3,[$out,#64*7] 1848 add $out,$out,#64*8 1849 b.ne .Loop_scatter_w7 1850 1851 ldr x29,[sp],#16 1852 ret 1853.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 1854 1855// void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1, 1856// int x2); 1857.globl ecp_nistz256_gather_w7 1858.type ecp_nistz256_gather_w7,%function 1859.align 4 1860ecp_nistz256_gather_w7: 1861 stp x29,x30,[sp,#-16]! 1862 add x29,sp,#0 1863 1864 cmp $index,xzr 1865 csetm x3,ne 1866 add $index,$index,x3 1867 add $inp,$inp,$index 1868 mov $index,#64/8 1869 nop 1870.Loop_gather_w7: 1871 ldrb w4,[$inp,#64*0] 1872 prfm pldl1strm,[$inp,#4096+64*0] 1873 subs $index,$index,#1 1874 ldrb w5,[$inp,#64*1] 1875 prfm pldl1strm,[$inp,#4096+64*1] 1876 ldrb w6,[$inp,#64*2] 1877 prfm pldl1strm,[$inp,#4096+64*2] 1878 ldrb w7,[$inp,#64*3] 1879 prfm pldl1strm,[$inp,#4096+64*3] 1880 ldrb w8,[$inp,#64*4] 1881 prfm pldl1strm,[$inp,#4096+64*4] 1882 ldrb w9,[$inp,#64*5] 1883 prfm pldl1strm,[$inp,#4096+64*5] 1884 ldrb w10,[$inp,#64*6] 1885 prfm pldl1strm,[$inp,#4096+64*6] 1886 ldrb w11,[$inp,#64*7] 1887 prfm pldl1strm,[$inp,#4096+64*7] 1888 add $inp,$inp,#64*8 1889 orr x4,x4,x5,lsl#8 1890 orr x6,x6,x7,lsl#8 1891 orr x8,x8,x9,lsl#8 1892 orr x4,x4,x6,lsl#16 1893 orr x10,x10,x11,lsl#8 1894 orr x4,x4,x8,lsl#32 1895 orr x4,x4,x10,lsl#48 1896 and x4,x4,x3 1897 str x4,[$out],#8 1898 b.ne .Loop_gather_w7 1899 1900 ldr x29,[sp],#16 1901 ret 1902.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 1903___ 1904} 1905 1906foreach (split("\n",$code)) { 1907 s/\`([^\`]*)\`/eval $1/ge; 1908 1909 print $_,"\n"; 1910} 1911close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1912