1#! /usr/bin/env perl 2# Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# April 2019 31# 32# Key to performance of parallelize-able modes is round instruction 33# interleaving. But which factor to use? There is optimal one for 34# each combination of instruction latency and issue rate, beyond 35# which increasing interleave factor doesn't pay off. While on cons 36# side we have code size increase and resource waste on platforms for 37# which interleave factor is too high. In other words you want it to 38# be just right. So far interleave factor of 3x was serving well all 39# platforms. But for ThunderX2 optimal interleave factor was measured 40# to be 5x... 41# 42# Performance in cycles per byte processed with 128-bit key: 43# 44# CBC enc CBC dec CTR 45# Apple A7 2.39 1.20 1.20 46# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 47# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 48# Cortex-A72 1.33 0.85/0.88 0.92/0.96 49# Denver 1.96 0.65/0.86 0.76/0.80 50# Mongoose 1.33 1.23/1.20 1.30/1.20 51# Kryo 1.26 0.87/0.94 1.00/1.00 52# ThunderX2 5.95 1.25 1.30 53# 54# (*) original 3.64/1.34/1.32 results were for r0p0 revision 55# and are still same even for updated module; 56# (**) numbers after slash are for 32-bit code, which is 3x- 57# interleaved; 58 59# $output is the last argument if it looks like a file (it has an extension) 60# $flavour is the first argument if it doesn't look like a file 61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 63 64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 67die "can't locate arm-xlate.pl"; 68 69open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 70 or die "can't call $xlate: $!"; 71*STDOUT=*OUT; 72 73$prefix="aes_v8"; 74 75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); 76 77$code=<<___; 78#include "arm_arch.h" 79 80#if __ARM_MAX_ARCH__>=7 81___ 82$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); 83$code.=<<___ if ($flavour !~ /64/); 84.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 85.fpu neon 86#ifdef __thumb2__ 87.syntax unified 88.thumb 89# define INST(a,b,c,d) $_byte c,d|0xc,a,b 90#else 91.code 32 92# define INST(a,b,c,d) $_byte a,b,c,d 93#endif 94 95.text 96___ 97 98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 100# maintain both 32- and 64-bit codes within single module and 101# transliterate common code to either flavour with regex vodoo. 102# 103{{{ 104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 107 108 109# 110# This file generates .s file for 64-bit and 32-bit CPUs. 111# We don't implement .rodata on 32-bit CPUs yet. 112# 113$code.=".rodata\n" if ($flavour =~ /64/); 114$code.=<<___; 115.align 5 116.Lrcon: 117.long 0x01,0x01,0x01,0x01 118.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 119.long 0x1b,0x1b,0x1b,0x1b 120___ 121$code.=".previous\n" if ($flavour =~ /64/); 122 123$code.=<<___; 124.globl ${prefix}_set_encrypt_key 125.type ${prefix}_set_encrypt_key,%function 126.align 5 127${prefix}_set_encrypt_key: 128.Lenc_key: 129___ 130$code.=<<___ if ($flavour =~ /64/); 131 stp x29,x30,[sp,#-16]! 132 add x29,sp,#0 133___ 134$code.=<<___; 135 mov $ptr,#-1 136 cmp $inp,#0 137 b.eq .Lenc_key_abort 138 cmp $out,#0 139 b.eq .Lenc_key_abort 140 mov $ptr,#-2 141 cmp $bits,#128 142 b.lt .Lenc_key_abort 143 cmp $bits,#256 144 b.gt .Lenc_key_abort 145 tst $bits,#0x3f 146 b.ne .Lenc_key_abort 147 148___ 149$code.=<<___ if ($flavour =~ /64/); 150 adrp $ptr,.Lrcon 151 add $ptr,$ptr,:lo12:.Lrcon 152___ 153$code.=<<___ if ($flavour =~ /32/); 154 adr $ptr,.Lrcon 155___ 156$code.=<<___; 157 cmp $bits,#192 158 159 veor $zero,$zero,$zero 160 vld1.8 {$in0},[$inp],#16 161 mov $bits,#8 // reuse $bits 162 vld1.32 {$rcon,$mask},[$ptr],#32 163 164 b.lt .Loop128 165 b.eq .L192 166 b .L256 167 168.align 4 169.Loop128: 170 vtbl.8 $key,{$in0},$mask 171 vext.8 $tmp,$zero,$in0,#12 172 vst1.32 {$in0},[$out],#16 173 aese $key,$zero 174 subs $bits,$bits,#1 175 176 veor $in0,$in0,$tmp 177 vext.8 $tmp,$zero,$tmp,#12 178 veor $in0,$in0,$tmp 179 vext.8 $tmp,$zero,$tmp,#12 180 veor $key,$key,$rcon 181 veor $in0,$in0,$tmp 182 vshl.u8 $rcon,$rcon,#1 183 veor $in0,$in0,$key 184 b.ne .Loop128 185 186 vld1.32 {$rcon},[$ptr] 187 188 vtbl.8 $key,{$in0},$mask 189 vext.8 $tmp,$zero,$in0,#12 190 vst1.32 {$in0},[$out],#16 191 aese $key,$zero 192 193 veor $in0,$in0,$tmp 194 vext.8 $tmp,$zero,$tmp,#12 195 veor $in0,$in0,$tmp 196 vext.8 $tmp,$zero,$tmp,#12 197 veor $key,$key,$rcon 198 veor $in0,$in0,$tmp 199 vshl.u8 $rcon,$rcon,#1 200 veor $in0,$in0,$key 201 202 vtbl.8 $key,{$in0},$mask 203 vext.8 $tmp,$zero,$in0,#12 204 vst1.32 {$in0},[$out],#16 205 aese $key,$zero 206 207 veor $in0,$in0,$tmp 208 vext.8 $tmp,$zero,$tmp,#12 209 veor $in0,$in0,$tmp 210 vext.8 $tmp,$zero,$tmp,#12 211 veor $key,$key,$rcon 212 veor $in0,$in0,$tmp 213 veor $in0,$in0,$key 214 vst1.32 {$in0},[$out] 215 add $out,$out,#0x50 216 217 mov $rounds,#10 218 b .Ldone 219 220.align 4 221.L192: 222 vld1.8 {$in1},[$inp],#8 223 vmov.i8 $key,#8 // borrow $key 224 vst1.32 {$in0},[$out],#16 225 vsub.i8 $mask,$mask,$key // adjust the mask 226 227.Loop192: 228 vtbl.8 $key,{$in1},$mask 229 vext.8 $tmp,$zero,$in0,#12 230#ifdef __ARMEB__ 231 vst1.32 {$in1},[$out],#16 232 sub $out,$out,#8 233#else 234 vst1.32 {$in1},[$out],#8 235#endif 236 aese $key,$zero 237 subs $bits,$bits,#1 238 239 veor $in0,$in0,$tmp 240 vext.8 $tmp,$zero,$tmp,#12 241 veor $in0,$in0,$tmp 242 vext.8 $tmp,$zero,$tmp,#12 243 veor $in0,$in0,$tmp 244 245 vdup.32 $tmp,${in0}[3] 246 veor $tmp,$tmp,$in1 247 veor $key,$key,$rcon 248 vext.8 $in1,$zero,$in1,#12 249 vshl.u8 $rcon,$rcon,#1 250 veor $in1,$in1,$tmp 251 veor $in0,$in0,$key 252 veor $in1,$in1,$key 253 vst1.32 {$in0},[$out],#16 254 b.ne .Loop192 255 256 mov $rounds,#12 257 add $out,$out,#0x20 258 b .Ldone 259 260.align 4 261.L256: 262 vld1.8 {$in1},[$inp] 263 mov $bits,#7 264 mov $rounds,#14 265 vst1.32 {$in0},[$out],#16 266 267.Loop256: 268 vtbl.8 $key,{$in1},$mask 269 vext.8 $tmp,$zero,$in0,#12 270 vst1.32 {$in1},[$out],#16 271 aese $key,$zero 272 subs $bits,$bits,#1 273 274 veor $in0,$in0,$tmp 275 vext.8 $tmp,$zero,$tmp,#12 276 veor $in0,$in0,$tmp 277 vext.8 $tmp,$zero,$tmp,#12 278 veor $key,$key,$rcon 279 veor $in0,$in0,$tmp 280 vshl.u8 $rcon,$rcon,#1 281 veor $in0,$in0,$key 282 vst1.32 {$in0},[$out],#16 283 b.eq .Ldone 284 285 vdup.32 $key,${in0}[3] // just splat 286 vext.8 $tmp,$zero,$in1,#12 287 aese $key,$zero 288 289 veor $in1,$in1,$tmp 290 vext.8 $tmp,$zero,$tmp,#12 291 veor $in1,$in1,$tmp 292 vext.8 $tmp,$zero,$tmp,#12 293 veor $in1,$in1,$tmp 294 295 veor $in1,$in1,$key 296 b .Loop256 297 298.Ldone: 299 str $rounds,[$out] 300 mov $ptr,#0 301 302.Lenc_key_abort: 303 mov x0,$ptr // return value 304 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 305 ret 306.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 307 308.globl ${prefix}_set_decrypt_key 309.type ${prefix}_set_decrypt_key,%function 310.align 5 311${prefix}_set_decrypt_key: 312___ 313$code.=<<___ if ($flavour =~ /64/); 314 .inst 0xd503233f // paciasp 315 stp x29,x30,[sp,#-16]! 316 add x29,sp,#0 317___ 318$code.=<<___ if ($flavour !~ /64/); 319 stmdb sp!,{r4,lr} 320___ 321$code.=<<___; 322 bl .Lenc_key 323 324 cmp x0,#0 325 b.ne .Ldec_key_abort 326 327 sub $out,$out,#240 // restore original $out 328 mov x4,#-16 329 add $inp,$out,x12,lsl#4 // end of key schedule 330 331 vld1.32 {v0.16b},[$out] 332 vld1.32 {v1.16b},[$inp] 333 vst1.32 {v0.16b},[$inp],x4 334 vst1.32 {v1.16b},[$out],#16 335 336.Loop_imc: 337 vld1.32 {v0.16b},[$out] 338 vld1.32 {v1.16b},[$inp] 339 aesimc v0.16b,v0.16b 340 aesimc v1.16b,v1.16b 341 vst1.32 {v0.16b},[$inp],x4 342 vst1.32 {v1.16b},[$out],#16 343 cmp $inp,$out 344 b.hi .Loop_imc 345 346 vld1.32 {v0.16b},[$out] 347 aesimc v0.16b,v0.16b 348 vst1.32 {v0.16b},[$inp] 349 350 eor x0,x0,x0 // return value 351.Ldec_key_abort: 352___ 353$code.=<<___ if ($flavour !~ /64/); 354 ldmia sp!,{r4,pc} 355___ 356$code.=<<___ if ($flavour =~ /64/); 357 ldp x29,x30,[sp],#16 358 .inst 0xd50323bf // autiasp 359 ret 360___ 361$code.=<<___; 362.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 363___ 364}}} 365{{{ 366sub gen_block () { 367my $dir = shift; 368my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 369my ($inp,$out,$key)=map("x$_",(0..2)); 370my $rounds="w3"; 371my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 372 373$code.=<<___; 374.globl ${prefix}_${dir}crypt 375.type ${prefix}_${dir}crypt,%function 376.align 5 377${prefix}_${dir}crypt: 378 ldr $rounds,[$key,#240] 379 vld1.32 {$rndkey0},[$key],#16 380 vld1.8 {$inout},[$inp] 381 sub $rounds,$rounds,#2 382 vld1.32 {$rndkey1},[$key],#16 383 384.Loop_${dir}c: 385 aes$e $inout,$rndkey0 386 aes$mc $inout,$inout 387 vld1.32 {$rndkey0},[$key],#16 388 subs $rounds,$rounds,#2 389 aes$e $inout,$rndkey1 390 aes$mc $inout,$inout 391 vld1.32 {$rndkey1},[$key],#16 392 b.gt .Loop_${dir}c 393 394 aes$e $inout,$rndkey0 395 aes$mc $inout,$inout 396 vld1.32 {$rndkey0},[$key] 397 aes$e $inout,$rndkey1 398 veor $inout,$inout,$rndkey0 399 400 vst1.8 {$inout},[$out] 401 ret 402.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 403___ 404} 405&gen_block("en"); 406&gen_block("de"); 407}}} 408 409# Performance in cycles per byte. 410# Processed with AES-ECB different key size. 411# It shows the value before and after optimization as below: 412# (before/after): 413# 414# AES-128-ECB AES-192-ECB AES-256-ECB 415# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10 416# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14 417 418# Optimization is implemented by loop unrolling and interleaving. 419# Commonly, we choose the unrolling factor as 5, if the input 420# data size smaller than 5 blocks, but not smaller than 3 blocks, 421# choose 3 as the unrolling factor. 422# If the input data size dsize >= 5*16 bytes, then take 5 blocks 423# as one iteration, every loop the left size lsize -= 5*16. 424# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration, 425# every loop lsize -=3*16. 426# If lsize < 3*16 bytes, treat them as the tail, interleave the 427# two blocks AES instructions. 428# There is one special case, if the original input data size dsize 429# = 16 bytes, we will treat it seperately to improve the 430# performance: one independent code block without LR, FP load and 431# store, just looks like what the original ECB implementation does. 432 433{{{ 434my ($inp,$out,$len,$key)=map("x$_",(0..3)); 435my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); 436my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 437 438my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 439 440### q7 last round key 441### q10-q15 q7 Last 7 round keys 442### q8-q9 preloaded round keys except last 7 keys for big size 443### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte 444 445{ 446my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 447 448my ($dat3,$in3,$tmp3); # used only in 64-bit mode 449my ($dat4,$in4,$tmp4); 450if ($flavour =~ /64/) { 451 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 452} 453 454$code.=<<___; 455.globl ${prefix}_ecb_encrypt 456.type ${prefix}_ecb_encrypt,%function 457.align 5 458${prefix}_ecb_encrypt: 459___ 460$code.=<<___ if ($flavour =~ /64/); 461 subs $len,$len,#16 462 // Original input data size bigger than 16, jump to big size processing. 463 b.ne .Lecb_big_size 464 vld1.8 {$dat0},[$inp] 465 cmp $enc,#0 // en- or decrypting? 466 ldr $rounds,[$key,#240] 467 vld1.32 {q5-q6},[$key],#32 // load key schedule... 468 469 b.eq .Lecb_small_dec 470 aese $dat0,q5 471 aesmc $dat0,$dat0 472 vld1.32 {q8-q9},[$key],#32 // load key schedule... 473 aese $dat0,q6 474 aesmc $dat0,$dat0 475 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing 476 b.eq .Lecb_128_enc 477.Lecb_round_loop: 478 aese $dat0,q8 479 aesmc $dat0,$dat0 480 vld1.32 {q8},[$key],#16 // load key schedule... 481 aese $dat0,q9 482 aesmc $dat0,$dat0 483 vld1.32 {q9},[$key],#16 // load key schedule... 484 subs $rounds,$rounds,#2 // bias 485 b.gt .Lecb_round_loop 486.Lecb_128_enc: 487 vld1.32 {q10-q11},[$key],#32 // load key schedule... 488 aese $dat0,q8 489 aesmc $dat0,$dat0 490 aese $dat0,q9 491 aesmc $dat0,$dat0 492 vld1.32 {q12-q13},[$key],#32 // load key schedule... 493 aese $dat0,q10 494 aesmc $dat0,$dat0 495 aese $dat0,q11 496 aesmc $dat0,$dat0 497 vld1.32 {q14-q15},[$key],#32 // load key schedule... 498 aese $dat0,q12 499 aesmc $dat0,$dat0 500 aese $dat0,q13 501 aesmc $dat0,$dat0 502 vld1.32 {$rndlast},[$key] 503 aese $dat0,q14 504 aesmc $dat0,$dat0 505 aese $dat0,q15 506 veor $dat0,$dat0,$rndlast 507 vst1.8 {$dat0},[$out] 508 b .Lecb_Final_abort 509.Lecb_small_dec: 510 aesd $dat0,q5 511 aesimc $dat0,$dat0 512 vld1.32 {q8-q9},[$key],#32 // load key schedule... 513 aesd $dat0,q6 514 aesimc $dat0,$dat0 515 subs $rounds,$rounds,#10 // bias 516 b.eq .Lecb_128_dec 517.Lecb_dec_round_loop: 518 aesd $dat0,q8 519 aesimc $dat0,$dat0 520 vld1.32 {q8},[$key],#16 // load key schedule... 521 aesd $dat0,q9 522 aesimc $dat0,$dat0 523 vld1.32 {q9},[$key],#16 // load key schedule... 524 subs $rounds,$rounds,#2 // bias 525 b.gt .Lecb_dec_round_loop 526.Lecb_128_dec: 527 vld1.32 {q10-q11},[$key],#32 // load key schedule... 528 aesd $dat0,q8 529 aesimc $dat0,$dat0 530 aesd $dat0,q9 531 aesimc $dat0,$dat0 532 vld1.32 {q12-q13},[$key],#32 // load key schedule... 533 aesd $dat0,q10 534 aesimc $dat0,$dat0 535 aesd $dat0,q11 536 aesimc $dat0,$dat0 537 vld1.32 {q14-q15},[$key],#32 // load key schedule... 538 aesd $dat0,q12 539 aesimc $dat0,$dat0 540 aesd $dat0,q13 541 aesimc $dat0,$dat0 542 vld1.32 {$rndlast},[$key] 543 aesd $dat0,q14 544 aesimc $dat0,$dat0 545 aesd $dat0,q15 546 veor $dat0,$dat0,$rndlast 547 vst1.8 {$dat0},[$out] 548 b .Lecb_Final_abort 549.Lecb_big_size: 550___ 551$code.=<<___ if ($flavour =~ /64/); 552 stp x29,x30,[sp,#-16]! 553 add x29,sp,#0 554___ 555$code.=<<___ if ($flavour !~ /64/); 556 mov ip,sp 557 stmdb sp!,{r4-r8,lr} 558 vstmdb sp!,{d8-d15} @ ABI specification says so 559 ldmia ip,{r4-r5} @ load remaining args 560 subs $len,$len,#16 561___ 562$code.=<<___; 563 mov $step,#16 564 b.lo .Lecb_done 565 cclr $step,eq 566 567 cmp $enc,#0 // en- or decrypting? 568 ldr $rounds,[$key,#240] 569 and $len,$len,#-16 570 vld1.8 {$dat},[$inp],$step 571 572 vld1.32 {q8-q9},[$key] // load key schedule... 573 sub $rounds,$rounds,#6 574 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 575 sub $rounds,$rounds,#2 576 vld1.32 {q10-q11},[$key_],#32 577 vld1.32 {q12-q13},[$key_],#32 578 vld1.32 {q14-q15},[$key_],#32 579 vld1.32 {$rndlast},[$key_] 580 581 add $key_,$key,#32 582 mov $cnt,$rounds 583 b.eq .Lecb_dec 584 585 vld1.8 {$dat1},[$inp],#16 586 subs $len,$len,#32 // bias 587 add $cnt,$rounds,#2 588 vorr $in1,$dat1,$dat1 589 vorr $dat2,$dat1,$dat1 590 vorr $dat1,$dat,$dat 591 b.lo .Lecb_enc_tail 592 593 vorr $dat1,$in1,$in1 594 vld1.8 {$dat2},[$inp],#16 595___ 596$code.=<<___ if ($flavour =~ /64/); 597 cmp $len,#32 598 b.lo .Loop3x_ecb_enc 599 600 vld1.8 {$dat3},[$inp],#16 601 vld1.8 {$dat4},[$inp],#16 602 sub $len,$len,#32 // bias 603 mov $cnt,$rounds 604 605.Loop5x_ecb_enc: 606 aese $dat0,q8 607 aesmc $dat0,$dat0 608 aese $dat1,q8 609 aesmc $dat1,$dat1 610 aese $dat2,q8 611 aesmc $dat2,$dat2 612 aese $dat3,q8 613 aesmc $dat3,$dat3 614 aese $dat4,q8 615 aesmc $dat4,$dat4 616 vld1.32 {q8},[$key_],#16 617 subs $cnt,$cnt,#2 618 aese $dat0,q9 619 aesmc $dat0,$dat0 620 aese $dat1,q9 621 aesmc $dat1,$dat1 622 aese $dat2,q9 623 aesmc $dat2,$dat2 624 aese $dat3,q9 625 aesmc $dat3,$dat3 626 aese $dat4,q9 627 aesmc $dat4,$dat4 628 vld1.32 {q9},[$key_],#16 629 b.gt .Loop5x_ecb_enc 630 631 aese $dat0,q8 632 aesmc $dat0,$dat0 633 aese $dat1,q8 634 aesmc $dat1,$dat1 635 aese $dat2,q8 636 aesmc $dat2,$dat2 637 aese $dat3,q8 638 aesmc $dat3,$dat3 639 aese $dat4,q8 640 aesmc $dat4,$dat4 641 cmp $len,#0x40 // because .Lecb_enc_tail4x 642 sub $len,$len,#0x50 643 644 aese $dat0,q9 645 aesmc $dat0,$dat0 646 aese $dat1,q9 647 aesmc $dat1,$dat1 648 aese $dat2,q9 649 aesmc $dat2,$dat2 650 aese $dat3,q9 651 aesmc $dat3,$dat3 652 aese $dat4,q9 653 aesmc $dat4,$dat4 654 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 655 mov $key_,$key 656 657 aese $dat0,q10 658 aesmc $dat0,$dat0 659 aese $dat1,q10 660 aesmc $dat1,$dat1 661 aese $dat2,q10 662 aesmc $dat2,$dat2 663 aese $dat3,q10 664 aesmc $dat3,$dat3 665 aese $dat4,q10 666 aesmc $dat4,$dat4 667 add $inp,$inp,x6 // $inp is adjusted in such way that 668 // at exit from the loop $dat1-$dat4 669 // are loaded with last "words" 670 add x6,$len,#0x60 // because .Lecb_enc_tail4x 671 672 aese $dat0,q11 673 aesmc $dat0,$dat0 674 aese $dat1,q11 675 aesmc $dat1,$dat1 676 aese $dat2,q11 677 aesmc $dat2,$dat2 678 aese $dat3,q11 679 aesmc $dat3,$dat3 680 aese $dat4,q11 681 aesmc $dat4,$dat4 682 683 aese $dat0,q12 684 aesmc $dat0,$dat0 685 aese $dat1,q12 686 aesmc $dat1,$dat1 687 aese $dat2,q12 688 aesmc $dat2,$dat2 689 aese $dat3,q12 690 aesmc $dat3,$dat3 691 aese $dat4,q12 692 aesmc $dat4,$dat4 693 694 aese $dat0,q13 695 aesmc $dat0,$dat0 696 aese $dat1,q13 697 aesmc $dat1,$dat1 698 aese $dat2,q13 699 aesmc $dat2,$dat2 700 aese $dat3,q13 701 aesmc $dat3,$dat3 702 aese $dat4,q13 703 aesmc $dat4,$dat4 704 705 aese $dat0,q14 706 aesmc $dat0,$dat0 707 aese $dat1,q14 708 aesmc $dat1,$dat1 709 aese $dat2,q14 710 aesmc $dat2,$dat2 711 aese $dat3,q14 712 aesmc $dat3,$dat3 713 aese $dat4,q14 714 aesmc $dat4,$dat4 715 716 aese $dat0,q15 717 vld1.8 {$in0},[$inp],#16 718 aese $dat1,q15 719 vld1.8 {$in1},[$inp],#16 720 aese $dat2,q15 721 vld1.8 {$in2},[$inp],#16 722 aese $dat3,q15 723 vld1.8 {$in3},[$inp],#16 724 aese $dat4,q15 725 vld1.8 {$in4},[$inp],#16 726 cbz x6,.Lecb_enc_tail4x 727 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 728 veor $tmp0,$rndlast,$dat0 729 vorr $dat0,$in0,$in0 730 veor $tmp1,$rndlast,$dat1 731 vorr $dat1,$in1,$in1 732 veor $tmp2,$rndlast,$dat2 733 vorr $dat2,$in2,$in2 734 veor $tmp3,$rndlast,$dat3 735 vorr $dat3,$in3,$in3 736 veor $tmp4,$rndlast,$dat4 737 vst1.8 {$tmp0},[$out],#16 738 vorr $dat4,$in4,$in4 739 vst1.8 {$tmp1},[$out],#16 740 mov $cnt,$rounds 741 vst1.8 {$tmp2},[$out],#16 742 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 743 vst1.8 {$tmp3},[$out],#16 744 vst1.8 {$tmp4},[$out],#16 745 b.hs .Loop5x_ecb_enc 746 747 add $len,$len,#0x50 748 cbz $len,.Lecb_done 749 750 add $cnt,$rounds,#2 751 subs $len,$len,#0x30 752 vorr $dat0,$in2,$in2 753 vorr $dat1,$in3,$in3 754 vorr $dat2,$in4,$in4 755 b.lo .Lecb_enc_tail 756 757 b .Loop3x_ecb_enc 758 759.align 4 760.Lecb_enc_tail4x: 761 veor $tmp1,$rndlast,$dat1 762 veor $tmp2,$rndlast,$dat2 763 veor $tmp3,$rndlast,$dat3 764 veor $tmp4,$rndlast,$dat4 765 vst1.8 {$tmp1},[$out],#16 766 vst1.8 {$tmp2},[$out],#16 767 vst1.8 {$tmp3},[$out],#16 768 vst1.8 {$tmp4},[$out],#16 769 770 b .Lecb_done 771.align 4 772___ 773$code.=<<___; 774.Loop3x_ecb_enc: 775 aese $dat0,q8 776 aesmc $dat0,$dat0 777 aese $dat1,q8 778 aesmc $dat1,$dat1 779 aese $dat2,q8 780 aesmc $dat2,$dat2 781 vld1.32 {q8},[$key_],#16 782 subs $cnt,$cnt,#2 783 aese $dat0,q9 784 aesmc $dat0,$dat0 785 aese $dat1,q9 786 aesmc $dat1,$dat1 787 aese $dat2,q9 788 aesmc $dat2,$dat2 789 vld1.32 {q9},[$key_],#16 790 b.gt .Loop3x_ecb_enc 791 792 aese $dat0,q8 793 aesmc $dat0,$dat0 794 aese $dat1,q8 795 aesmc $dat1,$dat1 796 aese $dat2,q8 797 aesmc $dat2,$dat2 798 subs $len,$len,#0x30 799 mov.lo x6,$len // x6, $cnt, is zero at this point 800 aese $dat0,q9 801 aesmc $dat0,$dat0 802 aese $dat1,q9 803 aesmc $dat1,$dat1 804 aese $dat2,q9 805 aesmc $dat2,$dat2 806 add $inp,$inp,x6 // $inp is adjusted in such way that 807 // at exit from the loop $dat1-$dat2 808 // are loaded with last "words" 809 mov $key_,$key 810 aese $dat0,q12 811 aesmc $dat0,$dat0 812 aese $dat1,q12 813 aesmc $dat1,$dat1 814 aese $dat2,q12 815 aesmc $dat2,$dat2 816 vld1.8 {$in0},[$inp],#16 817 aese $dat0,q13 818 aesmc $dat0,$dat0 819 aese $dat1,q13 820 aesmc $dat1,$dat1 821 aese $dat2,q13 822 aesmc $dat2,$dat2 823 vld1.8 {$in1},[$inp],#16 824 aese $dat0,q14 825 aesmc $dat0,$dat0 826 aese $dat1,q14 827 aesmc $dat1,$dat1 828 aese $dat2,q14 829 aesmc $dat2,$dat2 830 vld1.8 {$in2},[$inp],#16 831 aese $dat0,q15 832 aese $dat1,q15 833 aese $dat2,q15 834 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 835 add $cnt,$rounds,#2 836 veor $tmp0,$rndlast,$dat0 837 veor $tmp1,$rndlast,$dat1 838 veor $dat2,$dat2,$rndlast 839 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 840 vst1.8 {$tmp0},[$out],#16 841 vorr $dat0,$in0,$in0 842 vst1.8 {$tmp1},[$out],#16 843 vorr $dat1,$in1,$in1 844 vst1.8 {$dat2},[$out],#16 845 vorr $dat2,$in2,$in2 846 b.hs .Loop3x_ecb_enc 847 848 cmn $len,#0x30 849 b.eq .Lecb_done 850 nop 851 852.Lecb_enc_tail: 853 aese $dat1,q8 854 aesmc $dat1,$dat1 855 aese $dat2,q8 856 aesmc $dat2,$dat2 857 vld1.32 {q8},[$key_],#16 858 subs $cnt,$cnt,#2 859 aese $dat1,q9 860 aesmc $dat1,$dat1 861 aese $dat2,q9 862 aesmc $dat2,$dat2 863 vld1.32 {q9},[$key_],#16 864 b.gt .Lecb_enc_tail 865 866 aese $dat1,q8 867 aesmc $dat1,$dat1 868 aese $dat2,q8 869 aesmc $dat2,$dat2 870 aese $dat1,q9 871 aesmc $dat1,$dat1 872 aese $dat2,q9 873 aesmc $dat2,$dat2 874 aese $dat1,q12 875 aesmc $dat1,$dat1 876 aese $dat2,q12 877 aesmc $dat2,$dat2 878 cmn $len,#0x20 879 aese $dat1,q13 880 aesmc $dat1,$dat1 881 aese $dat2,q13 882 aesmc $dat2,$dat2 883 aese $dat1,q14 884 aesmc $dat1,$dat1 885 aese $dat2,q14 886 aesmc $dat2,$dat2 887 aese $dat1,q15 888 aese $dat2,q15 889 b.eq .Lecb_enc_one 890 veor $tmp1,$rndlast,$dat1 891 veor $tmp2,$rndlast,$dat2 892 vst1.8 {$tmp1},[$out],#16 893 vst1.8 {$tmp2},[$out],#16 894 b .Lecb_done 895 896.Lecb_enc_one: 897 veor $tmp1,$rndlast,$dat2 898 vst1.8 {$tmp1},[$out],#16 899 b .Lecb_done 900___ 901 902$code.=<<___; 903.align 5 904.Lecb_dec: 905 vld1.8 {$dat1},[$inp],#16 906 subs $len,$len,#32 // bias 907 add $cnt,$rounds,#2 908 vorr $in1,$dat1,$dat1 909 vorr $dat2,$dat1,$dat1 910 vorr $dat1,$dat,$dat 911 b.lo .Lecb_dec_tail 912 913 vorr $dat1,$in1,$in1 914 vld1.8 {$dat2},[$inp],#16 915___ 916$code.=<<___ if ($flavour =~ /64/); 917 cmp $len,#32 918 b.lo .Loop3x_ecb_dec 919 920 vld1.8 {$dat3},[$inp],#16 921 vld1.8 {$dat4},[$inp],#16 922 sub $len,$len,#32 // bias 923 mov $cnt,$rounds 924 925.Loop5x_ecb_dec: 926 aesd $dat0,q8 927 aesimc $dat0,$dat0 928 aesd $dat1,q8 929 aesimc $dat1,$dat1 930 aesd $dat2,q8 931 aesimc $dat2,$dat2 932 aesd $dat3,q8 933 aesimc $dat3,$dat3 934 aesd $dat4,q8 935 aesimc $dat4,$dat4 936 vld1.32 {q8},[$key_],#16 937 subs $cnt,$cnt,#2 938 aesd $dat0,q9 939 aesimc $dat0,$dat0 940 aesd $dat1,q9 941 aesimc $dat1,$dat1 942 aesd $dat2,q9 943 aesimc $dat2,$dat2 944 aesd $dat3,q9 945 aesimc $dat3,$dat3 946 aesd $dat4,q9 947 aesimc $dat4,$dat4 948 vld1.32 {q9},[$key_],#16 949 b.gt .Loop5x_ecb_dec 950 951 aesd $dat0,q8 952 aesimc $dat0,$dat0 953 aesd $dat1,q8 954 aesimc $dat1,$dat1 955 aesd $dat2,q8 956 aesimc $dat2,$dat2 957 aesd $dat3,q8 958 aesimc $dat3,$dat3 959 aesd $dat4,q8 960 aesimc $dat4,$dat4 961 cmp $len,#0x40 // because .Lecb_tail4x 962 sub $len,$len,#0x50 963 964 aesd $dat0,q9 965 aesimc $dat0,$dat0 966 aesd $dat1,q9 967 aesimc $dat1,$dat1 968 aesd $dat2,q9 969 aesimc $dat2,$dat2 970 aesd $dat3,q9 971 aesimc $dat3,$dat3 972 aesd $dat4,q9 973 aesimc $dat4,$dat4 974 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 975 mov $key_,$key 976 977 aesd $dat0,q10 978 aesimc $dat0,$dat0 979 aesd $dat1,q10 980 aesimc $dat1,$dat1 981 aesd $dat2,q10 982 aesimc $dat2,$dat2 983 aesd $dat3,q10 984 aesimc $dat3,$dat3 985 aesd $dat4,q10 986 aesimc $dat4,$dat4 987 add $inp,$inp,x6 // $inp is adjusted in such way that 988 // at exit from the loop $dat1-$dat4 989 // are loaded with last "words" 990 add x6,$len,#0x60 // because .Lecb_tail4x 991 992 aesd $dat0,q11 993 aesimc $dat0,$dat0 994 aesd $dat1,q11 995 aesimc $dat1,$dat1 996 aesd $dat2,q11 997 aesimc $dat2,$dat2 998 aesd $dat3,q11 999 aesimc $dat3,$dat3 1000 aesd $dat4,q11 1001 aesimc $dat4,$dat4 1002 1003 aesd $dat0,q12 1004 aesimc $dat0,$dat0 1005 aesd $dat1,q12 1006 aesimc $dat1,$dat1 1007 aesd $dat2,q12 1008 aesimc $dat2,$dat2 1009 aesd $dat3,q12 1010 aesimc $dat3,$dat3 1011 aesd $dat4,q12 1012 aesimc $dat4,$dat4 1013 1014 aesd $dat0,q13 1015 aesimc $dat0,$dat0 1016 aesd $dat1,q13 1017 aesimc $dat1,$dat1 1018 aesd $dat2,q13 1019 aesimc $dat2,$dat2 1020 aesd $dat3,q13 1021 aesimc $dat3,$dat3 1022 aesd $dat4,q13 1023 aesimc $dat4,$dat4 1024 1025 aesd $dat0,q14 1026 aesimc $dat0,$dat0 1027 aesd $dat1,q14 1028 aesimc $dat1,$dat1 1029 aesd $dat2,q14 1030 aesimc $dat2,$dat2 1031 aesd $dat3,q14 1032 aesimc $dat3,$dat3 1033 aesd $dat4,q14 1034 aesimc $dat4,$dat4 1035 1036 aesd $dat0,q15 1037 vld1.8 {$in0},[$inp],#16 1038 aesd $dat1,q15 1039 vld1.8 {$in1},[$inp],#16 1040 aesd $dat2,q15 1041 vld1.8 {$in2},[$inp],#16 1042 aesd $dat3,q15 1043 vld1.8 {$in3},[$inp],#16 1044 aesd $dat4,q15 1045 vld1.8 {$in4},[$inp],#16 1046 cbz x6,.Lecb_tail4x 1047 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1048 veor $tmp0,$rndlast,$dat0 1049 vorr $dat0,$in0,$in0 1050 veor $tmp1,$rndlast,$dat1 1051 vorr $dat1,$in1,$in1 1052 veor $tmp2,$rndlast,$dat2 1053 vorr $dat2,$in2,$in2 1054 veor $tmp3,$rndlast,$dat3 1055 vorr $dat3,$in3,$in3 1056 veor $tmp4,$rndlast,$dat4 1057 vst1.8 {$tmp0},[$out],#16 1058 vorr $dat4,$in4,$in4 1059 vst1.8 {$tmp1},[$out],#16 1060 mov $cnt,$rounds 1061 vst1.8 {$tmp2},[$out],#16 1062 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1063 vst1.8 {$tmp3},[$out],#16 1064 vst1.8 {$tmp4},[$out],#16 1065 b.hs .Loop5x_ecb_dec 1066 1067 add $len,$len,#0x50 1068 cbz $len,.Lecb_done 1069 1070 add $cnt,$rounds,#2 1071 subs $len,$len,#0x30 1072 vorr $dat0,$in2,$in2 1073 vorr $dat1,$in3,$in3 1074 vorr $dat2,$in4,$in4 1075 b.lo .Lecb_dec_tail 1076 1077 b .Loop3x_ecb_dec 1078 1079.align 4 1080.Lecb_tail4x: 1081 veor $tmp1,$rndlast,$dat1 1082 veor $tmp2,$rndlast,$dat2 1083 veor $tmp3,$rndlast,$dat3 1084 veor $tmp4,$rndlast,$dat4 1085 vst1.8 {$tmp1},[$out],#16 1086 vst1.8 {$tmp2},[$out],#16 1087 vst1.8 {$tmp3},[$out],#16 1088 vst1.8 {$tmp4},[$out],#16 1089 1090 b .Lecb_done 1091.align 4 1092___ 1093$code.=<<___; 1094.Loop3x_ecb_dec: 1095 aesd $dat0,q8 1096 aesimc $dat0,$dat0 1097 aesd $dat1,q8 1098 aesimc $dat1,$dat1 1099 aesd $dat2,q8 1100 aesimc $dat2,$dat2 1101 vld1.32 {q8},[$key_],#16 1102 subs $cnt,$cnt,#2 1103 aesd $dat0,q9 1104 aesimc $dat0,$dat0 1105 aesd $dat1,q9 1106 aesimc $dat1,$dat1 1107 aesd $dat2,q9 1108 aesimc $dat2,$dat2 1109 vld1.32 {q9},[$key_],#16 1110 b.gt .Loop3x_ecb_dec 1111 1112 aesd $dat0,q8 1113 aesimc $dat0,$dat0 1114 aesd $dat1,q8 1115 aesimc $dat1,$dat1 1116 aesd $dat2,q8 1117 aesimc $dat2,$dat2 1118 subs $len,$len,#0x30 1119 mov.lo x6,$len // x6, $cnt, is zero at this point 1120 aesd $dat0,q9 1121 aesimc $dat0,$dat0 1122 aesd $dat1,q9 1123 aesimc $dat1,$dat1 1124 aesd $dat2,q9 1125 aesimc $dat2,$dat2 1126 add $inp,$inp,x6 // $inp is adjusted in such way that 1127 // at exit from the loop $dat1-$dat2 1128 // are loaded with last "words" 1129 mov $key_,$key 1130 aesd $dat0,q12 1131 aesimc $dat0,$dat0 1132 aesd $dat1,q12 1133 aesimc $dat1,$dat1 1134 aesd $dat2,q12 1135 aesimc $dat2,$dat2 1136 vld1.8 {$in0},[$inp],#16 1137 aesd $dat0,q13 1138 aesimc $dat0,$dat0 1139 aesd $dat1,q13 1140 aesimc $dat1,$dat1 1141 aesd $dat2,q13 1142 aesimc $dat2,$dat2 1143 vld1.8 {$in1},[$inp],#16 1144 aesd $dat0,q14 1145 aesimc $dat0,$dat0 1146 aesd $dat1,q14 1147 aesimc $dat1,$dat1 1148 aesd $dat2,q14 1149 aesimc $dat2,$dat2 1150 vld1.8 {$in2},[$inp],#16 1151 aesd $dat0,q15 1152 aesd $dat1,q15 1153 aesd $dat2,q15 1154 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1155 add $cnt,$rounds,#2 1156 veor $tmp0,$rndlast,$dat0 1157 veor $tmp1,$rndlast,$dat1 1158 veor $dat2,$dat2,$rndlast 1159 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1160 vst1.8 {$tmp0},[$out],#16 1161 vorr $dat0,$in0,$in0 1162 vst1.8 {$tmp1},[$out],#16 1163 vorr $dat1,$in1,$in1 1164 vst1.8 {$dat2},[$out],#16 1165 vorr $dat2,$in2,$in2 1166 b.hs .Loop3x_ecb_dec 1167 1168 cmn $len,#0x30 1169 b.eq .Lecb_done 1170 nop 1171 1172.Lecb_dec_tail: 1173 aesd $dat1,q8 1174 aesimc $dat1,$dat1 1175 aesd $dat2,q8 1176 aesimc $dat2,$dat2 1177 vld1.32 {q8},[$key_],#16 1178 subs $cnt,$cnt,#2 1179 aesd $dat1,q9 1180 aesimc $dat1,$dat1 1181 aesd $dat2,q9 1182 aesimc $dat2,$dat2 1183 vld1.32 {q9},[$key_],#16 1184 b.gt .Lecb_dec_tail 1185 1186 aesd $dat1,q8 1187 aesimc $dat1,$dat1 1188 aesd $dat2,q8 1189 aesimc $dat2,$dat2 1190 aesd $dat1,q9 1191 aesimc $dat1,$dat1 1192 aesd $dat2,q9 1193 aesimc $dat2,$dat2 1194 aesd $dat1,q12 1195 aesimc $dat1,$dat1 1196 aesd $dat2,q12 1197 aesimc $dat2,$dat2 1198 cmn $len,#0x20 1199 aesd $dat1,q13 1200 aesimc $dat1,$dat1 1201 aesd $dat2,q13 1202 aesimc $dat2,$dat2 1203 aesd $dat1,q14 1204 aesimc $dat1,$dat1 1205 aesd $dat2,q14 1206 aesimc $dat2,$dat2 1207 aesd $dat1,q15 1208 aesd $dat2,q15 1209 b.eq .Lecb_dec_one 1210 veor $tmp1,$rndlast,$dat1 1211 veor $tmp2,$rndlast,$dat2 1212 vst1.8 {$tmp1},[$out],#16 1213 vst1.8 {$tmp2},[$out],#16 1214 b .Lecb_done 1215 1216.Lecb_dec_one: 1217 veor $tmp1,$rndlast,$dat2 1218 vst1.8 {$tmp1},[$out],#16 1219 1220.Lecb_done: 1221___ 1222} 1223$code.=<<___ if ($flavour !~ /64/); 1224 vldmia sp!,{d8-d15} 1225 ldmia sp!,{r4-r8,pc} 1226___ 1227$code.=<<___ if ($flavour =~ /64/); 1228 ldr x29,[sp],#16 1229___ 1230$code.=<<___ if ($flavour =~ /64/); 1231.Lecb_Final_abort: 1232 ret 1233___ 1234$code.=<<___; 1235.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt 1236___ 1237}}} 1238{{{ 1239my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 1240my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 1241my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 1242 1243my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 1244my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 1245 1246### q8-q15 preloaded key schedule 1247 1248$code.=<<___; 1249.globl ${prefix}_cbc_encrypt 1250.type ${prefix}_cbc_encrypt,%function 1251.align 5 1252${prefix}_cbc_encrypt: 1253___ 1254$code.=<<___ if ($flavour =~ /64/); 1255 stp x29,x30,[sp,#-16]! 1256 add x29,sp,#0 1257___ 1258$code.=<<___ if ($flavour !~ /64/); 1259 mov ip,sp 1260 stmdb sp!,{r4-r8,lr} 1261 vstmdb sp!,{d8-d15} @ ABI specification says so 1262 ldmia ip,{r4-r5} @ load remaining args 1263___ 1264$code.=<<___; 1265 subs $len,$len,#16 1266 mov $step,#16 1267 b.lo .Lcbc_abort 1268 cclr $step,eq 1269 1270 cmp $enc,#0 // en- or decrypting? 1271 ldr $rounds,[$key,#240] 1272 and $len,$len,#-16 1273 vld1.8 {$ivec},[$ivp] 1274 vld1.8 {$dat},[$inp],$step 1275 1276 vld1.32 {q8-q9},[$key] // load key schedule... 1277 sub $rounds,$rounds,#6 1278 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 1279 sub $rounds,$rounds,#2 1280 vld1.32 {q10-q11},[$key_],#32 1281 vld1.32 {q12-q13},[$key_],#32 1282 vld1.32 {q14-q15},[$key_],#32 1283 vld1.32 {$rndlast},[$key_] 1284 1285 add $key_,$key,#32 1286 mov $cnt,$rounds 1287 b.eq .Lcbc_dec 1288 1289 cmp $rounds,#2 1290 veor $dat,$dat,$ivec 1291 veor $rndzero_n_last,q8,$rndlast 1292 b.eq .Lcbc_enc128 1293 1294 vld1.32 {$in0-$in1},[$key_] 1295 add $key_,$key,#16 1296 add $key4,$key,#16*4 1297 add $key5,$key,#16*5 1298 aese $dat,q8 1299 aesmc $dat,$dat 1300 add $key6,$key,#16*6 1301 add $key7,$key,#16*7 1302 b .Lenter_cbc_enc 1303 1304.align 4 1305.Loop_cbc_enc: 1306 aese $dat,q8 1307 aesmc $dat,$dat 1308 vst1.8 {$ivec},[$out],#16 1309.Lenter_cbc_enc: 1310 aese $dat,q9 1311 aesmc $dat,$dat 1312 aese $dat,$in0 1313 aesmc $dat,$dat 1314 vld1.32 {q8},[$key4] 1315 cmp $rounds,#4 1316 aese $dat,$in1 1317 aesmc $dat,$dat 1318 vld1.32 {q9},[$key5] 1319 b.eq .Lcbc_enc192 1320 1321 aese $dat,q8 1322 aesmc $dat,$dat 1323 vld1.32 {q8},[$key6] 1324 aese $dat,q9 1325 aesmc $dat,$dat 1326 vld1.32 {q9},[$key7] 1327 nop 1328 1329.Lcbc_enc192: 1330 aese $dat,q8 1331 aesmc $dat,$dat 1332 subs $len,$len,#16 1333 aese $dat,q9 1334 aesmc $dat,$dat 1335 cclr $step,eq 1336 aese $dat,q10 1337 aesmc $dat,$dat 1338 aese $dat,q11 1339 aesmc $dat,$dat 1340 vld1.8 {q8},[$inp],$step 1341 aese $dat,q12 1342 aesmc $dat,$dat 1343 veor q8,q8,$rndzero_n_last 1344 aese $dat,q13 1345 aesmc $dat,$dat 1346 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 1347 aese $dat,q14 1348 aesmc $dat,$dat 1349 aese $dat,q15 1350 veor $ivec,$dat,$rndlast 1351 b.hs .Loop_cbc_enc 1352 1353 vst1.8 {$ivec},[$out],#16 1354 b .Lcbc_done 1355 1356.align 5 1357.Lcbc_enc128: 1358 vld1.32 {$in0-$in1},[$key_] 1359 aese $dat,q8 1360 aesmc $dat,$dat 1361 b .Lenter_cbc_enc128 1362.Loop_cbc_enc128: 1363 aese $dat,q8 1364 aesmc $dat,$dat 1365 vst1.8 {$ivec},[$out],#16 1366.Lenter_cbc_enc128: 1367 aese $dat,q9 1368 aesmc $dat,$dat 1369 subs $len,$len,#16 1370 aese $dat,$in0 1371 aesmc $dat,$dat 1372 cclr $step,eq 1373 aese $dat,$in1 1374 aesmc $dat,$dat 1375 aese $dat,q10 1376 aesmc $dat,$dat 1377 aese $dat,q11 1378 aesmc $dat,$dat 1379 vld1.8 {q8},[$inp],$step 1380 aese $dat,q12 1381 aesmc $dat,$dat 1382 aese $dat,q13 1383 aesmc $dat,$dat 1384 aese $dat,q14 1385 aesmc $dat,$dat 1386 veor q8,q8,$rndzero_n_last 1387 aese $dat,q15 1388 veor $ivec,$dat,$rndlast 1389 b.hs .Loop_cbc_enc128 1390 1391 vst1.8 {$ivec},[$out],#16 1392 b .Lcbc_done 1393___ 1394{ 1395my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 1396 1397my ($dat3,$in3,$tmp3); # used only in 64-bit mode 1398my ($dat4,$in4,$tmp4); 1399if ($flavour =~ /64/) { 1400 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 1401} 1402 1403$code.=<<___; 1404.align 5 1405.Lcbc_dec: 1406 vld1.8 {$dat2},[$inp],#16 1407 subs $len,$len,#32 // bias 1408 add $cnt,$rounds,#2 1409 vorr $in1,$dat,$dat 1410 vorr $dat1,$dat,$dat 1411 vorr $in2,$dat2,$dat2 1412 b.lo .Lcbc_dec_tail 1413 1414 vorr $dat1,$dat2,$dat2 1415 vld1.8 {$dat2},[$inp],#16 1416 vorr $in0,$dat,$dat 1417 vorr $in1,$dat1,$dat1 1418 vorr $in2,$dat2,$dat2 1419___ 1420$code.=<<___ if ($flavour =~ /64/); 1421 cmp $len,#32 1422 b.lo .Loop3x_cbc_dec 1423 1424 vld1.8 {$dat3},[$inp],#16 1425 vld1.8 {$dat4},[$inp],#16 1426 sub $len,$len,#32 // bias 1427 mov $cnt,$rounds 1428 vorr $in3,$dat3,$dat3 1429 vorr $in4,$dat4,$dat4 1430 1431.Loop5x_cbc_dec: 1432 aesd $dat0,q8 1433 aesimc $dat0,$dat0 1434 aesd $dat1,q8 1435 aesimc $dat1,$dat1 1436 aesd $dat2,q8 1437 aesimc $dat2,$dat2 1438 aesd $dat3,q8 1439 aesimc $dat3,$dat3 1440 aesd $dat4,q8 1441 aesimc $dat4,$dat4 1442 vld1.32 {q8},[$key_],#16 1443 subs $cnt,$cnt,#2 1444 aesd $dat0,q9 1445 aesimc $dat0,$dat0 1446 aesd $dat1,q9 1447 aesimc $dat1,$dat1 1448 aesd $dat2,q9 1449 aesimc $dat2,$dat2 1450 aesd $dat3,q9 1451 aesimc $dat3,$dat3 1452 aesd $dat4,q9 1453 aesimc $dat4,$dat4 1454 vld1.32 {q9},[$key_],#16 1455 b.gt .Loop5x_cbc_dec 1456 1457 aesd $dat0,q8 1458 aesimc $dat0,$dat0 1459 aesd $dat1,q8 1460 aesimc $dat1,$dat1 1461 aesd $dat2,q8 1462 aesimc $dat2,$dat2 1463 aesd $dat3,q8 1464 aesimc $dat3,$dat3 1465 aesd $dat4,q8 1466 aesimc $dat4,$dat4 1467 cmp $len,#0x40 // because .Lcbc_tail4x 1468 sub $len,$len,#0x50 1469 1470 aesd $dat0,q9 1471 aesimc $dat0,$dat0 1472 aesd $dat1,q9 1473 aesimc $dat1,$dat1 1474 aesd $dat2,q9 1475 aesimc $dat2,$dat2 1476 aesd $dat3,q9 1477 aesimc $dat3,$dat3 1478 aesd $dat4,q9 1479 aesimc $dat4,$dat4 1480 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 1481 mov $key_,$key 1482 1483 aesd $dat0,q10 1484 aesimc $dat0,$dat0 1485 aesd $dat1,q10 1486 aesimc $dat1,$dat1 1487 aesd $dat2,q10 1488 aesimc $dat2,$dat2 1489 aesd $dat3,q10 1490 aesimc $dat3,$dat3 1491 aesd $dat4,q10 1492 aesimc $dat4,$dat4 1493 add $inp,$inp,x6 // $inp is adjusted in such way that 1494 // at exit from the loop $dat1-$dat4 1495 // are loaded with last "words" 1496 add x6,$len,#0x60 // because .Lcbc_tail4x 1497 1498 aesd $dat0,q11 1499 aesimc $dat0,$dat0 1500 aesd $dat1,q11 1501 aesimc $dat1,$dat1 1502 aesd $dat2,q11 1503 aesimc $dat2,$dat2 1504 aesd $dat3,q11 1505 aesimc $dat3,$dat3 1506 aesd $dat4,q11 1507 aesimc $dat4,$dat4 1508 1509 aesd $dat0,q12 1510 aesimc $dat0,$dat0 1511 aesd $dat1,q12 1512 aesimc $dat1,$dat1 1513 aesd $dat2,q12 1514 aesimc $dat2,$dat2 1515 aesd $dat3,q12 1516 aesimc $dat3,$dat3 1517 aesd $dat4,q12 1518 aesimc $dat4,$dat4 1519 1520 aesd $dat0,q13 1521 aesimc $dat0,$dat0 1522 aesd $dat1,q13 1523 aesimc $dat1,$dat1 1524 aesd $dat2,q13 1525 aesimc $dat2,$dat2 1526 aesd $dat3,q13 1527 aesimc $dat3,$dat3 1528 aesd $dat4,q13 1529 aesimc $dat4,$dat4 1530 1531 aesd $dat0,q14 1532 aesimc $dat0,$dat0 1533 aesd $dat1,q14 1534 aesimc $dat1,$dat1 1535 aesd $dat2,q14 1536 aesimc $dat2,$dat2 1537 aesd $dat3,q14 1538 aesimc $dat3,$dat3 1539 aesd $dat4,q14 1540 aesimc $dat4,$dat4 1541 1542 veor $tmp0,$ivec,$rndlast 1543 aesd $dat0,q15 1544 veor $tmp1,$in0,$rndlast 1545 vld1.8 {$in0},[$inp],#16 1546 aesd $dat1,q15 1547 veor $tmp2,$in1,$rndlast 1548 vld1.8 {$in1},[$inp],#16 1549 aesd $dat2,q15 1550 veor $tmp3,$in2,$rndlast 1551 vld1.8 {$in2},[$inp],#16 1552 aesd $dat3,q15 1553 veor $tmp4,$in3,$rndlast 1554 vld1.8 {$in3},[$inp],#16 1555 aesd $dat4,q15 1556 vorr $ivec,$in4,$in4 1557 vld1.8 {$in4},[$inp],#16 1558 cbz x6,.Lcbc_tail4x 1559 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1560 veor $tmp0,$tmp0,$dat0 1561 vorr $dat0,$in0,$in0 1562 veor $tmp1,$tmp1,$dat1 1563 vorr $dat1,$in1,$in1 1564 veor $tmp2,$tmp2,$dat2 1565 vorr $dat2,$in2,$in2 1566 veor $tmp3,$tmp3,$dat3 1567 vorr $dat3,$in3,$in3 1568 veor $tmp4,$tmp4,$dat4 1569 vst1.8 {$tmp0},[$out],#16 1570 vorr $dat4,$in4,$in4 1571 vst1.8 {$tmp1},[$out],#16 1572 mov $cnt,$rounds 1573 vst1.8 {$tmp2},[$out],#16 1574 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1575 vst1.8 {$tmp3},[$out],#16 1576 vst1.8 {$tmp4},[$out],#16 1577 b.hs .Loop5x_cbc_dec 1578 1579 add $len,$len,#0x50 1580 cbz $len,.Lcbc_done 1581 1582 add $cnt,$rounds,#2 1583 subs $len,$len,#0x30 1584 vorr $dat0,$in2,$in2 1585 vorr $in0,$in2,$in2 1586 vorr $dat1,$in3,$in3 1587 vorr $in1,$in3,$in3 1588 vorr $dat2,$in4,$in4 1589 vorr $in2,$in4,$in4 1590 b.lo .Lcbc_dec_tail 1591 1592 b .Loop3x_cbc_dec 1593 1594.align 4 1595.Lcbc_tail4x: 1596 veor $tmp1,$tmp0,$dat1 1597 veor $tmp2,$tmp2,$dat2 1598 veor $tmp3,$tmp3,$dat3 1599 veor $tmp4,$tmp4,$dat4 1600 vst1.8 {$tmp1},[$out],#16 1601 vst1.8 {$tmp2},[$out],#16 1602 vst1.8 {$tmp3},[$out],#16 1603 vst1.8 {$tmp4},[$out],#16 1604 1605 b .Lcbc_done 1606.align 4 1607___ 1608$code.=<<___; 1609.Loop3x_cbc_dec: 1610 aesd $dat0,q8 1611 aesimc $dat0,$dat0 1612 aesd $dat1,q8 1613 aesimc $dat1,$dat1 1614 aesd $dat2,q8 1615 aesimc $dat2,$dat2 1616 vld1.32 {q8},[$key_],#16 1617 subs $cnt,$cnt,#2 1618 aesd $dat0,q9 1619 aesimc $dat0,$dat0 1620 aesd $dat1,q9 1621 aesimc $dat1,$dat1 1622 aesd $dat2,q9 1623 aesimc $dat2,$dat2 1624 vld1.32 {q9},[$key_],#16 1625 b.gt .Loop3x_cbc_dec 1626 1627 aesd $dat0,q8 1628 aesimc $dat0,$dat0 1629 aesd $dat1,q8 1630 aesimc $dat1,$dat1 1631 aesd $dat2,q8 1632 aesimc $dat2,$dat2 1633 veor $tmp0,$ivec,$rndlast 1634 subs $len,$len,#0x30 1635 veor $tmp1,$in0,$rndlast 1636 mov.lo x6,$len // x6, $cnt, is zero at this point 1637 aesd $dat0,q9 1638 aesimc $dat0,$dat0 1639 aesd $dat1,q9 1640 aesimc $dat1,$dat1 1641 aesd $dat2,q9 1642 aesimc $dat2,$dat2 1643 veor $tmp2,$in1,$rndlast 1644 add $inp,$inp,x6 // $inp is adjusted in such way that 1645 // at exit from the loop $dat1-$dat2 1646 // are loaded with last "words" 1647 vorr $ivec,$in2,$in2 1648 mov $key_,$key 1649 aesd $dat0,q12 1650 aesimc $dat0,$dat0 1651 aesd $dat1,q12 1652 aesimc $dat1,$dat1 1653 aesd $dat2,q12 1654 aesimc $dat2,$dat2 1655 vld1.8 {$in0},[$inp],#16 1656 aesd $dat0,q13 1657 aesimc $dat0,$dat0 1658 aesd $dat1,q13 1659 aesimc $dat1,$dat1 1660 aesd $dat2,q13 1661 aesimc $dat2,$dat2 1662 vld1.8 {$in1},[$inp],#16 1663 aesd $dat0,q14 1664 aesimc $dat0,$dat0 1665 aesd $dat1,q14 1666 aesimc $dat1,$dat1 1667 aesd $dat2,q14 1668 aesimc $dat2,$dat2 1669 vld1.8 {$in2},[$inp],#16 1670 aesd $dat0,q15 1671 aesd $dat1,q15 1672 aesd $dat2,q15 1673 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1674 add $cnt,$rounds,#2 1675 veor $tmp0,$tmp0,$dat0 1676 veor $tmp1,$tmp1,$dat1 1677 veor $dat2,$dat2,$tmp2 1678 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1679 vst1.8 {$tmp0},[$out],#16 1680 vorr $dat0,$in0,$in0 1681 vst1.8 {$tmp1},[$out],#16 1682 vorr $dat1,$in1,$in1 1683 vst1.8 {$dat2},[$out],#16 1684 vorr $dat2,$in2,$in2 1685 b.hs .Loop3x_cbc_dec 1686 1687 cmn $len,#0x30 1688 b.eq .Lcbc_done 1689 nop 1690 1691.Lcbc_dec_tail: 1692 aesd $dat1,q8 1693 aesimc $dat1,$dat1 1694 aesd $dat2,q8 1695 aesimc $dat2,$dat2 1696 vld1.32 {q8},[$key_],#16 1697 subs $cnt,$cnt,#2 1698 aesd $dat1,q9 1699 aesimc $dat1,$dat1 1700 aesd $dat2,q9 1701 aesimc $dat2,$dat2 1702 vld1.32 {q9},[$key_],#16 1703 b.gt .Lcbc_dec_tail 1704 1705 aesd $dat1,q8 1706 aesimc $dat1,$dat1 1707 aesd $dat2,q8 1708 aesimc $dat2,$dat2 1709 aesd $dat1,q9 1710 aesimc $dat1,$dat1 1711 aesd $dat2,q9 1712 aesimc $dat2,$dat2 1713 aesd $dat1,q12 1714 aesimc $dat1,$dat1 1715 aesd $dat2,q12 1716 aesimc $dat2,$dat2 1717 cmn $len,#0x20 1718 aesd $dat1,q13 1719 aesimc $dat1,$dat1 1720 aesd $dat2,q13 1721 aesimc $dat2,$dat2 1722 veor $tmp1,$ivec,$rndlast 1723 aesd $dat1,q14 1724 aesimc $dat1,$dat1 1725 aesd $dat2,q14 1726 aesimc $dat2,$dat2 1727 veor $tmp2,$in1,$rndlast 1728 aesd $dat1,q15 1729 aesd $dat2,q15 1730 b.eq .Lcbc_dec_one 1731 veor $tmp1,$tmp1,$dat1 1732 veor $tmp2,$tmp2,$dat2 1733 vorr $ivec,$in2,$in2 1734 vst1.8 {$tmp1},[$out],#16 1735 vst1.8 {$tmp2},[$out],#16 1736 b .Lcbc_done 1737 1738.Lcbc_dec_one: 1739 veor $tmp1,$tmp1,$dat2 1740 vorr $ivec,$in2,$in2 1741 vst1.8 {$tmp1},[$out],#16 1742 1743.Lcbc_done: 1744 vst1.8 {$ivec},[$ivp] 1745.Lcbc_abort: 1746___ 1747} 1748$code.=<<___ if ($flavour !~ /64/); 1749 vldmia sp!,{d8-d15} 1750 ldmia sp!,{r4-r8,pc} 1751___ 1752$code.=<<___ if ($flavour =~ /64/); 1753 ldr x29,[sp],#16 1754 ret 1755___ 1756$code.=<<___; 1757.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 1758___ 1759}}} 1760{{{ 1761my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 1762my ($rounds,$cnt,$key_)=("w5","w6","x7"); 1763my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 1764my $step="x12"; # aliases with $tctr2 1765 1766my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 1767my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 1768 1769# used only in 64-bit mode... 1770my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); 1771 1772my ($dat,$tmp)=($dat0,$tmp0); 1773 1774### q8-q15 preloaded key schedule 1775 1776$code.=<<___; 1777.globl ${prefix}_ctr32_encrypt_blocks 1778.type ${prefix}_ctr32_encrypt_blocks,%function 1779.align 5 1780${prefix}_ctr32_encrypt_blocks: 1781___ 1782$code.=<<___ if ($flavour =~ /64/); 1783 stp x29,x30,[sp,#-16]! 1784 add x29,sp,#0 1785___ 1786$code.=<<___ if ($flavour !~ /64/); 1787 mov ip,sp 1788 stmdb sp!,{r4-r10,lr} 1789 vstmdb sp!,{d8-d15} @ ABI specification says so 1790 ldr r4, [ip] @ load remaining arg 1791___ 1792$code.=<<___; 1793 ldr $rounds,[$key,#240] 1794 1795 ldr $ctr, [$ivp, #12] 1796#ifdef __ARMEB__ 1797 vld1.8 {$dat0},[$ivp] 1798#else 1799 vld1.32 {$dat0},[$ivp] 1800#endif 1801 vld1.32 {q8-q9},[$key] // load key schedule... 1802 sub $rounds,$rounds,#4 1803 mov $step,#16 1804 cmp $len,#2 1805 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 1806 sub $rounds,$rounds,#2 1807 vld1.32 {q12-q13},[$key_],#32 1808 vld1.32 {q14-q15},[$key_],#32 1809 vld1.32 {$rndlast},[$key_] 1810 add $key_,$key,#32 1811 mov $cnt,$rounds 1812 cclr $step,lo 1813#ifndef __ARMEB__ 1814 rev $ctr, $ctr 1815#endif 1816___ 1817$code.=<<___ if ($flavour =~ /64/); 1818 vorr $dat1,$dat0,$dat0 1819 add $tctr1, $ctr, #1 1820 vorr $dat2,$dat0,$dat0 1821 add $ctr, $ctr, #2 1822 vorr $ivec,$dat0,$dat0 1823 rev $tctr1, $tctr1 1824 vmov.32 ${dat1}[3],$tctr1 1825 b.ls .Lctr32_tail 1826 rev $tctr2, $ctr 1827 sub $len,$len,#3 // bias 1828 vmov.32 ${dat2}[3],$tctr2 1829___ 1830$code.=<<___ if ($flavour !~ /64/); 1831 add $tctr1, $ctr, #1 1832 vorr $ivec,$dat0,$dat0 1833 rev $tctr1, $tctr1 1834 vmov.32 ${ivec}[3],$tctr1 1835 add $ctr, $ctr, #2 1836 vorr $dat1,$ivec,$ivec 1837 b.ls .Lctr32_tail 1838 rev $tctr2, $ctr 1839 vmov.32 ${ivec}[3],$tctr2 1840 sub $len,$len,#3 // bias 1841 vorr $dat2,$ivec,$ivec 1842___ 1843$code.=<<___ if ($flavour =~ /64/); 1844 cmp $len,#32 1845 b.lo .Loop3x_ctr32 1846 1847 add w13,$ctr,#1 1848 add w14,$ctr,#2 1849 vorr $dat3,$dat0,$dat0 1850 rev w13,w13 1851 vorr $dat4,$dat0,$dat0 1852 rev w14,w14 1853 vmov.32 ${dat3}[3],w13 1854 sub $len,$len,#2 // bias 1855 vmov.32 ${dat4}[3],w14 1856 add $ctr,$ctr,#2 1857 b .Loop5x_ctr32 1858 1859.align 4 1860.Loop5x_ctr32: 1861 aese $dat0,q8 1862 aesmc $dat0,$dat0 1863 aese $dat1,q8 1864 aesmc $dat1,$dat1 1865 aese $dat2,q8 1866 aesmc $dat2,$dat2 1867 aese $dat3,q8 1868 aesmc $dat3,$dat3 1869 aese $dat4,q8 1870 aesmc $dat4,$dat4 1871 vld1.32 {q8},[$key_],#16 1872 subs $cnt,$cnt,#2 1873 aese $dat0,q9 1874 aesmc $dat0,$dat0 1875 aese $dat1,q9 1876 aesmc $dat1,$dat1 1877 aese $dat2,q9 1878 aesmc $dat2,$dat2 1879 aese $dat3,q9 1880 aesmc $dat3,$dat3 1881 aese $dat4,q9 1882 aesmc $dat4,$dat4 1883 vld1.32 {q9},[$key_],#16 1884 b.gt .Loop5x_ctr32 1885 1886 mov $key_,$key 1887 aese $dat0,q8 1888 aesmc $dat0,$dat0 1889 aese $dat1,q8 1890 aesmc $dat1,$dat1 1891 aese $dat2,q8 1892 aesmc $dat2,$dat2 1893 aese $dat3,q8 1894 aesmc $dat3,$dat3 1895 aese $dat4,q8 1896 aesmc $dat4,$dat4 1897 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1898 1899 aese $dat0,q9 1900 aesmc $dat0,$dat0 1901 aese $dat1,q9 1902 aesmc $dat1,$dat1 1903 aese $dat2,q9 1904 aesmc $dat2,$dat2 1905 aese $dat3,q9 1906 aesmc $dat3,$dat3 1907 aese $dat4,q9 1908 aesmc $dat4,$dat4 1909 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1910 1911 aese $dat0,q12 1912 aesmc $dat0,$dat0 1913 add $tctr0,$ctr,#1 1914 add $tctr1,$ctr,#2 1915 aese $dat1,q12 1916 aesmc $dat1,$dat1 1917 add $tctr2,$ctr,#3 1918 add w13,$ctr,#4 1919 aese $dat2,q12 1920 aesmc $dat2,$dat2 1921 add w14,$ctr,#5 1922 rev $tctr0,$tctr0 1923 aese $dat3,q12 1924 aesmc $dat3,$dat3 1925 rev $tctr1,$tctr1 1926 rev $tctr2,$tctr2 1927 aese $dat4,q12 1928 aesmc $dat4,$dat4 1929 rev w13,w13 1930 rev w14,w14 1931 1932 aese $dat0,q13 1933 aesmc $dat0,$dat0 1934 aese $dat1,q13 1935 aesmc $dat1,$dat1 1936 aese $dat2,q13 1937 aesmc $dat2,$dat2 1938 aese $dat3,q13 1939 aesmc $dat3,$dat3 1940 aese $dat4,q13 1941 aesmc $dat4,$dat4 1942 1943 aese $dat0,q14 1944 aesmc $dat0,$dat0 1945 vld1.8 {$in0},[$inp],#16 1946 aese $dat1,q14 1947 aesmc $dat1,$dat1 1948 vld1.8 {$in1},[$inp],#16 1949 aese $dat2,q14 1950 aesmc $dat2,$dat2 1951 vld1.8 {$in2},[$inp],#16 1952 aese $dat3,q14 1953 aesmc $dat3,$dat3 1954 vld1.8 {$in3},[$inp],#16 1955 aese $dat4,q14 1956 aesmc $dat4,$dat4 1957 vld1.8 {$in4},[$inp],#16 1958 1959 aese $dat0,q15 1960 veor $in0,$in0,$rndlast 1961 aese $dat1,q15 1962 veor $in1,$in1,$rndlast 1963 aese $dat2,q15 1964 veor $in2,$in2,$rndlast 1965 aese $dat3,q15 1966 veor $in3,$in3,$rndlast 1967 aese $dat4,q15 1968 veor $in4,$in4,$rndlast 1969 1970 veor $in0,$in0,$dat0 1971 vorr $dat0,$ivec,$ivec 1972 veor $in1,$in1,$dat1 1973 vorr $dat1,$ivec,$ivec 1974 veor $in2,$in2,$dat2 1975 vorr $dat2,$ivec,$ivec 1976 veor $in3,$in3,$dat3 1977 vorr $dat3,$ivec,$ivec 1978 veor $in4,$in4,$dat4 1979 vorr $dat4,$ivec,$ivec 1980 1981 vst1.8 {$in0},[$out],#16 1982 vmov.32 ${dat0}[3],$tctr0 1983 vst1.8 {$in1},[$out],#16 1984 vmov.32 ${dat1}[3],$tctr1 1985 vst1.8 {$in2},[$out],#16 1986 vmov.32 ${dat2}[3],$tctr2 1987 vst1.8 {$in3},[$out],#16 1988 vmov.32 ${dat3}[3],w13 1989 vst1.8 {$in4},[$out],#16 1990 vmov.32 ${dat4}[3],w14 1991 1992 mov $cnt,$rounds 1993 cbz $len,.Lctr32_done 1994 1995 add $ctr,$ctr,#5 1996 subs $len,$len,#5 1997 b.hs .Loop5x_ctr32 1998 1999 add $len,$len,#5 2000 sub $ctr,$ctr,#5 2001 2002 cmp $len,#2 2003 mov $step,#16 2004 cclr $step,lo 2005 b.ls .Lctr32_tail 2006 2007 sub $len,$len,#3 // bias 2008 add $ctr,$ctr,#3 2009___ 2010$code.=<<___; 2011 b .Loop3x_ctr32 2012 2013.align 4 2014.Loop3x_ctr32: 2015 aese $dat0,q8 2016 aesmc $dat0,$dat0 2017 aese $dat1,q8 2018 aesmc $dat1,$dat1 2019 aese $dat2,q8 2020 aesmc $dat2,$dat2 2021 vld1.32 {q8},[$key_],#16 2022 subs $cnt,$cnt,#2 2023 aese $dat0,q9 2024 aesmc $dat0,$dat0 2025 aese $dat1,q9 2026 aesmc $dat1,$dat1 2027 aese $dat2,q9 2028 aesmc $dat2,$dat2 2029 vld1.32 {q9},[$key_],#16 2030 b.gt .Loop3x_ctr32 2031 2032 aese $dat0,q8 2033 aesmc $tmp0,$dat0 2034 aese $dat1,q8 2035 aesmc $tmp1,$dat1 2036 vld1.8 {$in0},[$inp],#16 2037___ 2038$code.=<<___ if ($flavour =~ /64/); 2039 vorr $dat0,$ivec,$ivec 2040___ 2041$code.=<<___ if ($flavour !~ /64/); 2042 add $tctr0,$ctr,#1 2043___ 2044$code.=<<___; 2045 aese $dat2,q8 2046 aesmc $dat2,$dat2 2047 vld1.8 {$in1},[$inp],#16 2048___ 2049$code.=<<___ if ($flavour =~ /64/); 2050 vorr $dat1,$ivec,$ivec 2051___ 2052$code.=<<___ if ($flavour !~ /64/); 2053 rev $tctr0,$tctr0 2054___ 2055$code.=<<___; 2056 aese $tmp0,q9 2057 aesmc $tmp0,$tmp0 2058 aese $tmp1,q9 2059 aesmc $tmp1,$tmp1 2060 vld1.8 {$in2},[$inp],#16 2061 mov $key_,$key 2062 aese $dat2,q9 2063 aesmc $tmp2,$dat2 2064___ 2065$code.=<<___ if ($flavour =~ /64/); 2066 vorr $dat2,$ivec,$ivec 2067 add $tctr0,$ctr,#1 2068___ 2069$code.=<<___; 2070 aese $tmp0,q12 2071 aesmc $tmp0,$tmp0 2072 aese $tmp1,q12 2073 aesmc $tmp1,$tmp1 2074 veor $in0,$in0,$rndlast 2075 add $tctr1,$ctr,#2 2076 aese $tmp2,q12 2077 aesmc $tmp2,$tmp2 2078 veor $in1,$in1,$rndlast 2079 add $ctr,$ctr,#3 2080 aese $tmp0,q13 2081 aesmc $tmp0,$tmp0 2082 aese $tmp1,q13 2083 aesmc $tmp1,$tmp1 2084 veor $in2,$in2,$rndlast 2085___ 2086$code.=<<___ if ($flavour =~ /64/); 2087 rev $tctr0,$tctr0 2088 aese $tmp2,q13 2089 aesmc $tmp2,$tmp2 2090 vmov.32 ${dat0}[3], $tctr0 2091___ 2092$code.=<<___ if ($flavour !~ /64/); 2093 vmov.32 ${ivec}[3], $tctr0 2094 aese $tmp2,q13 2095 aesmc $tmp2,$tmp2 2096 vorr $dat0,$ivec,$ivec 2097___ 2098$code.=<<___; 2099 rev $tctr1,$tctr1 2100 aese $tmp0,q14 2101 aesmc $tmp0,$tmp0 2102___ 2103$code.=<<___ if ($flavour !~ /64/); 2104 vmov.32 ${ivec}[3], $tctr1 2105 rev $tctr2,$ctr 2106___ 2107$code.=<<___; 2108 aese $tmp1,q14 2109 aesmc $tmp1,$tmp1 2110___ 2111$code.=<<___ if ($flavour =~ /64/); 2112 vmov.32 ${dat1}[3], $tctr1 2113 rev $tctr2,$ctr 2114 aese $tmp2,q14 2115 aesmc $tmp2,$tmp2 2116 vmov.32 ${dat2}[3], $tctr2 2117___ 2118$code.=<<___ if ($flavour !~ /64/); 2119 vorr $dat1,$ivec,$ivec 2120 vmov.32 ${ivec}[3], $tctr2 2121 aese $tmp2,q14 2122 aesmc $tmp2,$tmp2 2123 vorr $dat2,$ivec,$ivec 2124___ 2125$code.=<<___; 2126 subs $len,$len,#3 2127 aese $tmp0,q15 2128 aese $tmp1,q15 2129 aese $tmp2,q15 2130 2131 veor $in0,$in0,$tmp0 2132 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2133 vst1.8 {$in0},[$out],#16 2134 veor $in1,$in1,$tmp1 2135 mov $cnt,$rounds 2136 vst1.8 {$in1},[$out],#16 2137 veor $in2,$in2,$tmp2 2138 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2139 vst1.8 {$in2},[$out],#16 2140 b.hs .Loop3x_ctr32 2141 2142 adds $len,$len,#3 2143 b.eq .Lctr32_done 2144 cmp $len,#1 2145 mov $step,#16 2146 cclr $step,eq 2147 2148.Lctr32_tail: 2149 aese $dat0,q8 2150 aesmc $dat0,$dat0 2151 aese $dat1,q8 2152 aesmc $dat1,$dat1 2153 vld1.32 {q8},[$key_],#16 2154 subs $cnt,$cnt,#2 2155 aese $dat0,q9 2156 aesmc $dat0,$dat0 2157 aese $dat1,q9 2158 aesmc $dat1,$dat1 2159 vld1.32 {q9},[$key_],#16 2160 b.gt .Lctr32_tail 2161 2162 aese $dat0,q8 2163 aesmc $dat0,$dat0 2164 aese $dat1,q8 2165 aesmc $dat1,$dat1 2166 aese $dat0,q9 2167 aesmc $dat0,$dat0 2168 aese $dat1,q9 2169 aesmc $dat1,$dat1 2170 vld1.8 {$in0},[$inp],$step 2171 aese $dat0,q12 2172 aesmc $dat0,$dat0 2173 aese $dat1,q12 2174 aesmc $dat1,$dat1 2175 vld1.8 {$in1},[$inp] 2176 aese $dat0,q13 2177 aesmc $dat0,$dat0 2178 aese $dat1,q13 2179 aesmc $dat1,$dat1 2180 veor $in0,$in0,$rndlast 2181 aese $dat0,q14 2182 aesmc $dat0,$dat0 2183 aese $dat1,q14 2184 aesmc $dat1,$dat1 2185 veor $in1,$in1,$rndlast 2186 aese $dat0,q15 2187 aese $dat1,q15 2188 2189 cmp $len,#1 2190 veor $in0,$in0,$dat0 2191 veor $in1,$in1,$dat1 2192 vst1.8 {$in0},[$out],#16 2193 b.eq .Lctr32_done 2194 vst1.8 {$in1},[$out] 2195 2196.Lctr32_done: 2197___ 2198$code.=<<___ if ($flavour !~ /64/); 2199 vldmia sp!,{d8-d15} 2200 ldmia sp!,{r4-r10,pc} 2201___ 2202$code.=<<___ if ($flavour =~ /64/); 2203 ldr x29,[sp],#16 2204 ret 2205___ 2206$code.=<<___; 2207.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 2208___ 2209}}} 2210# Performance in cycles per byte. 2211# Processed with AES-XTS different key size. 2212# It shows the value before and after optimization as below: 2213# (before/after): 2214# 2215# AES-128-XTS AES-256-XTS 2216# Cortex-A57 3.36/1.09 4.02/1.37 2217# Cortex-A72 3.03/1.02 3.28/1.33 2218 2219# Optimization is implemented by loop unrolling and interleaving. 2220# Commonly, we choose the unrolling factor as 5, if the input 2221# data size smaller than 5 blocks, but not smaller than 3 blocks, 2222# choose 3 as the unrolling factor. 2223# If the input data size dsize >= 5*16 bytes, then take 5 blocks 2224# as one iteration, every loop the left size lsize -= 5*16. 2225# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes 2226# will be processed specially, which be integrated into the 5*16 bytes 2227# loop to improve the efficiency. 2228# There is one special case, if the original input data size dsize 2229# = 16 bytes, we will treat it seperately to improve the 2230# performance: one independent code block without LR, FP load and 2231# store. 2232# Encryption will process the (length -tailcnt) bytes as mentioned 2233# previously, then encrypt the composite block as last second 2234# cipher block. 2235# Decryption will process the (length -tailcnt -1) bytes as mentioned 2236# previously, then decrypt the last second cipher block to get the 2237# last plain block(tail), decrypt the composite block as last second 2238# plain text block. 2239 2240{{{ 2241my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); 2242my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); 2243my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); 2244my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); 2245my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); 2246my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 2247my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); 2248my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); 2249my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); 2250 2251my ($tmpin)=("v26.16b"); 2252my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 2253 2254# q7 last round key 2255# q10-q15, q7 Last 7 round keys 2256# q8-q9 preloaded round keys except last 7 keys for big size 2257# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte 2258 2259 2260my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 2261 2262my ($dat3,$in3,$tmp3); # used only in 64-bit mode 2263my ($dat4,$in4,$tmp4); 2264if ($flavour =~ /64/) { 2265 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 2266} 2267 2268$code.=<<___ if ($flavour =~ /64/); 2269.globl ${prefix}_xts_encrypt 2270.type ${prefix}_xts_encrypt,%function 2271.align 5 2272${prefix}_xts_encrypt: 2273___ 2274$code.=<<___ if ($flavour =~ /64/); 2275 cmp $len,#16 2276 // Original input data size bigger than 16, jump to big size processing. 2277 b.ne .Lxts_enc_big_size 2278 // Encrypt the iv with key2, as the first XEX iv. 2279 ldr $rounds,[$key2,#240] 2280 vld1.32 {$dat},[$key2],#16 2281 vld1.8 {$iv0},[$ivp] 2282 sub $rounds,$rounds,#2 2283 vld1.32 {$dat1},[$key2],#16 2284 2285.Loop_enc_iv_enc: 2286 aese $iv0,$dat 2287 aesmc $iv0,$iv0 2288 vld1.32 {$dat},[$key2],#16 2289 subs $rounds,$rounds,#2 2290 aese $iv0,$dat1 2291 aesmc $iv0,$iv0 2292 vld1.32 {$dat1},[$key2],#16 2293 b.gt .Loop_enc_iv_enc 2294 2295 aese $iv0,$dat 2296 aesmc $iv0,$iv0 2297 vld1.32 {$dat},[$key2] 2298 aese $iv0,$dat1 2299 veor $iv0,$iv0,$dat 2300 2301 vld1.8 {$dat0},[$inp] 2302 veor $dat0,$iv0,$dat0 2303 2304 ldr $rounds,[$key1,#240] 2305 vld1.32 {q20-q21},[$key1],#32 // load key schedule... 2306 2307 aese $dat0,q20 2308 aesmc $dat0,$dat0 2309 vld1.32 {q8-q9},[$key1],#32 // load key schedule... 2310 aese $dat0,q21 2311 aesmc $dat0,$dat0 2312 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing 2313 b.eq .Lxts_128_enc 2314.Lxts_enc_round_loop: 2315 aese $dat0,q8 2316 aesmc $dat0,$dat0 2317 vld1.32 {q8},[$key1],#16 // load key schedule... 2318 aese $dat0,q9 2319 aesmc $dat0,$dat0 2320 vld1.32 {q9},[$key1],#16 // load key schedule... 2321 subs $rounds,$rounds,#2 // bias 2322 b.gt .Lxts_enc_round_loop 2323.Lxts_128_enc: 2324 vld1.32 {q10-q11},[$key1],#32 // load key schedule... 2325 aese $dat0,q8 2326 aesmc $dat0,$dat0 2327 aese $dat0,q9 2328 aesmc $dat0,$dat0 2329 vld1.32 {q12-q13},[$key1],#32 // load key schedule... 2330 aese $dat0,q10 2331 aesmc $dat0,$dat0 2332 aese $dat0,q11 2333 aesmc $dat0,$dat0 2334 vld1.32 {q14-q15},[$key1],#32 // load key schedule... 2335 aese $dat0,q12 2336 aesmc $dat0,$dat0 2337 aese $dat0,q13 2338 aesmc $dat0,$dat0 2339 vld1.32 {$rndlast},[$key1] 2340 aese $dat0,q14 2341 aesmc $dat0,$dat0 2342 aese $dat0,q15 2343 veor $dat0,$dat0,$rndlast 2344 veor $dat0,$dat0,$iv0 2345 vst1.8 {$dat0},[$out] 2346 b .Lxts_enc_final_abort 2347 2348.align 4 2349.Lxts_enc_big_size: 2350___ 2351$code.=<<___ if ($flavour =~ /64/); 2352 stp $constnumx,$tmpinp,[sp,#-64]! 2353 stp $tailcnt,$midnumx,[sp,#48] 2354 stp $ivd10,$ivd20,[sp,#32] 2355 stp $ivd30,$ivd40,[sp,#16] 2356 2357 // tailcnt store the tail value of length%16. 2358 and $tailcnt,$len,#0xf 2359 and $len,$len,#-16 2360 subs $len,$len,#16 2361 mov $step,#16 2362 b.lo .Lxts_abort 2363 csel $step,xzr,$step,eq 2364 2365 // Firstly, encrypt the iv with key2, as the first iv of XEX. 2366 ldr $rounds,[$key2,#240] 2367 vld1.32 {$dat},[$key2],#16 2368 vld1.8 {$iv0},[$ivp] 2369 sub $rounds,$rounds,#2 2370 vld1.32 {$dat1},[$key2],#16 2371 2372.Loop_iv_enc: 2373 aese $iv0,$dat 2374 aesmc $iv0,$iv0 2375 vld1.32 {$dat},[$key2],#16 2376 subs $rounds,$rounds,#2 2377 aese $iv0,$dat1 2378 aesmc $iv0,$iv0 2379 vld1.32 {$dat1},[$key2],#16 2380 b.gt .Loop_iv_enc 2381 2382 aese $iv0,$dat 2383 aesmc $iv0,$iv0 2384 vld1.32 {$dat},[$key2] 2385 aese $iv0,$dat1 2386 veor $iv0,$iv0,$dat 2387 2388 // The iv for second block 2389 // $ivl- iv(low), $ivh - iv(high) 2390 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 2391 fmov $ivl,$ivd00 2392 fmov $ivh,$ivd01 2393 mov $constnum,#0x87 2394 extr $midnumx,$ivh,$ivh,#32 2395 extr $ivh,$ivh,$ivl,#63 2396 and $tmpmw,$constnum,$midnum,asr#31 2397 eor $ivl,$tmpmx,$ivl,lsl#1 2398 fmov $ivd10,$ivl 2399 fmov $ivd11,$ivh 2400 2401 ldr $rounds0,[$key1,#240] // next starting point 2402 vld1.8 {$dat},[$inp],$step 2403 2404 vld1.32 {q8-q9},[$key1] // load key schedule... 2405 sub $rounds0,$rounds0,#6 2406 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys 2407 sub $rounds0,$rounds0,#2 2408 vld1.32 {q10-q11},[$key_],#32 2409 vld1.32 {q12-q13},[$key_],#32 2410 vld1.32 {q14-q15},[$key_],#32 2411 vld1.32 {$rndlast},[$key_] 2412 2413 add $key_,$key1,#32 2414 mov $rounds,$rounds0 2415 2416 // Encryption 2417.Lxts_enc: 2418 vld1.8 {$dat2},[$inp],#16 2419 subs $len,$len,#32 // bias 2420 add $rounds,$rounds0,#2 2421 vorr $in1,$dat,$dat 2422 vorr $dat1,$dat,$dat 2423 vorr $in3,$dat,$dat 2424 vorr $in2,$dat2,$dat2 2425 vorr $in4,$dat2,$dat2 2426 b.lo .Lxts_inner_enc_tail 2427 veor $dat,$dat,$iv0 // before encryption, xor with iv 2428 veor $dat2,$dat2,$iv1 2429 2430 // The iv for third block 2431 extr $midnumx,$ivh,$ivh,#32 2432 extr $ivh,$ivh,$ivl,#63 2433 and $tmpmw,$constnum,$midnum,asr#31 2434 eor $ivl,$tmpmx,$ivl,lsl#1 2435 fmov $ivd20,$ivl 2436 fmov $ivd21,$ivh 2437 2438 2439 vorr $dat1,$dat2,$dat2 2440 vld1.8 {$dat2},[$inp],#16 2441 vorr $in0,$dat,$dat 2442 vorr $in1,$dat1,$dat1 2443 veor $in2,$dat2,$iv2 // the third block 2444 veor $dat2,$dat2,$iv2 2445 cmp $len,#32 2446 b.lo .Lxts_outer_enc_tail 2447 2448 // The iv for fourth block 2449 extr $midnumx,$ivh,$ivh,#32 2450 extr $ivh,$ivh,$ivl,#63 2451 and $tmpmw,$constnum,$midnum,asr#31 2452 eor $ivl,$tmpmx,$ivl,lsl#1 2453 fmov $ivd30,$ivl 2454 fmov $ivd31,$ivh 2455 2456 vld1.8 {$dat3},[$inp],#16 2457 // The iv for fifth block 2458 extr $midnumx,$ivh,$ivh,#32 2459 extr $ivh,$ivh,$ivl,#63 2460 and $tmpmw,$constnum,$midnum,asr#31 2461 eor $ivl,$tmpmx,$ivl,lsl#1 2462 fmov $ivd40,$ivl 2463 fmov $ivd41,$ivh 2464 2465 vld1.8 {$dat4},[$inp],#16 2466 veor $dat3,$dat3,$iv3 // the fourth block 2467 veor $dat4,$dat4,$iv4 2468 sub $len,$len,#32 // bias 2469 mov $rounds,$rounds0 2470 b .Loop5x_xts_enc 2471 2472.align 4 2473.Loop5x_xts_enc: 2474 aese $dat0,q8 2475 aesmc $dat0,$dat0 2476 aese $dat1,q8 2477 aesmc $dat1,$dat1 2478 aese $dat2,q8 2479 aesmc $dat2,$dat2 2480 aese $dat3,q8 2481 aesmc $dat3,$dat3 2482 aese $dat4,q8 2483 aesmc $dat4,$dat4 2484 vld1.32 {q8},[$key_],#16 2485 subs $rounds,$rounds,#2 2486 aese $dat0,q9 2487 aesmc $dat0,$dat0 2488 aese $dat1,q9 2489 aesmc $dat1,$dat1 2490 aese $dat2,q9 2491 aesmc $dat2,$dat2 2492 aese $dat3,q9 2493 aesmc $dat3,$dat3 2494 aese $dat4,q9 2495 aesmc $dat4,$dat4 2496 vld1.32 {q9},[$key_],#16 2497 b.gt .Loop5x_xts_enc 2498 2499 aese $dat0,q8 2500 aesmc $dat0,$dat0 2501 aese $dat1,q8 2502 aesmc $dat1,$dat1 2503 aese $dat2,q8 2504 aesmc $dat2,$dat2 2505 aese $dat3,q8 2506 aesmc $dat3,$dat3 2507 aese $dat4,q8 2508 aesmc $dat4,$dat4 2509 subs $len,$len,#0x50 // because .Lxts_enc_tail4x 2510 2511 aese $dat0,q9 2512 aesmc $dat0,$dat0 2513 aese $dat1,q9 2514 aesmc $dat1,$dat1 2515 aese $dat2,q9 2516 aesmc $dat2,$dat2 2517 aese $dat3,q9 2518 aesmc $dat3,$dat3 2519 aese $dat4,q9 2520 aesmc $dat4,$dat4 2521 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo 2522 mov $key_,$key1 2523 2524 aese $dat0,q10 2525 aesmc $dat0,$dat0 2526 aese $dat1,q10 2527 aesmc $dat1,$dat1 2528 aese $dat2,q10 2529 aesmc $dat2,$dat2 2530 aese $dat3,q10 2531 aesmc $dat3,$dat3 2532 aese $dat4,q10 2533 aesmc $dat4,$dat4 2534 add $inp,$inp,$xoffset // x0 is adjusted in such way that 2535 // at exit from the loop v1.16b-v26.16b 2536 // are loaded with last "words" 2537 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x 2538 2539 aese $dat0,q11 2540 aesmc $dat0,$dat0 2541 aese $dat1,q11 2542 aesmc $dat1,$dat1 2543 aese $dat2,q11 2544 aesmc $dat2,$dat2 2545 aese $dat3,q11 2546 aesmc $dat3,$dat3 2547 aese $dat4,q11 2548 aesmc $dat4,$dat4 2549 2550 aese $dat0,q12 2551 aesmc $dat0,$dat0 2552 aese $dat1,q12 2553 aesmc $dat1,$dat1 2554 aese $dat2,q12 2555 aesmc $dat2,$dat2 2556 aese $dat3,q12 2557 aesmc $dat3,$dat3 2558 aese $dat4,q12 2559 aesmc $dat4,$dat4 2560 2561 aese $dat0,q13 2562 aesmc $dat0,$dat0 2563 aese $dat1,q13 2564 aesmc $dat1,$dat1 2565 aese $dat2,q13 2566 aesmc $dat2,$dat2 2567 aese $dat3,q13 2568 aesmc $dat3,$dat3 2569 aese $dat4,q13 2570 aesmc $dat4,$dat4 2571 2572 aese $dat0,q14 2573 aesmc $dat0,$dat0 2574 aese $dat1,q14 2575 aesmc $dat1,$dat1 2576 aese $dat2,q14 2577 aesmc $dat2,$dat2 2578 aese $dat3,q14 2579 aesmc $dat3,$dat3 2580 aese $dat4,q14 2581 aesmc $dat4,$dat4 2582 2583 veor $tmp0,$rndlast,$iv0 2584 aese $dat0,q15 2585 // The iv for first block of one iteration 2586 extr $midnumx,$ivh,$ivh,#32 2587 extr $ivh,$ivh,$ivl,#63 2588 and $tmpmw,$constnum,$midnum,asr#31 2589 eor $ivl,$tmpmx,$ivl,lsl#1 2590 fmov $ivd00,$ivl 2591 fmov $ivd01,$ivh 2592 veor $tmp1,$rndlast,$iv1 2593 vld1.8 {$in0},[$inp],#16 2594 aese $dat1,q15 2595 // The iv for second block 2596 extr $midnumx,$ivh,$ivh,#32 2597 extr $ivh,$ivh,$ivl,#63 2598 and $tmpmw,$constnum,$midnum,asr#31 2599 eor $ivl,$tmpmx,$ivl,lsl#1 2600 fmov $ivd10,$ivl 2601 fmov $ivd11,$ivh 2602 veor $tmp2,$rndlast,$iv2 2603 vld1.8 {$in1},[$inp],#16 2604 aese $dat2,q15 2605 // The iv for third block 2606 extr $midnumx,$ivh,$ivh,#32 2607 extr $ivh,$ivh,$ivl,#63 2608 and $tmpmw,$constnum,$midnum,asr#31 2609 eor $ivl,$tmpmx,$ivl,lsl#1 2610 fmov $ivd20,$ivl 2611 fmov $ivd21,$ivh 2612 veor $tmp3,$rndlast,$iv3 2613 vld1.8 {$in2},[$inp],#16 2614 aese $dat3,q15 2615 // The iv for fourth block 2616 extr $midnumx,$ivh,$ivh,#32 2617 extr $ivh,$ivh,$ivl,#63 2618 and $tmpmw,$constnum,$midnum,asr#31 2619 eor $ivl,$tmpmx,$ivl,lsl#1 2620 fmov $ivd30,$ivl 2621 fmov $ivd31,$ivh 2622 veor $tmp4,$rndlast,$iv4 2623 vld1.8 {$in3},[$inp],#16 2624 aese $dat4,q15 2625 2626 // The iv for fifth block 2627 extr $midnumx,$ivh,$ivh,#32 2628 extr $ivh,$ivh,$ivl,#63 2629 and $tmpmw,$constnum,$midnum,asr #31 2630 eor $ivl,$tmpmx,$ivl,lsl #1 2631 fmov $ivd40,$ivl 2632 fmov $ivd41,$ivh 2633 2634 vld1.8 {$in4},[$inp],#16 2635 cbz $xoffset,.Lxts_enc_tail4x 2636 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2637 veor $tmp0,$tmp0,$dat0 2638 veor $dat0,$in0,$iv0 2639 veor $tmp1,$tmp1,$dat1 2640 veor $dat1,$in1,$iv1 2641 veor $tmp2,$tmp2,$dat2 2642 veor $dat2,$in2,$iv2 2643 veor $tmp3,$tmp3,$dat3 2644 veor $dat3,$in3,$iv3 2645 veor $tmp4,$tmp4,$dat4 2646 vst1.8 {$tmp0},[$out],#16 2647 veor $dat4,$in4,$iv4 2648 vst1.8 {$tmp1},[$out],#16 2649 mov $rounds,$rounds0 2650 vst1.8 {$tmp2},[$out],#16 2651 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2652 vst1.8 {$tmp3},[$out],#16 2653 vst1.8 {$tmp4},[$out],#16 2654 b.hs .Loop5x_xts_enc 2655 2656 2657 // If left 4 blocks, borrow the five block's processing. 2658 cmn $len,#0x10 2659 b.ne .Loop5x_enc_after 2660 vorr $iv4,$iv3,$iv3 2661 vorr $iv3,$iv2,$iv2 2662 vorr $iv2,$iv1,$iv1 2663 vorr $iv1,$iv0,$iv0 2664 fmov $ivl,$ivd40 2665 fmov $ivh,$ivd41 2666 veor $dat0,$iv0,$in0 2667 veor $dat1,$iv1,$in1 2668 veor $dat2,$in2,$iv2 2669 veor $dat3,$in3,$iv3 2670 veor $dat4,$in4,$iv4 2671 b.eq .Loop5x_xts_enc 2672 2673.Loop5x_enc_after: 2674 add $len,$len,#0x50 2675 cbz $len,.Lxts_enc_done 2676 2677 add $rounds,$rounds0,#2 2678 subs $len,$len,#0x30 2679 b.lo .Lxts_inner_enc_tail 2680 2681 veor $dat0,$iv0,$in2 2682 veor $dat1,$iv1,$in3 2683 veor $dat2,$in4,$iv2 2684 b .Lxts_outer_enc_tail 2685 2686.align 4 2687.Lxts_enc_tail4x: 2688 add $inp,$inp,#16 2689 veor $tmp1,$dat1,$tmp1 2690 vst1.8 {$tmp1},[$out],#16 2691 veor $tmp2,$dat2,$tmp2 2692 vst1.8 {$tmp2},[$out],#16 2693 veor $tmp3,$dat3,$tmp3 2694 veor $tmp4,$dat4,$tmp4 2695 vst1.8 {$tmp3-$tmp4},[$out],#32 2696 2697 b .Lxts_enc_done 2698.align 4 2699.Lxts_outer_enc_tail: 2700 aese $dat0,q8 2701 aesmc $dat0,$dat0 2702 aese $dat1,q8 2703 aesmc $dat1,$dat1 2704 aese $dat2,q8 2705 aesmc $dat2,$dat2 2706 vld1.32 {q8},[$key_],#16 2707 subs $rounds,$rounds,#2 2708 aese $dat0,q9 2709 aesmc $dat0,$dat0 2710 aese $dat1,q9 2711 aesmc $dat1,$dat1 2712 aese $dat2,q9 2713 aesmc $dat2,$dat2 2714 vld1.32 {q9},[$key_],#16 2715 b.gt .Lxts_outer_enc_tail 2716 2717 aese $dat0,q8 2718 aesmc $dat0,$dat0 2719 aese $dat1,q8 2720 aesmc $dat1,$dat1 2721 aese $dat2,q8 2722 aesmc $dat2,$dat2 2723 veor $tmp0,$iv0,$rndlast 2724 subs $len,$len,#0x30 2725 // The iv for first block 2726 fmov $ivl,$ivd20 2727 fmov $ivh,$ivd21 2728 //mov $constnum,#0x87 2729 extr $midnumx,$ivh,$ivh,#32 2730 extr $ivh,$ivh,$ivl,#63 2731 and $tmpmw,$constnum,$midnum,asr#31 2732 eor $ivl,$tmpmx,$ivl,lsl#1 2733 fmov $ivd00,$ivl 2734 fmov $ivd01,$ivh 2735 veor $tmp1,$iv1,$rndlast 2736 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point 2737 aese $dat0,q9 2738 aesmc $dat0,$dat0 2739 aese $dat1,q9 2740 aesmc $dat1,$dat1 2741 aese $dat2,q9 2742 aesmc $dat2,$dat2 2743 veor $tmp2,$iv2,$rndlast 2744 2745 add $xoffset,$xoffset,#0x20 2746 add $inp,$inp,$xoffset 2747 mov $key_,$key1 2748 2749 aese $dat0,q12 2750 aesmc $dat0,$dat0 2751 aese $dat1,q12 2752 aesmc $dat1,$dat1 2753 aese $dat2,q12 2754 aesmc $dat2,$dat2 2755 aese $dat0,q13 2756 aesmc $dat0,$dat0 2757 aese $dat1,q13 2758 aesmc $dat1,$dat1 2759 aese $dat2,q13 2760 aesmc $dat2,$dat2 2761 aese $dat0,q14 2762 aesmc $dat0,$dat0 2763 aese $dat1,q14 2764 aesmc $dat1,$dat1 2765 aese $dat2,q14 2766 aesmc $dat2,$dat2 2767 aese $dat0,q15 2768 aese $dat1,q15 2769 aese $dat2,q15 2770 vld1.8 {$in2},[$inp],#16 2771 add $rounds,$rounds0,#2 2772 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2773 veor $tmp0,$tmp0,$dat0 2774 veor $tmp1,$tmp1,$dat1 2775 veor $dat2,$dat2,$tmp2 2776 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2777 vst1.8 {$tmp0},[$out],#16 2778 vst1.8 {$tmp1},[$out],#16 2779 vst1.8 {$dat2},[$out],#16 2780 cmn $len,#0x30 2781 b.eq .Lxts_enc_done 2782.Lxts_encxor_one: 2783 vorr $in3,$in1,$in1 2784 vorr $in4,$in2,$in2 2785 nop 2786 2787.Lxts_inner_enc_tail: 2788 cmn $len,#0x10 2789 veor $dat1,$in3,$iv0 2790 veor $dat2,$in4,$iv1 2791 b.eq .Lxts_enc_tail_loop 2792 veor $dat2,$in4,$iv0 2793.Lxts_enc_tail_loop: 2794 aese $dat1,q8 2795 aesmc $dat1,$dat1 2796 aese $dat2,q8 2797 aesmc $dat2,$dat2 2798 vld1.32 {q8},[$key_],#16 2799 subs $rounds,$rounds,#2 2800 aese $dat1,q9 2801 aesmc $dat1,$dat1 2802 aese $dat2,q9 2803 aesmc $dat2,$dat2 2804 vld1.32 {q9},[$key_],#16 2805 b.gt .Lxts_enc_tail_loop 2806 2807 aese $dat1,q8 2808 aesmc $dat1,$dat1 2809 aese $dat2,q8 2810 aesmc $dat2,$dat2 2811 aese $dat1,q9 2812 aesmc $dat1,$dat1 2813 aese $dat2,q9 2814 aesmc $dat2,$dat2 2815 aese $dat1,q12 2816 aesmc $dat1,$dat1 2817 aese $dat2,q12 2818 aesmc $dat2,$dat2 2819 cmn $len,#0x20 2820 aese $dat1,q13 2821 aesmc $dat1,$dat1 2822 aese $dat2,q13 2823 aesmc $dat2,$dat2 2824 veor $tmp1,$iv0,$rndlast 2825 aese $dat1,q14 2826 aesmc $dat1,$dat1 2827 aese $dat2,q14 2828 aesmc $dat2,$dat2 2829 veor $tmp2,$iv1,$rndlast 2830 aese $dat1,q15 2831 aese $dat2,q15 2832 b.eq .Lxts_enc_one 2833 veor $tmp1,$tmp1,$dat1 2834 vst1.8 {$tmp1},[$out],#16 2835 veor $tmp2,$tmp2,$dat2 2836 vorr $iv0,$iv1,$iv1 2837 vst1.8 {$tmp2},[$out],#16 2838 fmov $ivl,$ivd10 2839 fmov $ivh,$ivd11 2840 mov $constnum,#0x87 2841 extr $midnumx,$ivh,$ivh,#32 2842 extr $ivh,$ivh,$ivl,#63 2843 and $tmpmw,$constnum,$midnum,asr #31 2844 eor $ivl,$tmpmx,$ivl,lsl #1 2845 fmov $ivd00,$ivl 2846 fmov $ivd01,$ivh 2847 b .Lxts_enc_done 2848 2849.Lxts_enc_one: 2850 veor $tmp1,$tmp1,$dat2 2851 vorr $iv0,$iv0,$iv0 2852 vst1.8 {$tmp1},[$out],#16 2853 fmov $ivl,$ivd00 2854 fmov $ivh,$ivd01 2855 mov $constnum,#0x87 2856 extr $midnumx,$ivh,$ivh,#32 2857 extr $ivh,$ivh,$ivl,#63 2858 and $tmpmw,$constnum,$midnum,asr #31 2859 eor $ivl,$tmpmx,$ivl,lsl #1 2860 fmov $ivd00,$ivl 2861 fmov $ivd01,$ivh 2862 b .Lxts_enc_done 2863.align 5 2864.Lxts_enc_done: 2865 // Process the tail block with cipher stealing. 2866 tst $tailcnt,#0xf 2867 b.eq .Lxts_abort 2868 2869 mov $tmpinp,$inp 2870 mov $tmpoutp,$out 2871 sub $out,$out,#16 2872.composite_enc_loop: 2873 subs $tailcnt,$tailcnt,#1 2874 ldrb $l2outp,[$out,$tailcnt] 2875 ldrb $loutp,[$tmpinp,$tailcnt] 2876 strb $l2outp,[$tmpoutp,$tailcnt] 2877 strb $loutp,[$out,$tailcnt] 2878 b.gt .composite_enc_loop 2879.Lxts_enc_load_done: 2880 vld1.8 {$tmpin},[$out] 2881 veor $tmpin,$tmpin,$iv0 2882 2883 // Encrypt the composite block to get the last second encrypted text block 2884 ldr $rounds,[$key1,#240] // load key schedule... 2885 vld1.32 {$dat},[$key1],#16 2886 sub $rounds,$rounds,#2 2887 vld1.32 {$dat1},[$key1],#16 // load key schedule... 2888.Loop_final_enc: 2889 aese $tmpin,$dat0 2890 aesmc $tmpin,$tmpin 2891 vld1.32 {$dat0},[$key1],#16 2892 subs $rounds,$rounds,#2 2893 aese $tmpin,$dat1 2894 aesmc $tmpin,$tmpin 2895 vld1.32 {$dat1},[$key1],#16 2896 b.gt .Loop_final_enc 2897 2898 aese $tmpin,$dat0 2899 aesmc $tmpin,$tmpin 2900 vld1.32 {$dat0},[$key1] 2901 aese $tmpin,$dat1 2902 veor $tmpin,$tmpin,$dat0 2903 veor $tmpin,$tmpin,$iv0 2904 vst1.8 {$tmpin},[$out] 2905 2906.Lxts_abort: 2907 ldp $tailcnt,$midnumx,[sp,#48] 2908 ldp $ivd10,$ivd20,[sp,#32] 2909 ldp $ivd30,$ivd40,[sp,#16] 2910 ldp $constnumx,$tmpinp,[sp],#64 2911.Lxts_enc_final_abort: 2912 ret 2913.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt 2914___ 2915 2916}}} 2917{{{ 2918my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); 2919my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); 2920my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); 2921my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); 2922my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); 2923my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 2924my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); 2925my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); 2926my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); 2927 2928my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 2929 2930# q7 last round key 2931# q10-q15, q7 Last 7 round keys 2932# q8-q9 preloaded round keys except last 7 keys for big size 2933# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte 2934 2935{ 2936my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 2937 2938my ($dat3,$in3,$tmp3); # used only in 64-bit mode 2939my ($dat4,$in4,$tmp4); 2940if ($flavour =~ /64/) { 2941 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 2942} 2943 2944$code.=<<___ if ($flavour =~ /64/); 2945.globl ${prefix}_xts_decrypt 2946.type ${prefix}_xts_decrypt,%function 2947.align 5 2948${prefix}_xts_decrypt: 2949___ 2950$code.=<<___ if ($flavour =~ /64/); 2951 cmp $len,#16 2952 // Original input data size bigger than 16, jump to big size processing. 2953 b.ne .Lxts_dec_big_size 2954 // Encrypt the iv with key2, as the first XEX iv. 2955 ldr $rounds,[$key2,#240] 2956 vld1.32 {$dat},[$key2],#16 2957 vld1.8 {$iv0},[$ivp] 2958 sub $rounds,$rounds,#2 2959 vld1.32 {$dat1},[$key2],#16 2960 2961.Loop_dec_small_iv_enc: 2962 aese $iv0,$dat 2963 aesmc $iv0,$iv0 2964 vld1.32 {$dat},[$key2],#16 2965 subs $rounds,$rounds,#2 2966 aese $iv0,$dat1 2967 aesmc $iv0,$iv0 2968 vld1.32 {$dat1},[$key2],#16 2969 b.gt .Loop_dec_small_iv_enc 2970 2971 aese $iv0,$dat 2972 aesmc $iv0,$iv0 2973 vld1.32 {$dat},[$key2] 2974 aese $iv0,$dat1 2975 veor $iv0,$iv0,$dat 2976 2977 vld1.8 {$dat0},[$inp] 2978 veor $dat0,$iv0,$dat0 2979 2980 ldr $rounds,[$key1,#240] 2981 vld1.32 {q20-q21},[$key1],#32 // load key schedule... 2982 2983 aesd $dat0,q20 2984 aesimc $dat0,$dat0 2985 vld1.32 {q8-q9},[$key1],#32 // load key schedule... 2986 aesd $dat0,q21 2987 aesimc $dat0,$dat0 2988 subs $rounds,$rounds,#10 // bias 2989 b.eq .Lxts_128_dec 2990.Lxts_dec_round_loop: 2991 aesd $dat0,q8 2992 aesimc $dat0,$dat0 2993 vld1.32 {q8},[$key1],#16 // load key schedule... 2994 aesd $dat0,q9 2995 aesimc $dat0,$dat0 2996 vld1.32 {q9},[$key1],#16 // load key schedule... 2997 subs $rounds,$rounds,#2 // bias 2998 b.gt .Lxts_dec_round_loop 2999.Lxts_128_dec: 3000 vld1.32 {q10-q11},[$key1],#32 // load key schedule... 3001 aesd $dat0,q8 3002 aesimc $dat0,$dat0 3003 aesd $dat0,q9 3004 aesimc $dat0,$dat0 3005 vld1.32 {q12-q13},[$key1],#32 // load key schedule... 3006 aesd $dat0,q10 3007 aesimc $dat0,$dat0 3008 aesd $dat0,q11 3009 aesimc $dat0,$dat0 3010 vld1.32 {q14-q15},[$key1],#32 // load key schedule... 3011 aesd $dat0,q12 3012 aesimc $dat0,$dat0 3013 aesd $dat0,q13 3014 aesimc $dat0,$dat0 3015 vld1.32 {$rndlast},[$key1] 3016 aesd $dat0,q14 3017 aesimc $dat0,$dat0 3018 aesd $dat0,q15 3019 veor $dat0,$dat0,$rndlast 3020 veor $dat0,$iv0,$dat0 3021 vst1.8 {$dat0},[$out] 3022 b .Lxts_dec_final_abort 3023.Lxts_dec_big_size: 3024___ 3025$code.=<<___ if ($flavour =~ /64/); 3026 stp $constnumx,$tmpinp,[sp,#-64]! 3027 stp $tailcnt,$midnumx,[sp,#48] 3028 stp $ivd10,$ivd20,[sp,#32] 3029 stp $ivd30,$ivd40,[sp,#16] 3030 3031 and $tailcnt,$len,#0xf 3032 and $len,$len,#-16 3033 subs $len,$len,#16 3034 mov $step,#16 3035 b.lo .Lxts_dec_abort 3036 3037 // Encrypt the iv with key2, as the first XEX iv 3038 ldr $rounds,[$key2,#240] 3039 vld1.32 {$dat},[$key2],#16 3040 vld1.8 {$iv0},[$ivp] 3041 sub $rounds,$rounds,#2 3042 vld1.32 {$dat1},[$key2],#16 3043 3044.Loop_dec_iv_enc: 3045 aese $iv0,$dat 3046 aesmc $iv0,$iv0 3047 vld1.32 {$dat},[$key2],#16 3048 subs $rounds,$rounds,#2 3049 aese $iv0,$dat1 3050 aesmc $iv0,$iv0 3051 vld1.32 {$dat1},[$key2],#16 3052 b.gt .Loop_dec_iv_enc 3053 3054 aese $iv0,$dat 3055 aesmc $iv0,$iv0 3056 vld1.32 {$dat},[$key2] 3057 aese $iv0,$dat1 3058 veor $iv0,$iv0,$dat 3059 3060 // The iv for second block 3061 // $ivl- iv(low), $ivh - iv(high) 3062 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 3063 fmov $ivl,$ivd00 3064 fmov $ivh,$ivd01 3065 mov $constnum,#0x87 3066 extr $midnumx,$ivh,$ivh,#32 3067 extr $ivh,$ivh,$ivl,#63 3068 and $tmpmw,$constnum,$midnum,asr #31 3069 eor $ivl,$tmpmx,$ivl,lsl #1 3070 fmov $ivd10,$ivl 3071 fmov $ivd11,$ivh 3072 3073 ldr $rounds0,[$key1,#240] // load rounds number 3074 3075 // The iv for third block 3076 extr $midnumx,$ivh,$ivh,#32 3077 extr $ivh,$ivh,$ivl,#63 3078 and $tmpmw,$constnum,$midnum,asr #31 3079 eor $ivl,$tmpmx,$ivl,lsl #1 3080 fmov $ivd20,$ivl 3081 fmov $ivd21,$ivh 3082 3083 vld1.32 {q8-q9},[$key1] // load key schedule... 3084 sub $rounds0,$rounds0,#6 3085 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys 3086 sub $rounds0,$rounds0,#2 3087 vld1.32 {q10-q11},[$key_],#32 // load key schedule... 3088 vld1.32 {q12-q13},[$key_],#32 3089 vld1.32 {q14-q15},[$key_],#32 3090 vld1.32 {$rndlast},[$key_] 3091 3092 // The iv for fourth block 3093 extr $midnumx,$ivh,$ivh,#32 3094 extr $ivh,$ivh,$ivl,#63 3095 and $tmpmw,$constnum,$midnum,asr #31 3096 eor $ivl,$tmpmx,$ivl,lsl #1 3097 fmov $ivd30,$ivl 3098 fmov $ivd31,$ivh 3099 3100 add $key_,$key1,#32 3101 mov $rounds,$rounds0 3102 b .Lxts_dec 3103 3104 // Decryption 3105.align 5 3106.Lxts_dec: 3107 tst $tailcnt,#0xf 3108 b.eq .Lxts_dec_begin 3109 subs $len,$len,#16 3110 csel $step,xzr,$step,eq 3111 vld1.8 {$dat},[$inp],#16 3112 b.lo .Lxts_done 3113 sub $inp,$inp,#16 3114.Lxts_dec_begin: 3115 vld1.8 {$dat},[$inp],$step 3116 subs $len,$len,#32 // bias 3117 add $rounds,$rounds0,#2 3118 vorr $in1,$dat,$dat 3119 vorr $dat1,$dat,$dat 3120 vorr $in3,$dat,$dat 3121 vld1.8 {$dat2},[$inp],#16 3122 vorr $in2,$dat2,$dat2 3123 vorr $in4,$dat2,$dat2 3124 b.lo .Lxts_inner_dec_tail 3125 veor $dat,$dat,$iv0 // before decryt, xor with iv 3126 veor $dat2,$dat2,$iv1 3127 3128 vorr $dat1,$dat2,$dat2 3129 vld1.8 {$dat2},[$inp],#16 3130 vorr $in0,$dat,$dat 3131 vorr $in1,$dat1,$dat1 3132 veor $in2,$dat2,$iv2 // third block xox with third iv 3133 veor $dat2,$dat2,$iv2 3134 cmp $len,#32 3135 b.lo .Lxts_outer_dec_tail 3136 3137 vld1.8 {$dat3},[$inp],#16 3138 3139 // The iv for fifth block 3140 extr $midnumx,$ivh,$ivh,#32 3141 extr $ivh,$ivh,$ivl,#63 3142 and $tmpmw,$constnum,$midnum,asr #31 3143 eor $ivl,$tmpmx,$ivl,lsl #1 3144 fmov $ivd40,$ivl 3145 fmov $ivd41,$ivh 3146 3147 vld1.8 {$dat4},[$inp],#16 3148 veor $dat3,$dat3,$iv3 // the fourth block 3149 veor $dat4,$dat4,$iv4 3150 sub $len,$len,#32 // bias 3151 mov $rounds,$rounds0 3152 b .Loop5x_xts_dec 3153 3154.align 4 3155.Loop5x_xts_dec: 3156 aesd $dat0,q8 3157 aesimc $dat0,$dat0 3158 aesd $dat1,q8 3159 aesimc $dat1,$dat1 3160 aesd $dat2,q8 3161 aesimc $dat2,$dat2 3162 aesd $dat3,q8 3163 aesimc $dat3,$dat3 3164 aesd $dat4,q8 3165 aesimc $dat4,$dat4 3166 vld1.32 {q8},[$key_],#16 // load key schedule... 3167 subs $rounds,$rounds,#2 3168 aesd $dat0,q9 3169 aesimc $dat0,$dat0 3170 aesd $dat1,q9 3171 aesimc $dat1,$dat1 3172 aesd $dat2,q9 3173 aesimc $dat2,$dat2 3174 aesd $dat3,q9 3175 aesimc $dat3,$dat3 3176 aesd $dat4,q9 3177 aesimc $dat4,$dat4 3178 vld1.32 {q9},[$key_],#16 // load key schedule... 3179 b.gt .Loop5x_xts_dec 3180 3181 aesd $dat0,q8 3182 aesimc $dat0,$dat0 3183 aesd $dat1,q8 3184 aesimc $dat1,$dat1 3185 aesd $dat2,q8 3186 aesimc $dat2,$dat2 3187 aesd $dat3,q8 3188 aesimc $dat3,$dat3 3189 aesd $dat4,q8 3190 aesimc $dat4,$dat4 3191 subs $len,$len,#0x50 // because .Lxts_dec_tail4x 3192 3193 aesd $dat0,q9 3194 aesimc $dat0,$dat 3195 aesd $dat1,q9 3196 aesimc $dat1,$dat1 3197 aesd $dat2,q9 3198 aesimc $dat2,$dat2 3199 aesd $dat3,q9 3200 aesimc $dat3,$dat3 3201 aesd $dat4,q9 3202 aesimc $dat4,$dat4 3203 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo 3204 mov $key_,$key1 3205 3206 aesd $dat0,q10 3207 aesimc $dat0,$dat0 3208 aesd $dat1,q10 3209 aesimc $dat1,$dat1 3210 aesd $dat2,q10 3211 aesimc $dat2,$dat2 3212 aesd $dat3,q10 3213 aesimc $dat3,$dat3 3214 aesd $dat4,q10 3215 aesimc $dat4,$dat4 3216 add $inp,$inp,$xoffset // x0 is adjusted in such way that 3217 // at exit from the loop v1.16b-v26.16b 3218 // are loaded with last "words" 3219 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x 3220 3221 aesd $dat0,q11 3222 aesimc $dat0,$dat0 3223 aesd $dat1,q11 3224 aesimc $dat1,$dat1 3225 aesd $dat2,q11 3226 aesimc $dat2,$dat2 3227 aesd $dat3,q11 3228 aesimc $dat3,$dat3 3229 aesd $dat4,q11 3230 aesimc $dat4,$dat4 3231 3232 aesd $dat0,q12 3233 aesimc $dat0,$dat0 3234 aesd $dat1,q12 3235 aesimc $dat1,$dat1 3236 aesd $dat2,q12 3237 aesimc $dat2,$dat2 3238 aesd $dat3,q12 3239 aesimc $dat3,$dat3 3240 aesd $dat4,q12 3241 aesimc $dat4,$dat4 3242 3243 aesd $dat0,q13 3244 aesimc $dat0,$dat0 3245 aesd $dat1,q13 3246 aesimc $dat1,$dat1 3247 aesd $dat2,q13 3248 aesimc $dat2,$dat2 3249 aesd $dat3,q13 3250 aesimc $dat3,$dat3 3251 aesd $dat4,q13 3252 aesimc $dat4,$dat4 3253 3254 aesd $dat0,q14 3255 aesimc $dat0,$dat0 3256 aesd $dat1,q14 3257 aesimc $dat1,$dat1 3258 aesd $dat2,q14 3259 aesimc $dat2,$dat2 3260 aesd $dat3,q14 3261 aesimc $dat3,$dat3 3262 aesd $dat4,q14 3263 aesimc $dat4,$dat4 3264 3265 veor $tmp0,$rndlast,$iv0 3266 aesd $dat0,q15 3267 // The iv for first block of next iteration. 3268 extr $midnumx,$ivh,$ivh,#32 3269 extr $ivh,$ivh,$ivl,#63 3270 and $tmpmw,$constnum,$midnum,asr #31 3271 eor $ivl,$tmpmx,$ivl,lsl #1 3272 fmov $ivd00,$ivl 3273 fmov $ivd01,$ivh 3274 veor $tmp1,$rndlast,$iv1 3275 vld1.8 {$in0},[$inp],#16 3276 aesd $dat1,q15 3277 // The iv for second block 3278 extr $midnumx,$ivh,$ivh,#32 3279 extr $ivh,$ivh,$ivl,#63 3280 and $tmpmw,$constnum,$midnum,asr #31 3281 eor $ivl,$tmpmx,$ivl,lsl #1 3282 fmov $ivd10,$ivl 3283 fmov $ivd11,$ivh 3284 veor $tmp2,$rndlast,$iv2 3285 vld1.8 {$in1},[$inp],#16 3286 aesd $dat2,q15 3287 // The iv for third block 3288 extr $midnumx,$ivh,$ivh,#32 3289 extr $ivh,$ivh,$ivl,#63 3290 and $tmpmw,$constnum,$midnum,asr #31 3291 eor $ivl,$tmpmx,$ivl,lsl #1 3292 fmov $ivd20,$ivl 3293 fmov $ivd21,$ivh 3294 veor $tmp3,$rndlast,$iv3 3295 vld1.8 {$in2},[$inp],#16 3296 aesd $dat3,q15 3297 // The iv for fourth block 3298 extr $midnumx,$ivh,$ivh,#32 3299 extr $ivh,$ivh,$ivl,#63 3300 and $tmpmw,$constnum,$midnum,asr #31 3301 eor $ivl,$tmpmx,$ivl,lsl #1 3302 fmov $ivd30,$ivl 3303 fmov $ivd31,$ivh 3304 veor $tmp4,$rndlast,$iv4 3305 vld1.8 {$in3},[$inp],#16 3306 aesd $dat4,q15 3307 3308 // The iv for fifth block 3309 extr $midnumx,$ivh,$ivh,#32 3310 extr $ivh,$ivh,$ivl,#63 3311 and $tmpmw,$constnum,$midnum,asr #31 3312 eor $ivl,$tmpmx,$ivl,lsl #1 3313 fmov $ivd40,$ivl 3314 fmov $ivd41,$ivh 3315 3316 vld1.8 {$in4},[$inp],#16 3317 cbz $xoffset,.Lxts_dec_tail4x 3318 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 3319 veor $tmp0,$tmp0,$dat0 3320 veor $dat0,$in0,$iv0 3321 veor $tmp1,$tmp1,$dat1 3322 veor $dat1,$in1,$iv1 3323 veor $tmp2,$tmp2,$dat2 3324 veor $dat2,$in2,$iv2 3325 veor $tmp3,$tmp3,$dat3 3326 veor $dat3,$in3,$iv3 3327 veor $tmp4,$tmp4,$dat4 3328 vst1.8 {$tmp0},[$out],#16 3329 veor $dat4,$in4,$iv4 3330 vst1.8 {$tmp1},[$out],#16 3331 mov $rounds,$rounds0 3332 vst1.8 {$tmp2},[$out],#16 3333 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 3334 vst1.8 {$tmp3},[$out],#16 3335 vst1.8 {$tmp4},[$out],#16 3336 b.hs .Loop5x_xts_dec 3337 3338 cmn $len,#0x10 3339 b.ne .Loop5x_dec_after 3340 // If x2($len) equal to -0x10, the left blocks is 4. 3341 // After specially processing, utilize the five blocks processing again. 3342 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. 3343 vorr $iv4,$iv3,$iv3 3344 vorr $iv3,$iv2,$iv2 3345 vorr $iv2,$iv1,$iv1 3346 vorr $iv1,$iv0,$iv0 3347 fmov $ivl,$ivd40 3348 fmov $ivh,$ivd41 3349 veor $dat0,$iv0,$in0 3350 veor $dat1,$iv1,$in1 3351 veor $dat2,$in2,$iv2 3352 veor $dat3,$in3,$iv3 3353 veor $dat4,$in4,$iv4 3354 b.eq .Loop5x_xts_dec 3355 3356.Loop5x_dec_after: 3357 add $len,$len,#0x50 3358 cbz $len,.Lxts_done 3359 3360 add $rounds,$rounds0,#2 3361 subs $len,$len,#0x30 3362 b.lo .Lxts_inner_dec_tail 3363 3364 veor $dat0,$iv0,$in2 3365 veor $dat1,$iv1,$in3 3366 veor $dat2,$in4,$iv2 3367 b .Lxts_outer_dec_tail 3368 3369.align 4 3370.Lxts_dec_tail4x: 3371 add $inp,$inp,#16 3372 tst $tailcnt,#0xf 3373 veor $tmp1,$dat1,$tmp0 3374 vst1.8 {$tmp1},[$out],#16 3375 veor $tmp2,$dat2,$tmp2 3376 vst1.8 {$tmp2},[$out],#16 3377 veor $tmp3,$dat3,$tmp3 3378 veor $tmp4,$dat4,$tmp4 3379 vst1.8 {$tmp3-$tmp4},[$out],#32 3380 3381 b.eq .Lxts_dec_abort 3382 vld1.8 {$dat0},[$inp],#16 3383 b .Lxts_done 3384.align 4 3385.Lxts_outer_dec_tail: 3386 aesd $dat0,q8 3387 aesimc $dat0,$dat0 3388 aesd $dat1,q8 3389 aesimc $dat1,$dat1 3390 aesd $dat2,q8 3391 aesimc $dat2,$dat2 3392 vld1.32 {q8},[$key_],#16 3393 subs $rounds,$rounds,#2 3394 aesd $dat0,q9 3395 aesimc $dat0,$dat0 3396 aesd $dat1,q9 3397 aesimc $dat1,$dat1 3398 aesd $dat2,q9 3399 aesimc $dat2,$dat2 3400 vld1.32 {q9},[$key_],#16 3401 b.gt .Lxts_outer_dec_tail 3402 3403 aesd $dat0,q8 3404 aesimc $dat0,$dat0 3405 aesd $dat1,q8 3406 aesimc $dat1,$dat1 3407 aesd $dat2,q8 3408 aesimc $dat2,$dat2 3409 veor $tmp0,$iv0,$rndlast 3410 subs $len,$len,#0x30 3411 // The iv for first block 3412 fmov $ivl,$ivd20 3413 fmov $ivh,$ivd21 3414 mov $constnum,#0x87 3415 extr $midnumx,$ivh,$ivh,#32 3416 extr $ivh,$ivh,$ivl,#63 3417 and $tmpmw,$constnum,$midnum,asr #31 3418 eor $ivl,$tmpmx,$ivl,lsl #1 3419 fmov $ivd00,$ivl 3420 fmov $ivd01,$ivh 3421 veor $tmp1,$iv1,$rndlast 3422 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point 3423 aesd $dat0,q9 3424 aesimc $dat0,$dat0 3425 aesd $dat1,q9 3426 aesimc $dat1,$dat1 3427 aesd $dat2,q9 3428 aesimc $dat2,$dat2 3429 veor $tmp2,$iv2,$rndlast 3430 // The iv for second block 3431 extr $midnumx,$ivh,$ivh,#32 3432 extr $ivh,$ivh,$ivl,#63 3433 and $tmpmw,$constnum,$midnum,asr #31 3434 eor $ivl,$tmpmx,$ivl,lsl #1 3435 fmov $ivd10,$ivl 3436 fmov $ivd11,$ivh 3437 3438 add $xoffset,$xoffset,#0x20 3439 add $inp,$inp,$xoffset // $inp is adjusted to the last data 3440 3441 mov $key_,$key1 3442 3443 // The iv for third block 3444 extr $midnumx,$ivh,$ivh,#32 3445 extr $ivh,$ivh,$ivl,#63 3446 and $tmpmw,$constnum,$midnum,asr #31 3447 eor $ivl,$tmpmx,$ivl,lsl #1 3448 fmov $ivd20,$ivl 3449 fmov $ivd21,$ivh 3450 3451 aesd $dat0,q12 3452 aesimc $dat0,$dat0 3453 aesd $dat1,q12 3454 aesimc $dat1,$dat1 3455 aesd $dat2,q12 3456 aesimc $dat2,$dat2 3457 aesd $dat0,q13 3458 aesimc $dat0,$dat0 3459 aesd $dat1,q13 3460 aesimc $dat1,$dat1 3461 aesd $dat2,q13 3462 aesimc $dat2,$dat2 3463 aesd $dat0,q14 3464 aesimc $dat0,$dat0 3465 aesd $dat1,q14 3466 aesimc $dat1,$dat1 3467 aesd $dat2,q14 3468 aesimc $dat2,$dat2 3469 vld1.8 {$in2},[$inp],#16 3470 aesd $dat0,q15 3471 aesd $dat1,q15 3472 aesd $dat2,q15 3473 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 3474 add $rounds,$rounds0,#2 3475 veor $tmp0,$tmp0,$dat0 3476 veor $tmp1,$tmp1,$dat1 3477 veor $dat2,$dat2,$tmp2 3478 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 3479 vst1.8 {$tmp0},[$out],#16 3480 vst1.8 {$tmp1},[$out],#16 3481 vst1.8 {$dat2},[$out],#16 3482 3483 cmn $len,#0x30 3484 add $len,$len,#0x30 3485 b.eq .Lxts_done 3486 sub $len,$len,#0x30 3487 vorr $in3,$in1,$in1 3488 vorr $in4,$in2,$in2 3489 nop 3490 3491.Lxts_inner_dec_tail: 3492 // $len == -0x10 means two blocks left. 3493 cmn $len,#0x10 3494 veor $dat1,$in3,$iv0 3495 veor $dat2,$in4,$iv1 3496 b.eq .Lxts_dec_tail_loop 3497 veor $dat2,$in4,$iv0 3498.Lxts_dec_tail_loop: 3499 aesd $dat1,q8 3500 aesimc $dat1,$dat1 3501 aesd $dat2,q8 3502 aesimc $dat2,$dat2 3503 vld1.32 {q8},[$key_],#16 3504 subs $rounds,$rounds,#2 3505 aesd $dat1,q9 3506 aesimc $dat1,$dat1 3507 aesd $dat2,q9 3508 aesimc $dat2,$dat2 3509 vld1.32 {q9},[$key_],#16 3510 b.gt .Lxts_dec_tail_loop 3511 3512 aesd $dat1,q8 3513 aesimc $dat1,$dat1 3514 aesd $dat2,q8 3515 aesimc $dat2,$dat2 3516 aesd $dat1,q9 3517 aesimc $dat1,$dat1 3518 aesd $dat2,q9 3519 aesimc $dat2,$dat2 3520 aesd $dat1,q12 3521 aesimc $dat1,$dat1 3522 aesd $dat2,q12 3523 aesimc $dat2,$dat2 3524 cmn $len,#0x20 3525 aesd $dat1,q13 3526 aesimc $dat1,$dat1 3527 aesd $dat2,q13 3528 aesimc $dat2,$dat2 3529 veor $tmp1,$iv0,$rndlast 3530 aesd $dat1,q14 3531 aesimc $dat1,$dat1 3532 aesd $dat2,q14 3533 aesimc $dat2,$dat2 3534 veor $tmp2,$iv1,$rndlast 3535 aesd $dat1,q15 3536 aesd $dat2,q15 3537 b.eq .Lxts_dec_one 3538 veor $tmp1,$tmp1,$dat1 3539 veor $tmp2,$tmp2,$dat2 3540 vorr $iv0,$iv2,$iv2 3541 vorr $iv1,$iv3,$iv3 3542 vst1.8 {$tmp1},[$out],#16 3543 vst1.8 {$tmp2},[$out],#16 3544 add $len,$len,#16 3545 b .Lxts_done 3546 3547.Lxts_dec_one: 3548 veor $tmp1,$tmp1,$dat2 3549 vorr $iv0,$iv1,$iv1 3550 vorr $iv1,$iv2,$iv2 3551 vst1.8 {$tmp1},[$out],#16 3552 add $len,$len,#32 3553 3554.Lxts_done: 3555 tst $tailcnt,#0xf 3556 b.eq .Lxts_dec_abort 3557 // Processing the last two blocks with cipher stealing. 3558 mov x7,x3 3559 cbnz x2,.Lxts_dec_1st_done 3560 vld1.8 {$dat0},[$inp],#16 3561 3562 // Decrypt the last secod block to get the last plain text block 3563.Lxts_dec_1st_done: 3564 eor $tmpin,$dat0,$iv1 3565 ldr $rounds,[$key1,#240] 3566 vld1.32 {$dat0},[$key1],#16 3567 sub $rounds,$rounds,#2 3568 vld1.32 {$dat1},[$key1],#16 3569.Loop_final_2nd_dec: 3570 aesd $tmpin,$dat0 3571 aesimc $tmpin,$tmpin 3572 vld1.32 {$dat0},[$key1],#16 // load key schedule... 3573 subs $rounds,$rounds,#2 3574 aesd $tmpin,$dat1 3575 aesimc $tmpin,$tmpin 3576 vld1.32 {$dat1},[$key1],#16 // load key schedule... 3577 b.gt .Loop_final_2nd_dec 3578 3579 aesd $tmpin,$dat0 3580 aesimc $tmpin,$tmpin 3581 vld1.32 {$dat0},[$key1] 3582 aesd $tmpin,$dat1 3583 veor $tmpin,$tmpin,$dat0 3584 veor $tmpin,$tmpin,$iv1 3585 vst1.8 {$tmpin},[$out] 3586 3587 mov $tmpinp,$inp 3588 add $tmpoutp,$out,#16 3589 3590 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks 3591 // to get the last encrypted block. 3592.composite_dec_loop: 3593 subs $tailcnt,$tailcnt,#1 3594 ldrb $l2outp,[$out,$tailcnt] 3595 ldrb $loutp,[$tmpinp,$tailcnt] 3596 strb $l2outp,[$tmpoutp,$tailcnt] 3597 strb $loutp,[$out,$tailcnt] 3598 b.gt .composite_dec_loop 3599.Lxts_dec_load_done: 3600 vld1.8 {$tmpin},[$out] 3601 veor $tmpin,$tmpin,$iv0 3602 3603 // Decrypt the composite block to get the last second plain text block 3604 ldr $rounds,[$key_,#240] 3605 vld1.32 {$dat},[$key_],#16 3606 sub $rounds,$rounds,#2 3607 vld1.32 {$dat1},[$key_],#16 3608.Loop_final_dec: 3609 aesd $tmpin,$dat0 3610 aesimc $tmpin,$tmpin 3611 vld1.32 {$dat0},[$key_],#16 // load key schedule... 3612 subs $rounds,$rounds,#2 3613 aesd $tmpin,$dat1 3614 aesimc $tmpin,$tmpin 3615 vld1.32 {$dat1},[$key_],#16 // load key schedule... 3616 b.gt .Loop_final_dec 3617 3618 aesd $tmpin,$dat0 3619 aesimc $tmpin,$tmpin 3620 vld1.32 {$dat0},[$key_] 3621 aesd $tmpin,$dat1 3622 veor $tmpin,$tmpin,$dat0 3623 veor $tmpin,$tmpin,$iv0 3624 vst1.8 {$tmpin},[$out] 3625 3626.Lxts_dec_abort: 3627 ldp $tailcnt,$midnumx,[sp,#48] 3628 ldp $ivd10,$ivd20,[sp,#32] 3629 ldp $ivd30,$ivd40,[sp,#16] 3630 ldp $constnumx,$tmpinp,[sp],#64 3631 3632.Lxts_dec_final_abort: 3633 ret 3634.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt 3635___ 3636} 3637}}} 3638$code.=<<___; 3639#endif 3640___ 3641######################################## 3642if ($flavour =~ /64/) { ######## 64-bit code 3643 my %opcode = ( 3644 "aesd" => 0x4e285800, "aese" => 0x4e284800, 3645 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 3646 3647 local *unaes = sub { 3648 my ($mnemonic,$arg)=@_; 3649 3650 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 3651 sprintf ".inst\t0x%08x\t//%s %s", 3652 $opcode{$mnemonic}|$1|($2<<5), 3653 $mnemonic,$arg; 3654 }; 3655 3656 foreach(split("\n",$code)) { 3657 s/\`([^\`]*)\`/eval($1)/geo; 3658 3659 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 3660 s/@\s/\/\//o; # old->new style commentary 3661 3662 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 3663 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 3664 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 3665 s/vmov\.i8/movi/o or # fix up legacy mnemonics 3666 s/vext\.8/ext/o or 3667 s/vrev32\.8/rev32/o or 3668 s/vtst\.8/cmtst/o or 3669 s/vshr/ushr/o or 3670 s/^(\s+)v/$1/o or # strip off v prefix 3671 s/\bbx\s+lr\b/ret/o; 3672 3673 # fix up remaining legacy suffixes 3674 s/\.[ui]?8//o; 3675 m/\],#8/o and s/\.16b/\.8b/go; 3676 s/\.[ui]?32//o and s/\.16b/\.4s/go; 3677 s/\.[ui]?64//o and s/\.16b/\.2d/go; 3678 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 3679 3680 print $_,"\n"; 3681 } 3682} else { ######## 32-bit code 3683 my %opcode = ( 3684 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 3685 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 3686 3687 local *unaes = sub { 3688 my ($mnemonic,$arg)=@_; 3689 3690 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 3691 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 3692 |(($2&7)<<1) |(($2&8)<<2); 3693 # since ARMv7 instructions are always encoded little-endian. 3694 # correct solution is to use .inst directive, but older 3695 # assemblers don't implement it:-( 3696 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 3697 $word&0xff,($word>>8)&0xff, 3698 ($word>>16)&0xff,($word>>24)&0xff, 3699 $mnemonic,$arg; 3700 } 3701 }; 3702 3703 sub unvtbl { 3704 my $arg=shift; 3705 3706 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 3707 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 3708 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 3709 } 3710 3711 sub unvdup32 { 3712 my $arg=shift; 3713 3714 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 3715 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 3716 } 3717 3718 sub unvmov32 { 3719 my $arg=shift; 3720 3721 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 3722 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 3723 } 3724 3725 foreach(split("\n",$code)) { 3726 s/\`([^\`]*)\`/eval($1)/geo; 3727 3728 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 3729 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 3730 s/\/\/\s?/@ /o; # new->old style commentary 3731 3732 # fix up remaining new-style suffixes 3733 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 3734 s/\],#[0-9]+/]!/o; 3735 3736 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 3737 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or 3738 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 3739 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 3740 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 3741 s/^(\s+)b\./$1b/o or 3742 s/^(\s+)ret/$1bx\tlr/o; 3743 3744 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { 3745 print " it $2\n"; 3746 } 3747 3748 print $_,"\n"; 3749 } 3750} 3751 3752close STDOUT or die "error closing STDOUT: $!"; 3753