1 // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License 2.0 (the "License"). You may not use 4 // this file except in compliance with the License. You can obtain a copy 5 // in the file LICENSE in the source distribution or at 6 // https://www.openssl.org/source/license.html 7 8 // ==================================================================== 9 // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 10 // project. The module is, however, dual licensed under OpenSSL and 11 // CRYPTOGAMS licenses depending on where you obtain it. For further 12 // details see http://www.openssl.org/~appro/cryptogams/. 13 // 14 // Permission to use under GPLv2 terms is granted. 15 // ==================================================================== 16 // 17 // SHA256/512 for ARMv8. 18 // 19 // Performance in cycles per processed byte and improvement coefficient 20 // over code generated with "default" compiler: 21 // 22 // SHA256-hw SHA256(*) SHA512 23 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) 24 // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) 25 // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) 26 // Denver 2.01 10.5 (+26%) 6.70 (+8%) 27 // X-Gene 20.0 (+100%) 12.8 (+300%(***)) 28 // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) 29 // Kryo 1.92 17.4 (+30%) 11.2 (+8%) 30 // ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) 31 // 32 // (*) Software SHA256 results are of lesser relevance, presented 33 // mostly for informational purposes. 34 // (**) The result is a trade-off: it's possible to improve it by 35 // 10% (or by 1 cycle per round), but at the cost of 20% loss 36 // on Cortex-A53 (or by 4 cycles per round). 37 // (***) Super-impressive coefficients over gcc-generated code are 38 // indication of some compiler "pathology", most notably code 39 // generated with -mgeneral-regs-only is significantly faster 40 // and the gap is only 40-90%. 41 // 42 // October 2016. 43 // 44 // Originally it was reckoned that it makes no sense to implement NEON 45 // version of SHA256 for 64-bit processors. This is because performance 46 // improvement on most wide-spread Cortex-A5x processors was observed 47 // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was 48 // observed that 32-bit NEON SHA256 performs significantly better than 49 // 64-bit scalar version on *some* of the more recent processors. As 50 // result 64-bit NEON version of SHA256 was added to provide best 51 // all-round performance. For example it executes ~30% faster on X-Gene 52 // and Mongoose. [For reference, NEON version of SHA512 is bound to 53 // deliver much less improvement, likely *negative* on Cortex-A5x. 54 // Which is why NEON support is limited to SHA256.] 55 56 // $output is the last argument if it looks like a file (it has an extension) 57 // $flavour is the first argument if it doesn't look like a file 58 #ifndef __KERNEL__ 59 # include "arm_arch.h" 60 61 .hidden OPENSSL_armcap_P 62 #endif 63 64 .text 65 66 .globl sha512_block_data_order 67 .type sha512_block_data_order,%function 68 .align 6 69 sha512_block_data_order: 70 #ifndef __KERNEL__ 71 adrp x16,OPENSSL_armcap_P 72 ldr w16,[x16,#:lo12:OPENSSL_armcap_P] 73 tst w16,#ARMV8_SHA512 74 b.ne .Lv8_entry 75 #endif 76 .inst 0xd503233f // paciasp 77 stp x29,x30,[sp,#-128]! 78 add x29,sp,#0 79 80 stp x19,x20,[sp,#16] 81 stp x21,x22,[sp,#32] 82 stp x23,x24,[sp,#48] 83 stp x25,x26,[sp,#64] 84 stp x27,x28,[sp,#80] 85 sub sp,sp,#4*8 86 87 ldp x20,x21,[x0] // load context 88 ldp x22,x23,[x0,#2*8] 89 ldp x24,x25,[x0,#4*8] 90 add x2,x1,x2,lsl#7 // end of input 91 ldp x26,x27,[x0,#6*8] 92 adr x30,.LK512 93 stp x0,x2,[x29,#96] 94 95 .Loop: 96 ldp x3,x4,[x1],#2*8 97 ldr x19,[x30],#8 // *K++ 98 eor x28,x21,x22 // magic seed 99 str x1,[x29,#112] 100 #ifndef __AARCH64EB__ 101 rev x3,x3 // 0 102 #endif 103 ror x16,x24,#14 104 add x27,x27,x19 // h+=K[i] 105 eor x6,x24,x24,ror#23 106 and x17,x25,x24 107 bic x19,x26,x24 108 add x27,x27,x3 // h+=X[i] 109 orr x17,x17,x19 // Ch(e,f,g) 110 eor x19,x20,x21 // a^b, b^c in next round 111 eor x16,x16,x6,ror#18 // Sigma1(e) 112 ror x6,x20,#28 113 add x27,x27,x17 // h+=Ch(e,f,g) 114 eor x17,x20,x20,ror#5 115 add x27,x27,x16 // h+=Sigma1(e) 116 and x28,x28,x19 // (b^c)&=(a^b) 117 add x23,x23,x27 // d+=h 118 eor x28,x28,x21 // Maj(a,b,c) 119 eor x17,x6,x17,ror#34 // Sigma0(a) 120 add x27,x27,x28 // h+=Maj(a,b,c) 121 ldr x28,[x30],#8 // *K++, x19 in next round 122 //add x27,x27,x17 // h+=Sigma0(a) 123 #ifndef __AARCH64EB__ 124 rev x4,x4 // 1 125 #endif 126 ldp x5,x6,[x1],#2*8 127 add x27,x27,x17 // h+=Sigma0(a) 128 ror x16,x23,#14 129 add x26,x26,x28 // h+=K[i] 130 eor x7,x23,x23,ror#23 131 and x17,x24,x23 132 bic x28,x25,x23 133 add x26,x26,x4 // h+=X[i] 134 orr x17,x17,x28 // Ch(e,f,g) 135 eor x28,x27,x20 // a^b, b^c in next round 136 eor x16,x16,x7,ror#18 // Sigma1(e) 137 ror x7,x27,#28 138 add x26,x26,x17 // h+=Ch(e,f,g) 139 eor x17,x27,x27,ror#5 140 add x26,x26,x16 // h+=Sigma1(e) 141 and x19,x19,x28 // (b^c)&=(a^b) 142 add x22,x22,x26 // d+=h 143 eor x19,x19,x20 // Maj(a,b,c) 144 eor x17,x7,x17,ror#34 // Sigma0(a) 145 add x26,x26,x19 // h+=Maj(a,b,c) 146 ldr x19,[x30],#8 // *K++, x28 in next round 147 //add x26,x26,x17 // h+=Sigma0(a) 148 #ifndef __AARCH64EB__ 149 rev x5,x5 // 2 150 #endif 151 add x26,x26,x17 // h+=Sigma0(a) 152 ror x16,x22,#14 153 add x25,x25,x19 // h+=K[i] 154 eor x8,x22,x22,ror#23 155 and x17,x23,x22 156 bic x19,x24,x22 157 add x25,x25,x5 // h+=X[i] 158 orr x17,x17,x19 // Ch(e,f,g) 159 eor x19,x26,x27 // a^b, b^c in next round 160 eor x16,x16,x8,ror#18 // Sigma1(e) 161 ror x8,x26,#28 162 add x25,x25,x17 // h+=Ch(e,f,g) 163 eor x17,x26,x26,ror#5 164 add x25,x25,x16 // h+=Sigma1(e) 165 and x28,x28,x19 // (b^c)&=(a^b) 166 add x21,x21,x25 // d+=h 167 eor x28,x28,x27 // Maj(a,b,c) 168 eor x17,x8,x17,ror#34 // Sigma0(a) 169 add x25,x25,x28 // h+=Maj(a,b,c) 170 ldr x28,[x30],#8 // *K++, x19 in next round 171 //add x25,x25,x17 // h+=Sigma0(a) 172 #ifndef __AARCH64EB__ 173 rev x6,x6 // 3 174 #endif 175 ldp x7,x8,[x1],#2*8 176 add x25,x25,x17 // h+=Sigma0(a) 177 ror x16,x21,#14 178 add x24,x24,x28 // h+=K[i] 179 eor x9,x21,x21,ror#23 180 and x17,x22,x21 181 bic x28,x23,x21 182 add x24,x24,x6 // h+=X[i] 183 orr x17,x17,x28 // Ch(e,f,g) 184 eor x28,x25,x26 // a^b, b^c in next round 185 eor x16,x16,x9,ror#18 // Sigma1(e) 186 ror x9,x25,#28 187 add x24,x24,x17 // h+=Ch(e,f,g) 188 eor x17,x25,x25,ror#5 189 add x24,x24,x16 // h+=Sigma1(e) 190 and x19,x19,x28 // (b^c)&=(a^b) 191 add x20,x20,x24 // d+=h 192 eor x19,x19,x26 // Maj(a,b,c) 193 eor x17,x9,x17,ror#34 // Sigma0(a) 194 add x24,x24,x19 // h+=Maj(a,b,c) 195 ldr x19,[x30],#8 // *K++, x28 in next round 196 //add x24,x24,x17 // h+=Sigma0(a) 197 #ifndef __AARCH64EB__ 198 rev x7,x7 // 4 199 #endif 200 add x24,x24,x17 // h+=Sigma0(a) 201 ror x16,x20,#14 202 add x23,x23,x19 // h+=K[i] 203 eor x10,x20,x20,ror#23 204 and x17,x21,x20 205 bic x19,x22,x20 206 add x23,x23,x7 // h+=X[i] 207 orr x17,x17,x19 // Ch(e,f,g) 208 eor x19,x24,x25 // a^b, b^c in next round 209 eor x16,x16,x10,ror#18 // Sigma1(e) 210 ror x10,x24,#28 211 add x23,x23,x17 // h+=Ch(e,f,g) 212 eor x17,x24,x24,ror#5 213 add x23,x23,x16 // h+=Sigma1(e) 214 and x28,x28,x19 // (b^c)&=(a^b) 215 add x27,x27,x23 // d+=h 216 eor x28,x28,x25 // Maj(a,b,c) 217 eor x17,x10,x17,ror#34 // Sigma0(a) 218 add x23,x23,x28 // h+=Maj(a,b,c) 219 ldr x28,[x30],#8 // *K++, x19 in next round 220 //add x23,x23,x17 // h+=Sigma0(a) 221 #ifndef __AARCH64EB__ 222 rev x8,x8 // 5 223 #endif 224 ldp x9,x10,[x1],#2*8 225 add x23,x23,x17 // h+=Sigma0(a) 226 ror x16,x27,#14 227 add x22,x22,x28 // h+=K[i] 228 eor x11,x27,x27,ror#23 229 and x17,x20,x27 230 bic x28,x21,x27 231 add x22,x22,x8 // h+=X[i] 232 orr x17,x17,x28 // Ch(e,f,g) 233 eor x28,x23,x24 // a^b, b^c in next round 234 eor x16,x16,x11,ror#18 // Sigma1(e) 235 ror x11,x23,#28 236 add x22,x22,x17 // h+=Ch(e,f,g) 237 eor x17,x23,x23,ror#5 238 add x22,x22,x16 // h+=Sigma1(e) 239 and x19,x19,x28 // (b^c)&=(a^b) 240 add x26,x26,x22 // d+=h 241 eor x19,x19,x24 // Maj(a,b,c) 242 eor x17,x11,x17,ror#34 // Sigma0(a) 243 add x22,x22,x19 // h+=Maj(a,b,c) 244 ldr x19,[x30],#8 // *K++, x28 in next round 245 //add x22,x22,x17 // h+=Sigma0(a) 246 #ifndef __AARCH64EB__ 247 rev x9,x9 // 6 248 #endif 249 add x22,x22,x17 // h+=Sigma0(a) 250 ror x16,x26,#14 251 add x21,x21,x19 // h+=K[i] 252 eor x12,x26,x26,ror#23 253 and x17,x27,x26 254 bic x19,x20,x26 255 add x21,x21,x9 // h+=X[i] 256 orr x17,x17,x19 // Ch(e,f,g) 257 eor x19,x22,x23 // a^b, b^c in next round 258 eor x16,x16,x12,ror#18 // Sigma1(e) 259 ror x12,x22,#28 260 add x21,x21,x17 // h+=Ch(e,f,g) 261 eor x17,x22,x22,ror#5 262 add x21,x21,x16 // h+=Sigma1(e) 263 and x28,x28,x19 // (b^c)&=(a^b) 264 add x25,x25,x21 // d+=h 265 eor x28,x28,x23 // Maj(a,b,c) 266 eor x17,x12,x17,ror#34 // Sigma0(a) 267 add x21,x21,x28 // h+=Maj(a,b,c) 268 ldr x28,[x30],#8 // *K++, x19 in next round 269 //add x21,x21,x17 // h+=Sigma0(a) 270 #ifndef __AARCH64EB__ 271 rev x10,x10 // 7 272 #endif 273 ldp x11,x12,[x1],#2*8 274 add x21,x21,x17 // h+=Sigma0(a) 275 ror x16,x25,#14 276 add x20,x20,x28 // h+=K[i] 277 eor x13,x25,x25,ror#23 278 and x17,x26,x25 279 bic x28,x27,x25 280 add x20,x20,x10 // h+=X[i] 281 orr x17,x17,x28 // Ch(e,f,g) 282 eor x28,x21,x22 // a^b, b^c in next round 283 eor x16,x16,x13,ror#18 // Sigma1(e) 284 ror x13,x21,#28 285 add x20,x20,x17 // h+=Ch(e,f,g) 286 eor x17,x21,x21,ror#5 287 add x20,x20,x16 // h+=Sigma1(e) 288 and x19,x19,x28 // (b^c)&=(a^b) 289 add x24,x24,x20 // d+=h 290 eor x19,x19,x22 // Maj(a,b,c) 291 eor x17,x13,x17,ror#34 // Sigma0(a) 292 add x20,x20,x19 // h+=Maj(a,b,c) 293 ldr x19,[x30],#8 // *K++, x28 in next round 294 //add x20,x20,x17 // h+=Sigma0(a) 295 #ifndef __AARCH64EB__ 296 rev x11,x11 // 8 297 #endif 298 add x20,x20,x17 // h+=Sigma0(a) 299 ror x16,x24,#14 300 add x27,x27,x19 // h+=K[i] 301 eor x14,x24,x24,ror#23 302 and x17,x25,x24 303 bic x19,x26,x24 304 add x27,x27,x11 // h+=X[i] 305 orr x17,x17,x19 // Ch(e,f,g) 306 eor x19,x20,x21 // a^b, b^c in next round 307 eor x16,x16,x14,ror#18 // Sigma1(e) 308 ror x14,x20,#28 309 add x27,x27,x17 // h+=Ch(e,f,g) 310 eor x17,x20,x20,ror#5 311 add x27,x27,x16 // h+=Sigma1(e) 312 and x28,x28,x19 // (b^c)&=(a^b) 313 add x23,x23,x27 // d+=h 314 eor x28,x28,x21 // Maj(a,b,c) 315 eor x17,x14,x17,ror#34 // Sigma0(a) 316 add x27,x27,x28 // h+=Maj(a,b,c) 317 ldr x28,[x30],#8 // *K++, x19 in next round 318 //add x27,x27,x17 // h+=Sigma0(a) 319 #ifndef __AARCH64EB__ 320 rev x12,x12 // 9 321 #endif 322 ldp x13,x14,[x1],#2*8 323 add x27,x27,x17 // h+=Sigma0(a) 324 ror x16,x23,#14 325 add x26,x26,x28 // h+=K[i] 326 eor x15,x23,x23,ror#23 327 and x17,x24,x23 328 bic x28,x25,x23 329 add x26,x26,x12 // h+=X[i] 330 orr x17,x17,x28 // Ch(e,f,g) 331 eor x28,x27,x20 // a^b, b^c in next round 332 eor x16,x16,x15,ror#18 // Sigma1(e) 333 ror x15,x27,#28 334 add x26,x26,x17 // h+=Ch(e,f,g) 335 eor x17,x27,x27,ror#5 336 add x26,x26,x16 // h+=Sigma1(e) 337 and x19,x19,x28 // (b^c)&=(a^b) 338 add x22,x22,x26 // d+=h 339 eor x19,x19,x20 // Maj(a,b,c) 340 eor x17,x15,x17,ror#34 // Sigma0(a) 341 add x26,x26,x19 // h+=Maj(a,b,c) 342 ldr x19,[x30],#8 // *K++, x28 in next round 343 //add x26,x26,x17 // h+=Sigma0(a) 344 #ifndef __AARCH64EB__ 345 rev x13,x13 // 10 346 #endif 347 add x26,x26,x17 // h+=Sigma0(a) 348 ror x16,x22,#14 349 add x25,x25,x19 // h+=K[i] 350 eor x0,x22,x22,ror#23 351 and x17,x23,x22 352 bic x19,x24,x22 353 add x25,x25,x13 // h+=X[i] 354 orr x17,x17,x19 // Ch(e,f,g) 355 eor x19,x26,x27 // a^b, b^c in next round 356 eor x16,x16,x0,ror#18 // Sigma1(e) 357 ror x0,x26,#28 358 add x25,x25,x17 // h+=Ch(e,f,g) 359 eor x17,x26,x26,ror#5 360 add x25,x25,x16 // h+=Sigma1(e) 361 and x28,x28,x19 // (b^c)&=(a^b) 362 add x21,x21,x25 // d+=h 363 eor x28,x28,x27 // Maj(a,b,c) 364 eor x17,x0,x17,ror#34 // Sigma0(a) 365 add x25,x25,x28 // h+=Maj(a,b,c) 366 ldr x28,[x30],#8 // *K++, x19 in next round 367 //add x25,x25,x17 // h+=Sigma0(a) 368 #ifndef __AARCH64EB__ 369 rev x14,x14 // 11 370 #endif 371 ldp x15,x0,[x1],#2*8 372 add x25,x25,x17 // h+=Sigma0(a) 373 str x6,[sp,#24] 374 ror x16,x21,#14 375 add x24,x24,x28 // h+=K[i] 376 eor x6,x21,x21,ror#23 377 and x17,x22,x21 378 bic x28,x23,x21 379 add x24,x24,x14 // h+=X[i] 380 orr x17,x17,x28 // Ch(e,f,g) 381 eor x28,x25,x26 // a^b, b^c in next round 382 eor x16,x16,x6,ror#18 // Sigma1(e) 383 ror x6,x25,#28 384 add x24,x24,x17 // h+=Ch(e,f,g) 385 eor x17,x25,x25,ror#5 386 add x24,x24,x16 // h+=Sigma1(e) 387 and x19,x19,x28 // (b^c)&=(a^b) 388 add x20,x20,x24 // d+=h 389 eor x19,x19,x26 // Maj(a,b,c) 390 eor x17,x6,x17,ror#34 // Sigma0(a) 391 add x24,x24,x19 // h+=Maj(a,b,c) 392 ldr x19,[x30],#8 // *K++, x28 in next round 393 //add x24,x24,x17 // h+=Sigma0(a) 394 #ifndef __AARCH64EB__ 395 rev x15,x15 // 12 396 #endif 397 add x24,x24,x17 // h+=Sigma0(a) 398 str x7,[sp,#0] 399 ror x16,x20,#14 400 add x23,x23,x19 // h+=K[i] 401 eor x7,x20,x20,ror#23 402 and x17,x21,x20 403 bic x19,x22,x20 404 add x23,x23,x15 // h+=X[i] 405 orr x17,x17,x19 // Ch(e,f,g) 406 eor x19,x24,x25 // a^b, b^c in next round 407 eor x16,x16,x7,ror#18 // Sigma1(e) 408 ror x7,x24,#28 409 add x23,x23,x17 // h+=Ch(e,f,g) 410 eor x17,x24,x24,ror#5 411 add x23,x23,x16 // h+=Sigma1(e) 412 and x28,x28,x19 // (b^c)&=(a^b) 413 add x27,x27,x23 // d+=h 414 eor x28,x28,x25 // Maj(a,b,c) 415 eor x17,x7,x17,ror#34 // Sigma0(a) 416 add x23,x23,x28 // h+=Maj(a,b,c) 417 ldr x28,[x30],#8 // *K++, x19 in next round 418 //add x23,x23,x17 // h+=Sigma0(a) 419 #ifndef __AARCH64EB__ 420 rev x0,x0 // 13 421 #endif 422 ldp x1,x2,[x1] 423 add x23,x23,x17 // h+=Sigma0(a) 424 str x8,[sp,#8] 425 ror x16,x27,#14 426 add x22,x22,x28 // h+=K[i] 427 eor x8,x27,x27,ror#23 428 and x17,x20,x27 429 bic x28,x21,x27 430 add x22,x22,x0 // h+=X[i] 431 orr x17,x17,x28 // Ch(e,f,g) 432 eor x28,x23,x24 // a^b, b^c in next round 433 eor x16,x16,x8,ror#18 // Sigma1(e) 434 ror x8,x23,#28 435 add x22,x22,x17 // h+=Ch(e,f,g) 436 eor x17,x23,x23,ror#5 437 add x22,x22,x16 // h+=Sigma1(e) 438 and x19,x19,x28 // (b^c)&=(a^b) 439 add x26,x26,x22 // d+=h 440 eor x19,x19,x24 // Maj(a,b,c) 441 eor x17,x8,x17,ror#34 // Sigma0(a) 442 add x22,x22,x19 // h+=Maj(a,b,c) 443 ldr x19,[x30],#8 // *K++, x28 in next round 444 //add x22,x22,x17 // h+=Sigma0(a) 445 #ifndef __AARCH64EB__ 446 rev x1,x1 // 14 447 #endif 448 ldr x6,[sp,#24] 449 add x22,x22,x17 // h+=Sigma0(a) 450 str x9,[sp,#16] 451 ror x16,x26,#14 452 add x21,x21,x19 // h+=K[i] 453 eor x9,x26,x26,ror#23 454 and x17,x27,x26 455 bic x19,x20,x26 456 add x21,x21,x1 // h+=X[i] 457 orr x17,x17,x19 // Ch(e,f,g) 458 eor x19,x22,x23 // a^b, b^c in next round 459 eor x16,x16,x9,ror#18 // Sigma1(e) 460 ror x9,x22,#28 461 add x21,x21,x17 // h+=Ch(e,f,g) 462 eor x17,x22,x22,ror#5 463 add x21,x21,x16 // h+=Sigma1(e) 464 and x28,x28,x19 // (b^c)&=(a^b) 465 add x25,x25,x21 // d+=h 466 eor x28,x28,x23 // Maj(a,b,c) 467 eor x17,x9,x17,ror#34 // Sigma0(a) 468 add x21,x21,x28 // h+=Maj(a,b,c) 469 ldr x28,[x30],#8 // *K++, x19 in next round 470 //add x21,x21,x17 // h+=Sigma0(a) 471 #ifndef __AARCH64EB__ 472 rev x2,x2 // 15 473 #endif 474 ldr x7,[sp,#0] 475 add x21,x21,x17 // h+=Sigma0(a) 476 str x10,[sp,#24] 477 ror x16,x25,#14 478 add x20,x20,x28 // h+=K[i] 479 ror x9,x4,#1 480 and x17,x26,x25 481 ror x8,x1,#19 482 bic x28,x27,x25 483 ror x10,x21,#28 484 add x20,x20,x2 // h+=X[i] 485 eor x16,x16,x25,ror#18 486 eor x9,x9,x4,ror#8 487 orr x17,x17,x28 // Ch(e,f,g) 488 eor x28,x21,x22 // a^b, b^c in next round 489 eor x16,x16,x25,ror#41 // Sigma1(e) 490 eor x10,x10,x21,ror#34 491 add x20,x20,x17 // h+=Ch(e,f,g) 492 and x19,x19,x28 // (b^c)&=(a^b) 493 eor x8,x8,x1,ror#61 494 eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) 495 add x20,x20,x16 // h+=Sigma1(e) 496 eor x19,x19,x22 // Maj(a,b,c) 497 eor x17,x10,x21,ror#39 // Sigma0(a) 498 eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) 499 add x3,x3,x12 500 add x24,x24,x20 // d+=h 501 add x20,x20,x19 // h+=Maj(a,b,c) 502 ldr x19,[x30],#8 // *K++, x28 in next round 503 add x3,x3,x9 504 add x20,x20,x17 // h+=Sigma0(a) 505 add x3,x3,x8 506 .Loop_16_xx: 507 ldr x8,[sp,#8] 508 str x11,[sp,#0] 509 ror x16,x24,#14 510 add x27,x27,x19 // h+=K[i] 511 ror x10,x5,#1 512 and x17,x25,x24 513 ror x9,x2,#19 514 bic x19,x26,x24 515 ror x11,x20,#28 516 add x27,x27,x3 // h+=X[i] 517 eor x16,x16,x24,ror#18 518 eor x10,x10,x5,ror#8 519 orr x17,x17,x19 // Ch(e,f,g) 520 eor x19,x20,x21 // a^b, b^c in next round 521 eor x16,x16,x24,ror#41 // Sigma1(e) 522 eor x11,x11,x20,ror#34 523 add x27,x27,x17 // h+=Ch(e,f,g) 524 and x28,x28,x19 // (b^c)&=(a^b) 525 eor x9,x9,x2,ror#61 526 eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) 527 add x27,x27,x16 // h+=Sigma1(e) 528 eor x28,x28,x21 // Maj(a,b,c) 529 eor x17,x11,x20,ror#39 // Sigma0(a) 530 eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) 531 add x4,x4,x13 532 add x23,x23,x27 // d+=h 533 add x27,x27,x28 // h+=Maj(a,b,c) 534 ldr x28,[x30],#8 // *K++, x19 in next round 535 add x4,x4,x10 536 add x27,x27,x17 // h+=Sigma0(a) 537 add x4,x4,x9 538 ldr x9,[sp,#16] 539 str x12,[sp,#8] 540 ror x16,x23,#14 541 add x26,x26,x28 // h+=K[i] 542 ror x11,x6,#1 543 and x17,x24,x23 544 ror x10,x3,#19 545 bic x28,x25,x23 546 ror x12,x27,#28 547 add x26,x26,x4 // h+=X[i] 548 eor x16,x16,x23,ror#18 549 eor x11,x11,x6,ror#8 550 orr x17,x17,x28 // Ch(e,f,g) 551 eor x28,x27,x20 // a^b, b^c in next round 552 eor x16,x16,x23,ror#41 // Sigma1(e) 553 eor x12,x12,x27,ror#34 554 add x26,x26,x17 // h+=Ch(e,f,g) 555 and x19,x19,x28 // (b^c)&=(a^b) 556 eor x10,x10,x3,ror#61 557 eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) 558 add x26,x26,x16 // h+=Sigma1(e) 559 eor x19,x19,x20 // Maj(a,b,c) 560 eor x17,x12,x27,ror#39 // Sigma0(a) 561 eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) 562 add x5,x5,x14 563 add x22,x22,x26 // d+=h 564 add x26,x26,x19 // h+=Maj(a,b,c) 565 ldr x19,[x30],#8 // *K++, x28 in next round 566 add x5,x5,x11 567 add x26,x26,x17 // h+=Sigma0(a) 568 add x5,x5,x10 569 ldr x10,[sp,#24] 570 str x13,[sp,#16] 571 ror x16,x22,#14 572 add x25,x25,x19 // h+=K[i] 573 ror x12,x7,#1 574 and x17,x23,x22 575 ror x11,x4,#19 576 bic x19,x24,x22 577 ror x13,x26,#28 578 add x25,x25,x5 // h+=X[i] 579 eor x16,x16,x22,ror#18 580 eor x12,x12,x7,ror#8 581 orr x17,x17,x19 // Ch(e,f,g) 582 eor x19,x26,x27 // a^b, b^c in next round 583 eor x16,x16,x22,ror#41 // Sigma1(e) 584 eor x13,x13,x26,ror#34 585 add x25,x25,x17 // h+=Ch(e,f,g) 586 and x28,x28,x19 // (b^c)&=(a^b) 587 eor x11,x11,x4,ror#61 588 eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) 589 add x25,x25,x16 // h+=Sigma1(e) 590 eor x28,x28,x27 // Maj(a,b,c) 591 eor x17,x13,x26,ror#39 // Sigma0(a) 592 eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) 593 add x6,x6,x15 594 add x21,x21,x25 // d+=h 595 add x25,x25,x28 // h+=Maj(a,b,c) 596 ldr x28,[x30],#8 // *K++, x19 in next round 597 add x6,x6,x12 598 add x25,x25,x17 // h+=Sigma0(a) 599 add x6,x6,x11 600 ldr x11,[sp,#0] 601 str x14,[sp,#24] 602 ror x16,x21,#14 603 add x24,x24,x28 // h+=K[i] 604 ror x13,x8,#1 605 and x17,x22,x21 606 ror x12,x5,#19 607 bic x28,x23,x21 608 ror x14,x25,#28 609 add x24,x24,x6 // h+=X[i] 610 eor x16,x16,x21,ror#18 611 eor x13,x13,x8,ror#8 612 orr x17,x17,x28 // Ch(e,f,g) 613 eor x28,x25,x26 // a^b, b^c in next round 614 eor x16,x16,x21,ror#41 // Sigma1(e) 615 eor x14,x14,x25,ror#34 616 add x24,x24,x17 // h+=Ch(e,f,g) 617 and x19,x19,x28 // (b^c)&=(a^b) 618 eor x12,x12,x5,ror#61 619 eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) 620 add x24,x24,x16 // h+=Sigma1(e) 621 eor x19,x19,x26 // Maj(a,b,c) 622 eor x17,x14,x25,ror#39 // Sigma0(a) 623 eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) 624 add x7,x7,x0 625 add x20,x20,x24 // d+=h 626 add x24,x24,x19 // h+=Maj(a,b,c) 627 ldr x19,[x30],#8 // *K++, x28 in next round 628 add x7,x7,x13 629 add x24,x24,x17 // h+=Sigma0(a) 630 add x7,x7,x12 631 ldr x12,[sp,#8] 632 str x15,[sp,#0] 633 ror x16,x20,#14 634 add x23,x23,x19 // h+=K[i] 635 ror x14,x9,#1 636 and x17,x21,x20 637 ror x13,x6,#19 638 bic x19,x22,x20 639 ror x15,x24,#28 640 add x23,x23,x7 // h+=X[i] 641 eor x16,x16,x20,ror#18 642 eor x14,x14,x9,ror#8 643 orr x17,x17,x19 // Ch(e,f,g) 644 eor x19,x24,x25 // a^b, b^c in next round 645 eor x16,x16,x20,ror#41 // Sigma1(e) 646 eor x15,x15,x24,ror#34 647 add x23,x23,x17 // h+=Ch(e,f,g) 648 and x28,x28,x19 // (b^c)&=(a^b) 649 eor x13,x13,x6,ror#61 650 eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) 651 add x23,x23,x16 // h+=Sigma1(e) 652 eor x28,x28,x25 // Maj(a,b,c) 653 eor x17,x15,x24,ror#39 // Sigma0(a) 654 eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) 655 add x8,x8,x1 656 add x27,x27,x23 // d+=h 657 add x23,x23,x28 // h+=Maj(a,b,c) 658 ldr x28,[x30],#8 // *K++, x19 in next round 659 add x8,x8,x14 660 add x23,x23,x17 // h+=Sigma0(a) 661 add x8,x8,x13 662 ldr x13,[sp,#16] 663 str x0,[sp,#8] 664 ror x16,x27,#14 665 add x22,x22,x28 // h+=K[i] 666 ror x15,x10,#1 667 and x17,x20,x27 668 ror x14,x7,#19 669 bic x28,x21,x27 670 ror x0,x23,#28 671 add x22,x22,x8 // h+=X[i] 672 eor x16,x16,x27,ror#18 673 eor x15,x15,x10,ror#8 674 orr x17,x17,x28 // Ch(e,f,g) 675 eor x28,x23,x24 // a^b, b^c in next round 676 eor x16,x16,x27,ror#41 // Sigma1(e) 677 eor x0,x0,x23,ror#34 678 add x22,x22,x17 // h+=Ch(e,f,g) 679 and x19,x19,x28 // (b^c)&=(a^b) 680 eor x14,x14,x7,ror#61 681 eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) 682 add x22,x22,x16 // h+=Sigma1(e) 683 eor x19,x19,x24 // Maj(a,b,c) 684 eor x17,x0,x23,ror#39 // Sigma0(a) 685 eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) 686 add x9,x9,x2 687 add x26,x26,x22 // d+=h 688 add x22,x22,x19 // h+=Maj(a,b,c) 689 ldr x19,[x30],#8 // *K++, x28 in next round 690 add x9,x9,x15 691 add x22,x22,x17 // h+=Sigma0(a) 692 add x9,x9,x14 693 ldr x14,[sp,#24] 694 str x1,[sp,#16] 695 ror x16,x26,#14 696 add x21,x21,x19 // h+=K[i] 697 ror x0,x11,#1 698 and x17,x27,x26 699 ror x15,x8,#19 700 bic x19,x20,x26 701 ror x1,x22,#28 702 add x21,x21,x9 // h+=X[i] 703 eor x16,x16,x26,ror#18 704 eor x0,x0,x11,ror#8 705 orr x17,x17,x19 // Ch(e,f,g) 706 eor x19,x22,x23 // a^b, b^c in next round 707 eor x16,x16,x26,ror#41 // Sigma1(e) 708 eor x1,x1,x22,ror#34 709 add x21,x21,x17 // h+=Ch(e,f,g) 710 and x28,x28,x19 // (b^c)&=(a^b) 711 eor x15,x15,x8,ror#61 712 eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) 713 add x21,x21,x16 // h+=Sigma1(e) 714 eor x28,x28,x23 // Maj(a,b,c) 715 eor x17,x1,x22,ror#39 // Sigma0(a) 716 eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) 717 add x10,x10,x3 718 add x25,x25,x21 // d+=h 719 add x21,x21,x28 // h+=Maj(a,b,c) 720 ldr x28,[x30],#8 // *K++, x19 in next round 721 add x10,x10,x0 722 add x21,x21,x17 // h+=Sigma0(a) 723 add x10,x10,x15 724 ldr x15,[sp,#0] 725 str x2,[sp,#24] 726 ror x16,x25,#14 727 add x20,x20,x28 // h+=K[i] 728 ror x1,x12,#1 729 and x17,x26,x25 730 ror x0,x9,#19 731 bic x28,x27,x25 732 ror x2,x21,#28 733 add x20,x20,x10 // h+=X[i] 734 eor x16,x16,x25,ror#18 735 eor x1,x1,x12,ror#8 736 orr x17,x17,x28 // Ch(e,f,g) 737 eor x28,x21,x22 // a^b, b^c in next round 738 eor x16,x16,x25,ror#41 // Sigma1(e) 739 eor x2,x2,x21,ror#34 740 add x20,x20,x17 // h+=Ch(e,f,g) 741 and x19,x19,x28 // (b^c)&=(a^b) 742 eor x0,x0,x9,ror#61 743 eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) 744 add x20,x20,x16 // h+=Sigma1(e) 745 eor x19,x19,x22 // Maj(a,b,c) 746 eor x17,x2,x21,ror#39 // Sigma0(a) 747 eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) 748 add x11,x11,x4 749 add x24,x24,x20 // d+=h 750 add x20,x20,x19 // h+=Maj(a,b,c) 751 ldr x19,[x30],#8 // *K++, x28 in next round 752 add x11,x11,x1 753 add x20,x20,x17 // h+=Sigma0(a) 754 add x11,x11,x0 755 ldr x0,[sp,#8] 756 str x3,[sp,#0] 757 ror x16,x24,#14 758 add x27,x27,x19 // h+=K[i] 759 ror x2,x13,#1 760 and x17,x25,x24 761 ror x1,x10,#19 762 bic x19,x26,x24 763 ror x3,x20,#28 764 add x27,x27,x11 // h+=X[i] 765 eor x16,x16,x24,ror#18 766 eor x2,x2,x13,ror#8 767 orr x17,x17,x19 // Ch(e,f,g) 768 eor x19,x20,x21 // a^b, b^c in next round 769 eor x16,x16,x24,ror#41 // Sigma1(e) 770 eor x3,x3,x20,ror#34 771 add x27,x27,x17 // h+=Ch(e,f,g) 772 and x28,x28,x19 // (b^c)&=(a^b) 773 eor x1,x1,x10,ror#61 774 eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) 775 add x27,x27,x16 // h+=Sigma1(e) 776 eor x28,x28,x21 // Maj(a,b,c) 777 eor x17,x3,x20,ror#39 // Sigma0(a) 778 eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) 779 add x12,x12,x5 780 add x23,x23,x27 // d+=h 781 add x27,x27,x28 // h+=Maj(a,b,c) 782 ldr x28,[x30],#8 // *K++, x19 in next round 783 add x12,x12,x2 784 add x27,x27,x17 // h+=Sigma0(a) 785 add x12,x12,x1 786 ldr x1,[sp,#16] 787 str x4,[sp,#8] 788 ror x16,x23,#14 789 add x26,x26,x28 // h+=K[i] 790 ror x3,x14,#1 791 and x17,x24,x23 792 ror x2,x11,#19 793 bic x28,x25,x23 794 ror x4,x27,#28 795 add x26,x26,x12 // h+=X[i] 796 eor x16,x16,x23,ror#18 797 eor x3,x3,x14,ror#8 798 orr x17,x17,x28 // Ch(e,f,g) 799 eor x28,x27,x20 // a^b, b^c in next round 800 eor x16,x16,x23,ror#41 // Sigma1(e) 801 eor x4,x4,x27,ror#34 802 add x26,x26,x17 // h+=Ch(e,f,g) 803 and x19,x19,x28 // (b^c)&=(a^b) 804 eor x2,x2,x11,ror#61 805 eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) 806 add x26,x26,x16 // h+=Sigma1(e) 807 eor x19,x19,x20 // Maj(a,b,c) 808 eor x17,x4,x27,ror#39 // Sigma0(a) 809 eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) 810 add x13,x13,x6 811 add x22,x22,x26 // d+=h 812 add x26,x26,x19 // h+=Maj(a,b,c) 813 ldr x19,[x30],#8 // *K++, x28 in next round 814 add x13,x13,x3 815 add x26,x26,x17 // h+=Sigma0(a) 816 add x13,x13,x2 817 ldr x2,[sp,#24] 818 str x5,[sp,#16] 819 ror x16,x22,#14 820 add x25,x25,x19 // h+=K[i] 821 ror x4,x15,#1 822 and x17,x23,x22 823 ror x3,x12,#19 824 bic x19,x24,x22 825 ror x5,x26,#28 826 add x25,x25,x13 // h+=X[i] 827 eor x16,x16,x22,ror#18 828 eor x4,x4,x15,ror#8 829 orr x17,x17,x19 // Ch(e,f,g) 830 eor x19,x26,x27 // a^b, b^c in next round 831 eor x16,x16,x22,ror#41 // Sigma1(e) 832 eor x5,x5,x26,ror#34 833 add x25,x25,x17 // h+=Ch(e,f,g) 834 and x28,x28,x19 // (b^c)&=(a^b) 835 eor x3,x3,x12,ror#61 836 eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) 837 add x25,x25,x16 // h+=Sigma1(e) 838 eor x28,x28,x27 // Maj(a,b,c) 839 eor x17,x5,x26,ror#39 // Sigma0(a) 840 eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) 841 add x14,x14,x7 842 add x21,x21,x25 // d+=h 843 add x25,x25,x28 // h+=Maj(a,b,c) 844 ldr x28,[x30],#8 // *K++, x19 in next round 845 add x14,x14,x4 846 add x25,x25,x17 // h+=Sigma0(a) 847 add x14,x14,x3 848 ldr x3,[sp,#0] 849 str x6,[sp,#24] 850 ror x16,x21,#14 851 add x24,x24,x28 // h+=K[i] 852 ror x5,x0,#1 853 and x17,x22,x21 854 ror x4,x13,#19 855 bic x28,x23,x21 856 ror x6,x25,#28 857 add x24,x24,x14 // h+=X[i] 858 eor x16,x16,x21,ror#18 859 eor x5,x5,x0,ror#8 860 orr x17,x17,x28 // Ch(e,f,g) 861 eor x28,x25,x26 // a^b, b^c in next round 862 eor x16,x16,x21,ror#41 // Sigma1(e) 863 eor x6,x6,x25,ror#34 864 add x24,x24,x17 // h+=Ch(e,f,g) 865 and x19,x19,x28 // (b^c)&=(a^b) 866 eor x4,x4,x13,ror#61 867 eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) 868 add x24,x24,x16 // h+=Sigma1(e) 869 eor x19,x19,x26 // Maj(a,b,c) 870 eor x17,x6,x25,ror#39 // Sigma0(a) 871 eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) 872 add x15,x15,x8 873 add x20,x20,x24 // d+=h 874 add x24,x24,x19 // h+=Maj(a,b,c) 875 ldr x19,[x30],#8 // *K++, x28 in next round 876 add x15,x15,x5 877 add x24,x24,x17 // h+=Sigma0(a) 878 add x15,x15,x4 879 ldr x4,[sp,#8] 880 str x7,[sp,#0] 881 ror x16,x20,#14 882 add x23,x23,x19 // h+=K[i] 883 ror x6,x1,#1 884 and x17,x21,x20 885 ror x5,x14,#19 886 bic x19,x22,x20 887 ror x7,x24,#28 888 add x23,x23,x15 // h+=X[i] 889 eor x16,x16,x20,ror#18 890 eor x6,x6,x1,ror#8 891 orr x17,x17,x19 // Ch(e,f,g) 892 eor x19,x24,x25 // a^b, b^c in next round 893 eor x16,x16,x20,ror#41 // Sigma1(e) 894 eor x7,x7,x24,ror#34 895 add x23,x23,x17 // h+=Ch(e,f,g) 896 and x28,x28,x19 // (b^c)&=(a^b) 897 eor x5,x5,x14,ror#61 898 eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) 899 add x23,x23,x16 // h+=Sigma1(e) 900 eor x28,x28,x25 // Maj(a,b,c) 901 eor x17,x7,x24,ror#39 // Sigma0(a) 902 eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) 903 add x0,x0,x9 904 add x27,x27,x23 // d+=h 905 add x23,x23,x28 // h+=Maj(a,b,c) 906 ldr x28,[x30],#8 // *K++, x19 in next round 907 add x0,x0,x6 908 add x23,x23,x17 // h+=Sigma0(a) 909 add x0,x0,x5 910 ldr x5,[sp,#16] 911 str x8,[sp,#8] 912 ror x16,x27,#14 913 add x22,x22,x28 // h+=K[i] 914 ror x7,x2,#1 915 and x17,x20,x27 916 ror x6,x15,#19 917 bic x28,x21,x27 918 ror x8,x23,#28 919 add x22,x22,x0 // h+=X[i] 920 eor x16,x16,x27,ror#18 921 eor x7,x7,x2,ror#8 922 orr x17,x17,x28 // Ch(e,f,g) 923 eor x28,x23,x24 // a^b, b^c in next round 924 eor x16,x16,x27,ror#41 // Sigma1(e) 925 eor x8,x8,x23,ror#34 926 add x22,x22,x17 // h+=Ch(e,f,g) 927 and x19,x19,x28 // (b^c)&=(a^b) 928 eor x6,x6,x15,ror#61 929 eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) 930 add x22,x22,x16 // h+=Sigma1(e) 931 eor x19,x19,x24 // Maj(a,b,c) 932 eor x17,x8,x23,ror#39 // Sigma0(a) 933 eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) 934 add x1,x1,x10 935 add x26,x26,x22 // d+=h 936 add x22,x22,x19 // h+=Maj(a,b,c) 937 ldr x19,[x30],#8 // *K++, x28 in next round 938 add x1,x1,x7 939 add x22,x22,x17 // h+=Sigma0(a) 940 add x1,x1,x6 941 ldr x6,[sp,#24] 942 str x9,[sp,#16] 943 ror x16,x26,#14 944 add x21,x21,x19 // h+=K[i] 945 ror x8,x3,#1 946 and x17,x27,x26 947 ror x7,x0,#19 948 bic x19,x20,x26 949 ror x9,x22,#28 950 add x21,x21,x1 // h+=X[i] 951 eor x16,x16,x26,ror#18 952 eor x8,x8,x3,ror#8 953 orr x17,x17,x19 // Ch(e,f,g) 954 eor x19,x22,x23 // a^b, b^c in next round 955 eor x16,x16,x26,ror#41 // Sigma1(e) 956 eor x9,x9,x22,ror#34 957 add x21,x21,x17 // h+=Ch(e,f,g) 958 and x28,x28,x19 // (b^c)&=(a^b) 959 eor x7,x7,x0,ror#61 960 eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) 961 add x21,x21,x16 // h+=Sigma1(e) 962 eor x28,x28,x23 // Maj(a,b,c) 963 eor x17,x9,x22,ror#39 // Sigma0(a) 964 eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) 965 add x2,x2,x11 966 add x25,x25,x21 // d+=h 967 add x21,x21,x28 // h+=Maj(a,b,c) 968 ldr x28,[x30],#8 // *K++, x19 in next round 969 add x2,x2,x8 970 add x21,x21,x17 // h+=Sigma0(a) 971 add x2,x2,x7 972 ldr x7,[sp,#0] 973 str x10,[sp,#24] 974 ror x16,x25,#14 975 add x20,x20,x28 // h+=K[i] 976 ror x9,x4,#1 977 and x17,x26,x25 978 ror x8,x1,#19 979 bic x28,x27,x25 980 ror x10,x21,#28 981 add x20,x20,x2 // h+=X[i] 982 eor x16,x16,x25,ror#18 983 eor x9,x9,x4,ror#8 984 orr x17,x17,x28 // Ch(e,f,g) 985 eor x28,x21,x22 // a^b, b^c in next round 986 eor x16,x16,x25,ror#41 // Sigma1(e) 987 eor x10,x10,x21,ror#34 988 add x20,x20,x17 // h+=Ch(e,f,g) 989 and x19,x19,x28 // (b^c)&=(a^b) 990 eor x8,x8,x1,ror#61 991 eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) 992 add x20,x20,x16 // h+=Sigma1(e) 993 eor x19,x19,x22 // Maj(a,b,c) 994 eor x17,x10,x21,ror#39 // Sigma0(a) 995 eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) 996 add x3,x3,x12 997 add x24,x24,x20 // d+=h 998 add x20,x20,x19 // h+=Maj(a,b,c) 999 ldr x19,[x30],#8 // *K++, x28 in next round 1000 add x3,x3,x9 1001 add x20,x20,x17 // h+=Sigma0(a) 1002 add x3,x3,x8 1003 cbnz x19,.Loop_16_xx 1004 1005 ldp x0,x2,[x29,#96] 1006 ldr x1,[x29,#112] 1007 sub x30,x30,#648 // rewind 1008 1009 ldp x3,x4,[x0] 1010 ldp x5,x6,[x0,#2*8] 1011 add x1,x1,#14*8 // advance input pointer 1012 ldp x7,x8,[x0,#4*8] 1013 add x20,x20,x3 1014 ldp x9,x10,[x0,#6*8] 1015 add x21,x21,x4 1016 add x22,x22,x5 1017 add x23,x23,x6 1018 stp x20,x21,[x0] 1019 add x24,x24,x7 1020 add x25,x25,x8 1021 stp x22,x23,[x0,#2*8] 1022 add x26,x26,x9 1023 add x27,x27,x10 1024 cmp x1,x2 1025 stp x24,x25,[x0,#4*8] 1026 stp x26,x27,[x0,#6*8] 1027 b.ne .Loop 1028 1029 ldp x19,x20,[x29,#16] 1030 add sp,sp,#4*8 1031 ldp x21,x22,[x29,#32] 1032 ldp x23,x24,[x29,#48] 1033 ldp x25,x26,[x29,#64] 1034 ldp x27,x28,[x29,#80] 1035 ldp x29,x30,[sp],#128 1036 .inst 0xd50323bf // autiasp 1037 ret 1038 .size sha512_block_data_order,.-sha512_block_data_order 1039 1040 .align 6 1041 .type .LK512,%object 1042 .LK512: 1043 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 1044 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 1045 .quad 0x3956c25bf348b538,0x59f111f1b605d019 1046 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 1047 .quad 0xd807aa98a3030242,0x12835b0145706fbe 1048 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 1049 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 1050 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 1051 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 1052 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 1053 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 1054 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 1055 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 1056 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 1057 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 1058 .quad 0x06ca6351e003826f,0x142929670a0e6e70 1059 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 1060 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 1061 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 1062 .quad 0x81c2c92e47edaee6,0x92722c851482353b 1063 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 1064 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 1065 .quad 0xd192e819d6ef5218,0xd69906245565a910 1066 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 1067 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 1068 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 1069 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 1070 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 1071 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 1072 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 1073 .quad 0x90befffa23631e28,0xa4506cebde82bde9 1074 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 1075 .quad 0xca273eceea26619c,0xd186b8c721c0c207 1076 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 1077 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 1078 .quad 0x113f9804bef90dae,0x1b710b35131c471b 1079 .quad 0x28db77f523047d84,0x32caab7b40c72493 1080 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 1081 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 1082 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 1083 .quad 0 // terminator 1084 .size .LK512,.-.LK512 1085 .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1086 .align 2 1087 .align 2 1088 #ifndef __KERNEL__ 1089 .type sha512_block_armv8,%function 1090 .align 6 1091 sha512_block_armv8: 1092 .Lv8_entry: 1093 stp x29,x30,[sp,#-16]! 1094 add x29,sp,#0 1095 1096 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input 1097 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 1098 1099 ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context 1100 adr x3,.LK512 1101 1102 rev64 v16.16b,v16.16b 1103 rev64 v17.16b,v17.16b 1104 rev64 v18.16b,v18.16b 1105 rev64 v19.16b,v19.16b 1106 rev64 v20.16b,v20.16b 1107 rev64 v21.16b,v21.16b 1108 rev64 v22.16b,v22.16b 1109 rev64 v23.16b,v23.16b 1110 b .Loop_hw 1111 1112 .align 4 1113 .Loop_hw: 1114 ld1 {v24.2d},[x3],#16 1115 subs x2,x2,#1 1116 sub x4,x1,#128 1117 orr v26.16b,v0.16b,v0.16b // offload 1118 orr v27.16b,v1.16b,v1.16b 1119 orr v28.16b,v2.16b,v2.16b 1120 orr v29.16b,v3.16b,v3.16b 1121 csel x1,x1,x4,ne // conditional rewind 1122 add v24.2d,v24.2d,v16.2d 1123 ld1 {v25.2d},[x3],#16 1124 ext v24.16b,v24.16b,v24.16b,#8 1125 ext v5.16b,v2.16b,v3.16b,#8 1126 ext v6.16b,v1.16b,v2.16b,#8 1127 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1128 .inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1129 ext v7.16b,v20.16b,v21.16b,#8 1130 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1131 .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1132 add v4.2d,v1.2d,v3.2d // "D + T1" 1133 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1134 add v25.2d,v25.2d,v17.2d 1135 ld1 {v24.2d},[x3],#16 1136 ext v25.16b,v25.16b,v25.16b,#8 1137 ext v5.16b,v4.16b,v2.16b,#8 1138 ext v6.16b,v0.16b,v4.16b,#8 1139 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1140 .inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1141 ext v7.16b,v21.16b,v22.16b,#8 1142 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1143 .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1144 add v1.2d,v0.2d,v2.2d // "D + T1" 1145 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1146 add v24.2d,v24.2d,v18.2d 1147 ld1 {v25.2d},[x3],#16 1148 ext v24.16b,v24.16b,v24.16b,#8 1149 ext v5.16b,v1.16b,v4.16b,#8 1150 ext v6.16b,v3.16b,v1.16b,#8 1151 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1152 .inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1153 ext v7.16b,v22.16b,v23.16b,#8 1154 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1155 .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1156 add v0.2d,v3.2d,v4.2d // "D + T1" 1157 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1158 add v25.2d,v25.2d,v19.2d 1159 ld1 {v24.2d},[x3],#16 1160 ext v25.16b,v25.16b,v25.16b,#8 1161 ext v5.16b,v0.16b,v1.16b,#8 1162 ext v6.16b,v2.16b,v0.16b,#8 1163 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1164 .inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1165 ext v7.16b,v23.16b,v16.16b,#8 1166 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1167 .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1168 add v3.2d,v2.2d,v1.2d // "D + T1" 1169 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1170 add v24.2d,v24.2d,v20.2d 1171 ld1 {v25.2d},[x3],#16 1172 ext v24.16b,v24.16b,v24.16b,#8 1173 ext v5.16b,v3.16b,v0.16b,#8 1174 ext v6.16b,v4.16b,v3.16b,#8 1175 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1176 .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1177 ext v7.16b,v16.16b,v17.16b,#8 1178 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1179 .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1180 add v2.2d,v4.2d,v0.2d // "D + T1" 1181 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1182 add v25.2d,v25.2d,v21.2d 1183 ld1 {v24.2d},[x3],#16 1184 ext v25.16b,v25.16b,v25.16b,#8 1185 ext v5.16b,v2.16b,v3.16b,#8 1186 ext v6.16b,v1.16b,v2.16b,#8 1187 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1188 .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1189 ext v7.16b,v17.16b,v18.16b,#8 1190 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1191 .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1192 add v4.2d,v1.2d,v3.2d // "D + T1" 1193 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1194 add v24.2d,v24.2d,v22.2d 1195 ld1 {v25.2d},[x3],#16 1196 ext v24.16b,v24.16b,v24.16b,#8 1197 ext v5.16b,v4.16b,v2.16b,#8 1198 ext v6.16b,v0.16b,v4.16b,#8 1199 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1200 .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1201 ext v7.16b,v18.16b,v19.16b,#8 1202 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1203 .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1204 add v1.2d,v0.2d,v2.2d // "D + T1" 1205 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1206 add v25.2d,v25.2d,v23.2d 1207 ld1 {v24.2d},[x3],#16 1208 ext v25.16b,v25.16b,v25.16b,#8 1209 ext v5.16b,v1.16b,v4.16b,#8 1210 ext v6.16b,v3.16b,v1.16b,#8 1211 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1212 .inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1213 ext v7.16b,v19.16b,v20.16b,#8 1214 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1215 .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1216 add v0.2d,v3.2d,v4.2d // "D + T1" 1217 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1218 add v24.2d,v24.2d,v16.2d 1219 ld1 {v25.2d},[x3],#16 1220 ext v24.16b,v24.16b,v24.16b,#8 1221 ext v5.16b,v0.16b,v1.16b,#8 1222 ext v6.16b,v2.16b,v0.16b,#8 1223 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1224 .inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1225 ext v7.16b,v20.16b,v21.16b,#8 1226 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1227 .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1228 add v3.2d,v2.2d,v1.2d // "D + T1" 1229 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1230 add v25.2d,v25.2d,v17.2d 1231 ld1 {v24.2d},[x3],#16 1232 ext v25.16b,v25.16b,v25.16b,#8 1233 ext v5.16b,v3.16b,v0.16b,#8 1234 ext v6.16b,v4.16b,v3.16b,#8 1235 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1236 .inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1237 ext v7.16b,v21.16b,v22.16b,#8 1238 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1239 .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1240 add v2.2d,v4.2d,v0.2d // "D + T1" 1241 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1242 add v24.2d,v24.2d,v18.2d 1243 ld1 {v25.2d},[x3],#16 1244 ext v24.16b,v24.16b,v24.16b,#8 1245 ext v5.16b,v2.16b,v3.16b,#8 1246 ext v6.16b,v1.16b,v2.16b,#8 1247 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1248 .inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1249 ext v7.16b,v22.16b,v23.16b,#8 1250 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1251 .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1252 add v4.2d,v1.2d,v3.2d // "D + T1" 1253 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1254 add v25.2d,v25.2d,v19.2d 1255 ld1 {v24.2d},[x3],#16 1256 ext v25.16b,v25.16b,v25.16b,#8 1257 ext v5.16b,v4.16b,v2.16b,#8 1258 ext v6.16b,v0.16b,v4.16b,#8 1259 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1260 .inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1261 ext v7.16b,v23.16b,v16.16b,#8 1262 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1263 .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1264 add v1.2d,v0.2d,v2.2d // "D + T1" 1265 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1266 add v24.2d,v24.2d,v20.2d 1267 ld1 {v25.2d},[x3],#16 1268 ext v24.16b,v24.16b,v24.16b,#8 1269 ext v5.16b,v1.16b,v4.16b,#8 1270 ext v6.16b,v3.16b,v1.16b,#8 1271 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1272 .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1273 ext v7.16b,v16.16b,v17.16b,#8 1274 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1275 .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1276 add v0.2d,v3.2d,v4.2d // "D + T1" 1277 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1278 add v25.2d,v25.2d,v21.2d 1279 ld1 {v24.2d},[x3],#16 1280 ext v25.16b,v25.16b,v25.16b,#8 1281 ext v5.16b,v0.16b,v1.16b,#8 1282 ext v6.16b,v2.16b,v0.16b,#8 1283 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1284 .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1285 ext v7.16b,v17.16b,v18.16b,#8 1286 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1287 .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1288 add v3.2d,v2.2d,v1.2d // "D + T1" 1289 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1290 add v24.2d,v24.2d,v22.2d 1291 ld1 {v25.2d},[x3],#16 1292 ext v24.16b,v24.16b,v24.16b,#8 1293 ext v5.16b,v3.16b,v0.16b,#8 1294 ext v6.16b,v4.16b,v3.16b,#8 1295 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1296 .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1297 ext v7.16b,v18.16b,v19.16b,#8 1298 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1299 .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1300 add v2.2d,v4.2d,v0.2d // "D + T1" 1301 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1302 add v25.2d,v25.2d,v23.2d 1303 ld1 {v24.2d},[x3],#16 1304 ext v25.16b,v25.16b,v25.16b,#8 1305 ext v5.16b,v2.16b,v3.16b,#8 1306 ext v6.16b,v1.16b,v2.16b,#8 1307 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1308 .inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1309 ext v7.16b,v19.16b,v20.16b,#8 1310 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1311 .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1312 add v4.2d,v1.2d,v3.2d // "D + T1" 1313 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1314 add v24.2d,v24.2d,v16.2d 1315 ld1 {v25.2d},[x3],#16 1316 ext v24.16b,v24.16b,v24.16b,#8 1317 ext v5.16b,v4.16b,v2.16b,#8 1318 ext v6.16b,v0.16b,v4.16b,#8 1319 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1320 .inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1321 ext v7.16b,v20.16b,v21.16b,#8 1322 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1323 .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1324 add v1.2d,v0.2d,v2.2d // "D + T1" 1325 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1326 add v25.2d,v25.2d,v17.2d 1327 ld1 {v24.2d},[x3],#16 1328 ext v25.16b,v25.16b,v25.16b,#8 1329 ext v5.16b,v1.16b,v4.16b,#8 1330 ext v6.16b,v3.16b,v1.16b,#8 1331 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1332 .inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1333 ext v7.16b,v21.16b,v22.16b,#8 1334 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1335 .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1336 add v0.2d,v3.2d,v4.2d // "D + T1" 1337 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1338 add v24.2d,v24.2d,v18.2d 1339 ld1 {v25.2d},[x3],#16 1340 ext v24.16b,v24.16b,v24.16b,#8 1341 ext v5.16b,v0.16b,v1.16b,#8 1342 ext v6.16b,v2.16b,v0.16b,#8 1343 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1344 .inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1345 ext v7.16b,v22.16b,v23.16b,#8 1346 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1347 .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1348 add v3.2d,v2.2d,v1.2d // "D + T1" 1349 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1350 add v25.2d,v25.2d,v19.2d 1351 ld1 {v24.2d},[x3],#16 1352 ext v25.16b,v25.16b,v25.16b,#8 1353 ext v5.16b,v3.16b,v0.16b,#8 1354 ext v6.16b,v4.16b,v3.16b,#8 1355 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1356 .inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1357 ext v7.16b,v23.16b,v16.16b,#8 1358 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1359 .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1360 add v2.2d,v4.2d,v0.2d // "D + T1" 1361 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1362 add v24.2d,v24.2d,v20.2d 1363 ld1 {v25.2d},[x3],#16 1364 ext v24.16b,v24.16b,v24.16b,#8 1365 ext v5.16b,v2.16b,v3.16b,#8 1366 ext v6.16b,v1.16b,v2.16b,#8 1367 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1368 .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1369 ext v7.16b,v16.16b,v17.16b,#8 1370 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1371 .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1372 add v4.2d,v1.2d,v3.2d // "D + T1" 1373 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1374 add v25.2d,v25.2d,v21.2d 1375 ld1 {v24.2d},[x3],#16 1376 ext v25.16b,v25.16b,v25.16b,#8 1377 ext v5.16b,v4.16b,v2.16b,#8 1378 ext v6.16b,v0.16b,v4.16b,#8 1379 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1380 .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1381 ext v7.16b,v17.16b,v18.16b,#8 1382 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1383 .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1384 add v1.2d,v0.2d,v2.2d // "D + T1" 1385 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1386 add v24.2d,v24.2d,v22.2d 1387 ld1 {v25.2d},[x3],#16 1388 ext v24.16b,v24.16b,v24.16b,#8 1389 ext v5.16b,v1.16b,v4.16b,#8 1390 ext v6.16b,v3.16b,v1.16b,#8 1391 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1392 .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1393 ext v7.16b,v18.16b,v19.16b,#8 1394 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1395 .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1396 add v0.2d,v3.2d,v4.2d // "D + T1" 1397 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1398 add v25.2d,v25.2d,v23.2d 1399 ld1 {v24.2d},[x3],#16 1400 ext v25.16b,v25.16b,v25.16b,#8 1401 ext v5.16b,v0.16b,v1.16b,#8 1402 ext v6.16b,v2.16b,v0.16b,#8 1403 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1404 .inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1405 ext v7.16b,v19.16b,v20.16b,#8 1406 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1407 .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1408 add v3.2d,v2.2d,v1.2d // "D + T1" 1409 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1410 add v24.2d,v24.2d,v16.2d 1411 ld1 {v25.2d},[x3],#16 1412 ext v24.16b,v24.16b,v24.16b,#8 1413 ext v5.16b,v3.16b,v0.16b,#8 1414 ext v6.16b,v4.16b,v3.16b,#8 1415 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1416 .inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1417 ext v7.16b,v20.16b,v21.16b,#8 1418 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1419 .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1420 add v2.2d,v4.2d,v0.2d // "D + T1" 1421 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1422 add v25.2d,v25.2d,v17.2d 1423 ld1 {v24.2d},[x3],#16 1424 ext v25.16b,v25.16b,v25.16b,#8 1425 ext v5.16b,v2.16b,v3.16b,#8 1426 ext v6.16b,v1.16b,v2.16b,#8 1427 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1428 .inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1429 ext v7.16b,v21.16b,v22.16b,#8 1430 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1431 .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1432 add v4.2d,v1.2d,v3.2d // "D + T1" 1433 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1434 add v24.2d,v24.2d,v18.2d 1435 ld1 {v25.2d},[x3],#16 1436 ext v24.16b,v24.16b,v24.16b,#8 1437 ext v5.16b,v4.16b,v2.16b,#8 1438 ext v6.16b,v0.16b,v4.16b,#8 1439 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1440 .inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1441 ext v7.16b,v22.16b,v23.16b,#8 1442 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1443 .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1444 add v1.2d,v0.2d,v2.2d // "D + T1" 1445 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1446 add v25.2d,v25.2d,v19.2d 1447 ld1 {v24.2d},[x3],#16 1448 ext v25.16b,v25.16b,v25.16b,#8 1449 ext v5.16b,v1.16b,v4.16b,#8 1450 ext v6.16b,v3.16b,v1.16b,#8 1451 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1452 .inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1453 ext v7.16b,v23.16b,v16.16b,#8 1454 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1455 .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1456 add v0.2d,v3.2d,v4.2d // "D + T1" 1457 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1458 add v24.2d,v24.2d,v20.2d 1459 ld1 {v25.2d},[x3],#16 1460 ext v24.16b,v24.16b,v24.16b,#8 1461 ext v5.16b,v0.16b,v1.16b,#8 1462 ext v6.16b,v2.16b,v0.16b,#8 1463 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1464 .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1465 ext v7.16b,v16.16b,v17.16b,#8 1466 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1467 .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1468 add v3.2d,v2.2d,v1.2d // "D + T1" 1469 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1470 add v25.2d,v25.2d,v21.2d 1471 ld1 {v24.2d},[x3],#16 1472 ext v25.16b,v25.16b,v25.16b,#8 1473 ext v5.16b,v3.16b,v0.16b,#8 1474 ext v6.16b,v4.16b,v3.16b,#8 1475 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1476 .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1477 ext v7.16b,v17.16b,v18.16b,#8 1478 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1479 .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1480 add v2.2d,v4.2d,v0.2d // "D + T1" 1481 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1482 add v24.2d,v24.2d,v22.2d 1483 ld1 {v25.2d},[x3],#16 1484 ext v24.16b,v24.16b,v24.16b,#8 1485 ext v5.16b,v2.16b,v3.16b,#8 1486 ext v6.16b,v1.16b,v2.16b,#8 1487 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1488 .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1489 ext v7.16b,v18.16b,v19.16b,#8 1490 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1491 .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1492 add v4.2d,v1.2d,v3.2d // "D + T1" 1493 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1494 add v25.2d,v25.2d,v23.2d 1495 ld1 {v24.2d},[x3],#16 1496 ext v25.16b,v25.16b,v25.16b,#8 1497 ext v5.16b,v4.16b,v2.16b,#8 1498 ext v6.16b,v0.16b,v4.16b,#8 1499 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1500 .inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1501 ext v7.16b,v19.16b,v20.16b,#8 1502 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1503 .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1504 add v1.2d,v0.2d,v2.2d // "D + T1" 1505 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1506 ld1 {v25.2d},[x3],#16 1507 add v24.2d,v24.2d,v16.2d 1508 ld1 {v16.16b},[x1],#16 // load next input 1509 ext v24.16b,v24.16b,v24.16b,#8 1510 ext v5.16b,v1.16b,v4.16b,#8 1511 ext v6.16b,v3.16b,v1.16b,#8 1512 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1513 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1514 rev64 v16.16b,v16.16b 1515 add v0.2d,v3.2d,v4.2d // "D + T1" 1516 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1517 ld1 {v24.2d},[x3],#16 1518 add v25.2d,v25.2d,v17.2d 1519 ld1 {v17.16b},[x1],#16 // load next input 1520 ext v25.16b,v25.16b,v25.16b,#8 1521 ext v5.16b,v0.16b,v1.16b,#8 1522 ext v6.16b,v2.16b,v0.16b,#8 1523 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1524 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1525 rev64 v17.16b,v17.16b 1526 add v3.2d,v2.2d,v1.2d // "D + T1" 1527 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1528 ld1 {v25.2d},[x3],#16 1529 add v24.2d,v24.2d,v18.2d 1530 ld1 {v18.16b},[x1],#16 // load next input 1531 ext v24.16b,v24.16b,v24.16b,#8 1532 ext v5.16b,v3.16b,v0.16b,#8 1533 ext v6.16b,v4.16b,v3.16b,#8 1534 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1535 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1536 rev64 v18.16b,v18.16b 1537 add v2.2d,v4.2d,v0.2d // "D + T1" 1538 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1539 ld1 {v24.2d},[x3],#16 1540 add v25.2d,v25.2d,v19.2d 1541 ld1 {v19.16b},[x1],#16 // load next input 1542 ext v25.16b,v25.16b,v25.16b,#8 1543 ext v5.16b,v2.16b,v3.16b,#8 1544 ext v6.16b,v1.16b,v2.16b,#8 1545 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1546 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1547 rev64 v19.16b,v19.16b 1548 add v4.2d,v1.2d,v3.2d // "D + T1" 1549 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1550 ld1 {v25.2d},[x3],#16 1551 add v24.2d,v24.2d,v20.2d 1552 ld1 {v20.16b},[x1],#16 // load next input 1553 ext v24.16b,v24.16b,v24.16b,#8 1554 ext v5.16b,v4.16b,v2.16b,#8 1555 ext v6.16b,v0.16b,v4.16b,#8 1556 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1557 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1558 rev64 v20.16b,v20.16b 1559 add v1.2d,v0.2d,v2.2d // "D + T1" 1560 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1561 ld1 {v24.2d},[x3],#16 1562 add v25.2d,v25.2d,v21.2d 1563 ld1 {v21.16b},[x1],#16 // load next input 1564 ext v25.16b,v25.16b,v25.16b,#8 1565 ext v5.16b,v1.16b,v4.16b,#8 1566 ext v6.16b,v3.16b,v1.16b,#8 1567 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1568 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1569 rev64 v21.16b,v21.16b 1570 add v0.2d,v3.2d,v4.2d // "D + T1" 1571 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1572 ld1 {v25.2d},[x3],#16 1573 add v24.2d,v24.2d,v22.2d 1574 ld1 {v22.16b},[x1],#16 // load next input 1575 ext v24.16b,v24.16b,v24.16b,#8 1576 ext v5.16b,v0.16b,v1.16b,#8 1577 ext v6.16b,v2.16b,v0.16b,#8 1578 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1579 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1580 rev64 v22.16b,v22.16b 1581 add v3.2d,v2.2d,v1.2d // "D + T1" 1582 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1583 sub x3,x3,#80*8 // rewind 1584 add v25.2d,v25.2d,v23.2d 1585 ld1 {v23.16b},[x1],#16 // load next input 1586 ext v25.16b,v25.16b,v25.16b,#8 1587 ext v5.16b,v3.16b,v0.16b,#8 1588 ext v6.16b,v4.16b,v3.16b,#8 1589 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1590 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1591 rev64 v23.16b,v23.16b 1592 add v2.2d,v4.2d,v0.2d // "D + T1" 1593 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1594 add v0.2d,v0.2d,v26.2d // accumulate 1595 add v1.2d,v1.2d,v27.2d 1596 add v2.2d,v2.2d,v28.2d 1597 add v3.2d,v3.2d,v29.2d 1598 1599 cbnz x2,.Loop_hw 1600 1601 st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context 1602 1603 ldr x29,[sp],#16 1604 ret 1605 .size sha512_block_armv8,.-sha512_block_armv8 1606 #endif 1607