1 .text 2 .globl _ChaCha20_ctr32 3 .align 4 4 _ChaCha20_ctr32: 5 L_ChaCha20_ctr32_begin: 6 %ifdef __CET__ 7 8 .byte 243,15,30,251 9 %endif 10 11 pushl %ebp 12 pushl %ebx 13 pushl %esi 14 pushl %edi 15 xorl %eax,%eax 16 cmpl 28(%esp),%eax 17 je L000no_data 18 call Lpic_point 19 Lpic_point: 20 popl %eax 21 movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lpic_point(%eax),%ebp 22 testl $16777216,(%ebp) 23 jz L001x86 24 testl $512,4(%ebp) 25 jz L001x86 26 jmp Lssse3_shortcut 27 L001x86: 28 movl 32(%esp),%esi 29 movl 36(%esp),%edi 30 subl $132,%esp 31 movl (%esi),%eax 32 movl 4(%esi),%ebx 33 movl 8(%esi),%ecx 34 movl 12(%esi),%edx 35 movl %eax,80(%esp) 36 movl %ebx,84(%esp) 37 movl %ecx,88(%esp) 38 movl %edx,92(%esp) 39 movl 16(%esi),%eax 40 movl 20(%esi),%ebx 41 movl 24(%esi),%ecx 42 movl 28(%esi),%edx 43 movl %eax,96(%esp) 44 movl %ebx,100(%esp) 45 movl %ecx,104(%esp) 46 movl %edx,108(%esp) 47 movl (%edi),%eax 48 movl 4(%edi),%ebx 49 movl 8(%edi),%ecx 50 movl 12(%edi),%edx 51 subl $1,%eax 52 movl %eax,112(%esp) 53 movl %ebx,116(%esp) 54 movl %ecx,120(%esp) 55 movl %edx,124(%esp) 56 jmp L002entry 57 .align 4,0x90 58 L003outer_loop: 59 movl %ebx,156(%esp) 60 movl %eax,152(%esp) 61 movl %ecx,160(%esp) 62 L002entry: 63 movl $1634760805,%eax 64 movl $857760878,4(%esp) 65 movl $2036477234,8(%esp) 66 movl $1797285236,12(%esp) 67 movl 84(%esp),%ebx 68 movl 88(%esp),%ebp 69 movl 104(%esp),%ecx 70 movl 108(%esp),%esi 71 movl 116(%esp),%edx 72 movl 120(%esp),%edi 73 movl %ebx,20(%esp) 74 movl %ebp,24(%esp) 75 movl %ecx,40(%esp) 76 movl %esi,44(%esp) 77 movl %edx,52(%esp) 78 movl %edi,56(%esp) 79 movl 92(%esp),%ebx 80 movl 124(%esp),%edi 81 movl 112(%esp),%edx 82 movl 80(%esp),%ebp 83 movl 96(%esp),%ecx 84 movl 100(%esp),%esi 85 addl $1,%edx 86 movl %ebx,28(%esp) 87 movl %edi,60(%esp) 88 movl %edx,112(%esp) 89 movl $10,%ebx 90 jmp L004loop 91 .align 4,0x90 92 L004loop: 93 addl %ebp,%eax 94 movl %ebx,128(%esp) 95 movl %ebp,%ebx 96 xorl %eax,%edx 97 roll $16,%edx 98 addl %edx,%ecx 99 xorl %ecx,%ebx 100 movl 52(%esp),%edi 101 roll $12,%ebx 102 movl 20(%esp),%ebp 103 addl %ebx,%eax 104 xorl %eax,%edx 105 movl %eax,(%esp) 106 roll $8,%edx 107 movl 4(%esp),%eax 108 addl %edx,%ecx 109 movl %edx,48(%esp) 110 xorl %ecx,%ebx 111 addl %ebp,%eax 112 roll $7,%ebx 113 xorl %eax,%edi 114 movl %ecx,32(%esp) 115 roll $16,%edi 116 movl %ebx,16(%esp) 117 addl %edi,%esi 118 movl 40(%esp),%ecx 119 xorl %esi,%ebp 120 movl 56(%esp),%edx 121 roll $12,%ebp 122 movl 24(%esp),%ebx 123 addl %ebp,%eax 124 xorl %eax,%edi 125 movl %eax,4(%esp) 126 roll $8,%edi 127 movl 8(%esp),%eax 128 addl %edi,%esi 129 movl %edi,52(%esp) 130 xorl %esi,%ebp 131 addl %ebx,%eax 132 roll $7,%ebp 133 xorl %eax,%edx 134 movl %esi,36(%esp) 135 roll $16,%edx 136 movl %ebp,20(%esp) 137 addl %edx,%ecx 138 movl 44(%esp),%esi 139 xorl %ecx,%ebx 140 movl 60(%esp),%edi 141 roll $12,%ebx 142 movl 28(%esp),%ebp 143 addl %ebx,%eax 144 xorl %eax,%edx 145 movl %eax,8(%esp) 146 roll $8,%edx 147 movl 12(%esp),%eax 148 addl %edx,%ecx 149 movl %edx,56(%esp) 150 xorl %ecx,%ebx 151 addl %ebp,%eax 152 roll $7,%ebx 153 xorl %eax,%edi 154 roll $16,%edi 155 movl %ebx,24(%esp) 156 addl %edi,%esi 157 xorl %esi,%ebp 158 roll $12,%ebp 159 movl 20(%esp),%ebx 160 addl %ebp,%eax 161 xorl %eax,%edi 162 movl %eax,12(%esp) 163 roll $8,%edi 164 movl (%esp),%eax 165 addl %edi,%esi 166 movl %edi,%edx 167 xorl %esi,%ebp 168 addl %ebx,%eax 169 roll $7,%ebp 170 xorl %eax,%edx 171 roll $16,%edx 172 movl %ebp,28(%esp) 173 addl %edx,%ecx 174 xorl %ecx,%ebx 175 movl 48(%esp),%edi 176 roll $12,%ebx 177 movl 24(%esp),%ebp 178 addl %ebx,%eax 179 xorl %eax,%edx 180 movl %eax,(%esp) 181 roll $8,%edx 182 movl 4(%esp),%eax 183 addl %edx,%ecx 184 movl %edx,60(%esp) 185 xorl %ecx,%ebx 186 addl %ebp,%eax 187 roll $7,%ebx 188 xorl %eax,%edi 189 movl %ecx,40(%esp) 190 roll $16,%edi 191 movl %ebx,20(%esp) 192 addl %edi,%esi 193 movl 32(%esp),%ecx 194 xorl %esi,%ebp 195 movl 52(%esp),%edx 196 roll $12,%ebp 197 movl 28(%esp),%ebx 198 addl %ebp,%eax 199 xorl %eax,%edi 200 movl %eax,4(%esp) 201 roll $8,%edi 202 movl 8(%esp),%eax 203 addl %edi,%esi 204 movl %edi,48(%esp) 205 xorl %esi,%ebp 206 addl %ebx,%eax 207 roll $7,%ebp 208 xorl %eax,%edx 209 movl %esi,44(%esp) 210 roll $16,%edx 211 movl %ebp,24(%esp) 212 addl %edx,%ecx 213 movl 36(%esp),%esi 214 xorl %ecx,%ebx 215 movl 56(%esp),%edi 216 roll $12,%ebx 217 movl 16(%esp),%ebp 218 addl %ebx,%eax 219 xorl %eax,%edx 220 movl %eax,8(%esp) 221 roll $8,%edx 222 movl 12(%esp),%eax 223 addl %edx,%ecx 224 movl %edx,52(%esp) 225 xorl %ecx,%ebx 226 addl %ebp,%eax 227 roll $7,%ebx 228 xorl %eax,%edi 229 roll $16,%edi 230 movl %ebx,28(%esp) 231 addl %edi,%esi 232 xorl %esi,%ebp 233 movl 48(%esp),%edx 234 roll $12,%ebp 235 movl 128(%esp),%ebx 236 addl %ebp,%eax 237 xorl %eax,%edi 238 movl %eax,12(%esp) 239 roll $8,%edi 240 movl (%esp),%eax 241 addl %edi,%esi 242 movl %edi,56(%esp) 243 xorl %esi,%ebp 244 roll $7,%ebp 245 decl %ebx 246 jnz L004loop 247 movl 160(%esp),%ebx 248 addl $1634760805,%eax 249 addl 80(%esp),%ebp 250 addl 96(%esp),%ecx 251 addl 100(%esp),%esi 252 cmpl $64,%ebx 253 jb L005tail 254 movl 156(%esp),%ebx 255 addl 112(%esp),%edx 256 addl 120(%esp),%edi 257 xorl (%ebx),%eax 258 xorl 16(%ebx),%ebp 259 movl %eax,(%esp) 260 movl 152(%esp),%eax 261 xorl 32(%ebx),%ecx 262 xorl 36(%ebx),%esi 263 xorl 48(%ebx),%edx 264 xorl 56(%ebx),%edi 265 movl %ebp,16(%eax) 266 movl %ecx,32(%eax) 267 movl %esi,36(%eax) 268 movl %edx,48(%eax) 269 movl %edi,56(%eax) 270 movl 4(%esp),%ebp 271 movl 8(%esp),%ecx 272 movl 12(%esp),%esi 273 movl 20(%esp),%edx 274 movl 24(%esp),%edi 275 addl $857760878,%ebp 276 addl $2036477234,%ecx 277 addl $1797285236,%esi 278 addl 84(%esp),%edx 279 addl 88(%esp),%edi 280 xorl 4(%ebx),%ebp 281 xorl 8(%ebx),%ecx 282 xorl 12(%ebx),%esi 283 xorl 20(%ebx),%edx 284 xorl 24(%ebx),%edi 285 movl %ebp,4(%eax) 286 movl %ecx,8(%eax) 287 movl %esi,12(%eax) 288 movl %edx,20(%eax) 289 movl %edi,24(%eax) 290 movl 28(%esp),%ebp 291 movl 40(%esp),%ecx 292 movl 44(%esp),%esi 293 movl 52(%esp),%edx 294 movl 60(%esp),%edi 295 addl 92(%esp),%ebp 296 addl 104(%esp),%ecx 297 addl 108(%esp),%esi 298 addl 116(%esp),%edx 299 addl 124(%esp),%edi 300 xorl 28(%ebx),%ebp 301 xorl 40(%ebx),%ecx 302 xorl 44(%ebx),%esi 303 xorl 52(%ebx),%edx 304 xorl 60(%ebx),%edi 305 leal 64(%ebx),%ebx 306 movl %ebp,28(%eax) 307 movl (%esp),%ebp 308 movl %ecx,40(%eax) 309 movl 160(%esp),%ecx 310 movl %esi,44(%eax) 311 movl %edx,52(%eax) 312 movl %edi,60(%eax) 313 movl %ebp,(%eax) 314 leal 64(%eax),%eax 315 subl $64,%ecx 316 jnz L003outer_loop 317 jmp L006done 318 L005tail: 319 addl 112(%esp),%edx 320 addl 120(%esp),%edi 321 movl %eax,(%esp) 322 movl %ebp,16(%esp) 323 movl %ecx,32(%esp) 324 movl %esi,36(%esp) 325 movl %edx,48(%esp) 326 movl %edi,56(%esp) 327 movl 4(%esp),%ebp 328 movl 8(%esp),%ecx 329 movl 12(%esp),%esi 330 movl 20(%esp),%edx 331 movl 24(%esp),%edi 332 addl $857760878,%ebp 333 addl $2036477234,%ecx 334 addl $1797285236,%esi 335 addl 84(%esp),%edx 336 addl 88(%esp),%edi 337 movl %ebp,4(%esp) 338 movl %ecx,8(%esp) 339 movl %esi,12(%esp) 340 movl %edx,20(%esp) 341 movl %edi,24(%esp) 342 movl 28(%esp),%ebp 343 movl 40(%esp),%ecx 344 movl 44(%esp),%esi 345 movl 52(%esp),%edx 346 movl 60(%esp),%edi 347 addl 92(%esp),%ebp 348 addl 104(%esp),%ecx 349 addl 108(%esp),%esi 350 addl 116(%esp),%edx 351 addl 124(%esp),%edi 352 movl %ebp,28(%esp) 353 movl 156(%esp),%ebp 354 movl %ecx,40(%esp) 355 movl 152(%esp),%ecx 356 movl %esi,44(%esp) 357 xorl %esi,%esi 358 movl %edx,52(%esp) 359 movl %edi,60(%esp) 360 xorl %eax,%eax 361 xorl %edx,%edx 362 L007tail_loop: 363 movb (%esi,%ebp,1),%al 364 movb (%esp,%esi,1),%dl 365 leal 1(%esi),%esi 366 xorb %dl,%al 367 movb %al,-1(%ecx,%esi,1) 368 decl %ebx 369 jnz L007tail_loop 370 L006done: 371 addl $132,%esp 372 L000no_data: 373 popl %edi 374 popl %esi 375 popl %ebx 376 popl %ebp 377 ret 378 .globl _ChaCha20_ssse3 379 .align 4 380 _ChaCha20_ssse3: 381 L_ChaCha20_ssse3_begin: 382 %ifdef __CET__ 383 384 .byte 243,15,30,251 385 %endif 386 387 pushl %ebp 388 pushl %ebx 389 pushl %esi 390 pushl %edi 391 Lssse3_shortcut: 392 testl $2048,4(%ebp) 393 jnz Lxop_shortcut 394 movl 20(%esp),%edi 395 movl 24(%esp),%esi 396 movl 28(%esp),%ecx 397 movl 32(%esp),%edx 398 movl 36(%esp),%ebx 399 movl %esp,%ebp 400 subl $524,%esp 401 andl $-64,%esp 402 movl %ebp,512(%esp) 403 leal Lssse3_data-Lpic_point(%eax),%eax 404 movdqu (%ebx),%xmm3 405 cmpl $256,%ecx 406 jb L0081x 407 movl %edx,516(%esp) 408 movl %ebx,520(%esp) 409 subl $256,%ecx 410 leal 384(%esp),%ebp 411 movdqu (%edx),%xmm7 412 pshufd $0,%xmm3,%xmm0 413 pshufd $85,%xmm3,%xmm1 414 pshufd $170,%xmm3,%xmm2 415 pshufd $255,%xmm3,%xmm3 416 paddd 48(%eax),%xmm0 417 pshufd $0,%xmm7,%xmm4 418 pshufd $85,%xmm7,%xmm5 419 psubd 64(%eax),%xmm0 420 pshufd $170,%xmm7,%xmm6 421 pshufd $255,%xmm7,%xmm7 422 movdqa %xmm0,64(%ebp) 423 movdqa %xmm1,80(%ebp) 424 movdqa %xmm2,96(%ebp) 425 movdqa %xmm3,112(%ebp) 426 movdqu 16(%edx),%xmm3 427 movdqa %xmm4,-64(%ebp) 428 movdqa %xmm5,-48(%ebp) 429 movdqa %xmm6,-32(%ebp) 430 movdqa %xmm7,-16(%ebp) 431 movdqa 32(%eax),%xmm7 432 leal 128(%esp),%ebx 433 pshufd $0,%xmm3,%xmm0 434 pshufd $85,%xmm3,%xmm1 435 pshufd $170,%xmm3,%xmm2 436 pshufd $255,%xmm3,%xmm3 437 pshufd $0,%xmm7,%xmm4 438 pshufd $85,%xmm7,%xmm5 439 pshufd $170,%xmm7,%xmm6 440 pshufd $255,%xmm7,%xmm7 441 movdqa %xmm0,(%ebp) 442 movdqa %xmm1,16(%ebp) 443 movdqa %xmm2,32(%ebp) 444 movdqa %xmm3,48(%ebp) 445 movdqa %xmm4,-128(%ebp) 446 movdqa %xmm5,-112(%ebp) 447 movdqa %xmm6,-96(%ebp) 448 movdqa %xmm7,-80(%ebp) 449 leal 128(%esi),%esi 450 leal 128(%edi),%edi 451 jmp L009outer_loop 452 .align 4,0x90 453 L009outer_loop: 454 movdqa -112(%ebp),%xmm1 455 movdqa -96(%ebp),%xmm2 456 movdqa -80(%ebp),%xmm3 457 movdqa -48(%ebp),%xmm5 458 movdqa -32(%ebp),%xmm6 459 movdqa -16(%ebp),%xmm7 460 movdqa %xmm1,-112(%ebx) 461 movdqa %xmm2,-96(%ebx) 462 movdqa %xmm3,-80(%ebx) 463 movdqa %xmm5,-48(%ebx) 464 movdqa %xmm6,-32(%ebx) 465 movdqa %xmm7,-16(%ebx) 466 movdqa 32(%ebp),%xmm2 467 movdqa 48(%ebp),%xmm3 468 movdqa 64(%ebp),%xmm4 469 movdqa 80(%ebp),%xmm5 470 movdqa 96(%ebp),%xmm6 471 movdqa 112(%ebp),%xmm7 472 paddd 64(%eax),%xmm4 473 movdqa %xmm2,32(%ebx) 474 movdqa %xmm3,48(%ebx) 475 movdqa %xmm4,64(%ebx) 476 movdqa %xmm5,80(%ebx) 477 movdqa %xmm6,96(%ebx) 478 movdqa %xmm7,112(%ebx) 479 movdqa %xmm4,64(%ebp) 480 movdqa -128(%ebp),%xmm0 481 movdqa %xmm4,%xmm6 482 movdqa -64(%ebp),%xmm3 483 movdqa (%ebp),%xmm4 484 movdqa 16(%ebp),%xmm5 485 movl $10,%edx 486 nop 487 .align 4,0x90 488 L010loop: 489 paddd %xmm3,%xmm0 490 movdqa %xmm3,%xmm2 491 pxor %xmm0,%xmm6 492 pshufb (%eax),%xmm6 493 paddd %xmm6,%xmm4 494 pxor %xmm4,%xmm2 495 movdqa -48(%ebx),%xmm3 496 movdqa %xmm2,%xmm1 497 pslld $12,%xmm2 498 psrld $20,%xmm1 499 por %xmm1,%xmm2 500 movdqa -112(%ebx),%xmm1 501 paddd %xmm2,%xmm0 502 movdqa 80(%ebx),%xmm7 503 pxor %xmm0,%xmm6 504 movdqa %xmm0,-128(%ebx) 505 pshufb 16(%eax),%xmm6 506 paddd %xmm6,%xmm4 507 movdqa %xmm6,64(%ebx) 508 pxor %xmm4,%xmm2 509 paddd %xmm3,%xmm1 510 movdqa %xmm2,%xmm0 511 pslld $7,%xmm2 512 psrld $25,%xmm0 513 pxor %xmm1,%xmm7 514 por %xmm0,%xmm2 515 movdqa %xmm4,(%ebx) 516 pshufb (%eax),%xmm7 517 movdqa %xmm2,-64(%ebx) 518 paddd %xmm7,%xmm5 519 movdqa 32(%ebx),%xmm4 520 pxor %xmm5,%xmm3 521 movdqa -32(%ebx),%xmm2 522 movdqa %xmm3,%xmm0 523 pslld $12,%xmm3 524 psrld $20,%xmm0 525 por %xmm0,%xmm3 526 movdqa -96(%ebx),%xmm0 527 paddd %xmm3,%xmm1 528 movdqa 96(%ebx),%xmm6 529 pxor %xmm1,%xmm7 530 movdqa %xmm1,-112(%ebx) 531 pshufb 16(%eax),%xmm7 532 paddd %xmm7,%xmm5 533 movdqa %xmm7,80(%ebx) 534 pxor %xmm5,%xmm3 535 paddd %xmm2,%xmm0 536 movdqa %xmm3,%xmm1 537 pslld $7,%xmm3 538 psrld $25,%xmm1 539 pxor %xmm0,%xmm6 540 por %xmm1,%xmm3 541 movdqa %xmm5,16(%ebx) 542 pshufb (%eax),%xmm6 543 movdqa %xmm3,-48(%ebx) 544 paddd %xmm6,%xmm4 545 movdqa 48(%ebx),%xmm5 546 pxor %xmm4,%xmm2 547 movdqa -16(%ebx),%xmm3 548 movdqa %xmm2,%xmm1 549 pslld $12,%xmm2 550 psrld $20,%xmm1 551 por %xmm1,%xmm2 552 movdqa -80(%ebx),%xmm1 553 paddd %xmm2,%xmm0 554 movdqa 112(%ebx),%xmm7 555 pxor %xmm0,%xmm6 556 movdqa %xmm0,-96(%ebx) 557 pshufb 16(%eax),%xmm6 558 paddd %xmm6,%xmm4 559 movdqa %xmm6,96(%ebx) 560 pxor %xmm4,%xmm2 561 paddd %xmm3,%xmm1 562 movdqa %xmm2,%xmm0 563 pslld $7,%xmm2 564 psrld $25,%xmm0 565 pxor %xmm1,%xmm7 566 por %xmm0,%xmm2 567 pshufb (%eax),%xmm7 568 movdqa %xmm2,-32(%ebx) 569 paddd %xmm7,%xmm5 570 pxor %xmm5,%xmm3 571 movdqa -48(%ebx),%xmm2 572 movdqa %xmm3,%xmm0 573 pslld $12,%xmm3 574 psrld $20,%xmm0 575 por %xmm0,%xmm3 576 movdqa -128(%ebx),%xmm0 577 paddd %xmm3,%xmm1 578 pxor %xmm1,%xmm7 579 movdqa %xmm1,-80(%ebx) 580 pshufb 16(%eax),%xmm7 581 paddd %xmm7,%xmm5 582 movdqa %xmm7,%xmm6 583 pxor %xmm5,%xmm3 584 paddd %xmm2,%xmm0 585 movdqa %xmm3,%xmm1 586 pslld $7,%xmm3 587 psrld $25,%xmm1 588 pxor %xmm0,%xmm6 589 por %xmm1,%xmm3 590 pshufb (%eax),%xmm6 591 movdqa %xmm3,-16(%ebx) 592 paddd %xmm6,%xmm4 593 pxor %xmm4,%xmm2 594 movdqa -32(%ebx),%xmm3 595 movdqa %xmm2,%xmm1 596 pslld $12,%xmm2 597 psrld $20,%xmm1 598 por %xmm1,%xmm2 599 movdqa -112(%ebx),%xmm1 600 paddd %xmm2,%xmm0 601 movdqa 64(%ebx),%xmm7 602 pxor %xmm0,%xmm6 603 movdqa %xmm0,-128(%ebx) 604 pshufb 16(%eax),%xmm6 605 paddd %xmm6,%xmm4 606 movdqa %xmm6,112(%ebx) 607 pxor %xmm4,%xmm2 608 paddd %xmm3,%xmm1 609 movdqa %xmm2,%xmm0 610 pslld $7,%xmm2 611 psrld $25,%xmm0 612 pxor %xmm1,%xmm7 613 por %xmm0,%xmm2 614 movdqa %xmm4,32(%ebx) 615 pshufb (%eax),%xmm7 616 movdqa %xmm2,-48(%ebx) 617 paddd %xmm7,%xmm5 618 movdqa (%ebx),%xmm4 619 pxor %xmm5,%xmm3 620 movdqa -16(%ebx),%xmm2 621 movdqa %xmm3,%xmm0 622 pslld $12,%xmm3 623 psrld $20,%xmm0 624 por %xmm0,%xmm3 625 movdqa -96(%ebx),%xmm0 626 paddd %xmm3,%xmm1 627 movdqa 80(%ebx),%xmm6 628 pxor %xmm1,%xmm7 629 movdqa %xmm1,-112(%ebx) 630 pshufb 16(%eax),%xmm7 631 paddd %xmm7,%xmm5 632 movdqa %xmm7,64(%ebx) 633 pxor %xmm5,%xmm3 634 paddd %xmm2,%xmm0 635 movdqa %xmm3,%xmm1 636 pslld $7,%xmm3 637 psrld $25,%xmm1 638 pxor %xmm0,%xmm6 639 por %xmm1,%xmm3 640 movdqa %xmm5,48(%ebx) 641 pshufb (%eax),%xmm6 642 movdqa %xmm3,-32(%ebx) 643 paddd %xmm6,%xmm4 644 movdqa 16(%ebx),%xmm5 645 pxor %xmm4,%xmm2 646 movdqa -64(%ebx),%xmm3 647 movdqa %xmm2,%xmm1 648 pslld $12,%xmm2 649 psrld $20,%xmm1 650 por %xmm1,%xmm2 651 movdqa -80(%ebx),%xmm1 652 paddd %xmm2,%xmm0 653 movdqa 96(%ebx),%xmm7 654 pxor %xmm0,%xmm6 655 movdqa %xmm0,-96(%ebx) 656 pshufb 16(%eax),%xmm6 657 paddd %xmm6,%xmm4 658 movdqa %xmm6,80(%ebx) 659 pxor %xmm4,%xmm2 660 paddd %xmm3,%xmm1 661 movdqa %xmm2,%xmm0 662 pslld $7,%xmm2 663 psrld $25,%xmm0 664 pxor %xmm1,%xmm7 665 por %xmm0,%xmm2 666 pshufb (%eax),%xmm7 667 movdqa %xmm2,-16(%ebx) 668 paddd %xmm7,%xmm5 669 pxor %xmm5,%xmm3 670 movdqa %xmm3,%xmm0 671 pslld $12,%xmm3 672 psrld $20,%xmm0 673 por %xmm0,%xmm3 674 movdqa -128(%ebx),%xmm0 675 paddd %xmm3,%xmm1 676 movdqa 64(%ebx),%xmm6 677 pxor %xmm1,%xmm7 678 movdqa %xmm1,-80(%ebx) 679 pshufb 16(%eax),%xmm7 680 paddd %xmm7,%xmm5 681 movdqa %xmm7,96(%ebx) 682 pxor %xmm5,%xmm3 683 movdqa %xmm3,%xmm1 684 pslld $7,%xmm3 685 psrld $25,%xmm1 686 por %xmm1,%xmm3 687 decl %edx 688 jnz L010loop 689 movdqa %xmm3,-64(%ebx) 690 movdqa %xmm4,(%ebx) 691 movdqa %xmm5,16(%ebx) 692 movdqa %xmm6,64(%ebx) 693 movdqa %xmm7,96(%ebx) 694 movdqa -112(%ebx),%xmm1 695 movdqa -96(%ebx),%xmm2 696 movdqa -80(%ebx),%xmm3 697 paddd -128(%ebp),%xmm0 698 paddd -112(%ebp),%xmm1 699 paddd -96(%ebp),%xmm2 700 paddd -80(%ebp),%xmm3 701 movdqa %xmm0,%xmm6 702 punpckldq %xmm1,%xmm0 703 movdqa %xmm2,%xmm7 704 punpckldq %xmm3,%xmm2 705 punpckhdq %xmm1,%xmm6 706 punpckhdq %xmm3,%xmm7 707 movdqa %xmm0,%xmm1 708 punpcklqdq %xmm2,%xmm0 709 movdqa %xmm6,%xmm3 710 punpcklqdq %xmm7,%xmm6 711 punpckhqdq %xmm2,%xmm1 712 punpckhqdq %xmm7,%xmm3 713 movdqu -128(%esi),%xmm4 714 movdqu -64(%esi),%xmm5 715 movdqu (%esi),%xmm2 716 movdqu 64(%esi),%xmm7 717 leal 16(%esi),%esi 718 pxor %xmm0,%xmm4 719 movdqa -64(%ebx),%xmm0 720 pxor %xmm1,%xmm5 721 movdqa -48(%ebx),%xmm1 722 pxor %xmm2,%xmm6 723 movdqa -32(%ebx),%xmm2 724 pxor %xmm3,%xmm7 725 movdqa -16(%ebx),%xmm3 726 movdqu %xmm4,-128(%edi) 727 movdqu %xmm5,-64(%edi) 728 movdqu %xmm6,(%edi) 729 movdqu %xmm7,64(%edi) 730 leal 16(%edi),%edi 731 paddd -64(%ebp),%xmm0 732 paddd -48(%ebp),%xmm1 733 paddd -32(%ebp),%xmm2 734 paddd -16(%ebp),%xmm3 735 movdqa %xmm0,%xmm6 736 punpckldq %xmm1,%xmm0 737 movdqa %xmm2,%xmm7 738 punpckldq %xmm3,%xmm2 739 punpckhdq %xmm1,%xmm6 740 punpckhdq %xmm3,%xmm7 741 movdqa %xmm0,%xmm1 742 punpcklqdq %xmm2,%xmm0 743 movdqa %xmm6,%xmm3 744 punpcklqdq %xmm7,%xmm6 745 punpckhqdq %xmm2,%xmm1 746 punpckhqdq %xmm7,%xmm3 747 movdqu -128(%esi),%xmm4 748 movdqu -64(%esi),%xmm5 749 movdqu (%esi),%xmm2 750 movdqu 64(%esi),%xmm7 751 leal 16(%esi),%esi 752 pxor %xmm0,%xmm4 753 movdqa (%ebx),%xmm0 754 pxor %xmm1,%xmm5 755 movdqa 16(%ebx),%xmm1 756 pxor %xmm2,%xmm6 757 movdqa 32(%ebx),%xmm2 758 pxor %xmm3,%xmm7 759 movdqa 48(%ebx),%xmm3 760 movdqu %xmm4,-128(%edi) 761 movdqu %xmm5,-64(%edi) 762 movdqu %xmm6,(%edi) 763 movdqu %xmm7,64(%edi) 764 leal 16(%edi),%edi 765 paddd (%ebp),%xmm0 766 paddd 16(%ebp),%xmm1 767 paddd 32(%ebp),%xmm2 768 paddd 48(%ebp),%xmm3 769 movdqa %xmm0,%xmm6 770 punpckldq %xmm1,%xmm0 771 movdqa %xmm2,%xmm7 772 punpckldq %xmm3,%xmm2 773 punpckhdq %xmm1,%xmm6 774 punpckhdq %xmm3,%xmm7 775 movdqa %xmm0,%xmm1 776 punpcklqdq %xmm2,%xmm0 777 movdqa %xmm6,%xmm3 778 punpcklqdq %xmm7,%xmm6 779 punpckhqdq %xmm2,%xmm1 780 punpckhqdq %xmm7,%xmm3 781 movdqu -128(%esi),%xmm4 782 movdqu -64(%esi),%xmm5 783 movdqu (%esi),%xmm2 784 movdqu 64(%esi),%xmm7 785 leal 16(%esi),%esi 786 pxor %xmm0,%xmm4 787 movdqa 64(%ebx),%xmm0 788 pxor %xmm1,%xmm5 789 movdqa 80(%ebx),%xmm1 790 pxor %xmm2,%xmm6 791 movdqa 96(%ebx),%xmm2 792 pxor %xmm3,%xmm7 793 movdqa 112(%ebx),%xmm3 794 movdqu %xmm4,-128(%edi) 795 movdqu %xmm5,-64(%edi) 796 movdqu %xmm6,(%edi) 797 movdqu %xmm7,64(%edi) 798 leal 16(%edi),%edi 799 paddd 64(%ebp),%xmm0 800 paddd 80(%ebp),%xmm1 801 paddd 96(%ebp),%xmm2 802 paddd 112(%ebp),%xmm3 803 movdqa %xmm0,%xmm6 804 punpckldq %xmm1,%xmm0 805 movdqa %xmm2,%xmm7 806 punpckldq %xmm3,%xmm2 807 punpckhdq %xmm1,%xmm6 808 punpckhdq %xmm3,%xmm7 809 movdqa %xmm0,%xmm1 810 punpcklqdq %xmm2,%xmm0 811 movdqa %xmm6,%xmm3 812 punpcklqdq %xmm7,%xmm6 813 punpckhqdq %xmm2,%xmm1 814 punpckhqdq %xmm7,%xmm3 815 movdqu -128(%esi),%xmm4 816 movdqu -64(%esi),%xmm5 817 movdqu (%esi),%xmm2 818 movdqu 64(%esi),%xmm7 819 leal 208(%esi),%esi 820 pxor %xmm0,%xmm4 821 pxor %xmm1,%xmm5 822 pxor %xmm2,%xmm6 823 pxor %xmm3,%xmm7 824 movdqu %xmm4,-128(%edi) 825 movdqu %xmm5,-64(%edi) 826 movdqu %xmm6,(%edi) 827 movdqu %xmm7,64(%edi) 828 leal 208(%edi),%edi 829 subl $256,%ecx 830 jnc L009outer_loop 831 addl $256,%ecx 832 jz L011done 833 movl 520(%esp),%ebx 834 leal -128(%esi),%esi 835 movl 516(%esp),%edx 836 leal -128(%edi),%edi 837 movd 64(%ebp),%xmm2 838 movdqu (%ebx),%xmm3 839 paddd 96(%eax),%xmm2 840 pand 112(%eax),%xmm3 841 por %xmm2,%xmm3 842 L0081x: 843 movdqa 32(%eax),%xmm0 844 movdqu (%edx),%xmm1 845 movdqu 16(%edx),%xmm2 846 movdqa (%eax),%xmm6 847 movdqa 16(%eax),%xmm7 848 movl %ebp,48(%esp) 849 movdqa %xmm0,(%esp) 850 movdqa %xmm1,16(%esp) 851 movdqa %xmm2,32(%esp) 852 movdqa %xmm3,48(%esp) 853 movl $10,%edx 854 jmp L012loop1x 855 .align 4,0x90 856 L013outer1x: 857 movdqa 80(%eax),%xmm3 858 movdqa (%esp),%xmm0 859 movdqa 16(%esp),%xmm1 860 movdqa 32(%esp),%xmm2 861 paddd 48(%esp),%xmm3 862 movl $10,%edx 863 movdqa %xmm3,48(%esp) 864 jmp L012loop1x 865 .align 4,0x90 866 L012loop1x: 867 paddd %xmm1,%xmm0 868 pxor %xmm0,%xmm3 869 .byte 102,15,56,0,222 870 paddd %xmm3,%xmm2 871 pxor %xmm2,%xmm1 872 movdqa %xmm1,%xmm4 873 psrld $20,%xmm1 874 pslld $12,%xmm4 875 por %xmm4,%xmm1 876 paddd %xmm1,%xmm0 877 pxor %xmm0,%xmm3 878 .byte 102,15,56,0,223 879 paddd %xmm3,%xmm2 880 pxor %xmm2,%xmm1 881 movdqa %xmm1,%xmm4 882 psrld $25,%xmm1 883 pslld $7,%xmm4 884 por %xmm4,%xmm1 885 pshufd $78,%xmm2,%xmm2 886 pshufd $57,%xmm1,%xmm1 887 pshufd $147,%xmm3,%xmm3 888 nop 889 paddd %xmm1,%xmm0 890 pxor %xmm0,%xmm3 891 .byte 102,15,56,0,222 892 paddd %xmm3,%xmm2 893 pxor %xmm2,%xmm1 894 movdqa %xmm1,%xmm4 895 psrld $20,%xmm1 896 pslld $12,%xmm4 897 por %xmm4,%xmm1 898 paddd %xmm1,%xmm0 899 pxor %xmm0,%xmm3 900 .byte 102,15,56,0,223 901 paddd %xmm3,%xmm2 902 pxor %xmm2,%xmm1 903 movdqa %xmm1,%xmm4 904 psrld $25,%xmm1 905 pslld $7,%xmm4 906 por %xmm4,%xmm1 907 pshufd $78,%xmm2,%xmm2 908 pshufd $147,%xmm1,%xmm1 909 pshufd $57,%xmm3,%xmm3 910 decl %edx 911 jnz L012loop1x 912 paddd (%esp),%xmm0 913 paddd 16(%esp),%xmm1 914 paddd 32(%esp),%xmm2 915 paddd 48(%esp),%xmm3 916 cmpl $64,%ecx 917 jb L014tail 918 movdqu (%esi),%xmm4 919 movdqu 16(%esi),%xmm5 920 pxor %xmm4,%xmm0 921 movdqu 32(%esi),%xmm4 922 pxor %xmm5,%xmm1 923 movdqu 48(%esi),%xmm5 924 pxor %xmm4,%xmm2 925 pxor %xmm5,%xmm3 926 leal 64(%esi),%esi 927 movdqu %xmm0,(%edi) 928 movdqu %xmm1,16(%edi) 929 movdqu %xmm2,32(%edi) 930 movdqu %xmm3,48(%edi) 931 leal 64(%edi),%edi 932 subl $64,%ecx 933 jnz L013outer1x 934 jmp L011done 935 L014tail: 936 movdqa %xmm0,(%esp) 937 movdqa %xmm1,16(%esp) 938 movdqa %xmm2,32(%esp) 939 movdqa %xmm3,48(%esp) 940 xorl %eax,%eax 941 xorl %edx,%edx 942 xorl %ebp,%ebp 943 L015tail_loop: 944 movb (%esp,%ebp,1),%al 945 movb (%esi,%ebp,1),%dl 946 leal 1(%ebp),%ebp 947 xorb %dl,%al 948 movb %al,-1(%edi,%ebp,1) 949 decl %ecx 950 jnz L015tail_loop 951 L011done: 952 movl 512(%esp),%esp 953 popl %edi 954 popl %esi 955 popl %ebx 956 popl %ebp 957 ret 958 .align 6,0x90 959 Lssse3_data: 960 .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 961 .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 962 .long 1634760805,857760878,2036477234,1797285236 963 .long 0,1,2,3 964 .long 4,4,4,4 965 .long 1,0,0,0 966 .long 4,0,0,0 967 .long 0,-1,-1,-1 968 .align 6,0x90 969 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 970 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 971 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 972 .byte 114,103,62,0 973 .globl _ChaCha20_xop 974 .align 4 975 _ChaCha20_xop: 976 L_ChaCha20_xop_begin: 977 %ifdef __CET__ 978 979 .byte 243,15,30,251 980 %endif 981 982 pushl %ebp 983 pushl %ebx 984 pushl %esi 985 pushl %edi 986 Lxop_shortcut: 987 movl 20(%esp),%edi 988 movl 24(%esp),%esi 989 movl 28(%esp),%ecx 990 movl 32(%esp),%edx 991 movl 36(%esp),%ebx 992 vzeroupper 993 movl %esp,%ebp 994 subl $524,%esp 995 andl $-64,%esp 996 movl %ebp,512(%esp) 997 leal Lssse3_data-Lpic_point(%eax),%eax 998 vmovdqu (%ebx),%xmm3 999 cmpl $256,%ecx 1000 jb L0161x 1001 movl %edx,516(%esp) 1002 movl %ebx,520(%esp) 1003 subl $256,%ecx 1004 leal 384(%esp),%ebp 1005 vmovdqu (%edx),%xmm7 1006 vpshufd $0,%xmm3,%xmm0 1007 vpshufd $85,%xmm3,%xmm1 1008 vpshufd $170,%xmm3,%xmm2 1009 vpshufd $255,%xmm3,%xmm3 1010 vpaddd 48(%eax),%xmm0,%xmm0 1011 vpshufd $0,%xmm7,%xmm4 1012 vpshufd $85,%xmm7,%xmm5 1013 vpsubd 64(%eax),%xmm0,%xmm0 1014 vpshufd $170,%xmm7,%xmm6 1015 vpshufd $255,%xmm7,%xmm7 1016 vmovdqa %xmm0,64(%ebp) 1017 vmovdqa %xmm1,80(%ebp) 1018 vmovdqa %xmm2,96(%ebp) 1019 vmovdqa %xmm3,112(%ebp) 1020 vmovdqu 16(%edx),%xmm3 1021 vmovdqa %xmm4,-64(%ebp) 1022 vmovdqa %xmm5,-48(%ebp) 1023 vmovdqa %xmm6,-32(%ebp) 1024 vmovdqa %xmm7,-16(%ebp) 1025 vmovdqa 32(%eax),%xmm7 1026 leal 128(%esp),%ebx 1027 vpshufd $0,%xmm3,%xmm0 1028 vpshufd $85,%xmm3,%xmm1 1029 vpshufd $170,%xmm3,%xmm2 1030 vpshufd $255,%xmm3,%xmm3 1031 vpshufd $0,%xmm7,%xmm4 1032 vpshufd $85,%xmm7,%xmm5 1033 vpshufd $170,%xmm7,%xmm6 1034 vpshufd $255,%xmm7,%xmm7 1035 vmovdqa %xmm0,(%ebp) 1036 vmovdqa %xmm1,16(%ebp) 1037 vmovdqa %xmm2,32(%ebp) 1038 vmovdqa %xmm3,48(%ebp) 1039 vmovdqa %xmm4,-128(%ebp) 1040 vmovdqa %xmm5,-112(%ebp) 1041 vmovdqa %xmm6,-96(%ebp) 1042 vmovdqa %xmm7,-80(%ebp) 1043 leal 128(%esi),%esi 1044 leal 128(%edi),%edi 1045 jmp L017outer_loop 1046 .align 5,0x90 1047 L017outer_loop: 1048 vmovdqa -112(%ebp),%xmm1 1049 vmovdqa -96(%ebp),%xmm2 1050 vmovdqa -80(%ebp),%xmm3 1051 vmovdqa -48(%ebp),%xmm5 1052 vmovdqa -32(%ebp),%xmm6 1053 vmovdqa -16(%ebp),%xmm7 1054 vmovdqa %xmm1,-112(%ebx) 1055 vmovdqa %xmm2,-96(%ebx) 1056 vmovdqa %xmm3,-80(%ebx) 1057 vmovdqa %xmm5,-48(%ebx) 1058 vmovdqa %xmm6,-32(%ebx) 1059 vmovdqa %xmm7,-16(%ebx) 1060 vmovdqa 32(%ebp),%xmm2 1061 vmovdqa 48(%ebp),%xmm3 1062 vmovdqa 64(%ebp),%xmm4 1063 vmovdqa 80(%ebp),%xmm5 1064 vmovdqa 96(%ebp),%xmm6 1065 vmovdqa 112(%ebp),%xmm7 1066 vpaddd 64(%eax),%xmm4,%xmm4 1067 vmovdqa %xmm2,32(%ebx) 1068 vmovdqa %xmm3,48(%ebx) 1069 vmovdqa %xmm4,64(%ebx) 1070 vmovdqa %xmm5,80(%ebx) 1071 vmovdqa %xmm6,96(%ebx) 1072 vmovdqa %xmm7,112(%ebx) 1073 vmovdqa %xmm4,64(%ebp) 1074 vmovdqa -128(%ebp),%xmm0 1075 vmovdqa %xmm4,%xmm6 1076 vmovdqa -64(%ebp),%xmm3 1077 vmovdqa (%ebp),%xmm4 1078 vmovdqa 16(%ebp),%xmm5 1079 movl $10,%edx 1080 nop 1081 .align 5,0x90 1082 L018loop: 1083 vpaddd %xmm3,%xmm0,%xmm0 1084 vpxor %xmm0,%xmm6,%xmm6 1085 .byte 143,232,120,194,246,16 1086 vpaddd %xmm6,%xmm4,%xmm4 1087 vpxor %xmm4,%xmm3,%xmm2 1088 vmovdqa -112(%ebx),%xmm1 1089 .byte 143,232,120,194,210,12 1090 vmovdqa -48(%ebx),%xmm3 1091 vpaddd %xmm2,%xmm0,%xmm0 1092 vmovdqa 80(%ebx),%xmm7 1093 vpxor %xmm0,%xmm6,%xmm6 1094 vpaddd %xmm3,%xmm1,%xmm1 1095 .byte 143,232,120,194,246,8 1096 vmovdqa %xmm0,-128(%ebx) 1097 vpaddd %xmm6,%xmm4,%xmm4 1098 vmovdqa %xmm6,64(%ebx) 1099 vpxor %xmm4,%xmm2,%xmm2 1100 vpxor %xmm1,%xmm7,%xmm7 1101 .byte 143,232,120,194,210,7 1102 vmovdqa %xmm4,(%ebx) 1103 .byte 143,232,120,194,255,16 1104 vmovdqa %xmm2,-64(%ebx) 1105 vpaddd %xmm7,%xmm5,%xmm5 1106 vmovdqa 32(%ebx),%xmm4 1107 vpxor %xmm5,%xmm3,%xmm3 1108 vmovdqa -96(%ebx),%xmm0 1109 .byte 143,232,120,194,219,12 1110 vmovdqa -32(%ebx),%xmm2 1111 vpaddd %xmm3,%xmm1,%xmm1 1112 vmovdqa 96(%ebx),%xmm6 1113 vpxor %xmm1,%xmm7,%xmm7 1114 vpaddd %xmm2,%xmm0,%xmm0 1115 .byte 143,232,120,194,255,8 1116 vmovdqa %xmm1,-112(%ebx) 1117 vpaddd %xmm7,%xmm5,%xmm5 1118 vmovdqa %xmm7,80(%ebx) 1119 vpxor %xmm5,%xmm3,%xmm3 1120 vpxor %xmm0,%xmm6,%xmm6 1121 .byte 143,232,120,194,219,7 1122 vmovdqa %xmm5,16(%ebx) 1123 .byte 143,232,120,194,246,16 1124 vmovdqa %xmm3,-48(%ebx) 1125 vpaddd %xmm6,%xmm4,%xmm4 1126 vmovdqa 48(%ebx),%xmm5 1127 vpxor %xmm4,%xmm2,%xmm2 1128 vmovdqa -80(%ebx),%xmm1 1129 .byte 143,232,120,194,210,12 1130 vmovdqa -16(%ebx),%xmm3 1131 vpaddd %xmm2,%xmm0,%xmm0 1132 vmovdqa 112(%ebx),%xmm7 1133 vpxor %xmm0,%xmm6,%xmm6 1134 vpaddd %xmm3,%xmm1,%xmm1 1135 .byte 143,232,120,194,246,8 1136 vmovdqa %xmm0,-96(%ebx) 1137 vpaddd %xmm6,%xmm4,%xmm4 1138 vmovdqa %xmm6,96(%ebx) 1139 vpxor %xmm4,%xmm2,%xmm2 1140 vpxor %xmm1,%xmm7,%xmm7 1141 .byte 143,232,120,194,210,7 1142 .byte 143,232,120,194,255,16 1143 vmovdqa %xmm2,-32(%ebx) 1144 vpaddd %xmm7,%xmm5,%xmm5 1145 vpxor %xmm5,%xmm3,%xmm3 1146 vmovdqa -128(%ebx),%xmm0 1147 .byte 143,232,120,194,219,12 1148 vmovdqa -48(%ebx),%xmm2 1149 vpaddd %xmm3,%xmm1,%xmm1 1150 vpxor %xmm1,%xmm7,%xmm7 1151 vpaddd %xmm2,%xmm0,%xmm0 1152 .byte 143,232,120,194,255,8 1153 vmovdqa %xmm1,-80(%ebx) 1154 vpaddd %xmm7,%xmm5,%xmm5 1155 vpxor %xmm5,%xmm3,%xmm3 1156 vpxor %xmm0,%xmm7,%xmm6 1157 .byte 143,232,120,194,219,7 1158 .byte 143,232,120,194,246,16 1159 vmovdqa %xmm3,-16(%ebx) 1160 vpaddd %xmm6,%xmm4,%xmm4 1161 vpxor %xmm4,%xmm2,%xmm2 1162 vmovdqa -112(%ebx),%xmm1 1163 .byte 143,232,120,194,210,12 1164 vmovdqa -32(%ebx),%xmm3 1165 vpaddd %xmm2,%xmm0,%xmm0 1166 vmovdqa 64(%ebx),%xmm7 1167 vpxor %xmm0,%xmm6,%xmm6 1168 vpaddd %xmm3,%xmm1,%xmm1 1169 .byte 143,232,120,194,246,8 1170 vmovdqa %xmm0,-128(%ebx) 1171 vpaddd %xmm6,%xmm4,%xmm4 1172 vmovdqa %xmm6,112(%ebx) 1173 vpxor %xmm4,%xmm2,%xmm2 1174 vpxor %xmm1,%xmm7,%xmm7 1175 .byte 143,232,120,194,210,7 1176 vmovdqa %xmm4,32(%ebx) 1177 .byte 143,232,120,194,255,16 1178 vmovdqa %xmm2,-48(%ebx) 1179 vpaddd %xmm7,%xmm5,%xmm5 1180 vmovdqa (%ebx),%xmm4 1181 vpxor %xmm5,%xmm3,%xmm3 1182 vmovdqa -96(%ebx),%xmm0 1183 .byte 143,232,120,194,219,12 1184 vmovdqa -16(%ebx),%xmm2 1185 vpaddd %xmm3,%xmm1,%xmm1 1186 vmovdqa 80(%ebx),%xmm6 1187 vpxor %xmm1,%xmm7,%xmm7 1188 vpaddd %xmm2,%xmm0,%xmm0 1189 .byte 143,232,120,194,255,8 1190 vmovdqa %xmm1,-112(%ebx) 1191 vpaddd %xmm7,%xmm5,%xmm5 1192 vmovdqa %xmm7,64(%ebx) 1193 vpxor %xmm5,%xmm3,%xmm3 1194 vpxor %xmm0,%xmm6,%xmm6 1195 .byte 143,232,120,194,219,7 1196 vmovdqa %xmm5,48(%ebx) 1197 .byte 143,232,120,194,246,16 1198 vmovdqa %xmm3,-32(%ebx) 1199 vpaddd %xmm6,%xmm4,%xmm4 1200 vmovdqa 16(%ebx),%xmm5 1201 vpxor %xmm4,%xmm2,%xmm2 1202 vmovdqa -80(%ebx),%xmm1 1203 .byte 143,232,120,194,210,12 1204 vmovdqa -64(%ebx),%xmm3 1205 vpaddd %xmm2,%xmm0,%xmm0 1206 vmovdqa 96(%ebx),%xmm7 1207 vpxor %xmm0,%xmm6,%xmm6 1208 vpaddd %xmm3,%xmm1,%xmm1 1209 .byte 143,232,120,194,246,8 1210 vmovdqa %xmm0,-96(%ebx) 1211 vpaddd %xmm6,%xmm4,%xmm4 1212 vmovdqa %xmm6,80(%ebx) 1213 vpxor %xmm4,%xmm2,%xmm2 1214 vpxor %xmm1,%xmm7,%xmm7 1215 .byte 143,232,120,194,210,7 1216 .byte 143,232,120,194,255,16 1217 vmovdqa %xmm2,-16(%ebx) 1218 vpaddd %xmm7,%xmm5,%xmm5 1219 vpxor %xmm5,%xmm3,%xmm3 1220 vmovdqa -128(%ebx),%xmm0 1221 .byte 143,232,120,194,219,12 1222 vpaddd %xmm3,%xmm1,%xmm1 1223 vmovdqa 64(%ebx),%xmm6 1224 vpxor %xmm1,%xmm7,%xmm7 1225 .byte 143,232,120,194,255,8 1226 vmovdqa %xmm1,-80(%ebx) 1227 vpaddd %xmm7,%xmm5,%xmm5 1228 vmovdqa %xmm7,96(%ebx) 1229 vpxor %xmm5,%xmm3,%xmm3 1230 .byte 143,232,120,194,219,7 1231 decl %edx 1232 jnz L018loop 1233 vmovdqa %xmm3,-64(%ebx) 1234 vmovdqa %xmm4,(%ebx) 1235 vmovdqa %xmm5,16(%ebx) 1236 vmovdqa %xmm6,64(%ebx) 1237 vmovdqa %xmm7,96(%ebx) 1238 vmovdqa -112(%ebx),%xmm1 1239 vmovdqa -96(%ebx),%xmm2 1240 vmovdqa -80(%ebx),%xmm3 1241 vpaddd -128(%ebp),%xmm0,%xmm0 1242 vpaddd -112(%ebp),%xmm1,%xmm1 1243 vpaddd -96(%ebp),%xmm2,%xmm2 1244 vpaddd -80(%ebp),%xmm3,%xmm3 1245 vpunpckldq %xmm1,%xmm0,%xmm6 1246 vpunpckldq %xmm3,%xmm2,%xmm7 1247 vpunpckhdq %xmm1,%xmm0,%xmm0 1248 vpunpckhdq %xmm3,%xmm2,%xmm2 1249 vpunpcklqdq %xmm7,%xmm6,%xmm1 1250 vpunpckhqdq %xmm7,%xmm6,%xmm6 1251 vpunpcklqdq %xmm2,%xmm0,%xmm7 1252 vpunpckhqdq %xmm2,%xmm0,%xmm3 1253 vpxor -128(%esi),%xmm1,%xmm4 1254 vpxor -64(%esi),%xmm6,%xmm5 1255 vpxor (%esi),%xmm7,%xmm6 1256 vpxor 64(%esi),%xmm3,%xmm7 1257 leal 16(%esi),%esi 1258 vmovdqa -64(%ebx),%xmm0 1259 vmovdqa -48(%ebx),%xmm1 1260 vmovdqa -32(%ebx),%xmm2 1261 vmovdqa -16(%ebx),%xmm3 1262 vmovdqu %xmm4,-128(%edi) 1263 vmovdqu %xmm5,-64(%edi) 1264 vmovdqu %xmm6,(%edi) 1265 vmovdqu %xmm7,64(%edi) 1266 leal 16(%edi),%edi 1267 vpaddd -64(%ebp),%xmm0,%xmm0 1268 vpaddd -48(%ebp),%xmm1,%xmm1 1269 vpaddd -32(%ebp),%xmm2,%xmm2 1270 vpaddd -16(%ebp),%xmm3,%xmm3 1271 vpunpckldq %xmm1,%xmm0,%xmm6 1272 vpunpckldq %xmm3,%xmm2,%xmm7 1273 vpunpckhdq %xmm1,%xmm0,%xmm0 1274 vpunpckhdq %xmm3,%xmm2,%xmm2 1275 vpunpcklqdq %xmm7,%xmm6,%xmm1 1276 vpunpckhqdq %xmm7,%xmm6,%xmm6 1277 vpunpcklqdq %xmm2,%xmm0,%xmm7 1278 vpunpckhqdq %xmm2,%xmm0,%xmm3 1279 vpxor -128(%esi),%xmm1,%xmm4 1280 vpxor -64(%esi),%xmm6,%xmm5 1281 vpxor (%esi),%xmm7,%xmm6 1282 vpxor 64(%esi),%xmm3,%xmm7 1283 leal 16(%esi),%esi 1284 vmovdqa (%ebx),%xmm0 1285 vmovdqa 16(%ebx),%xmm1 1286 vmovdqa 32(%ebx),%xmm2 1287 vmovdqa 48(%ebx),%xmm3 1288 vmovdqu %xmm4,-128(%edi) 1289 vmovdqu %xmm5,-64(%edi) 1290 vmovdqu %xmm6,(%edi) 1291 vmovdqu %xmm7,64(%edi) 1292 leal 16(%edi),%edi 1293 vpaddd (%ebp),%xmm0,%xmm0 1294 vpaddd 16(%ebp),%xmm1,%xmm1 1295 vpaddd 32(%ebp),%xmm2,%xmm2 1296 vpaddd 48(%ebp),%xmm3,%xmm3 1297 vpunpckldq %xmm1,%xmm0,%xmm6 1298 vpunpckldq %xmm3,%xmm2,%xmm7 1299 vpunpckhdq %xmm1,%xmm0,%xmm0 1300 vpunpckhdq %xmm3,%xmm2,%xmm2 1301 vpunpcklqdq %xmm7,%xmm6,%xmm1 1302 vpunpckhqdq %xmm7,%xmm6,%xmm6 1303 vpunpcklqdq %xmm2,%xmm0,%xmm7 1304 vpunpckhqdq %xmm2,%xmm0,%xmm3 1305 vpxor -128(%esi),%xmm1,%xmm4 1306 vpxor -64(%esi),%xmm6,%xmm5 1307 vpxor (%esi),%xmm7,%xmm6 1308 vpxor 64(%esi),%xmm3,%xmm7 1309 leal 16(%esi),%esi 1310 vmovdqa 64(%ebx),%xmm0 1311 vmovdqa 80(%ebx),%xmm1 1312 vmovdqa 96(%ebx),%xmm2 1313 vmovdqa 112(%ebx),%xmm3 1314 vmovdqu %xmm4,-128(%edi) 1315 vmovdqu %xmm5,-64(%edi) 1316 vmovdqu %xmm6,(%edi) 1317 vmovdqu %xmm7,64(%edi) 1318 leal 16(%edi),%edi 1319 vpaddd 64(%ebp),%xmm0,%xmm0 1320 vpaddd 80(%ebp),%xmm1,%xmm1 1321 vpaddd 96(%ebp),%xmm2,%xmm2 1322 vpaddd 112(%ebp),%xmm3,%xmm3 1323 vpunpckldq %xmm1,%xmm0,%xmm6 1324 vpunpckldq %xmm3,%xmm2,%xmm7 1325 vpunpckhdq %xmm1,%xmm0,%xmm0 1326 vpunpckhdq %xmm3,%xmm2,%xmm2 1327 vpunpcklqdq %xmm7,%xmm6,%xmm1 1328 vpunpckhqdq %xmm7,%xmm6,%xmm6 1329 vpunpcklqdq %xmm2,%xmm0,%xmm7 1330 vpunpckhqdq %xmm2,%xmm0,%xmm3 1331 vpxor -128(%esi),%xmm1,%xmm4 1332 vpxor -64(%esi),%xmm6,%xmm5 1333 vpxor (%esi),%xmm7,%xmm6 1334 vpxor 64(%esi),%xmm3,%xmm7 1335 leal 208(%esi),%esi 1336 vmovdqu %xmm4,-128(%edi) 1337 vmovdqu %xmm5,-64(%edi) 1338 vmovdqu %xmm6,(%edi) 1339 vmovdqu %xmm7,64(%edi) 1340 leal 208(%edi),%edi 1341 subl $256,%ecx 1342 jnc L017outer_loop 1343 addl $256,%ecx 1344 jz L019done 1345 movl 520(%esp),%ebx 1346 leal -128(%esi),%esi 1347 movl 516(%esp),%edx 1348 leal -128(%edi),%edi 1349 vmovd 64(%ebp),%xmm2 1350 vmovdqu (%ebx),%xmm3 1351 vpaddd 96(%eax),%xmm2,%xmm2 1352 vpand 112(%eax),%xmm3,%xmm3 1353 vpor %xmm2,%xmm3,%xmm3 1354 L0161x: 1355 vmovdqa 32(%eax),%xmm0 1356 vmovdqu (%edx),%xmm1 1357 vmovdqu 16(%edx),%xmm2 1358 vmovdqa (%eax),%xmm6 1359 vmovdqa 16(%eax),%xmm7 1360 movl %ebp,48(%esp) 1361 vmovdqa %xmm0,(%esp) 1362 vmovdqa %xmm1,16(%esp) 1363 vmovdqa %xmm2,32(%esp) 1364 vmovdqa %xmm3,48(%esp) 1365 movl $10,%edx 1366 jmp L020loop1x 1367 .align 4,0x90 1368 L021outer1x: 1369 vmovdqa 80(%eax),%xmm3 1370 vmovdqa (%esp),%xmm0 1371 vmovdqa 16(%esp),%xmm1 1372 vmovdqa 32(%esp),%xmm2 1373 vpaddd 48(%esp),%xmm3,%xmm3 1374 movl $10,%edx 1375 vmovdqa %xmm3,48(%esp) 1376 jmp L020loop1x 1377 .align 4,0x90 1378 L020loop1x: 1379 vpaddd %xmm1,%xmm0,%xmm0 1380 vpxor %xmm0,%xmm3,%xmm3 1381 .byte 143,232,120,194,219,16 1382 vpaddd %xmm3,%xmm2,%xmm2 1383 vpxor %xmm2,%xmm1,%xmm1 1384 .byte 143,232,120,194,201,12 1385 vpaddd %xmm1,%xmm0,%xmm0 1386 vpxor %xmm0,%xmm3,%xmm3 1387 .byte 143,232,120,194,219,8 1388 vpaddd %xmm3,%xmm2,%xmm2 1389 vpxor %xmm2,%xmm1,%xmm1 1390 .byte 143,232,120,194,201,7 1391 vpshufd $78,%xmm2,%xmm2 1392 vpshufd $57,%xmm1,%xmm1 1393 vpshufd $147,%xmm3,%xmm3 1394 vpaddd %xmm1,%xmm0,%xmm0 1395 vpxor %xmm0,%xmm3,%xmm3 1396 .byte 143,232,120,194,219,16 1397 vpaddd %xmm3,%xmm2,%xmm2 1398 vpxor %xmm2,%xmm1,%xmm1 1399 .byte 143,232,120,194,201,12 1400 vpaddd %xmm1,%xmm0,%xmm0 1401 vpxor %xmm0,%xmm3,%xmm3 1402 .byte 143,232,120,194,219,8 1403 vpaddd %xmm3,%xmm2,%xmm2 1404 vpxor %xmm2,%xmm1,%xmm1 1405 .byte 143,232,120,194,201,7 1406 vpshufd $78,%xmm2,%xmm2 1407 vpshufd $147,%xmm1,%xmm1 1408 vpshufd $57,%xmm3,%xmm3 1409 decl %edx 1410 jnz L020loop1x 1411 vpaddd (%esp),%xmm0,%xmm0 1412 vpaddd 16(%esp),%xmm1,%xmm1 1413 vpaddd 32(%esp),%xmm2,%xmm2 1414 vpaddd 48(%esp),%xmm3,%xmm3 1415 cmpl $64,%ecx 1416 jb L022tail 1417 vpxor (%esi),%xmm0,%xmm0 1418 vpxor 16(%esi),%xmm1,%xmm1 1419 vpxor 32(%esi),%xmm2,%xmm2 1420 vpxor 48(%esi),%xmm3,%xmm3 1421 leal 64(%esi),%esi 1422 vmovdqu %xmm0,(%edi) 1423 vmovdqu %xmm1,16(%edi) 1424 vmovdqu %xmm2,32(%edi) 1425 vmovdqu %xmm3,48(%edi) 1426 leal 64(%edi),%edi 1427 subl $64,%ecx 1428 jnz L021outer1x 1429 jmp L019done 1430 L022tail: 1431 vmovdqa %xmm0,(%esp) 1432 vmovdqa %xmm1,16(%esp) 1433 vmovdqa %xmm2,32(%esp) 1434 vmovdqa %xmm3,48(%esp) 1435 xorl %eax,%eax 1436 xorl %edx,%edx 1437 xorl %ebp,%ebp 1438 L023tail_loop: 1439 movb (%esp,%ebp,1),%al 1440 movb (%esi,%ebp,1),%dl 1441 leal 1(%ebp),%ebp 1442 xorb %dl,%al 1443 movb %al,-1(%edi,%ebp,1) 1444 decl %ecx 1445 jnz L023tail_loop 1446 L019done: 1447 vzeroupper 1448 movl 512(%esp),%esp 1449 popl %edi 1450 popl %esi 1451 popl %ebx 1452 popl %ebp 1453 ret 1454 .section __IMPORT,__pointers,non_lazy_symbol_pointers 1455 L_OPENSSL_ia32cap_P$non_lazy_ptr: 1456 .indirect_symbol _OPENSSL_ia32cap_P 1457 .long 0 1458 .comm _OPENSSL_ia32cap_P,16,2 1459