1 .text 2 .align 6,0x90 3 .globl _poly1305_init 4 .type _poly1305_init,@function 5 .align 4 6 _poly1305_init: 7 L_poly1305_init_begin: 8 %ifdef __CET__ 9 10 .byte 243,15,30,251 11 %endif 12 13 pushl %ebp 14 pushl %ebx 15 pushl %esi 16 pushl %edi 17 movl 20(%esp),%edi 18 movl 24(%esp),%esi 19 movl 28(%esp),%ebp 20 xorl %eax,%eax 21 movl %eax,(%edi) 22 movl %eax,4(%edi) 23 movl %eax,8(%edi) 24 movl %eax,12(%edi) 25 movl %eax,16(%edi) 26 movl %eax,20(%edi) 27 cmpl $0,%esi 28 je L000nokey 29 call L001pic_point 30 L001pic_point: 31 popl %ebx 32 leal _poly1305_blocks-L001pic_point(%ebx),%eax 33 leal _poly1305_emit-L001pic_point(%ebx),%edx 34 leal __GLOBAL_OFFSET_TABLE_+[.-L001pic_point](%ebx),%edi 35 movl _OPENSSL_ia32cap_P@GOT(%edi),%edi 36 movl (%edi),%ecx 37 andl $83886080,%ecx 38 cmpl $83886080,%ecx 39 jne L002no_sse2 40 leal __poly1305_blocks_sse2-L001pic_point(%ebx),%eax 41 leal __poly1305_emit_sse2-L001pic_point(%ebx),%edx 42 movl 8(%edi),%ecx 43 testl $32,%ecx 44 jz L002no_sse2 45 leal __poly1305_blocks_avx2-L001pic_point(%ebx),%eax 46 L002no_sse2: 47 movl 20(%esp),%edi 48 movl %eax,(%ebp) 49 movl %edx,4(%ebp) 50 movl (%esi),%eax 51 movl 4(%esi),%ebx 52 movl 8(%esi),%ecx 53 movl 12(%esi),%edx 54 andl $268435455,%eax 55 andl $268435452,%ebx 56 andl $268435452,%ecx 57 andl $268435452,%edx 58 movl %eax,24(%edi) 59 movl %ebx,28(%edi) 60 movl %ecx,32(%edi) 61 movl %edx,36(%edi) 62 movl $1,%eax 63 L000nokey: 64 popl %edi 65 popl %esi 66 popl %ebx 67 popl %ebp 68 ret 69 .globl _poly1305_blocks 70 .type _poly1305_blocks,@function 71 .align 4 72 _poly1305_blocks: 73 L_poly1305_blocks_begin: 74 %ifdef __CET__ 75 76 .byte 243,15,30,251 77 %endif 78 79 pushl %ebp 80 pushl %ebx 81 pushl %esi 82 pushl %edi 83 movl 20(%esp),%edi 84 movl 24(%esp),%esi 85 movl 28(%esp),%ecx 86 Lenter_blocks: 87 andl $-15,%ecx 88 jz L003nodata 89 subl $64,%esp 90 movl 24(%edi),%eax 91 movl 28(%edi),%ebx 92 leal (%esi,%ecx,1),%ebp 93 movl 32(%edi),%ecx 94 movl 36(%edi),%edx 95 movl %ebp,92(%esp) 96 movl %esi,%ebp 97 movl %eax,36(%esp) 98 movl %ebx,%eax 99 shrl $2,%eax 100 movl %ebx,40(%esp) 101 addl %ebx,%eax 102 movl %ecx,%ebx 103 shrl $2,%ebx 104 movl %ecx,44(%esp) 105 addl %ecx,%ebx 106 movl %edx,%ecx 107 shrl $2,%ecx 108 movl %edx,48(%esp) 109 addl %edx,%ecx 110 movl %eax,52(%esp) 111 movl %ebx,56(%esp) 112 movl %ecx,60(%esp) 113 movl (%edi),%eax 114 movl 4(%edi),%ebx 115 movl 8(%edi),%ecx 116 movl 12(%edi),%esi 117 movl 16(%edi),%edi 118 jmp L004loop 119 .align 5,0x90 120 L004loop: 121 addl (%ebp),%eax 122 adcl 4(%ebp),%ebx 123 adcl 8(%ebp),%ecx 124 adcl 12(%ebp),%esi 125 leal 16(%ebp),%ebp 126 adcl 96(%esp),%edi 127 movl %eax,(%esp) 128 movl %esi,12(%esp) 129 mull 36(%esp) 130 movl %edi,16(%esp) 131 movl %eax,%edi 132 movl %ebx,%eax 133 movl %edx,%esi 134 mull 60(%esp) 135 addl %eax,%edi 136 movl %ecx,%eax 137 adcl %edx,%esi 138 mull 56(%esp) 139 addl %eax,%edi 140 movl 12(%esp),%eax 141 adcl %edx,%esi 142 mull 52(%esp) 143 addl %eax,%edi 144 movl (%esp),%eax 145 adcl %edx,%esi 146 mull 40(%esp) 147 movl %edi,20(%esp) 148 xorl %edi,%edi 149 addl %eax,%esi 150 movl %ebx,%eax 151 adcl %edx,%edi 152 mull 36(%esp) 153 addl %eax,%esi 154 movl %ecx,%eax 155 adcl %edx,%edi 156 mull 60(%esp) 157 addl %eax,%esi 158 movl 12(%esp),%eax 159 adcl %edx,%edi 160 mull 56(%esp) 161 addl %eax,%esi 162 movl 16(%esp),%eax 163 adcl %edx,%edi 164 imull 52(%esp),%eax 165 addl %eax,%esi 166 movl (%esp),%eax 167 adcl $0,%edi 168 mull 44(%esp) 169 movl %esi,24(%esp) 170 xorl %esi,%esi 171 addl %eax,%edi 172 movl %ebx,%eax 173 adcl %edx,%esi 174 mull 40(%esp) 175 addl %eax,%edi 176 movl %ecx,%eax 177 adcl %edx,%esi 178 mull 36(%esp) 179 addl %eax,%edi 180 movl 12(%esp),%eax 181 adcl %edx,%esi 182 mull 60(%esp) 183 addl %eax,%edi 184 movl 16(%esp),%eax 185 adcl %edx,%esi 186 imull 56(%esp),%eax 187 addl %eax,%edi 188 movl (%esp),%eax 189 adcl $0,%esi 190 mull 48(%esp) 191 movl %edi,28(%esp) 192 xorl %edi,%edi 193 addl %eax,%esi 194 movl %ebx,%eax 195 adcl %edx,%edi 196 mull 44(%esp) 197 addl %eax,%esi 198 movl %ecx,%eax 199 adcl %edx,%edi 200 mull 40(%esp) 201 addl %eax,%esi 202 movl 12(%esp),%eax 203 adcl %edx,%edi 204 mull 36(%esp) 205 addl %eax,%esi 206 movl 16(%esp),%ecx 207 adcl %edx,%edi 208 movl %ecx,%edx 209 imull 60(%esp),%ecx 210 addl %ecx,%esi 211 movl 20(%esp),%eax 212 adcl $0,%edi 213 imull 36(%esp),%edx 214 addl %edi,%edx 215 movl 24(%esp),%ebx 216 movl 28(%esp),%ecx 217 movl %edx,%edi 218 shrl $2,%edx 219 andl $3,%edi 220 leal (%edx,%edx,4),%edx 221 addl %edx,%eax 222 adcl $0,%ebx 223 adcl $0,%ecx 224 adcl $0,%esi 225 adcl $0,%edi 226 cmpl 92(%esp),%ebp 227 jne L004loop 228 movl 84(%esp),%edx 229 addl $64,%esp 230 movl %eax,(%edx) 231 movl %ebx,4(%edx) 232 movl %ecx,8(%edx) 233 movl %esi,12(%edx) 234 movl %edi,16(%edx) 235 L003nodata: 236 popl %edi 237 popl %esi 238 popl %ebx 239 popl %ebp 240 ret 241 .globl _poly1305_emit 242 .type _poly1305_emit,@function 243 .align 4 244 _poly1305_emit: 245 L_poly1305_emit_begin: 246 %ifdef __CET__ 247 248 .byte 243,15,30,251 249 %endif 250 251 pushl %ebp 252 pushl %ebx 253 pushl %esi 254 pushl %edi 255 movl 20(%esp),%ebp 256 Lenter_emit: 257 movl 24(%esp),%edi 258 movl (%ebp),%eax 259 movl 4(%ebp),%ebx 260 movl 8(%ebp),%ecx 261 movl 12(%ebp),%edx 262 movl 16(%ebp),%esi 263 addl $5,%eax 264 adcl $0,%ebx 265 adcl $0,%ecx 266 adcl $0,%edx 267 adcl $0,%esi 268 shrl $2,%esi 269 negl %esi 270 andl %esi,%eax 271 andl %esi,%ebx 272 andl %esi,%ecx 273 andl %esi,%edx 274 movl %eax,(%edi) 275 movl %ebx,4(%edi) 276 movl %ecx,8(%edi) 277 movl %edx,12(%edi) 278 notl %esi 279 movl (%ebp),%eax 280 movl 4(%ebp),%ebx 281 movl 8(%ebp),%ecx 282 movl 12(%ebp),%edx 283 movl 28(%esp),%ebp 284 andl %esi,%eax 285 andl %esi,%ebx 286 andl %esi,%ecx 287 andl %esi,%edx 288 orl (%edi),%eax 289 orl 4(%edi),%ebx 290 orl 8(%edi),%ecx 291 orl 12(%edi),%edx 292 addl (%ebp),%eax 293 adcl 4(%ebp),%ebx 294 adcl 8(%ebp),%ecx 295 adcl 12(%ebp),%edx 296 movl %eax,(%edi) 297 movl %ebx,4(%edi) 298 movl %ecx,8(%edi) 299 movl %edx,12(%edi) 300 popl %edi 301 popl %esi 302 popl %ebx 303 popl %ebp 304 ret 305 .align 5,0x90 306 .type __poly1305_init_sse2,@function 307 .align 4 308 __poly1305_init_sse2: 309 %ifdef __CET__ 310 311 .byte 243,15,30,251 312 %endif 313 314 movdqu 24(%edi),%xmm4 315 leal 48(%edi),%edi 316 movl %esp,%ebp 317 subl $224,%esp 318 andl $-16,%esp 319 movq 64(%ebx),%xmm7 320 movdqa %xmm4,%xmm0 321 movdqa %xmm4,%xmm1 322 movdqa %xmm4,%xmm2 323 pand %xmm7,%xmm0 324 psrlq $26,%xmm1 325 psrldq $6,%xmm2 326 pand %xmm7,%xmm1 327 movdqa %xmm2,%xmm3 328 psrlq $4,%xmm2 329 psrlq $30,%xmm3 330 pand %xmm7,%xmm2 331 pand %xmm7,%xmm3 332 psrldq $13,%xmm4 333 leal 144(%esp),%edx 334 movl $2,%ecx 335 L005square: 336 movdqa %xmm0,(%esp) 337 movdqa %xmm1,16(%esp) 338 movdqa %xmm2,32(%esp) 339 movdqa %xmm3,48(%esp) 340 movdqa %xmm4,64(%esp) 341 movdqa %xmm1,%xmm6 342 movdqa %xmm2,%xmm5 343 pslld $2,%xmm6 344 pslld $2,%xmm5 345 paddd %xmm1,%xmm6 346 paddd %xmm2,%xmm5 347 movdqa %xmm6,80(%esp) 348 movdqa %xmm5,96(%esp) 349 movdqa %xmm3,%xmm6 350 movdqa %xmm4,%xmm5 351 pslld $2,%xmm6 352 pslld $2,%xmm5 353 paddd %xmm3,%xmm6 354 paddd %xmm4,%xmm5 355 movdqa %xmm6,112(%esp) 356 movdqa %xmm5,128(%esp) 357 pshufd $68,%xmm0,%xmm6 358 movdqa %xmm1,%xmm5 359 pshufd $68,%xmm1,%xmm1 360 pshufd $68,%xmm2,%xmm2 361 pshufd $68,%xmm3,%xmm3 362 pshufd $68,%xmm4,%xmm4 363 movdqa %xmm6,(%edx) 364 movdqa %xmm1,16(%edx) 365 movdqa %xmm2,32(%edx) 366 movdqa %xmm3,48(%edx) 367 movdqa %xmm4,64(%edx) 368 pmuludq %xmm0,%xmm4 369 pmuludq %xmm0,%xmm3 370 pmuludq %xmm0,%xmm2 371 pmuludq %xmm0,%xmm1 372 pmuludq %xmm6,%xmm0 373 movdqa %xmm5,%xmm6 374 pmuludq 48(%edx),%xmm5 375 movdqa %xmm6,%xmm7 376 pmuludq 32(%edx),%xmm6 377 paddq %xmm5,%xmm4 378 movdqa %xmm7,%xmm5 379 pmuludq 16(%edx),%xmm7 380 paddq %xmm6,%xmm3 381 movdqa 80(%esp),%xmm6 382 pmuludq (%edx),%xmm5 383 paddq %xmm7,%xmm2 384 pmuludq 64(%edx),%xmm6 385 movdqa 32(%esp),%xmm7 386 paddq %xmm5,%xmm1 387 movdqa %xmm7,%xmm5 388 pmuludq 32(%edx),%xmm7 389 paddq %xmm6,%xmm0 390 movdqa %xmm5,%xmm6 391 pmuludq 16(%edx),%xmm5 392 paddq %xmm7,%xmm4 393 movdqa 96(%esp),%xmm7 394 pmuludq (%edx),%xmm6 395 paddq %xmm5,%xmm3 396 movdqa %xmm7,%xmm5 397 pmuludq 64(%edx),%xmm7 398 paddq %xmm6,%xmm2 399 pmuludq 48(%edx),%xmm5 400 movdqa 48(%esp),%xmm6 401 paddq %xmm7,%xmm1 402 movdqa %xmm6,%xmm7 403 pmuludq 16(%edx),%xmm6 404 paddq %xmm5,%xmm0 405 movdqa 112(%esp),%xmm5 406 pmuludq (%edx),%xmm7 407 paddq %xmm6,%xmm4 408 movdqa %xmm5,%xmm6 409 pmuludq 64(%edx),%xmm5 410 paddq %xmm7,%xmm3 411 movdqa %xmm6,%xmm7 412 pmuludq 48(%edx),%xmm6 413 paddq %xmm5,%xmm2 414 pmuludq 32(%edx),%xmm7 415 movdqa 64(%esp),%xmm5 416 paddq %xmm6,%xmm1 417 movdqa 128(%esp),%xmm6 418 pmuludq (%edx),%xmm5 419 paddq %xmm7,%xmm0 420 movdqa %xmm6,%xmm7 421 pmuludq 64(%edx),%xmm6 422 paddq %xmm5,%xmm4 423 movdqa %xmm7,%xmm5 424 pmuludq 16(%edx),%xmm7 425 paddq %xmm6,%xmm3 426 movdqa %xmm5,%xmm6 427 pmuludq 32(%edx),%xmm5 428 paddq %xmm7,%xmm0 429 pmuludq 48(%edx),%xmm6 430 movdqa 64(%ebx),%xmm7 431 paddq %xmm5,%xmm1 432 paddq %xmm6,%xmm2 433 movdqa %xmm3,%xmm5 434 pand %xmm7,%xmm3 435 psrlq $26,%xmm5 436 paddq %xmm4,%xmm5 437 movdqa %xmm0,%xmm6 438 pand %xmm7,%xmm0 439 psrlq $26,%xmm6 440 movdqa %xmm5,%xmm4 441 paddq %xmm1,%xmm6 442 psrlq $26,%xmm5 443 pand %xmm7,%xmm4 444 movdqa %xmm6,%xmm1 445 psrlq $26,%xmm6 446 paddd %xmm5,%xmm0 447 psllq $2,%xmm5 448 paddq %xmm2,%xmm6 449 paddq %xmm0,%xmm5 450 pand %xmm7,%xmm1 451 movdqa %xmm6,%xmm2 452 psrlq $26,%xmm6 453 pand %xmm7,%xmm2 454 paddd %xmm3,%xmm6 455 movdqa %xmm5,%xmm0 456 psrlq $26,%xmm5 457 movdqa %xmm6,%xmm3 458 psrlq $26,%xmm6 459 pand %xmm7,%xmm0 460 paddd %xmm5,%xmm1 461 pand %xmm7,%xmm3 462 paddd %xmm6,%xmm4 463 decl %ecx 464 jz L006square_break 465 punpcklqdq (%esp),%xmm0 466 punpcklqdq 16(%esp),%xmm1 467 punpcklqdq 32(%esp),%xmm2 468 punpcklqdq 48(%esp),%xmm3 469 punpcklqdq 64(%esp),%xmm4 470 jmp L005square 471 L006square_break: 472 psllq $32,%xmm0 473 psllq $32,%xmm1 474 psllq $32,%xmm2 475 psllq $32,%xmm3 476 psllq $32,%xmm4 477 por (%esp),%xmm0 478 por 16(%esp),%xmm1 479 por 32(%esp),%xmm2 480 por 48(%esp),%xmm3 481 por 64(%esp),%xmm4 482 pshufd $141,%xmm0,%xmm0 483 pshufd $141,%xmm1,%xmm1 484 pshufd $141,%xmm2,%xmm2 485 pshufd $141,%xmm3,%xmm3 486 pshufd $141,%xmm4,%xmm4 487 movdqu %xmm0,(%edi) 488 movdqu %xmm1,16(%edi) 489 movdqu %xmm2,32(%edi) 490 movdqu %xmm3,48(%edi) 491 movdqu %xmm4,64(%edi) 492 movdqa %xmm1,%xmm6 493 movdqa %xmm2,%xmm5 494 pslld $2,%xmm6 495 pslld $2,%xmm5 496 paddd %xmm1,%xmm6 497 paddd %xmm2,%xmm5 498 movdqu %xmm6,80(%edi) 499 movdqu %xmm5,96(%edi) 500 movdqa %xmm3,%xmm6 501 movdqa %xmm4,%xmm5 502 pslld $2,%xmm6 503 pslld $2,%xmm5 504 paddd %xmm3,%xmm6 505 paddd %xmm4,%xmm5 506 movdqu %xmm6,112(%edi) 507 movdqu %xmm5,128(%edi) 508 movl %ebp,%esp 509 leal -48(%edi),%edi 510 ret 511 .align 5,0x90 512 .type __poly1305_blocks_sse2,@function 513 .align 4 514 __poly1305_blocks_sse2: 515 %ifdef __CET__ 516 517 .byte 243,15,30,251 518 %endif 519 520 pushl %ebp 521 pushl %ebx 522 pushl %esi 523 pushl %edi 524 movl 20(%esp),%edi 525 movl 24(%esp),%esi 526 movl 28(%esp),%ecx 527 movl 20(%edi),%eax 528 andl $-16,%ecx 529 jz L007nodata 530 cmpl $64,%ecx 531 jae L008enter_sse2 532 testl %eax,%eax 533 jz Lenter_blocks 534 .align 4,0x90 535 L008enter_sse2: 536 call L009pic_point 537 L009pic_point: 538 popl %ebx 539 leal Lconst_sse2-L009pic_point(%ebx),%ebx 540 testl %eax,%eax 541 jnz L010base2_26 542 call __poly1305_init_sse2 543 movl (%edi),%eax 544 movl 3(%edi),%ecx 545 movl 6(%edi),%edx 546 movl 9(%edi),%esi 547 movl 13(%edi),%ebp 548 movl $1,20(%edi) 549 shrl $2,%ecx 550 andl $67108863,%eax 551 shrl $4,%edx 552 andl $67108863,%ecx 553 shrl $6,%esi 554 andl $67108863,%edx 555 movd %eax,%xmm0 556 movd %ecx,%xmm1 557 movd %edx,%xmm2 558 movd %esi,%xmm3 559 movd %ebp,%xmm4 560 movl 24(%esp),%esi 561 movl 28(%esp),%ecx 562 jmp L011base2_32 563 .align 4,0x90 564 L010base2_26: 565 movd (%edi),%xmm0 566 movd 4(%edi),%xmm1 567 movd 8(%edi),%xmm2 568 movd 12(%edi),%xmm3 569 movd 16(%edi),%xmm4 570 movdqa 64(%ebx),%xmm7 571 L011base2_32: 572 movl 32(%esp),%eax 573 movl %esp,%ebp 574 subl $528,%esp 575 andl $-16,%esp 576 leal 48(%edi),%edi 577 shll $24,%eax 578 testl $31,%ecx 579 jz L012even 580 movdqu (%esi),%xmm6 581 leal 16(%esi),%esi 582 movdqa %xmm6,%xmm5 583 pand %xmm7,%xmm6 584 paddd %xmm6,%xmm0 585 movdqa %xmm5,%xmm6 586 psrlq $26,%xmm5 587 psrldq $6,%xmm6 588 pand %xmm7,%xmm5 589 paddd %xmm5,%xmm1 590 movdqa %xmm6,%xmm5 591 psrlq $4,%xmm6 592 pand %xmm7,%xmm6 593 paddd %xmm6,%xmm2 594 movdqa %xmm5,%xmm6 595 psrlq $30,%xmm5 596 pand %xmm7,%xmm5 597 psrldq $7,%xmm6 598 paddd %xmm5,%xmm3 599 movd %eax,%xmm5 600 paddd %xmm6,%xmm4 601 movd 12(%edi),%xmm6 602 paddd %xmm5,%xmm4 603 movdqa %xmm0,(%esp) 604 movdqa %xmm1,16(%esp) 605 movdqa %xmm2,32(%esp) 606 movdqa %xmm3,48(%esp) 607 movdqa %xmm4,64(%esp) 608 pmuludq %xmm6,%xmm0 609 pmuludq %xmm6,%xmm1 610 pmuludq %xmm6,%xmm2 611 movd 28(%edi),%xmm5 612 pmuludq %xmm6,%xmm3 613 pmuludq %xmm6,%xmm4 614 movdqa %xmm5,%xmm6 615 pmuludq 48(%esp),%xmm5 616 movdqa %xmm6,%xmm7 617 pmuludq 32(%esp),%xmm6 618 paddq %xmm5,%xmm4 619 movdqa %xmm7,%xmm5 620 pmuludq 16(%esp),%xmm7 621 paddq %xmm6,%xmm3 622 movd 92(%edi),%xmm6 623 pmuludq (%esp),%xmm5 624 paddq %xmm7,%xmm2 625 pmuludq 64(%esp),%xmm6 626 movd 44(%edi),%xmm7 627 paddq %xmm5,%xmm1 628 movdqa %xmm7,%xmm5 629 pmuludq 32(%esp),%xmm7 630 paddq %xmm6,%xmm0 631 movdqa %xmm5,%xmm6 632 pmuludq 16(%esp),%xmm5 633 paddq %xmm7,%xmm4 634 movd 108(%edi),%xmm7 635 pmuludq (%esp),%xmm6 636 paddq %xmm5,%xmm3 637 movdqa %xmm7,%xmm5 638 pmuludq 64(%esp),%xmm7 639 paddq %xmm6,%xmm2 640 pmuludq 48(%esp),%xmm5 641 movd 60(%edi),%xmm6 642 paddq %xmm7,%xmm1 643 movdqa %xmm6,%xmm7 644 pmuludq 16(%esp),%xmm6 645 paddq %xmm5,%xmm0 646 movd 124(%edi),%xmm5 647 pmuludq (%esp),%xmm7 648 paddq %xmm6,%xmm4 649 movdqa %xmm5,%xmm6 650 pmuludq 64(%esp),%xmm5 651 paddq %xmm7,%xmm3 652 movdqa %xmm6,%xmm7 653 pmuludq 48(%esp),%xmm6 654 paddq %xmm5,%xmm2 655 pmuludq 32(%esp),%xmm7 656 movd 76(%edi),%xmm5 657 paddq %xmm6,%xmm1 658 movd 140(%edi),%xmm6 659 pmuludq (%esp),%xmm5 660 paddq %xmm7,%xmm0 661 movdqa %xmm6,%xmm7 662 pmuludq 64(%esp),%xmm6 663 paddq %xmm5,%xmm4 664 movdqa %xmm7,%xmm5 665 pmuludq 16(%esp),%xmm7 666 paddq %xmm6,%xmm3 667 movdqa %xmm5,%xmm6 668 pmuludq 32(%esp),%xmm5 669 paddq %xmm7,%xmm0 670 pmuludq 48(%esp),%xmm6 671 movdqa 64(%ebx),%xmm7 672 paddq %xmm5,%xmm1 673 paddq %xmm6,%xmm2 674 movdqa %xmm3,%xmm5 675 pand %xmm7,%xmm3 676 psrlq $26,%xmm5 677 paddq %xmm4,%xmm5 678 movdqa %xmm0,%xmm6 679 pand %xmm7,%xmm0 680 psrlq $26,%xmm6 681 movdqa %xmm5,%xmm4 682 paddq %xmm1,%xmm6 683 psrlq $26,%xmm5 684 pand %xmm7,%xmm4 685 movdqa %xmm6,%xmm1 686 psrlq $26,%xmm6 687 paddd %xmm5,%xmm0 688 psllq $2,%xmm5 689 paddq %xmm2,%xmm6 690 paddq %xmm0,%xmm5 691 pand %xmm7,%xmm1 692 movdqa %xmm6,%xmm2 693 psrlq $26,%xmm6 694 pand %xmm7,%xmm2 695 paddd %xmm3,%xmm6 696 movdqa %xmm5,%xmm0 697 psrlq $26,%xmm5 698 movdqa %xmm6,%xmm3 699 psrlq $26,%xmm6 700 pand %xmm7,%xmm0 701 paddd %xmm5,%xmm1 702 pand %xmm7,%xmm3 703 paddd %xmm6,%xmm4 704 subl $16,%ecx 705 jz L013done 706 L012even: 707 leal 384(%esp),%edx 708 leal -32(%esi),%eax 709 subl $64,%ecx 710 movdqu (%edi),%xmm5 711 pshufd $68,%xmm5,%xmm6 712 cmovbl %eax,%esi 713 pshufd $238,%xmm5,%xmm5 714 movdqa %xmm6,(%edx) 715 leal 160(%esp),%eax 716 movdqu 16(%edi),%xmm6 717 movdqa %xmm5,-144(%edx) 718 pshufd $68,%xmm6,%xmm5 719 pshufd $238,%xmm6,%xmm6 720 movdqa %xmm5,16(%edx) 721 movdqu 32(%edi),%xmm5 722 movdqa %xmm6,-128(%edx) 723 pshufd $68,%xmm5,%xmm6 724 pshufd $238,%xmm5,%xmm5 725 movdqa %xmm6,32(%edx) 726 movdqu 48(%edi),%xmm6 727 movdqa %xmm5,-112(%edx) 728 pshufd $68,%xmm6,%xmm5 729 pshufd $238,%xmm6,%xmm6 730 movdqa %xmm5,48(%edx) 731 movdqu 64(%edi),%xmm5 732 movdqa %xmm6,-96(%edx) 733 pshufd $68,%xmm5,%xmm6 734 pshufd $238,%xmm5,%xmm5 735 movdqa %xmm6,64(%edx) 736 movdqu 80(%edi),%xmm6 737 movdqa %xmm5,-80(%edx) 738 pshufd $68,%xmm6,%xmm5 739 pshufd $238,%xmm6,%xmm6 740 movdqa %xmm5,80(%edx) 741 movdqu 96(%edi),%xmm5 742 movdqa %xmm6,-64(%edx) 743 pshufd $68,%xmm5,%xmm6 744 pshufd $238,%xmm5,%xmm5 745 movdqa %xmm6,96(%edx) 746 movdqu 112(%edi),%xmm6 747 movdqa %xmm5,-48(%edx) 748 pshufd $68,%xmm6,%xmm5 749 pshufd $238,%xmm6,%xmm6 750 movdqa %xmm5,112(%edx) 751 movdqu 128(%edi),%xmm5 752 movdqa %xmm6,-32(%edx) 753 pshufd $68,%xmm5,%xmm6 754 pshufd $238,%xmm5,%xmm5 755 movdqa %xmm6,128(%edx) 756 movdqa %xmm5,-16(%edx) 757 movdqu 32(%esi),%xmm5 758 movdqu 48(%esi),%xmm6 759 leal 32(%esi),%esi 760 movdqa %xmm2,112(%esp) 761 movdqa %xmm3,128(%esp) 762 movdqa %xmm4,144(%esp) 763 movdqa %xmm5,%xmm2 764 movdqa %xmm6,%xmm3 765 psrldq $6,%xmm2 766 psrldq $6,%xmm3 767 movdqa %xmm5,%xmm4 768 punpcklqdq %xmm3,%xmm2 769 punpckhqdq %xmm6,%xmm4 770 punpcklqdq %xmm6,%xmm5 771 movdqa %xmm2,%xmm3 772 psrlq $4,%xmm2 773 psrlq $30,%xmm3 774 movdqa %xmm5,%xmm6 775 psrlq $40,%xmm4 776 psrlq $26,%xmm6 777 pand %xmm7,%xmm5 778 pand %xmm7,%xmm6 779 pand %xmm7,%xmm2 780 pand %xmm7,%xmm3 781 por (%ebx),%xmm4 782 movdqa %xmm0,80(%esp) 783 movdqa %xmm1,96(%esp) 784 jbe L014skip_loop 785 jmp L015loop 786 .align 5,0x90 787 L015loop: 788 movdqa -144(%edx),%xmm7 789 movdqa %xmm6,16(%eax) 790 movdqa %xmm2,32(%eax) 791 movdqa %xmm3,48(%eax) 792 movdqa %xmm4,64(%eax) 793 movdqa %xmm5,%xmm1 794 pmuludq %xmm7,%xmm5 795 movdqa %xmm6,%xmm0 796 pmuludq %xmm7,%xmm6 797 pmuludq %xmm7,%xmm2 798 pmuludq %xmm7,%xmm3 799 pmuludq %xmm7,%xmm4 800 pmuludq -16(%edx),%xmm0 801 movdqa %xmm1,%xmm7 802 pmuludq -128(%edx),%xmm1 803 paddq %xmm5,%xmm0 804 movdqa %xmm7,%xmm5 805 pmuludq -112(%edx),%xmm7 806 paddq %xmm6,%xmm1 807 movdqa %xmm5,%xmm6 808 pmuludq -96(%edx),%xmm5 809 paddq %xmm7,%xmm2 810 movdqa 16(%eax),%xmm7 811 pmuludq -80(%edx),%xmm6 812 paddq %xmm5,%xmm3 813 movdqa %xmm7,%xmm5 814 pmuludq -128(%edx),%xmm7 815 paddq %xmm6,%xmm4 816 movdqa %xmm5,%xmm6 817 pmuludq -112(%edx),%xmm5 818 paddq %xmm7,%xmm2 819 movdqa 32(%eax),%xmm7 820 pmuludq -96(%edx),%xmm6 821 paddq %xmm5,%xmm3 822 movdqa %xmm7,%xmm5 823 pmuludq -32(%edx),%xmm7 824 paddq %xmm6,%xmm4 825 movdqa %xmm5,%xmm6 826 pmuludq -16(%edx),%xmm5 827 paddq %xmm7,%xmm0 828 movdqa %xmm6,%xmm7 829 pmuludq -128(%edx),%xmm6 830 paddq %xmm5,%xmm1 831 movdqa 48(%eax),%xmm5 832 pmuludq -112(%edx),%xmm7 833 paddq %xmm6,%xmm3 834 movdqa %xmm5,%xmm6 835 pmuludq -48(%edx),%xmm5 836 paddq %xmm7,%xmm4 837 movdqa %xmm6,%xmm7 838 pmuludq -32(%edx),%xmm6 839 paddq %xmm5,%xmm0 840 movdqa %xmm7,%xmm5 841 pmuludq -16(%edx),%xmm7 842 paddq %xmm6,%xmm1 843 movdqa 64(%eax),%xmm6 844 pmuludq -128(%edx),%xmm5 845 paddq %xmm7,%xmm2 846 movdqa %xmm6,%xmm7 847 pmuludq -16(%edx),%xmm6 848 paddq %xmm5,%xmm4 849 movdqa %xmm7,%xmm5 850 pmuludq -64(%edx),%xmm7 851 paddq %xmm6,%xmm3 852 movdqa %xmm5,%xmm6 853 pmuludq -48(%edx),%xmm5 854 paddq %xmm7,%xmm0 855 movdqa 64(%ebx),%xmm7 856 pmuludq -32(%edx),%xmm6 857 paddq %xmm5,%xmm1 858 paddq %xmm6,%xmm2 859 movdqu -32(%esi),%xmm5 860 movdqu -16(%esi),%xmm6 861 leal 32(%esi),%esi 862 movdqa %xmm2,32(%esp) 863 movdqa %xmm3,48(%esp) 864 movdqa %xmm4,64(%esp) 865 movdqa %xmm5,%xmm2 866 movdqa %xmm6,%xmm3 867 psrldq $6,%xmm2 868 psrldq $6,%xmm3 869 movdqa %xmm5,%xmm4 870 punpcklqdq %xmm3,%xmm2 871 punpckhqdq %xmm6,%xmm4 872 punpcklqdq %xmm6,%xmm5 873 movdqa %xmm2,%xmm3 874 psrlq $4,%xmm2 875 psrlq $30,%xmm3 876 movdqa %xmm5,%xmm6 877 psrlq $40,%xmm4 878 psrlq $26,%xmm6 879 pand %xmm7,%xmm5 880 pand %xmm7,%xmm6 881 pand %xmm7,%xmm2 882 pand %xmm7,%xmm3 883 por (%ebx),%xmm4 884 leal -32(%esi),%eax 885 subl $64,%ecx 886 paddd 80(%esp),%xmm5 887 paddd 96(%esp),%xmm6 888 paddd 112(%esp),%xmm2 889 paddd 128(%esp),%xmm3 890 paddd 144(%esp),%xmm4 891 cmovbl %eax,%esi 892 leal 160(%esp),%eax 893 movdqa (%edx),%xmm7 894 movdqa %xmm1,16(%esp) 895 movdqa %xmm6,16(%eax) 896 movdqa %xmm2,32(%eax) 897 movdqa %xmm3,48(%eax) 898 movdqa %xmm4,64(%eax) 899 movdqa %xmm5,%xmm1 900 pmuludq %xmm7,%xmm5 901 paddq %xmm0,%xmm5 902 movdqa %xmm6,%xmm0 903 pmuludq %xmm7,%xmm6 904 pmuludq %xmm7,%xmm2 905 pmuludq %xmm7,%xmm3 906 pmuludq %xmm7,%xmm4 907 paddq 16(%esp),%xmm6 908 paddq 32(%esp),%xmm2 909 paddq 48(%esp),%xmm3 910 paddq 64(%esp),%xmm4 911 pmuludq 128(%edx),%xmm0 912 movdqa %xmm1,%xmm7 913 pmuludq 16(%edx),%xmm1 914 paddq %xmm5,%xmm0 915 movdqa %xmm7,%xmm5 916 pmuludq 32(%edx),%xmm7 917 paddq %xmm6,%xmm1 918 movdqa %xmm5,%xmm6 919 pmuludq 48(%edx),%xmm5 920 paddq %xmm7,%xmm2 921 movdqa 16(%eax),%xmm7 922 pmuludq 64(%edx),%xmm6 923 paddq %xmm5,%xmm3 924 movdqa %xmm7,%xmm5 925 pmuludq 16(%edx),%xmm7 926 paddq %xmm6,%xmm4 927 movdqa %xmm5,%xmm6 928 pmuludq 32(%edx),%xmm5 929 paddq %xmm7,%xmm2 930 movdqa 32(%eax),%xmm7 931 pmuludq 48(%edx),%xmm6 932 paddq %xmm5,%xmm3 933 movdqa %xmm7,%xmm5 934 pmuludq 112(%edx),%xmm7 935 paddq %xmm6,%xmm4 936 movdqa %xmm5,%xmm6 937 pmuludq 128(%edx),%xmm5 938 paddq %xmm7,%xmm0 939 movdqa %xmm6,%xmm7 940 pmuludq 16(%edx),%xmm6 941 paddq %xmm5,%xmm1 942 movdqa 48(%eax),%xmm5 943 pmuludq 32(%edx),%xmm7 944 paddq %xmm6,%xmm3 945 movdqa %xmm5,%xmm6 946 pmuludq 96(%edx),%xmm5 947 paddq %xmm7,%xmm4 948 movdqa %xmm6,%xmm7 949 pmuludq 112(%edx),%xmm6 950 paddq %xmm5,%xmm0 951 movdqa %xmm7,%xmm5 952 pmuludq 128(%edx),%xmm7 953 paddq %xmm6,%xmm1 954 movdqa 64(%eax),%xmm6 955 pmuludq 16(%edx),%xmm5 956 paddq %xmm7,%xmm2 957 movdqa %xmm6,%xmm7 958 pmuludq 128(%edx),%xmm6 959 paddq %xmm5,%xmm4 960 movdqa %xmm7,%xmm5 961 pmuludq 80(%edx),%xmm7 962 paddq %xmm6,%xmm3 963 movdqa %xmm5,%xmm6 964 pmuludq 96(%edx),%xmm5 965 paddq %xmm7,%xmm0 966 movdqa 64(%ebx),%xmm7 967 pmuludq 112(%edx),%xmm6 968 paddq %xmm5,%xmm1 969 paddq %xmm6,%xmm2 970 movdqa %xmm3,%xmm5 971 pand %xmm7,%xmm3 972 psrlq $26,%xmm5 973 paddq %xmm4,%xmm5 974 movdqa %xmm0,%xmm6 975 pand %xmm7,%xmm0 976 psrlq $26,%xmm6 977 movdqa %xmm5,%xmm4 978 paddq %xmm1,%xmm6 979 psrlq $26,%xmm5 980 pand %xmm7,%xmm4 981 movdqa %xmm6,%xmm1 982 psrlq $26,%xmm6 983 paddd %xmm5,%xmm0 984 psllq $2,%xmm5 985 paddq %xmm2,%xmm6 986 paddq %xmm0,%xmm5 987 pand %xmm7,%xmm1 988 movdqa %xmm6,%xmm2 989 psrlq $26,%xmm6 990 pand %xmm7,%xmm2 991 paddd %xmm3,%xmm6 992 movdqa %xmm5,%xmm0 993 psrlq $26,%xmm5 994 movdqa %xmm6,%xmm3 995 psrlq $26,%xmm6 996 pand %xmm7,%xmm0 997 paddd %xmm5,%xmm1 998 pand %xmm7,%xmm3 999 paddd %xmm6,%xmm4 1000 movdqu 32(%esi),%xmm5 1001 movdqu 48(%esi),%xmm6 1002 leal 32(%esi),%esi 1003 movdqa %xmm2,112(%esp) 1004 movdqa %xmm3,128(%esp) 1005 movdqa %xmm4,144(%esp) 1006 movdqa %xmm5,%xmm2 1007 movdqa %xmm6,%xmm3 1008 psrldq $6,%xmm2 1009 psrldq $6,%xmm3 1010 movdqa %xmm5,%xmm4 1011 punpcklqdq %xmm3,%xmm2 1012 punpckhqdq %xmm6,%xmm4 1013 punpcklqdq %xmm6,%xmm5 1014 movdqa %xmm2,%xmm3 1015 psrlq $4,%xmm2 1016 psrlq $30,%xmm3 1017 movdqa %xmm5,%xmm6 1018 psrlq $40,%xmm4 1019 psrlq $26,%xmm6 1020 pand %xmm7,%xmm5 1021 pand %xmm7,%xmm6 1022 pand %xmm7,%xmm2 1023 pand %xmm7,%xmm3 1024 por (%ebx),%xmm4 1025 movdqa %xmm0,80(%esp) 1026 movdqa %xmm1,96(%esp) 1027 ja L015loop 1028 L014skip_loop: 1029 pshufd $16,-144(%edx),%xmm7 1030 addl $32,%ecx 1031 jnz L016long_tail 1032 paddd %xmm0,%xmm5 1033 paddd %xmm1,%xmm6 1034 paddd 112(%esp),%xmm2 1035 paddd 128(%esp),%xmm3 1036 paddd 144(%esp),%xmm4 1037 L016long_tail: 1038 movdqa %xmm5,(%eax) 1039 movdqa %xmm6,16(%eax) 1040 movdqa %xmm2,32(%eax) 1041 movdqa %xmm3,48(%eax) 1042 movdqa %xmm4,64(%eax) 1043 pmuludq %xmm7,%xmm5 1044 pmuludq %xmm7,%xmm6 1045 pmuludq %xmm7,%xmm2 1046 movdqa %xmm5,%xmm0 1047 pshufd $16,-128(%edx),%xmm5 1048 pmuludq %xmm7,%xmm3 1049 movdqa %xmm6,%xmm1 1050 pmuludq %xmm7,%xmm4 1051 movdqa %xmm5,%xmm6 1052 pmuludq 48(%eax),%xmm5 1053 movdqa %xmm6,%xmm7 1054 pmuludq 32(%eax),%xmm6 1055 paddq %xmm5,%xmm4 1056 movdqa %xmm7,%xmm5 1057 pmuludq 16(%eax),%xmm7 1058 paddq %xmm6,%xmm3 1059 pshufd $16,-64(%edx),%xmm6 1060 pmuludq (%eax),%xmm5 1061 paddq %xmm7,%xmm2 1062 pmuludq 64(%eax),%xmm6 1063 pshufd $16,-112(%edx),%xmm7 1064 paddq %xmm5,%xmm1 1065 movdqa %xmm7,%xmm5 1066 pmuludq 32(%eax),%xmm7 1067 paddq %xmm6,%xmm0 1068 movdqa %xmm5,%xmm6 1069 pmuludq 16(%eax),%xmm5 1070 paddq %xmm7,%xmm4 1071 pshufd $16,-48(%edx),%xmm7 1072 pmuludq (%eax),%xmm6 1073 paddq %xmm5,%xmm3 1074 movdqa %xmm7,%xmm5 1075 pmuludq 64(%eax),%xmm7 1076 paddq %xmm6,%xmm2 1077 pmuludq 48(%eax),%xmm5 1078 pshufd $16,-96(%edx),%xmm6 1079 paddq %xmm7,%xmm1 1080 movdqa %xmm6,%xmm7 1081 pmuludq 16(%eax),%xmm6 1082 paddq %xmm5,%xmm0 1083 pshufd $16,-32(%edx),%xmm5 1084 pmuludq (%eax),%xmm7 1085 paddq %xmm6,%xmm4 1086 movdqa %xmm5,%xmm6 1087 pmuludq 64(%eax),%xmm5 1088 paddq %xmm7,%xmm3 1089 movdqa %xmm6,%xmm7 1090 pmuludq 48(%eax),%xmm6 1091 paddq %xmm5,%xmm2 1092 pmuludq 32(%eax),%xmm7 1093 pshufd $16,-80(%edx),%xmm5 1094 paddq %xmm6,%xmm1 1095 pshufd $16,-16(%edx),%xmm6 1096 pmuludq (%eax),%xmm5 1097 paddq %xmm7,%xmm0 1098 movdqa %xmm6,%xmm7 1099 pmuludq 64(%eax),%xmm6 1100 paddq %xmm5,%xmm4 1101 movdqa %xmm7,%xmm5 1102 pmuludq 16(%eax),%xmm7 1103 paddq %xmm6,%xmm3 1104 movdqa %xmm5,%xmm6 1105 pmuludq 32(%eax),%xmm5 1106 paddq %xmm7,%xmm0 1107 pmuludq 48(%eax),%xmm6 1108 movdqa 64(%ebx),%xmm7 1109 paddq %xmm5,%xmm1 1110 paddq %xmm6,%xmm2 1111 jz L017short_tail 1112 movdqu -32(%esi),%xmm5 1113 movdqu -16(%esi),%xmm6 1114 leal 32(%esi),%esi 1115 movdqa %xmm2,32(%esp) 1116 movdqa %xmm3,48(%esp) 1117 movdqa %xmm4,64(%esp) 1118 movdqa %xmm5,%xmm2 1119 movdqa %xmm6,%xmm3 1120 psrldq $6,%xmm2 1121 psrldq $6,%xmm3 1122 movdqa %xmm5,%xmm4 1123 punpcklqdq %xmm3,%xmm2 1124 punpckhqdq %xmm6,%xmm4 1125 punpcklqdq %xmm6,%xmm5 1126 movdqa %xmm2,%xmm3 1127 psrlq $4,%xmm2 1128 psrlq $30,%xmm3 1129 movdqa %xmm5,%xmm6 1130 psrlq $40,%xmm4 1131 psrlq $26,%xmm6 1132 pand %xmm7,%xmm5 1133 pand %xmm7,%xmm6 1134 pand %xmm7,%xmm2 1135 pand %xmm7,%xmm3 1136 por (%ebx),%xmm4 1137 pshufd $16,(%edx),%xmm7 1138 paddd 80(%esp),%xmm5 1139 paddd 96(%esp),%xmm6 1140 paddd 112(%esp),%xmm2 1141 paddd 128(%esp),%xmm3 1142 paddd 144(%esp),%xmm4 1143 movdqa %xmm5,(%esp) 1144 pmuludq %xmm7,%xmm5 1145 movdqa %xmm6,16(%esp) 1146 pmuludq %xmm7,%xmm6 1147 paddq %xmm5,%xmm0 1148 movdqa %xmm2,%xmm5 1149 pmuludq %xmm7,%xmm2 1150 paddq %xmm6,%xmm1 1151 movdqa %xmm3,%xmm6 1152 pmuludq %xmm7,%xmm3 1153 paddq 32(%esp),%xmm2 1154 movdqa %xmm5,32(%esp) 1155 pshufd $16,16(%edx),%xmm5 1156 paddq 48(%esp),%xmm3 1157 movdqa %xmm6,48(%esp) 1158 movdqa %xmm4,%xmm6 1159 pmuludq %xmm7,%xmm4 1160 paddq 64(%esp),%xmm4 1161 movdqa %xmm6,64(%esp) 1162 movdqa %xmm5,%xmm6 1163 pmuludq 48(%esp),%xmm5 1164 movdqa %xmm6,%xmm7 1165 pmuludq 32(%esp),%xmm6 1166 paddq %xmm5,%xmm4 1167 movdqa %xmm7,%xmm5 1168 pmuludq 16(%esp),%xmm7 1169 paddq %xmm6,%xmm3 1170 pshufd $16,80(%edx),%xmm6 1171 pmuludq (%esp),%xmm5 1172 paddq %xmm7,%xmm2 1173 pmuludq 64(%esp),%xmm6 1174 pshufd $16,32(%edx),%xmm7 1175 paddq %xmm5,%xmm1 1176 movdqa %xmm7,%xmm5 1177 pmuludq 32(%esp),%xmm7 1178 paddq %xmm6,%xmm0 1179 movdqa %xmm5,%xmm6 1180 pmuludq 16(%esp),%xmm5 1181 paddq %xmm7,%xmm4 1182 pshufd $16,96(%edx),%xmm7 1183 pmuludq (%esp),%xmm6 1184 paddq %xmm5,%xmm3 1185 movdqa %xmm7,%xmm5 1186 pmuludq 64(%esp),%xmm7 1187 paddq %xmm6,%xmm2 1188 pmuludq 48(%esp),%xmm5 1189 pshufd $16,48(%edx),%xmm6 1190 paddq %xmm7,%xmm1 1191 movdqa %xmm6,%xmm7 1192 pmuludq 16(%esp),%xmm6 1193 paddq %xmm5,%xmm0 1194 pshufd $16,112(%edx),%xmm5 1195 pmuludq (%esp),%xmm7 1196 paddq %xmm6,%xmm4 1197 movdqa %xmm5,%xmm6 1198 pmuludq 64(%esp),%xmm5 1199 paddq %xmm7,%xmm3 1200 movdqa %xmm6,%xmm7 1201 pmuludq 48(%esp),%xmm6 1202 paddq %xmm5,%xmm2 1203 pmuludq 32(%esp),%xmm7 1204 pshufd $16,64(%edx),%xmm5 1205 paddq %xmm6,%xmm1 1206 pshufd $16,128(%edx),%xmm6 1207 pmuludq (%esp),%xmm5 1208 paddq %xmm7,%xmm0 1209 movdqa %xmm6,%xmm7 1210 pmuludq 64(%esp),%xmm6 1211 paddq %xmm5,%xmm4 1212 movdqa %xmm7,%xmm5 1213 pmuludq 16(%esp),%xmm7 1214 paddq %xmm6,%xmm3 1215 movdqa %xmm5,%xmm6 1216 pmuludq 32(%esp),%xmm5 1217 paddq %xmm7,%xmm0 1218 pmuludq 48(%esp),%xmm6 1219 movdqa 64(%ebx),%xmm7 1220 paddq %xmm5,%xmm1 1221 paddq %xmm6,%xmm2 1222 L017short_tail: 1223 pshufd $78,%xmm4,%xmm6 1224 pshufd $78,%xmm3,%xmm5 1225 paddq %xmm6,%xmm4 1226 paddq %xmm5,%xmm3 1227 pshufd $78,%xmm0,%xmm6 1228 pshufd $78,%xmm1,%xmm5 1229 paddq %xmm6,%xmm0 1230 paddq %xmm5,%xmm1 1231 pshufd $78,%xmm2,%xmm6 1232 movdqa %xmm3,%xmm5 1233 pand %xmm7,%xmm3 1234 psrlq $26,%xmm5 1235 paddq %xmm6,%xmm2 1236 paddq %xmm4,%xmm5 1237 movdqa %xmm0,%xmm6 1238 pand %xmm7,%xmm0 1239 psrlq $26,%xmm6 1240 movdqa %xmm5,%xmm4 1241 paddq %xmm1,%xmm6 1242 psrlq $26,%xmm5 1243 pand %xmm7,%xmm4 1244 movdqa %xmm6,%xmm1 1245 psrlq $26,%xmm6 1246 paddd %xmm5,%xmm0 1247 psllq $2,%xmm5 1248 paddq %xmm2,%xmm6 1249 paddq %xmm0,%xmm5 1250 pand %xmm7,%xmm1 1251 movdqa %xmm6,%xmm2 1252 psrlq $26,%xmm6 1253 pand %xmm7,%xmm2 1254 paddd %xmm3,%xmm6 1255 movdqa %xmm5,%xmm0 1256 psrlq $26,%xmm5 1257 movdqa %xmm6,%xmm3 1258 psrlq $26,%xmm6 1259 pand %xmm7,%xmm0 1260 paddd %xmm5,%xmm1 1261 pand %xmm7,%xmm3 1262 paddd %xmm6,%xmm4 1263 L013done: 1264 movd %xmm0,-48(%edi) 1265 movd %xmm1,-44(%edi) 1266 movd %xmm2,-40(%edi) 1267 movd %xmm3,-36(%edi) 1268 movd %xmm4,-32(%edi) 1269 movl %ebp,%esp 1270 L007nodata: 1271 popl %edi 1272 popl %esi 1273 popl %ebx 1274 popl %ebp 1275 ret 1276 .align 5,0x90 1277 .type __poly1305_emit_sse2,@function 1278 .align 4 1279 __poly1305_emit_sse2: 1280 %ifdef __CET__ 1281 1282 .byte 243,15,30,251 1283 %endif 1284 1285 pushl %ebp 1286 pushl %ebx 1287 pushl %esi 1288 pushl %edi 1289 movl 20(%esp),%ebp 1290 cmpl $0,20(%ebp) 1291 je Lenter_emit 1292 movl (%ebp),%eax 1293 movl 4(%ebp),%edi 1294 movl 8(%ebp),%ecx 1295 movl 12(%ebp),%edx 1296 movl 16(%ebp),%esi 1297 movl %edi,%ebx 1298 shll $26,%edi 1299 shrl $6,%ebx 1300 addl %edi,%eax 1301 movl %ecx,%edi 1302 adcl $0,%ebx 1303 shll $20,%edi 1304 shrl $12,%ecx 1305 addl %edi,%ebx 1306 movl %edx,%edi 1307 adcl $0,%ecx 1308 shll $14,%edi 1309 shrl $18,%edx 1310 addl %edi,%ecx 1311 movl %esi,%edi 1312 adcl $0,%edx 1313 shll $8,%edi 1314 shrl $24,%esi 1315 addl %edi,%edx 1316 adcl $0,%esi 1317 movl %esi,%edi 1318 andl $3,%esi 1319 shrl $2,%edi 1320 leal (%edi,%edi,4),%ebp 1321 movl 24(%esp),%edi 1322 addl %ebp,%eax 1323 movl 28(%esp),%ebp 1324 adcl $0,%ebx 1325 adcl $0,%ecx 1326 adcl $0,%edx 1327 adcl $0,%esi 1328 movd %eax,%xmm0 1329 addl $5,%eax 1330 movd %ebx,%xmm1 1331 adcl $0,%ebx 1332 movd %ecx,%xmm2 1333 adcl $0,%ecx 1334 movd %edx,%xmm3 1335 adcl $0,%edx 1336 adcl $0,%esi 1337 shrl $2,%esi 1338 negl %esi 1339 andl %esi,%eax 1340 andl %esi,%ebx 1341 andl %esi,%ecx 1342 andl %esi,%edx 1343 movl %eax,(%edi) 1344 movd %xmm0,%eax 1345 movl %ebx,4(%edi) 1346 movd %xmm1,%ebx 1347 movl %ecx,8(%edi) 1348 movd %xmm2,%ecx 1349 movl %edx,12(%edi) 1350 movd %xmm3,%edx 1351 notl %esi 1352 andl %esi,%eax 1353 andl %esi,%ebx 1354 orl (%edi),%eax 1355 andl %esi,%ecx 1356 orl 4(%edi),%ebx 1357 andl %esi,%edx 1358 orl 8(%edi),%ecx 1359 orl 12(%edi),%edx 1360 addl (%ebp),%eax 1361 adcl 4(%ebp),%ebx 1362 movl %eax,(%edi) 1363 adcl 8(%ebp),%ecx 1364 movl %ebx,4(%edi) 1365 adcl 12(%ebp),%edx 1366 movl %ecx,8(%edi) 1367 movl %edx,12(%edi) 1368 popl %edi 1369 popl %esi 1370 popl %ebx 1371 popl %ebp 1372 ret 1373 .align 5,0x90 1374 .type __poly1305_init_avx2,@function 1375 .align 4 1376 __poly1305_init_avx2: 1377 %ifdef __CET__ 1378 1379 .byte 243,15,30,251 1380 %endif 1381 1382 vmovdqu 24(%edi),%xmm4 1383 leal 48(%edi),%edi 1384 movl %esp,%ebp 1385 subl $224,%esp 1386 andl $-16,%esp 1387 vmovdqa 64(%ebx),%xmm7 1388 vpand %xmm7,%xmm4,%xmm0 1389 vpsrlq $26,%xmm4,%xmm1 1390 vpsrldq $6,%xmm4,%xmm3 1391 vpand %xmm7,%xmm1,%xmm1 1392 vpsrlq $4,%xmm3,%xmm2 1393 vpsrlq $30,%xmm3,%xmm3 1394 vpand %xmm7,%xmm2,%xmm2 1395 vpand %xmm7,%xmm3,%xmm3 1396 vpsrldq $13,%xmm4,%xmm4 1397 leal 144(%esp),%edx 1398 movl $2,%ecx 1399 L018square: 1400 vmovdqa %xmm0,(%esp) 1401 vmovdqa %xmm1,16(%esp) 1402 vmovdqa %xmm2,32(%esp) 1403 vmovdqa %xmm3,48(%esp) 1404 vmovdqa %xmm4,64(%esp) 1405 vpslld $2,%xmm1,%xmm6 1406 vpslld $2,%xmm2,%xmm5 1407 vpaddd %xmm1,%xmm6,%xmm6 1408 vpaddd %xmm2,%xmm5,%xmm5 1409 vmovdqa %xmm6,80(%esp) 1410 vmovdqa %xmm5,96(%esp) 1411 vpslld $2,%xmm3,%xmm6 1412 vpslld $2,%xmm4,%xmm5 1413 vpaddd %xmm3,%xmm6,%xmm6 1414 vpaddd %xmm4,%xmm5,%xmm5 1415 vmovdqa %xmm6,112(%esp) 1416 vmovdqa %xmm5,128(%esp) 1417 vpshufd $68,%xmm0,%xmm5 1418 vmovdqa %xmm1,%xmm6 1419 vpshufd $68,%xmm1,%xmm1 1420 vpshufd $68,%xmm2,%xmm2 1421 vpshufd $68,%xmm3,%xmm3 1422 vpshufd $68,%xmm4,%xmm4 1423 vmovdqa %xmm5,(%edx) 1424 vmovdqa %xmm1,16(%edx) 1425 vmovdqa %xmm2,32(%edx) 1426 vmovdqa %xmm3,48(%edx) 1427 vmovdqa %xmm4,64(%edx) 1428 vpmuludq %xmm0,%xmm4,%xmm4 1429 vpmuludq %xmm0,%xmm3,%xmm3 1430 vpmuludq %xmm0,%xmm2,%xmm2 1431 vpmuludq %xmm0,%xmm1,%xmm1 1432 vpmuludq %xmm0,%xmm5,%xmm0 1433 vpmuludq 48(%edx),%xmm6,%xmm5 1434 vpaddq %xmm5,%xmm4,%xmm4 1435 vpmuludq 32(%edx),%xmm6,%xmm7 1436 vpaddq %xmm7,%xmm3,%xmm3 1437 vpmuludq 16(%edx),%xmm6,%xmm5 1438 vpaddq %xmm5,%xmm2,%xmm2 1439 vmovdqa 80(%esp),%xmm7 1440 vpmuludq (%edx),%xmm6,%xmm6 1441 vpaddq %xmm6,%xmm1,%xmm1 1442 vmovdqa 32(%esp),%xmm5 1443 vpmuludq 64(%edx),%xmm7,%xmm7 1444 vpaddq %xmm7,%xmm0,%xmm0 1445 vpmuludq 32(%edx),%xmm5,%xmm6 1446 vpaddq %xmm6,%xmm4,%xmm4 1447 vpmuludq 16(%edx),%xmm5,%xmm7 1448 vpaddq %xmm7,%xmm3,%xmm3 1449 vmovdqa 96(%esp),%xmm6 1450 vpmuludq (%edx),%xmm5,%xmm5 1451 vpaddq %xmm5,%xmm2,%xmm2 1452 vpmuludq 64(%edx),%xmm6,%xmm7 1453 vpaddq %xmm7,%xmm1,%xmm1 1454 vmovdqa 48(%esp),%xmm5 1455 vpmuludq 48(%edx),%xmm6,%xmm6 1456 vpaddq %xmm6,%xmm0,%xmm0 1457 vpmuludq 16(%edx),%xmm5,%xmm7 1458 vpaddq %xmm7,%xmm4,%xmm4 1459 vmovdqa 112(%esp),%xmm6 1460 vpmuludq (%edx),%xmm5,%xmm5 1461 vpaddq %xmm5,%xmm3,%xmm3 1462 vpmuludq 64(%edx),%xmm6,%xmm7 1463 vpaddq %xmm7,%xmm2,%xmm2 1464 vpmuludq 48(%edx),%xmm6,%xmm5 1465 vpaddq %xmm5,%xmm1,%xmm1 1466 vmovdqa 64(%esp),%xmm7 1467 vpmuludq 32(%edx),%xmm6,%xmm6 1468 vpaddq %xmm6,%xmm0,%xmm0 1469 vmovdqa 128(%esp),%xmm5 1470 vpmuludq (%edx),%xmm7,%xmm7 1471 vpaddq %xmm7,%xmm4,%xmm4 1472 vpmuludq 64(%edx),%xmm5,%xmm6 1473 vpaddq %xmm6,%xmm3,%xmm3 1474 vpmuludq 16(%edx),%xmm5,%xmm7 1475 vpaddq %xmm7,%xmm0,%xmm0 1476 vpmuludq 32(%edx),%xmm5,%xmm6 1477 vpaddq %xmm6,%xmm1,%xmm1 1478 vmovdqa 64(%ebx),%xmm7 1479 vpmuludq 48(%edx),%xmm5,%xmm5 1480 vpaddq %xmm5,%xmm2,%xmm2 1481 vpsrlq $26,%xmm3,%xmm5 1482 vpand %xmm7,%xmm3,%xmm3 1483 vpsrlq $26,%xmm0,%xmm6 1484 vpand %xmm7,%xmm0,%xmm0 1485 vpaddq %xmm5,%xmm4,%xmm4 1486 vpaddq %xmm6,%xmm1,%xmm1 1487 vpsrlq $26,%xmm4,%xmm5 1488 vpand %xmm7,%xmm4,%xmm4 1489 vpsrlq $26,%xmm1,%xmm6 1490 vpand %xmm7,%xmm1,%xmm1 1491 vpaddq %xmm6,%xmm2,%xmm2 1492 vpaddd %xmm5,%xmm0,%xmm0 1493 vpsllq $2,%xmm5,%xmm5 1494 vpsrlq $26,%xmm2,%xmm6 1495 vpand %xmm7,%xmm2,%xmm2 1496 vpaddd %xmm5,%xmm0,%xmm0 1497 vpaddd %xmm6,%xmm3,%xmm3 1498 vpsrlq $26,%xmm3,%xmm6 1499 vpsrlq $26,%xmm0,%xmm5 1500 vpand %xmm7,%xmm0,%xmm0 1501 vpand %xmm7,%xmm3,%xmm3 1502 vpaddd %xmm5,%xmm1,%xmm1 1503 vpaddd %xmm6,%xmm4,%xmm4 1504 decl %ecx 1505 jz L019square_break 1506 vpunpcklqdq (%esp),%xmm0,%xmm0 1507 vpunpcklqdq 16(%esp),%xmm1,%xmm1 1508 vpunpcklqdq 32(%esp),%xmm2,%xmm2 1509 vpunpcklqdq 48(%esp),%xmm3,%xmm3 1510 vpunpcklqdq 64(%esp),%xmm4,%xmm4 1511 jmp L018square 1512 L019square_break: 1513 vpsllq $32,%xmm0,%xmm0 1514 vpsllq $32,%xmm1,%xmm1 1515 vpsllq $32,%xmm2,%xmm2 1516 vpsllq $32,%xmm3,%xmm3 1517 vpsllq $32,%xmm4,%xmm4 1518 vpor (%esp),%xmm0,%xmm0 1519 vpor 16(%esp),%xmm1,%xmm1 1520 vpor 32(%esp),%xmm2,%xmm2 1521 vpor 48(%esp),%xmm3,%xmm3 1522 vpor 64(%esp),%xmm4,%xmm4 1523 vpshufd $141,%xmm0,%xmm0 1524 vpshufd $141,%xmm1,%xmm1 1525 vpshufd $141,%xmm2,%xmm2 1526 vpshufd $141,%xmm3,%xmm3 1527 vpshufd $141,%xmm4,%xmm4 1528 vmovdqu %xmm0,(%edi) 1529 vmovdqu %xmm1,16(%edi) 1530 vmovdqu %xmm2,32(%edi) 1531 vmovdqu %xmm3,48(%edi) 1532 vmovdqu %xmm4,64(%edi) 1533 vpslld $2,%xmm1,%xmm6 1534 vpslld $2,%xmm2,%xmm5 1535 vpaddd %xmm1,%xmm6,%xmm6 1536 vpaddd %xmm2,%xmm5,%xmm5 1537 vmovdqu %xmm6,80(%edi) 1538 vmovdqu %xmm5,96(%edi) 1539 vpslld $2,%xmm3,%xmm6 1540 vpslld $2,%xmm4,%xmm5 1541 vpaddd %xmm3,%xmm6,%xmm6 1542 vpaddd %xmm4,%xmm5,%xmm5 1543 vmovdqu %xmm6,112(%edi) 1544 vmovdqu %xmm5,128(%edi) 1545 movl %ebp,%esp 1546 leal -48(%edi),%edi 1547 ret 1548 .align 5,0x90 1549 .type __poly1305_blocks_avx2,@function 1550 .align 4 1551 __poly1305_blocks_avx2: 1552 %ifdef __CET__ 1553 1554 .byte 243,15,30,251 1555 %endif 1556 1557 pushl %ebp 1558 pushl %ebx 1559 pushl %esi 1560 pushl %edi 1561 movl 20(%esp),%edi 1562 movl 24(%esp),%esi 1563 movl 28(%esp),%ecx 1564 movl 20(%edi),%eax 1565 andl $-16,%ecx 1566 jz L020nodata 1567 cmpl $64,%ecx 1568 jae L021enter_avx2 1569 testl %eax,%eax 1570 jz Lenter_blocks 1571 L021enter_avx2: 1572 vzeroupper 1573 call L022pic_point 1574 L022pic_point: 1575 popl %ebx 1576 leal Lconst_sse2-L022pic_point(%ebx),%ebx 1577 testl %eax,%eax 1578 jnz L023base2_26 1579 call __poly1305_init_avx2 1580 movl (%edi),%eax 1581 movl 3(%edi),%ecx 1582 movl 6(%edi),%edx 1583 movl 9(%edi),%esi 1584 movl 13(%edi),%ebp 1585 shrl $2,%ecx 1586 andl $67108863,%eax 1587 shrl $4,%edx 1588 andl $67108863,%ecx 1589 shrl $6,%esi 1590 andl $67108863,%edx 1591 movl %eax,(%edi) 1592 movl %ecx,4(%edi) 1593 movl %edx,8(%edi) 1594 movl %esi,12(%edi) 1595 movl %ebp,16(%edi) 1596 movl $1,20(%edi) 1597 movl 24(%esp),%esi 1598 movl 28(%esp),%ecx 1599 L023base2_26: 1600 movl 32(%esp),%eax 1601 movl %esp,%ebp 1602 subl $448,%esp 1603 andl $-512,%esp 1604 vmovdqu 48(%edi),%xmm0 1605 leal 288(%esp),%edx 1606 vmovdqu 64(%edi),%xmm1 1607 vmovdqu 80(%edi),%xmm2 1608 vmovdqu 96(%edi),%xmm3 1609 vmovdqu 112(%edi),%xmm4 1610 leal 48(%edi),%edi 1611 vpermq $64,%ymm0,%ymm0 1612 vpermq $64,%ymm1,%ymm1 1613 vpermq $64,%ymm2,%ymm2 1614 vpermq $64,%ymm3,%ymm3 1615 vpermq $64,%ymm4,%ymm4 1616 vpshufd $200,%ymm0,%ymm0 1617 vpshufd $200,%ymm1,%ymm1 1618 vpshufd $200,%ymm2,%ymm2 1619 vpshufd $200,%ymm3,%ymm3 1620 vpshufd $200,%ymm4,%ymm4 1621 vmovdqa %ymm0,-128(%edx) 1622 vmovdqu 80(%edi),%xmm0 1623 vmovdqa %ymm1,-96(%edx) 1624 vmovdqu 96(%edi),%xmm1 1625 vmovdqa %ymm2,-64(%edx) 1626 vmovdqu 112(%edi),%xmm2 1627 vmovdqa %ymm3,-32(%edx) 1628 vmovdqu 128(%edi),%xmm3 1629 vmovdqa %ymm4,(%edx) 1630 vpermq $64,%ymm0,%ymm0 1631 vpermq $64,%ymm1,%ymm1 1632 vpermq $64,%ymm2,%ymm2 1633 vpermq $64,%ymm3,%ymm3 1634 vpshufd $200,%ymm0,%ymm0 1635 vpshufd $200,%ymm1,%ymm1 1636 vpshufd $200,%ymm2,%ymm2 1637 vpshufd $200,%ymm3,%ymm3 1638 vmovdqa %ymm0,32(%edx) 1639 vmovd -48(%edi),%xmm0 1640 vmovdqa %ymm1,64(%edx) 1641 vmovd -44(%edi),%xmm1 1642 vmovdqa %ymm2,96(%edx) 1643 vmovd -40(%edi),%xmm2 1644 vmovdqa %ymm3,128(%edx) 1645 vmovd -36(%edi),%xmm3 1646 vmovd -32(%edi),%xmm4 1647 vmovdqa 64(%ebx),%ymm7 1648 negl %eax 1649 testl $63,%ecx 1650 jz L024even 1651 movl %ecx,%edx 1652 andl $-64,%ecx 1653 andl $63,%edx 1654 vmovdqu (%esi),%xmm5 1655 cmpl $32,%edx 1656 jb L025one 1657 vmovdqu 16(%esi),%xmm6 1658 je L026two 1659 vinserti128 $1,32(%esi),%ymm5,%ymm5 1660 leal 48(%esi),%esi 1661 leal 8(%ebx),%ebx 1662 leal 296(%esp),%edx 1663 jmp L027tail 1664 L026two: 1665 leal 32(%esi),%esi 1666 leal 16(%ebx),%ebx 1667 leal 304(%esp),%edx 1668 jmp L027tail 1669 L025one: 1670 leal 16(%esi),%esi 1671 vpxor %ymm6,%ymm6,%ymm6 1672 leal 32(%ebx,%eax,8),%ebx 1673 leal 312(%esp),%edx 1674 jmp L027tail 1675 .align 5,0x90 1676 L024even: 1677 vmovdqu (%esi),%xmm5 1678 vmovdqu 16(%esi),%xmm6 1679 vinserti128 $1,32(%esi),%ymm5,%ymm5 1680 vinserti128 $1,48(%esi),%ymm6,%ymm6 1681 leal 64(%esi),%esi 1682 subl $64,%ecx 1683 jz L027tail 1684 L028loop: 1685 vmovdqa %ymm2,64(%esp) 1686 vpsrldq $6,%ymm5,%ymm2 1687 vmovdqa %ymm0,(%esp) 1688 vpsrldq $6,%ymm6,%ymm0 1689 vmovdqa %ymm1,32(%esp) 1690 vpunpckhqdq %ymm6,%ymm5,%ymm1 1691 vpunpcklqdq %ymm6,%ymm5,%ymm5 1692 vpunpcklqdq %ymm0,%ymm2,%ymm2 1693 vpsrlq $30,%ymm2,%ymm0 1694 vpsrlq $4,%ymm2,%ymm2 1695 vpsrlq $26,%ymm5,%ymm6 1696 vpsrlq $40,%ymm1,%ymm1 1697 vpand %ymm7,%ymm2,%ymm2 1698 vpand %ymm7,%ymm5,%ymm5 1699 vpand %ymm7,%ymm6,%ymm6 1700 vpand %ymm7,%ymm0,%ymm0 1701 vpor (%ebx),%ymm1,%ymm1 1702 vpaddq 64(%esp),%ymm2,%ymm2 1703 vpaddq (%esp),%ymm5,%ymm5 1704 vpaddq 32(%esp),%ymm6,%ymm6 1705 vpaddq %ymm3,%ymm0,%ymm0 1706 vpaddq %ymm4,%ymm1,%ymm1 1707 vpmuludq -96(%edx),%ymm2,%ymm3 1708 vmovdqa %ymm6,32(%esp) 1709 vpmuludq -64(%edx),%ymm2,%ymm4 1710 vmovdqa %ymm0,96(%esp) 1711 vpmuludq 96(%edx),%ymm2,%ymm0 1712 vmovdqa %ymm1,128(%esp) 1713 vpmuludq 128(%edx),%ymm2,%ymm1 1714 vpmuludq -128(%edx),%ymm2,%ymm2 1715 vpmuludq -32(%edx),%ymm5,%ymm7 1716 vpaddq %ymm7,%ymm3,%ymm3 1717 vpmuludq (%edx),%ymm5,%ymm6 1718 vpaddq %ymm6,%ymm4,%ymm4 1719 vpmuludq -128(%edx),%ymm5,%ymm7 1720 vpaddq %ymm7,%ymm0,%ymm0 1721 vmovdqa 32(%esp),%ymm7 1722 vpmuludq -96(%edx),%ymm5,%ymm6 1723 vpaddq %ymm6,%ymm1,%ymm1 1724 vpmuludq -64(%edx),%ymm5,%ymm5 1725 vpaddq %ymm5,%ymm2,%ymm2 1726 vpmuludq -64(%edx),%ymm7,%ymm6 1727 vpaddq %ymm6,%ymm3,%ymm3 1728 vpmuludq -32(%edx),%ymm7,%ymm5 1729 vpaddq %ymm5,%ymm4,%ymm4 1730 vpmuludq 128(%edx),%ymm7,%ymm6 1731 vpaddq %ymm6,%ymm0,%ymm0 1732 vmovdqa 96(%esp),%ymm6 1733 vpmuludq -128(%edx),%ymm7,%ymm5 1734 vpaddq %ymm5,%ymm1,%ymm1 1735 vpmuludq -96(%edx),%ymm7,%ymm7 1736 vpaddq %ymm7,%ymm2,%ymm2 1737 vpmuludq -128(%edx),%ymm6,%ymm5 1738 vpaddq %ymm5,%ymm3,%ymm3 1739 vpmuludq -96(%edx),%ymm6,%ymm7 1740 vpaddq %ymm7,%ymm4,%ymm4 1741 vpmuludq 64(%edx),%ymm6,%ymm5 1742 vpaddq %ymm5,%ymm0,%ymm0 1743 vmovdqa 128(%esp),%ymm5 1744 vpmuludq 96(%edx),%ymm6,%ymm7 1745 vpaddq %ymm7,%ymm1,%ymm1 1746 vpmuludq 128(%edx),%ymm6,%ymm6 1747 vpaddq %ymm6,%ymm2,%ymm2 1748 vpmuludq 128(%edx),%ymm5,%ymm7 1749 vpaddq %ymm7,%ymm3,%ymm3 1750 vpmuludq 32(%edx),%ymm5,%ymm6 1751 vpaddq %ymm6,%ymm0,%ymm0 1752 vpmuludq -128(%edx),%ymm5,%ymm7 1753 vpaddq %ymm7,%ymm4,%ymm4 1754 vmovdqa 64(%ebx),%ymm7 1755 vpmuludq 64(%edx),%ymm5,%ymm6 1756 vpaddq %ymm6,%ymm1,%ymm1 1757 vpmuludq 96(%edx),%ymm5,%ymm5 1758 vpaddq %ymm5,%ymm2,%ymm2 1759 vpsrlq $26,%ymm3,%ymm5 1760 vpand %ymm7,%ymm3,%ymm3 1761 vpsrlq $26,%ymm0,%ymm6 1762 vpand %ymm7,%ymm0,%ymm0 1763 vpaddq %ymm5,%ymm4,%ymm4 1764 vpaddq %ymm6,%ymm1,%ymm1 1765 vpsrlq $26,%ymm4,%ymm5 1766 vpand %ymm7,%ymm4,%ymm4 1767 vpsrlq $26,%ymm1,%ymm6 1768 vpand %ymm7,%ymm1,%ymm1 1769 vpaddq %ymm6,%ymm2,%ymm2 1770 vpaddq %ymm5,%ymm0,%ymm0 1771 vpsllq $2,%ymm5,%ymm5 1772 vpsrlq $26,%ymm2,%ymm6 1773 vpand %ymm7,%ymm2,%ymm2 1774 vpaddq %ymm5,%ymm0,%ymm0 1775 vpaddq %ymm6,%ymm3,%ymm3 1776 vpsrlq $26,%ymm3,%ymm6 1777 vpsrlq $26,%ymm0,%ymm5 1778 vpand %ymm7,%ymm0,%ymm0 1779 vpand %ymm7,%ymm3,%ymm3 1780 vpaddq %ymm5,%ymm1,%ymm1 1781 vpaddq %ymm6,%ymm4,%ymm4 1782 vmovdqu (%esi),%xmm5 1783 vmovdqu 16(%esi),%xmm6 1784 vinserti128 $1,32(%esi),%ymm5,%ymm5 1785 vinserti128 $1,48(%esi),%ymm6,%ymm6 1786 leal 64(%esi),%esi 1787 subl $64,%ecx 1788 jnz L028loop 1789 L027tail: 1790 vmovdqa %ymm2,64(%esp) 1791 vpsrldq $6,%ymm5,%ymm2 1792 vmovdqa %ymm0,(%esp) 1793 vpsrldq $6,%ymm6,%ymm0 1794 vmovdqa %ymm1,32(%esp) 1795 vpunpckhqdq %ymm6,%ymm5,%ymm1 1796 vpunpcklqdq %ymm6,%ymm5,%ymm5 1797 vpunpcklqdq %ymm0,%ymm2,%ymm2 1798 vpsrlq $30,%ymm2,%ymm0 1799 vpsrlq $4,%ymm2,%ymm2 1800 vpsrlq $26,%ymm5,%ymm6 1801 vpsrlq $40,%ymm1,%ymm1 1802 vpand %ymm7,%ymm2,%ymm2 1803 vpand %ymm7,%ymm5,%ymm5 1804 vpand %ymm7,%ymm6,%ymm6 1805 vpand %ymm7,%ymm0,%ymm0 1806 vpor (%ebx),%ymm1,%ymm1 1807 andl $-64,%ebx 1808 vpaddq 64(%esp),%ymm2,%ymm2 1809 vpaddq (%esp),%ymm5,%ymm5 1810 vpaddq 32(%esp),%ymm6,%ymm6 1811 vpaddq %ymm3,%ymm0,%ymm0 1812 vpaddq %ymm4,%ymm1,%ymm1 1813 vpmuludq -92(%edx),%ymm2,%ymm3 1814 vmovdqa %ymm6,32(%esp) 1815 vpmuludq -60(%edx),%ymm2,%ymm4 1816 vmovdqa %ymm0,96(%esp) 1817 vpmuludq 100(%edx),%ymm2,%ymm0 1818 vmovdqa %ymm1,128(%esp) 1819 vpmuludq 132(%edx),%ymm2,%ymm1 1820 vpmuludq -124(%edx),%ymm2,%ymm2 1821 vpmuludq -28(%edx),%ymm5,%ymm7 1822 vpaddq %ymm7,%ymm3,%ymm3 1823 vpmuludq 4(%edx),%ymm5,%ymm6 1824 vpaddq %ymm6,%ymm4,%ymm4 1825 vpmuludq -124(%edx),%ymm5,%ymm7 1826 vpaddq %ymm7,%ymm0,%ymm0 1827 vmovdqa 32(%esp),%ymm7 1828 vpmuludq -92(%edx),%ymm5,%ymm6 1829 vpaddq %ymm6,%ymm1,%ymm1 1830 vpmuludq -60(%edx),%ymm5,%ymm5 1831 vpaddq %ymm5,%ymm2,%ymm2 1832 vpmuludq -60(%edx),%ymm7,%ymm6 1833 vpaddq %ymm6,%ymm3,%ymm3 1834 vpmuludq -28(%edx),%ymm7,%ymm5 1835 vpaddq %ymm5,%ymm4,%ymm4 1836 vpmuludq 132(%edx),%ymm7,%ymm6 1837 vpaddq %ymm6,%ymm0,%ymm0 1838 vmovdqa 96(%esp),%ymm6 1839 vpmuludq -124(%edx),%ymm7,%ymm5 1840 vpaddq %ymm5,%ymm1,%ymm1 1841 vpmuludq -92(%edx),%ymm7,%ymm7 1842 vpaddq %ymm7,%ymm2,%ymm2 1843 vpmuludq -124(%edx),%ymm6,%ymm5 1844 vpaddq %ymm5,%ymm3,%ymm3 1845 vpmuludq -92(%edx),%ymm6,%ymm7 1846 vpaddq %ymm7,%ymm4,%ymm4 1847 vpmuludq 68(%edx),%ymm6,%ymm5 1848 vpaddq %ymm5,%ymm0,%ymm0 1849 vmovdqa 128(%esp),%ymm5 1850 vpmuludq 100(%edx),%ymm6,%ymm7 1851 vpaddq %ymm7,%ymm1,%ymm1 1852 vpmuludq 132(%edx),%ymm6,%ymm6 1853 vpaddq %ymm6,%ymm2,%ymm2 1854 vpmuludq 132(%edx),%ymm5,%ymm7 1855 vpaddq %ymm7,%ymm3,%ymm3 1856 vpmuludq 36(%edx),%ymm5,%ymm6 1857 vpaddq %ymm6,%ymm0,%ymm0 1858 vpmuludq -124(%edx),%ymm5,%ymm7 1859 vpaddq %ymm7,%ymm4,%ymm4 1860 vmovdqa 64(%ebx),%ymm7 1861 vpmuludq 68(%edx),%ymm5,%ymm6 1862 vpaddq %ymm6,%ymm1,%ymm1 1863 vpmuludq 100(%edx),%ymm5,%ymm5 1864 vpaddq %ymm5,%ymm2,%ymm2 1865 vpsrldq $8,%ymm4,%ymm5 1866 vpsrldq $8,%ymm3,%ymm6 1867 vpaddq %ymm5,%ymm4,%ymm4 1868 vpsrldq $8,%ymm0,%ymm5 1869 vpaddq %ymm6,%ymm3,%ymm3 1870 vpsrldq $8,%ymm1,%ymm6 1871 vpaddq %ymm5,%ymm0,%ymm0 1872 vpsrldq $8,%ymm2,%ymm5 1873 vpaddq %ymm6,%ymm1,%ymm1 1874 vpermq $2,%ymm4,%ymm6 1875 vpaddq %ymm5,%ymm2,%ymm2 1876 vpermq $2,%ymm3,%ymm5 1877 vpaddq %ymm6,%ymm4,%ymm4 1878 vpermq $2,%ymm0,%ymm6 1879 vpaddq %ymm5,%ymm3,%ymm3 1880 vpermq $2,%ymm1,%ymm5 1881 vpaddq %ymm6,%ymm0,%ymm0 1882 vpermq $2,%ymm2,%ymm6 1883 vpaddq %ymm5,%ymm1,%ymm1 1884 vpaddq %ymm6,%ymm2,%ymm2 1885 vpsrlq $26,%ymm3,%ymm5 1886 vpand %ymm7,%ymm3,%ymm3 1887 vpsrlq $26,%ymm0,%ymm6 1888 vpand %ymm7,%ymm0,%ymm0 1889 vpaddq %ymm5,%ymm4,%ymm4 1890 vpaddq %ymm6,%ymm1,%ymm1 1891 vpsrlq $26,%ymm4,%ymm5 1892 vpand %ymm7,%ymm4,%ymm4 1893 vpsrlq $26,%ymm1,%ymm6 1894 vpand %ymm7,%ymm1,%ymm1 1895 vpaddq %ymm6,%ymm2,%ymm2 1896 vpaddq %ymm5,%ymm0,%ymm0 1897 vpsllq $2,%ymm5,%ymm5 1898 vpsrlq $26,%ymm2,%ymm6 1899 vpand %ymm7,%ymm2,%ymm2 1900 vpaddq %ymm5,%ymm0,%ymm0 1901 vpaddq %ymm6,%ymm3,%ymm3 1902 vpsrlq $26,%ymm3,%ymm6 1903 vpsrlq $26,%ymm0,%ymm5 1904 vpand %ymm7,%ymm0,%ymm0 1905 vpand %ymm7,%ymm3,%ymm3 1906 vpaddq %ymm5,%ymm1,%ymm1 1907 vpaddq %ymm6,%ymm4,%ymm4 1908 cmpl $0,%ecx 1909 je L029done 1910 vpshufd $252,%xmm0,%xmm0 1911 leal 288(%esp),%edx 1912 vpshufd $252,%xmm1,%xmm1 1913 vpshufd $252,%xmm2,%xmm2 1914 vpshufd $252,%xmm3,%xmm3 1915 vpshufd $252,%xmm4,%xmm4 1916 jmp L024even 1917 .align 4,0x90 1918 L029done: 1919 vmovd %xmm0,-48(%edi) 1920 vmovd %xmm1,-44(%edi) 1921 vmovd %xmm2,-40(%edi) 1922 vmovd %xmm3,-36(%edi) 1923 vmovd %xmm4,-32(%edi) 1924 vzeroupper 1925 movl %ebp,%esp 1926 L020nodata: 1927 popl %edi 1928 popl %esi 1929 popl %ebx 1930 popl %ebp 1931 ret 1932 .align 6,0x90 1933 Lconst_sse2: 1934 .long 16777216,0,16777216,0,16777216,0,16777216,0 1935 .long 0,0,0,0,0,0,0,0 1936 .long 67108863,0,67108863,0,67108863,0,67108863,0 1937 .long 268435455,268435452,268435452,268435452 1938 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 1939 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 1940 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 1941 .byte 114,103,62,0 1942 .align 2,0x90 1943 .comm _OPENSSL_ia32cap_P,16 1944