1 .text 2 .align 6,0x90 3 .globl _poly1305_init 4 .align 4 5 _poly1305_init: 6 L_poly1305_init_begin: 7 %ifdef __CET__ 8 9 .byte 243,15,30,251 10 %endif 11 12 pushl %ebp 13 pushl %ebx 14 pushl %esi 15 pushl %edi 16 movl 20(%esp),%edi 17 movl 24(%esp),%esi 18 movl 28(%esp),%ebp 19 xorl %eax,%eax 20 movl %eax,(%edi) 21 movl %eax,4(%edi) 22 movl %eax,8(%edi) 23 movl %eax,12(%edi) 24 movl %eax,16(%edi) 25 movl %eax,20(%edi) 26 cmpl $0,%esi 27 je L000nokey 28 call L001pic_point 29 L001pic_point: 30 popl %ebx 31 leal _poly1305_blocks-L001pic_point(%ebx),%eax 32 leal _poly1305_emit-L001pic_point(%ebx),%edx 33 movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001pic_point(%ebx),%edi 34 movl (%edi),%ecx 35 andl $83886080,%ecx 36 cmpl $83886080,%ecx 37 jne L002no_sse2 38 leal __poly1305_blocks_sse2-L001pic_point(%ebx),%eax 39 leal __poly1305_emit_sse2-L001pic_point(%ebx),%edx 40 movl 8(%edi),%ecx 41 testl $32,%ecx 42 jz L002no_sse2 43 leal __poly1305_blocks_avx2-L001pic_point(%ebx),%eax 44 L002no_sse2: 45 movl 20(%esp),%edi 46 movl %eax,(%ebp) 47 movl %edx,4(%ebp) 48 movl (%esi),%eax 49 movl 4(%esi),%ebx 50 movl 8(%esi),%ecx 51 movl 12(%esi),%edx 52 andl $268435455,%eax 53 andl $268435452,%ebx 54 andl $268435452,%ecx 55 andl $268435452,%edx 56 movl %eax,24(%edi) 57 movl %ebx,28(%edi) 58 movl %ecx,32(%edi) 59 movl %edx,36(%edi) 60 movl $1,%eax 61 L000nokey: 62 popl %edi 63 popl %esi 64 popl %ebx 65 popl %ebp 66 ret 67 .globl _poly1305_blocks 68 .align 4 69 _poly1305_blocks: 70 L_poly1305_blocks_begin: 71 %ifdef __CET__ 72 73 .byte 243,15,30,251 74 %endif 75 76 pushl %ebp 77 pushl %ebx 78 pushl %esi 79 pushl %edi 80 movl 20(%esp),%edi 81 movl 24(%esp),%esi 82 movl 28(%esp),%ecx 83 Lenter_blocks: 84 andl $-15,%ecx 85 jz L003nodata 86 subl $64,%esp 87 movl 24(%edi),%eax 88 movl 28(%edi),%ebx 89 leal (%esi,%ecx,1),%ebp 90 movl 32(%edi),%ecx 91 movl 36(%edi),%edx 92 movl %ebp,92(%esp) 93 movl %esi,%ebp 94 movl %eax,36(%esp) 95 movl %ebx,%eax 96 shrl $2,%eax 97 movl %ebx,40(%esp) 98 addl %ebx,%eax 99 movl %ecx,%ebx 100 shrl $2,%ebx 101 movl %ecx,44(%esp) 102 addl %ecx,%ebx 103 movl %edx,%ecx 104 shrl $2,%ecx 105 movl %edx,48(%esp) 106 addl %edx,%ecx 107 movl %eax,52(%esp) 108 movl %ebx,56(%esp) 109 movl %ecx,60(%esp) 110 movl (%edi),%eax 111 movl 4(%edi),%ebx 112 movl 8(%edi),%ecx 113 movl 12(%edi),%esi 114 movl 16(%edi),%edi 115 jmp L004loop 116 .align 5,0x90 117 L004loop: 118 addl (%ebp),%eax 119 adcl 4(%ebp),%ebx 120 adcl 8(%ebp),%ecx 121 adcl 12(%ebp),%esi 122 leal 16(%ebp),%ebp 123 adcl 96(%esp),%edi 124 movl %eax,(%esp) 125 movl %esi,12(%esp) 126 mull 36(%esp) 127 movl %edi,16(%esp) 128 movl %eax,%edi 129 movl %ebx,%eax 130 movl %edx,%esi 131 mull 60(%esp) 132 addl %eax,%edi 133 movl %ecx,%eax 134 adcl %edx,%esi 135 mull 56(%esp) 136 addl %eax,%edi 137 movl 12(%esp),%eax 138 adcl %edx,%esi 139 mull 52(%esp) 140 addl %eax,%edi 141 movl (%esp),%eax 142 adcl %edx,%esi 143 mull 40(%esp) 144 movl %edi,20(%esp) 145 xorl %edi,%edi 146 addl %eax,%esi 147 movl %ebx,%eax 148 adcl %edx,%edi 149 mull 36(%esp) 150 addl %eax,%esi 151 movl %ecx,%eax 152 adcl %edx,%edi 153 mull 60(%esp) 154 addl %eax,%esi 155 movl 12(%esp),%eax 156 adcl %edx,%edi 157 mull 56(%esp) 158 addl %eax,%esi 159 movl 16(%esp),%eax 160 adcl %edx,%edi 161 imull 52(%esp),%eax 162 addl %eax,%esi 163 movl (%esp),%eax 164 adcl $0,%edi 165 mull 44(%esp) 166 movl %esi,24(%esp) 167 xorl %esi,%esi 168 addl %eax,%edi 169 movl %ebx,%eax 170 adcl %edx,%esi 171 mull 40(%esp) 172 addl %eax,%edi 173 movl %ecx,%eax 174 adcl %edx,%esi 175 mull 36(%esp) 176 addl %eax,%edi 177 movl 12(%esp),%eax 178 adcl %edx,%esi 179 mull 60(%esp) 180 addl %eax,%edi 181 movl 16(%esp),%eax 182 adcl %edx,%esi 183 imull 56(%esp),%eax 184 addl %eax,%edi 185 movl (%esp),%eax 186 adcl $0,%esi 187 mull 48(%esp) 188 movl %edi,28(%esp) 189 xorl %edi,%edi 190 addl %eax,%esi 191 movl %ebx,%eax 192 adcl %edx,%edi 193 mull 44(%esp) 194 addl %eax,%esi 195 movl %ecx,%eax 196 adcl %edx,%edi 197 mull 40(%esp) 198 addl %eax,%esi 199 movl 12(%esp),%eax 200 adcl %edx,%edi 201 mull 36(%esp) 202 addl %eax,%esi 203 movl 16(%esp),%ecx 204 adcl %edx,%edi 205 movl %ecx,%edx 206 imull 60(%esp),%ecx 207 addl %ecx,%esi 208 movl 20(%esp),%eax 209 adcl $0,%edi 210 imull 36(%esp),%edx 211 addl %edi,%edx 212 movl 24(%esp),%ebx 213 movl 28(%esp),%ecx 214 movl %edx,%edi 215 shrl $2,%edx 216 andl $3,%edi 217 leal (%edx,%edx,4),%edx 218 addl %edx,%eax 219 adcl $0,%ebx 220 adcl $0,%ecx 221 adcl $0,%esi 222 adcl $0,%edi 223 cmpl 92(%esp),%ebp 224 jne L004loop 225 movl 84(%esp),%edx 226 addl $64,%esp 227 movl %eax,(%edx) 228 movl %ebx,4(%edx) 229 movl %ecx,8(%edx) 230 movl %esi,12(%edx) 231 movl %edi,16(%edx) 232 L003nodata: 233 popl %edi 234 popl %esi 235 popl %ebx 236 popl %ebp 237 ret 238 .globl _poly1305_emit 239 .align 4 240 _poly1305_emit: 241 L_poly1305_emit_begin: 242 %ifdef __CET__ 243 244 .byte 243,15,30,251 245 %endif 246 247 pushl %ebp 248 pushl %ebx 249 pushl %esi 250 pushl %edi 251 movl 20(%esp),%ebp 252 Lenter_emit: 253 movl 24(%esp),%edi 254 movl (%ebp),%eax 255 movl 4(%ebp),%ebx 256 movl 8(%ebp),%ecx 257 movl 12(%ebp),%edx 258 movl 16(%ebp),%esi 259 addl $5,%eax 260 adcl $0,%ebx 261 adcl $0,%ecx 262 adcl $0,%edx 263 adcl $0,%esi 264 shrl $2,%esi 265 negl %esi 266 andl %esi,%eax 267 andl %esi,%ebx 268 andl %esi,%ecx 269 andl %esi,%edx 270 movl %eax,(%edi) 271 movl %ebx,4(%edi) 272 movl %ecx,8(%edi) 273 movl %edx,12(%edi) 274 notl %esi 275 movl (%ebp),%eax 276 movl 4(%ebp),%ebx 277 movl 8(%ebp),%ecx 278 movl 12(%ebp),%edx 279 movl 28(%esp),%ebp 280 andl %esi,%eax 281 andl %esi,%ebx 282 andl %esi,%ecx 283 andl %esi,%edx 284 orl (%edi),%eax 285 orl 4(%edi),%ebx 286 orl 8(%edi),%ecx 287 orl 12(%edi),%edx 288 addl (%ebp),%eax 289 adcl 4(%ebp),%ebx 290 adcl 8(%ebp),%ecx 291 adcl 12(%ebp),%edx 292 movl %eax,(%edi) 293 movl %ebx,4(%edi) 294 movl %ecx,8(%edi) 295 movl %edx,12(%edi) 296 popl %edi 297 popl %esi 298 popl %ebx 299 popl %ebp 300 ret 301 .align 5,0x90 302 .align 4 303 __poly1305_init_sse2: 304 %ifdef __CET__ 305 306 .byte 243,15,30,251 307 %endif 308 309 movdqu 24(%edi),%xmm4 310 leal 48(%edi),%edi 311 movl %esp,%ebp 312 subl $224,%esp 313 andl $-16,%esp 314 movq 64(%ebx),%xmm7 315 movdqa %xmm4,%xmm0 316 movdqa %xmm4,%xmm1 317 movdqa %xmm4,%xmm2 318 pand %xmm7,%xmm0 319 psrlq $26,%xmm1 320 psrldq $6,%xmm2 321 pand %xmm7,%xmm1 322 movdqa %xmm2,%xmm3 323 psrlq $4,%xmm2 324 psrlq $30,%xmm3 325 pand %xmm7,%xmm2 326 pand %xmm7,%xmm3 327 psrldq $13,%xmm4 328 leal 144(%esp),%edx 329 movl $2,%ecx 330 L005square: 331 movdqa %xmm0,(%esp) 332 movdqa %xmm1,16(%esp) 333 movdqa %xmm2,32(%esp) 334 movdqa %xmm3,48(%esp) 335 movdqa %xmm4,64(%esp) 336 movdqa %xmm1,%xmm6 337 movdqa %xmm2,%xmm5 338 pslld $2,%xmm6 339 pslld $2,%xmm5 340 paddd %xmm1,%xmm6 341 paddd %xmm2,%xmm5 342 movdqa %xmm6,80(%esp) 343 movdqa %xmm5,96(%esp) 344 movdqa %xmm3,%xmm6 345 movdqa %xmm4,%xmm5 346 pslld $2,%xmm6 347 pslld $2,%xmm5 348 paddd %xmm3,%xmm6 349 paddd %xmm4,%xmm5 350 movdqa %xmm6,112(%esp) 351 movdqa %xmm5,128(%esp) 352 pshufd $68,%xmm0,%xmm6 353 movdqa %xmm1,%xmm5 354 pshufd $68,%xmm1,%xmm1 355 pshufd $68,%xmm2,%xmm2 356 pshufd $68,%xmm3,%xmm3 357 pshufd $68,%xmm4,%xmm4 358 movdqa %xmm6,(%edx) 359 movdqa %xmm1,16(%edx) 360 movdqa %xmm2,32(%edx) 361 movdqa %xmm3,48(%edx) 362 movdqa %xmm4,64(%edx) 363 pmuludq %xmm0,%xmm4 364 pmuludq %xmm0,%xmm3 365 pmuludq %xmm0,%xmm2 366 pmuludq %xmm0,%xmm1 367 pmuludq %xmm6,%xmm0 368 movdqa %xmm5,%xmm6 369 pmuludq 48(%edx),%xmm5 370 movdqa %xmm6,%xmm7 371 pmuludq 32(%edx),%xmm6 372 paddq %xmm5,%xmm4 373 movdqa %xmm7,%xmm5 374 pmuludq 16(%edx),%xmm7 375 paddq %xmm6,%xmm3 376 movdqa 80(%esp),%xmm6 377 pmuludq (%edx),%xmm5 378 paddq %xmm7,%xmm2 379 pmuludq 64(%edx),%xmm6 380 movdqa 32(%esp),%xmm7 381 paddq %xmm5,%xmm1 382 movdqa %xmm7,%xmm5 383 pmuludq 32(%edx),%xmm7 384 paddq %xmm6,%xmm0 385 movdqa %xmm5,%xmm6 386 pmuludq 16(%edx),%xmm5 387 paddq %xmm7,%xmm4 388 movdqa 96(%esp),%xmm7 389 pmuludq (%edx),%xmm6 390 paddq %xmm5,%xmm3 391 movdqa %xmm7,%xmm5 392 pmuludq 64(%edx),%xmm7 393 paddq %xmm6,%xmm2 394 pmuludq 48(%edx),%xmm5 395 movdqa 48(%esp),%xmm6 396 paddq %xmm7,%xmm1 397 movdqa %xmm6,%xmm7 398 pmuludq 16(%edx),%xmm6 399 paddq %xmm5,%xmm0 400 movdqa 112(%esp),%xmm5 401 pmuludq (%edx),%xmm7 402 paddq %xmm6,%xmm4 403 movdqa %xmm5,%xmm6 404 pmuludq 64(%edx),%xmm5 405 paddq %xmm7,%xmm3 406 movdqa %xmm6,%xmm7 407 pmuludq 48(%edx),%xmm6 408 paddq %xmm5,%xmm2 409 pmuludq 32(%edx),%xmm7 410 movdqa 64(%esp),%xmm5 411 paddq %xmm6,%xmm1 412 movdqa 128(%esp),%xmm6 413 pmuludq (%edx),%xmm5 414 paddq %xmm7,%xmm0 415 movdqa %xmm6,%xmm7 416 pmuludq 64(%edx),%xmm6 417 paddq %xmm5,%xmm4 418 movdqa %xmm7,%xmm5 419 pmuludq 16(%edx),%xmm7 420 paddq %xmm6,%xmm3 421 movdqa %xmm5,%xmm6 422 pmuludq 32(%edx),%xmm5 423 paddq %xmm7,%xmm0 424 pmuludq 48(%edx),%xmm6 425 movdqa 64(%ebx),%xmm7 426 paddq %xmm5,%xmm1 427 paddq %xmm6,%xmm2 428 movdqa %xmm3,%xmm5 429 pand %xmm7,%xmm3 430 psrlq $26,%xmm5 431 paddq %xmm4,%xmm5 432 movdqa %xmm0,%xmm6 433 pand %xmm7,%xmm0 434 psrlq $26,%xmm6 435 movdqa %xmm5,%xmm4 436 paddq %xmm1,%xmm6 437 psrlq $26,%xmm5 438 pand %xmm7,%xmm4 439 movdqa %xmm6,%xmm1 440 psrlq $26,%xmm6 441 paddd %xmm5,%xmm0 442 psllq $2,%xmm5 443 paddq %xmm2,%xmm6 444 paddq %xmm0,%xmm5 445 pand %xmm7,%xmm1 446 movdqa %xmm6,%xmm2 447 psrlq $26,%xmm6 448 pand %xmm7,%xmm2 449 paddd %xmm3,%xmm6 450 movdqa %xmm5,%xmm0 451 psrlq $26,%xmm5 452 movdqa %xmm6,%xmm3 453 psrlq $26,%xmm6 454 pand %xmm7,%xmm0 455 paddd %xmm5,%xmm1 456 pand %xmm7,%xmm3 457 paddd %xmm6,%xmm4 458 decl %ecx 459 jz L006square_break 460 punpcklqdq (%esp),%xmm0 461 punpcklqdq 16(%esp),%xmm1 462 punpcklqdq 32(%esp),%xmm2 463 punpcklqdq 48(%esp),%xmm3 464 punpcklqdq 64(%esp),%xmm4 465 jmp L005square 466 L006square_break: 467 psllq $32,%xmm0 468 psllq $32,%xmm1 469 psllq $32,%xmm2 470 psllq $32,%xmm3 471 psllq $32,%xmm4 472 por (%esp),%xmm0 473 por 16(%esp),%xmm1 474 por 32(%esp),%xmm2 475 por 48(%esp),%xmm3 476 por 64(%esp),%xmm4 477 pshufd $141,%xmm0,%xmm0 478 pshufd $141,%xmm1,%xmm1 479 pshufd $141,%xmm2,%xmm2 480 pshufd $141,%xmm3,%xmm3 481 pshufd $141,%xmm4,%xmm4 482 movdqu %xmm0,(%edi) 483 movdqu %xmm1,16(%edi) 484 movdqu %xmm2,32(%edi) 485 movdqu %xmm3,48(%edi) 486 movdqu %xmm4,64(%edi) 487 movdqa %xmm1,%xmm6 488 movdqa %xmm2,%xmm5 489 pslld $2,%xmm6 490 pslld $2,%xmm5 491 paddd %xmm1,%xmm6 492 paddd %xmm2,%xmm5 493 movdqu %xmm6,80(%edi) 494 movdqu %xmm5,96(%edi) 495 movdqa %xmm3,%xmm6 496 movdqa %xmm4,%xmm5 497 pslld $2,%xmm6 498 pslld $2,%xmm5 499 paddd %xmm3,%xmm6 500 paddd %xmm4,%xmm5 501 movdqu %xmm6,112(%edi) 502 movdqu %xmm5,128(%edi) 503 movl %ebp,%esp 504 leal -48(%edi),%edi 505 ret 506 .align 5,0x90 507 .align 4 508 __poly1305_blocks_sse2: 509 %ifdef __CET__ 510 511 .byte 243,15,30,251 512 %endif 513 514 pushl %ebp 515 pushl %ebx 516 pushl %esi 517 pushl %edi 518 movl 20(%esp),%edi 519 movl 24(%esp),%esi 520 movl 28(%esp),%ecx 521 movl 20(%edi),%eax 522 andl $-16,%ecx 523 jz L007nodata 524 cmpl $64,%ecx 525 jae L008enter_sse2 526 testl %eax,%eax 527 jz Lenter_blocks 528 .align 4,0x90 529 L008enter_sse2: 530 call L009pic_point 531 L009pic_point: 532 popl %ebx 533 leal Lconst_sse2-L009pic_point(%ebx),%ebx 534 testl %eax,%eax 535 jnz L010base2_26 536 call __poly1305_init_sse2 537 movl (%edi),%eax 538 movl 3(%edi),%ecx 539 movl 6(%edi),%edx 540 movl 9(%edi),%esi 541 movl 13(%edi),%ebp 542 movl $1,20(%edi) 543 shrl $2,%ecx 544 andl $67108863,%eax 545 shrl $4,%edx 546 andl $67108863,%ecx 547 shrl $6,%esi 548 andl $67108863,%edx 549 movd %eax,%xmm0 550 movd %ecx,%xmm1 551 movd %edx,%xmm2 552 movd %esi,%xmm3 553 movd %ebp,%xmm4 554 movl 24(%esp),%esi 555 movl 28(%esp),%ecx 556 jmp L011base2_32 557 .align 4,0x90 558 L010base2_26: 559 movd (%edi),%xmm0 560 movd 4(%edi),%xmm1 561 movd 8(%edi),%xmm2 562 movd 12(%edi),%xmm3 563 movd 16(%edi),%xmm4 564 movdqa 64(%ebx),%xmm7 565 L011base2_32: 566 movl 32(%esp),%eax 567 movl %esp,%ebp 568 subl $528,%esp 569 andl $-16,%esp 570 leal 48(%edi),%edi 571 shll $24,%eax 572 testl $31,%ecx 573 jz L012even 574 movdqu (%esi),%xmm6 575 leal 16(%esi),%esi 576 movdqa %xmm6,%xmm5 577 pand %xmm7,%xmm6 578 paddd %xmm6,%xmm0 579 movdqa %xmm5,%xmm6 580 psrlq $26,%xmm5 581 psrldq $6,%xmm6 582 pand %xmm7,%xmm5 583 paddd %xmm5,%xmm1 584 movdqa %xmm6,%xmm5 585 psrlq $4,%xmm6 586 pand %xmm7,%xmm6 587 paddd %xmm6,%xmm2 588 movdqa %xmm5,%xmm6 589 psrlq $30,%xmm5 590 pand %xmm7,%xmm5 591 psrldq $7,%xmm6 592 paddd %xmm5,%xmm3 593 movd %eax,%xmm5 594 paddd %xmm6,%xmm4 595 movd 12(%edi),%xmm6 596 paddd %xmm5,%xmm4 597 movdqa %xmm0,(%esp) 598 movdqa %xmm1,16(%esp) 599 movdqa %xmm2,32(%esp) 600 movdqa %xmm3,48(%esp) 601 movdqa %xmm4,64(%esp) 602 pmuludq %xmm6,%xmm0 603 pmuludq %xmm6,%xmm1 604 pmuludq %xmm6,%xmm2 605 movd 28(%edi),%xmm5 606 pmuludq %xmm6,%xmm3 607 pmuludq %xmm6,%xmm4 608 movdqa %xmm5,%xmm6 609 pmuludq 48(%esp),%xmm5 610 movdqa %xmm6,%xmm7 611 pmuludq 32(%esp),%xmm6 612 paddq %xmm5,%xmm4 613 movdqa %xmm7,%xmm5 614 pmuludq 16(%esp),%xmm7 615 paddq %xmm6,%xmm3 616 movd 92(%edi),%xmm6 617 pmuludq (%esp),%xmm5 618 paddq %xmm7,%xmm2 619 pmuludq 64(%esp),%xmm6 620 movd 44(%edi),%xmm7 621 paddq %xmm5,%xmm1 622 movdqa %xmm7,%xmm5 623 pmuludq 32(%esp),%xmm7 624 paddq %xmm6,%xmm0 625 movdqa %xmm5,%xmm6 626 pmuludq 16(%esp),%xmm5 627 paddq %xmm7,%xmm4 628 movd 108(%edi),%xmm7 629 pmuludq (%esp),%xmm6 630 paddq %xmm5,%xmm3 631 movdqa %xmm7,%xmm5 632 pmuludq 64(%esp),%xmm7 633 paddq %xmm6,%xmm2 634 pmuludq 48(%esp),%xmm5 635 movd 60(%edi),%xmm6 636 paddq %xmm7,%xmm1 637 movdqa %xmm6,%xmm7 638 pmuludq 16(%esp),%xmm6 639 paddq %xmm5,%xmm0 640 movd 124(%edi),%xmm5 641 pmuludq (%esp),%xmm7 642 paddq %xmm6,%xmm4 643 movdqa %xmm5,%xmm6 644 pmuludq 64(%esp),%xmm5 645 paddq %xmm7,%xmm3 646 movdqa %xmm6,%xmm7 647 pmuludq 48(%esp),%xmm6 648 paddq %xmm5,%xmm2 649 pmuludq 32(%esp),%xmm7 650 movd 76(%edi),%xmm5 651 paddq %xmm6,%xmm1 652 movd 140(%edi),%xmm6 653 pmuludq (%esp),%xmm5 654 paddq %xmm7,%xmm0 655 movdqa %xmm6,%xmm7 656 pmuludq 64(%esp),%xmm6 657 paddq %xmm5,%xmm4 658 movdqa %xmm7,%xmm5 659 pmuludq 16(%esp),%xmm7 660 paddq %xmm6,%xmm3 661 movdqa %xmm5,%xmm6 662 pmuludq 32(%esp),%xmm5 663 paddq %xmm7,%xmm0 664 pmuludq 48(%esp),%xmm6 665 movdqa 64(%ebx),%xmm7 666 paddq %xmm5,%xmm1 667 paddq %xmm6,%xmm2 668 movdqa %xmm3,%xmm5 669 pand %xmm7,%xmm3 670 psrlq $26,%xmm5 671 paddq %xmm4,%xmm5 672 movdqa %xmm0,%xmm6 673 pand %xmm7,%xmm0 674 psrlq $26,%xmm6 675 movdqa %xmm5,%xmm4 676 paddq %xmm1,%xmm6 677 psrlq $26,%xmm5 678 pand %xmm7,%xmm4 679 movdqa %xmm6,%xmm1 680 psrlq $26,%xmm6 681 paddd %xmm5,%xmm0 682 psllq $2,%xmm5 683 paddq %xmm2,%xmm6 684 paddq %xmm0,%xmm5 685 pand %xmm7,%xmm1 686 movdqa %xmm6,%xmm2 687 psrlq $26,%xmm6 688 pand %xmm7,%xmm2 689 paddd %xmm3,%xmm6 690 movdqa %xmm5,%xmm0 691 psrlq $26,%xmm5 692 movdqa %xmm6,%xmm3 693 psrlq $26,%xmm6 694 pand %xmm7,%xmm0 695 paddd %xmm5,%xmm1 696 pand %xmm7,%xmm3 697 paddd %xmm6,%xmm4 698 subl $16,%ecx 699 jz L013done 700 L012even: 701 leal 384(%esp),%edx 702 leal -32(%esi),%eax 703 subl $64,%ecx 704 movdqu (%edi),%xmm5 705 pshufd $68,%xmm5,%xmm6 706 cmovbl %eax,%esi 707 pshufd $238,%xmm5,%xmm5 708 movdqa %xmm6,(%edx) 709 leal 160(%esp),%eax 710 movdqu 16(%edi),%xmm6 711 movdqa %xmm5,-144(%edx) 712 pshufd $68,%xmm6,%xmm5 713 pshufd $238,%xmm6,%xmm6 714 movdqa %xmm5,16(%edx) 715 movdqu 32(%edi),%xmm5 716 movdqa %xmm6,-128(%edx) 717 pshufd $68,%xmm5,%xmm6 718 pshufd $238,%xmm5,%xmm5 719 movdqa %xmm6,32(%edx) 720 movdqu 48(%edi),%xmm6 721 movdqa %xmm5,-112(%edx) 722 pshufd $68,%xmm6,%xmm5 723 pshufd $238,%xmm6,%xmm6 724 movdqa %xmm5,48(%edx) 725 movdqu 64(%edi),%xmm5 726 movdqa %xmm6,-96(%edx) 727 pshufd $68,%xmm5,%xmm6 728 pshufd $238,%xmm5,%xmm5 729 movdqa %xmm6,64(%edx) 730 movdqu 80(%edi),%xmm6 731 movdqa %xmm5,-80(%edx) 732 pshufd $68,%xmm6,%xmm5 733 pshufd $238,%xmm6,%xmm6 734 movdqa %xmm5,80(%edx) 735 movdqu 96(%edi),%xmm5 736 movdqa %xmm6,-64(%edx) 737 pshufd $68,%xmm5,%xmm6 738 pshufd $238,%xmm5,%xmm5 739 movdqa %xmm6,96(%edx) 740 movdqu 112(%edi),%xmm6 741 movdqa %xmm5,-48(%edx) 742 pshufd $68,%xmm6,%xmm5 743 pshufd $238,%xmm6,%xmm6 744 movdqa %xmm5,112(%edx) 745 movdqu 128(%edi),%xmm5 746 movdqa %xmm6,-32(%edx) 747 pshufd $68,%xmm5,%xmm6 748 pshufd $238,%xmm5,%xmm5 749 movdqa %xmm6,128(%edx) 750 movdqa %xmm5,-16(%edx) 751 movdqu 32(%esi),%xmm5 752 movdqu 48(%esi),%xmm6 753 leal 32(%esi),%esi 754 movdqa %xmm2,112(%esp) 755 movdqa %xmm3,128(%esp) 756 movdqa %xmm4,144(%esp) 757 movdqa %xmm5,%xmm2 758 movdqa %xmm6,%xmm3 759 psrldq $6,%xmm2 760 psrldq $6,%xmm3 761 movdqa %xmm5,%xmm4 762 punpcklqdq %xmm3,%xmm2 763 punpckhqdq %xmm6,%xmm4 764 punpcklqdq %xmm6,%xmm5 765 movdqa %xmm2,%xmm3 766 psrlq $4,%xmm2 767 psrlq $30,%xmm3 768 movdqa %xmm5,%xmm6 769 psrlq $40,%xmm4 770 psrlq $26,%xmm6 771 pand %xmm7,%xmm5 772 pand %xmm7,%xmm6 773 pand %xmm7,%xmm2 774 pand %xmm7,%xmm3 775 por (%ebx),%xmm4 776 movdqa %xmm0,80(%esp) 777 movdqa %xmm1,96(%esp) 778 jbe L014skip_loop 779 jmp L015loop 780 .align 5,0x90 781 L015loop: 782 movdqa -144(%edx),%xmm7 783 movdqa %xmm6,16(%eax) 784 movdqa %xmm2,32(%eax) 785 movdqa %xmm3,48(%eax) 786 movdqa %xmm4,64(%eax) 787 movdqa %xmm5,%xmm1 788 pmuludq %xmm7,%xmm5 789 movdqa %xmm6,%xmm0 790 pmuludq %xmm7,%xmm6 791 pmuludq %xmm7,%xmm2 792 pmuludq %xmm7,%xmm3 793 pmuludq %xmm7,%xmm4 794 pmuludq -16(%edx),%xmm0 795 movdqa %xmm1,%xmm7 796 pmuludq -128(%edx),%xmm1 797 paddq %xmm5,%xmm0 798 movdqa %xmm7,%xmm5 799 pmuludq -112(%edx),%xmm7 800 paddq %xmm6,%xmm1 801 movdqa %xmm5,%xmm6 802 pmuludq -96(%edx),%xmm5 803 paddq %xmm7,%xmm2 804 movdqa 16(%eax),%xmm7 805 pmuludq -80(%edx),%xmm6 806 paddq %xmm5,%xmm3 807 movdqa %xmm7,%xmm5 808 pmuludq -128(%edx),%xmm7 809 paddq %xmm6,%xmm4 810 movdqa %xmm5,%xmm6 811 pmuludq -112(%edx),%xmm5 812 paddq %xmm7,%xmm2 813 movdqa 32(%eax),%xmm7 814 pmuludq -96(%edx),%xmm6 815 paddq %xmm5,%xmm3 816 movdqa %xmm7,%xmm5 817 pmuludq -32(%edx),%xmm7 818 paddq %xmm6,%xmm4 819 movdqa %xmm5,%xmm6 820 pmuludq -16(%edx),%xmm5 821 paddq %xmm7,%xmm0 822 movdqa %xmm6,%xmm7 823 pmuludq -128(%edx),%xmm6 824 paddq %xmm5,%xmm1 825 movdqa 48(%eax),%xmm5 826 pmuludq -112(%edx),%xmm7 827 paddq %xmm6,%xmm3 828 movdqa %xmm5,%xmm6 829 pmuludq -48(%edx),%xmm5 830 paddq %xmm7,%xmm4 831 movdqa %xmm6,%xmm7 832 pmuludq -32(%edx),%xmm6 833 paddq %xmm5,%xmm0 834 movdqa %xmm7,%xmm5 835 pmuludq -16(%edx),%xmm7 836 paddq %xmm6,%xmm1 837 movdqa 64(%eax),%xmm6 838 pmuludq -128(%edx),%xmm5 839 paddq %xmm7,%xmm2 840 movdqa %xmm6,%xmm7 841 pmuludq -16(%edx),%xmm6 842 paddq %xmm5,%xmm4 843 movdqa %xmm7,%xmm5 844 pmuludq -64(%edx),%xmm7 845 paddq %xmm6,%xmm3 846 movdqa %xmm5,%xmm6 847 pmuludq -48(%edx),%xmm5 848 paddq %xmm7,%xmm0 849 movdqa 64(%ebx),%xmm7 850 pmuludq -32(%edx),%xmm6 851 paddq %xmm5,%xmm1 852 paddq %xmm6,%xmm2 853 movdqu -32(%esi),%xmm5 854 movdqu -16(%esi),%xmm6 855 leal 32(%esi),%esi 856 movdqa %xmm2,32(%esp) 857 movdqa %xmm3,48(%esp) 858 movdqa %xmm4,64(%esp) 859 movdqa %xmm5,%xmm2 860 movdqa %xmm6,%xmm3 861 psrldq $6,%xmm2 862 psrldq $6,%xmm3 863 movdqa %xmm5,%xmm4 864 punpcklqdq %xmm3,%xmm2 865 punpckhqdq %xmm6,%xmm4 866 punpcklqdq %xmm6,%xmm5 867 movdqa %xmm2,%xmm3 868 psrlq $4,%xmm2 869 psrlq $30,%xmm3 870 movdqa %xmm5,%xmm6 871 psrlq $40,%xmm4 872 psrlq $26,%xmm6 873 pand %xmm7,%xmm5 874 pand %xmm7,%xmm6 875 pand %xmm7,%xmm2 876 pand %xmm7,%xmm3 877 por (%ebx),%xmm4 878 leal -32(%esi),%eax 879 subl $64,%ecx 880 paddd 80(%esp),%xmm5 881 paddd 96(%esp),%xmm6 882 paddd 112(%esp),%xmm2 883 paddd 128(%esp),%xmm3 884 paddd 144(%esp),%xmm4 885 cmovbl %eax,%esi 886 leal 160(%esp),%eax 887 movdqa (%edx),%xmm7 888 movdqa %xmm1,16(%esp) 889 movdqa %xmm6,16(%eax) 890 movdqa %xmm2,32(%eax) 891 movdqa %xmm3,48(%eax) 892 movdqa %xmm4,64(%eax) 893 movdqa %xmm5,%xmm1 894 pmuludq %xmm7,%xmm5 895 paddq %xmm0,%xmm5 896 movdqa %xmm6,%xmm0 897 pmuludq %xmm7,%xmm6 898 pmuludq %xmm7,%xmm2 899 pmuludq %xmm7,%xmm3 900 pmuludq %xmm7,%xmm4 901 paddq 16(%esp),%xmm6 902 paddq 32(%esp),%xmm2 903 paddq 48(%esp),%xmm3 904 paddq 64(%esp),%xmm4 905 pmuludq 128(%edx),%xmm0 906 movdqa %xmm1,%xmm7 907 pmuludq 16(%edx),%xmm1 908 paddq %xmm5,%xmm0 909 movdqa %xmm7,%xmm5 910 pmuludq 32(%edx),%xmm7 911 paddq %xmm6,%xmm1 912 movdqa %xmm5,%xmm6 913 pmuludq 48(%edx),%xmm5 914 paddq %xmm7,%xmm2 915 movdqa 16(%eax),%xmm7 916 pmuludq 64(%edx),%xmm6 917 paddq %xmm5,%xmm3 918 movdqa %xmm7,%xmm5 919 pmuludq 16(%edx),%xmm7 920 paddq %xmm6,%xmm4 921 movdqa %xmm5,%xmm6 922 pmuludq 32(%edx),%xmm5 923 paddq %xmm7,%xmm2 924 movdqa 32(%eax),%xmm7 925 pmuludq 48(%edx),%xmm6 926 paddq %xmm5,%xmm3 927 movdqa %xmm7,%xmm5 928 pmuludq 112(%edx),%xmm7 929 paddq %xmm6,%xmm4 930 movdqa %xmm5,%xmm6 931 pmuludq 128(%edx),%xmm5 932 paddq %xmm7,%xmm0 933 movdqa %xmm6,%xmm7 934 pmuludq 16(%edx),%xmm6 935 paddq %xmm5,%xmm1 936 movdqa 48(%eax),%xmm5 937 pmuludq 32(%edx),%xmm7 938 paddq %xmm6,%xmm3 939 movdqa %xmm5,%xmm6 940 pmuludq 96(%edx),%xmm5 941 paddq %xmm7,%xmm4 942 movdqa %xmm6,%xmm7 943 pmuludq 112(%edx),%xmm6 944 paddq %xmm5,%xmm0 945 movdqa %xmm7,%xmm5 946 pmuludq 128(%edx),%xmm7 947 paddq %xmm6,%xmm1 948 movdqa 64(%eax),%xmm6 949 pmuludq 16(%edx),%xmm5 950 paddq %xmm7,%xmm2 951 movdqa %xmm6,%xmm7 952 pmuludq 128(%edx),%xmm6 953 paddq %xmm5,%xmm4 954 movdqa %xmm7,%xmm5 955 pmuludq 80(%edx),%xmm7 956 paddq %xmm6,%xmm3 957 movdqa %xmm5,%xmm6 958 pmuludq 96(%edx),%xmm5 959 paddq %xmm7,%xmm0 960 movdqa 64(%ebx),%xmm7 961 pmuludq 112(%edx),%xmm6 962 paddq %xmm5,%xmm1 963 paddq %xmm6,%xmm2 964 movdqa %xmm3,%xmm5 965 pand %xmm7,%xmm3 966 psrlq $26,%xmm5 967 paddq %xmm4,%xmm5 968 movdqa %xmm0,%xmm6 969 pand %xmm7,%xmm0 970 psrlq $26,%xmm6 971 movdqa %xmm5,%xmm4 972 paddq %xmm1,%xmm6 973 psrlq $26,%xmm5 974 pand %xmm7,%xmm4 975 movdqa %xmm6,%xmm1 976 psrlq $26,%xmm6 977 paddd %xmm5,%xmm0 978 psllq $2,%xmm5 979 paddq %xmm2,%xmm6 980 paddq %xmm0,%xmm5 981 pand %xmm7,%xmm1 982 movdqa %xmm6,%xmm2 983 psrlq $26,%xmm6 984 pand %xmm7,%xmm2 985 paddd %xmm3,%xmm6 986 movdqa %xmm5,%xmm0 987 psrlq $26,%xmm5 988 movdqa %xmm6,%xmm3 989 psrlq $26,%xmm6 990 pand %xmm7,%xmm0 991 paddd %xmm5,%xmm1 992 pand %xmm7,%xmm3 993 paddd %xmm6,%xmm4 994 movdqu 32(%esi),%xmm5 995 movdqu 48(%esi),%xmm6 996 leal 32(%esi),%esi 997 movdqa %xmm2,112(%esp) 998 movdqa %xmm3,128(%esp) 999 movdqa %xmm4,144(%esp) 1000 movdqa %xmm5,%xmm2 1001 movdqa %xmm6,%xmm3 1002 psrldq $6,%xmm2 1003 psrldq $6,%xmm3 1004 movdqa %xmm5,%xmm4 1005 punpcklqdq %xmm3,%xmm2 1006 punpckhqdq %xmm6,%xmm4 1007 punpcklqdq %xmm6,%xmm5 1008 movdqa %xmm2,%xmm3 1009 psrlq $4,%xmm2 1010 psrlq $30,%xmm3 1011 movdqa %xmm5,%xmm6 1012 psrlq $40,%xmm4 1013 psrlq $26,%xmm6 1014 pand %xmm7,%xmm5 1015 pand %xmm7,%xmm6 1016 pand %xmm7,%xmm2 1017 pand %xmm7,%xmm3 1018 por (%ebx),%xmm4 1019 movdqa %xmm0,80(%esp) 1020 movdqa %xmm1,96(%esp) 1021 ja L015loop 1022 L014skip_loop: 1023 pshufd $16,-144(%edx),%xmm7 1024 addl $32,%ecx 1025 jnz L016long_tail 1026 paddd %xmm0,%xmm5 1027 paddd %xmm1,%xmm6 1028 paddd 112(%esp),%xmm2 1029 paddd 128(%esp),%xmm3 1030 paddd 144(%esp),%xmm4 1031 L016long_tail: 1032 movdqa %xmm5,(%eax) 1033 movdqa %xmm6,16(%eax) 1034 movdqa %xmm2,32(%eax) 1035 movdqa %xmm3,48(%eax) 1036 movdqa %xmm4,64(%eax) 1037 pmuludq %xmm7,%xmm5 1038 pmuludq %xmm7,%xmm6 1039 pmuludq %xmm7,%xmm2 1040 movdqa %xmm5,%xmm0 1041 pshufd $16,-128(%edx),%xmm5 1042 pmuludq %xmm7,%xmm3 1043 movdqa %xmm6,%xmm1 1044 pmuludq %xmm7,%xmm4 1045 movdqa %xmm5,%xmm6 1046 pmuludq 48(%eax),%xmm5 1047 movdqa %xmm6,%xmm7 1048 pmuludq 32(%eax),%xmm6 1049 paddq %xmm5,%xmm4 1050 movdqa %xmm7,%xmm5 1051 pmuludq 16(%eax),%xmm7 1052 paddq %xmm6,%xmm3 1053 pshufd $16,-64(%edx),%xmm6 1054 pmuludq (%eax),%xmm5 1055 paddq %xmm7,%xmm2 1056 pmuludq 64(%eax),%xmm6 1057 pshufd $16,-112(%edx),%xmm7 1058 paddq %xmm5,%xmm1 1059 movdqa %xmm7,%xmm5 1060 pmuludq 32(%eax),%xmm7 1061 paddq %xmm6,%xmm0 1062 movdqa %xmm5,%xmm6 1063 pmuludq 16(%eax),%xmm5 1064 paddq %xmm7,%xmm4 1065 pshufd $16,-48(%edx),%xmm7 1066 pmuludq (%eax),%xmm6 1067 paddq %xmm5,%xmm3 1068 movdqa %xmm7,%xmm5 1069 pmuludq 64(%eax),%xmm7 1070 paddq %xmm6,%xmm2 1071 pmuludq 48(%eax),%xmm5 1072 pshufd $16,-96(%edx),%xmm6 1073 paddq %xmm7,%xmm1 1074 movdqa %xmm6,%xmm7 1075 pmuludq 16(%eax),%xmm6 1076 paddq %xmm5,%xmm0 1077 pshufd $16,-32(%edx),%xmm5 1078 pmuludq (%eax),%xmm7 1079 paddq %xmm6,%xmm4 1080 movdqa %xmm5,%xmm6 1081 pmuludq 64(%eax),%xmm5 1082 paddq %xmm7,%xmm3 1083 movdqa %xmm6,%xmm7 1084 pmuludq 48(%eax),%xmm6 1085 paddq %xmm5,%xmm2 1086 pmuludq 32(%eax),%xmm7 1087 pshufd $16,-80(%edx),%xmm5 1088 paddq %xmm6,%xmm1 1089 pshufd $16,-16(%edx),%xmm6 1090 pmuludq (%eax),%xmm5 1091 paddq %xmm7,%xmm0 1092 movdqa %xmm6,%xmm7 1093 pmuludq 64(%eax),%xmm6 1094 paddq %xmm5,%xmm4 1095 movdqa %xmm7,%xmm5 1096 pmuludq 16(%eax),%xmm7 1097 paddq %xmm6,%xmm3 1098 movdqa %xmm5,%xmm6 1099 pmuludq 32(%eax),%xmm5 1100 paddq %xmm7,%xmm0 1101 pmuludq 48(%eax),%xmm6 1102 movdqa 64(%ebx),%xmm7 1103 paddq %xmm5,%xmm1 1104 paddq %xmm6,%xmm2 1105 jz L017short_tail 1106 movdqu -32(%esi),%xmm5 1107 movdqu -16(%esi),%xmm6 1108 leal 32(%esi),%esi 1109 movdqa %xmm2,32(%esp) 1110 movdqa %xmm3,48(%esp) 1111 movdqa %xmm4,64(%esp) 1112 movdqa %xmm5,%xmm2 1113 movdqa %xmm6,%xmm3 1114 psrldq $6,%xmm2 1115 psrldq $6,%xmm3 1116 movdqa %xmm5,%xmm4 1117 punpcklqdq %xmm3,%xmm2 1118 punpckhqdq %xmm6,%xmm4 1119 punpcklqdq %xmm6,%xmm5 1120 movdqa %xmm2,%xmm3 1121 psrlq $4,%xmm2 1122 psrlq $30,%xmm3 1123 movdqa %xmm5,%xmm6 1124 psrlq $40,%xmm4 1125 psrlq $26,%xmm6 1126 pand %xmm7,%xmm5 1127 pand %xmm7,%xmm6 1128 pand %xmm7,%xmm2 1129 pand %xmm7,%xmm3 1130 por (%ebx),%xmm4 1131 pshufd $16,(%edx),%xmm7 1132 paddd 80(%esp),%xmm5 1133 paddd 96(%esp),%xmm6 1134 paddd 112(%esp),%xmm2 1135 paddd 128(%esp),%xmm3 1136 paddd 144(%esp),%xmm4 1137 movdqa %xmm5,(%esp) 1138 pmuludq %xmm7,%xmm5 1139 movdqa %xmm6,16(%esp) 1140 pmuludq %xmm7,%xmm6 1141 paddq %xmm5,%xmm0 1142 movdqa %xmm2,%xmm5 1143 pmuludq %xmm7,%xmm2 1144 paddq %xmm6,%xmm1 1145 movdqa %xmm3,%xmm6 1146 pmuludq %xmm7,%xmm3 1147 paddq 32(%esp),%xmm2 1148 movdqa %xmm5,32(%esp) 1149 pshufd $16,16(%edx),%xmm5 1150 paddq 48(%esp),%xmm3 1151 movdqa %xmm6,48(%esp) 1152 movdqa %xmm4,%xmm6 1153 pmuludq %xmm7,%xmm4 1154 paddq 64(%esp),%xmm4 1155 movdqa %xmm6,64(%esp) 1156 movdqa %xmm5,%xmm6 1157 pmuludq 48(%esp),%xmm5 1158 movdqa %xmm6,%xmm7 1159 pmuludq 32(%esp),%xmm6 1160 paddq %xmm5,%xmm4 1161 movdqa %xmm7,%xmm5 1162 pmuludq 16(%esp),%xmm7 1163 paddq %xmm6,%xmm3 1164 pshufd $16,80(%edx),%xmm6 1165 pmuludq (%esp),%xmm5 1166 paddq %xmm7,%xmm2 1167 pmuludq 64(%esp),%xmm6 1168 pshufd $16,32(%edx),%xmm7 1169 paddq %xmm5,%xmm1 1170 movdqa %xmm7,%xmm5 1171 pmuludq 32(%esp),%xmm7 1172 paddq %xmm6,%xmm0 1173 movdqa %xmm5,%xmm6 1174 pmuludq 16(%esp),%xmm5 1175 paddq %xmm7,%xmm4 1176 pshufd $16,96(%edx),%xmm7 1177 pmuludq (%esp),%xmm6 1178 paddq %xmm5,%xmm3 1179 movdqa %xmm7,%xmm5 1180 pmuludq 64(%esp),%xmm7 1181 paddq %xmm6,%xmm2 1182 pmuludq 48(%esp),%xmm5 1183 pshufd $16,48(%edx),%xmm6 1184 paddq %xmm7,%xmm1 1185 movdqa %xmm6,%xmm7 1186 pmuludq 16(%esp),%xmm6 1187 paddq %xmm5,%xmm0 1188 pshufd $16,112(%edx),%xmm5 1189 pmuludq (%esp),%xmm7 1190 paddq %xmm6,%xmm4 1191 movdqa %xmm5,%xmm6 1192 pmuludq 64(%esp),%xmm5 1193 paddq %xmm7,%xmm3 1194 movdqa %xmm6,%xmm7 1195 pmuludq 48(%esp),%xmm6 1196 paddq %xmm5,%xmm2 1197 pmuludq 32(%esp),%xmm7 1198 pshufd $16,64(%edx),%xmm5 1199 paddq %xmm6,%xmm1 1200 pshufd $16,128(%edx),%xmm6 1201 pmuludq (%esp),%xmm5 1202 paddq %xmm7,%xmm0 1203 movdqa %xmm6,%xmm7 1204 pmuludq 64(%esp),%xmm6 1205 paddq %xmm5,%xmm4 1206 movdqa %xmm7,%xmm5 1207 pmuludq 16(%esp),%xmm7 1208 paddq %xmm6,%xmm3 1209 movdqa %xmm5,%xmm6 1210 pmuludq 32(%esp),%xmm5 1211 paddq %xmm7,%xmm0 1212 pmuludq 48(%esp),%xmm6 1213 movdqa 64(%ebx),%xmm7 1214 paddq %xmm5,%xmm1 1215 paddq %xmm6,%xmm2 1216 L017short_tail: 1217 pshufd $78,%xmm4,%xmm6 1218 pshufd $78,%xmm3,%xmm5 1219 paddq %xmm6,%xmm4 1220 paddq %xmm5,%xmm3 1221 pshufd $78,%xmm0,%xmm6 1222 pshufd $78,%xmm1,%xmm5 1223 paddq %xmm6,%xmm0 1224 paddq %xmm5,%xmm1 1225 pshufd $78,%xmm2,%xmm6 1226 movdqa %xmm3,%xmm5 1227 pand %xmm7,%xmm3 1228 psrlq $26,%xmm5 1229 paddq %xmm6,%xmm2 1230 paddq %xmm4,%xmm5 1231 movdqa %xmm0,%xmm6 1232 pand %xmm7,%xmm0 1233 psrlq $26,%xmm6 1234 movdqa %xmm5,%xmm4 1235 paddq %xmm1,%xmm6 1236 psrlq $26,%xmm5 1237 pand %xmm7,%xmm4 1238 movdqa %xmm6,%xmm1 1239 psrlq $26,%xmm6 1240 paddd %xmm5,%xmm0 1241 psllq $2,%xmm5 1242 paddq %xmm2,%xmm6 1243 paddq %xmm0,%xmm5 1244 pand %xmm7,%xmm1 1245 movdqa %xmm6,%xmm2 1246 psrlq $26,%xmm6 1247 pand %xmm7,%xmm2 1248 paddd %xmm3,%xmm6 1249 movdqa %xmm5,%xmm0 1250 psrlq $26,%xmm5 1251 movdqa %xmm6,%xmm3 1252 psrlq $26,%xmm6 1253 pand %xmm7,%xmm0 1254 paddd %xmm5,%xmm1 1255 pand %xmm7,%xmm3 1256 paddd %xmm6,%xmm4 1257 L013done: 1258 movd %xmm0,-48(%edi) 1259 movd %xmm1,-44(%edi) 1260 movd %xmm2,-40(%edi) 1261 movd %xmm3,-36(%edi) 1262 movd %xmm4,-32(%edi) 1263 movl %ebp,%esp 1264 L007nodata: 1265 popl %edi 1266 popl %esi 1267 popl %ebx 1268 popl %ebp 1269 ret 1270 .align 5,0x90 1271 .align 4 1272 __poly1305_emit_sse2: 1273 %ifdef __CET__ 1274 1275 .byte 243,15,30,251 1276 %endif 1277 1278 pushl %ebp 1279 pushl %ebx 1280 pushl %esi 1281 pushl %edi 1282 movl 20(%esp),%ebp 1283 cmpl $0,20(%ebp) 1284 je Lenter_emit 1285 movl (%ebp),%eax 1286 movl 4(%ebp),%edi 1287 movl 8(%ebp),%ecx 1288 movl 12(%ebp),%edx 1289 movl 16(%ebp),%esi 1290 movl %edi,%ebx 1291 shll $26,%edi 1292 shrl $6,%ebx 1293 addl %edi,%eax 1294 movl %ecx,%edi 1295 adcl $0,%ebx 1296 shll $20,%edi 1297 shrl $12,%ecx 1298 addl %edi,%ebx 1299 movl %edx,%edi 1300 adcl $0,%ecx 1301 shll $14,%edi 1302 shrl $18,%edx 1303 addl %edi,%ecx 1304 movl %esi,%edi 1305 adcl $0,%edx 1306 shll $8,%edi 1307 shrl $24,%esi 1308 addl %edi,%edx 1309 adcl $0,%esi 1310 movl %esi,%edi 1311 andl $3,%esi 1312 shrl $2,%edi 1313 leal (%edi,%edi,4),%ebp 1314 movl 24(%esp),%edi 1315 addl %ebp,%eax 1316 movl 28(%esp),%ebp 1317 adcl $0,%ebx 1318 adcl $0,%ecx 1319 adcl $0,%edx 1320 adcl $0,%esi 1321 movd %eax,%xmm0 1322 addl $5,%eax 1323 movd %ebx,%xmm1 1324 adcl $0,%ebx 1325 movd %ecx,%xmm2 1326 adcl $0,%ecx 1327 movd %edx,%xmm3 1328 adcl $0,%edx 1329 adcl $0,%esi 1330 shrl $2,%esi 1331 negl %esi 1332 andl %esi,%eax 1333 andl %esi,%ebx 1334 andl %esi,%ecx 1335 andl %esi,%edx 1336 movl %eax,(%edi) 1337 movd %xmm0,%eax 1338 movl %ebx,4(%edi) 1339 movd %xmm1,%ebx 1340 movl %ecx,8(%edi) 1341 movd %xmm2,%ecx 1342 movl %edx,12(%edi) 1343 movd %xmm3,%edx 1344 notl %esi 1345 andl %esi,%eax 1346 andl %esi,%ebx 1347 orl (%edi),%eax 1348 andl %esi,%ecx 1349 orl 4(%edi),%ebx 1350 andl %esi,%edx 1351 orl 8(%edi),%ecx 1352 orl 12(%edi),%edx 1353 addl (%ebp),%eax 1354 adcl 4(%ebp),%ebx 1355 movl %eax,(%edi) 1356 adcl 8(%ebp),%ecx 1357 movl %ebx,4(%edi) 1358 adcl 12(%ebp),%edx 1359 movl %ecx,8(%edi) 1360 movl %edx,12(%edi) 1361 popl %edi 1362 popl %esi 1363 popl %ebx 1364 popl %ebp 1365 ret 1366 .align 5,0x90 1367 .align 4 1368 __poly1305_init_avx2: 1369 %ifdef __CET__ 1370 1371 .byte 243,15,30,251 1372 %endif 1373 1374 vmovdqu 24(%edi),%xmm4 1375 leal 48(%edi),%edi 1376 movl %esp,%ebp 1377 subl $224,%esp 1378 andl $-16,%esp 1379 vmovdqa 64(%ebx),%xmm7 1380 vpand %xmm7,%xmm4,%xmm0 1381 vpsrlq $26,%xmm4,%xmm1 1382 vpsrldq $6,%xmm4,%xmm3 1383 vpand %xmm7,%xmm1,%xmm1 1384 vpsrlq $4,%xmm3,%xmm2 1385 vpsrlq $30,%xmm3,%xmm3 1386 vpand %xmm7,%xmm2,%xmm2 1387 vpand %xmm7,%xmm3,%xmm3 1388 vpsrldq $13,%xmm4,%xmm4 1389 leal 144(%esp),%edx 1390 movl $2,%ecx 1391 L018square: 1392 vmovdqa %xmm0,(%esp) 1393 vmovdqa %xmm1,16(%esp) 1394 vmovdqa %xmm2,32(%esp) 1395 vmovdqa %xmm3,48(%esp) 1396 vmovdqa %xmm4,64(%esp) 1397 vpslld $2,%xmm1,%xmm6 1398 vpslld $2,%xmm2,%xmm5 1399 vpaddd %xmm1,%xmm6,%xmm6 1400 vpaddd %xmm2,%xmm5,%xmm5 1401 vmovdqa %xmm6,80(%esp) 1402 vmovdqa %xmm5,96(%esp) 1403 vpslld $2,%xmm3,%xmm6 1404 vpslld $2,%xmm4,%xmm5 1405 vpaddd %xmm3,%xmm6,%xmm6 1406 vpaddd %xmm4,%xmm5,%xmm5 1407 vmovdqa %xmm6,112(%esp) 1408 vmovdqa %xmm5,128(%esp) 1409 vpshufd $68,%xmm0,%xmm5 1410 vmovdqa %xmm1,%xmm6 1411 vpshufd $68,%xmm1,%xmm1 1412 vpshufd $68,%xmm2,%xmm2 1413 vpshufd $68,%xmm3,%xmm3 1414 vpshufd $68,%xmm4,%xmm4 1415 vmovdqa %xmm5,(%edx) 1416 vmovdqa %xmm1,16(%edx) 1417 vmovdqa %xmm2,32(%edx) 1418 vmovdqa %xmm3,48(%edx) 1419 vmovdqa %xmm4,64(%edx) 1420 vpmuludq %xmm0,%xmm4,%xmm4 1421 vpmuludq %xmm0,%xmm3,%xmm3 1422 vpmuludq %xmm0,%xmm2,%xmm2 1423 vpmuludq %xmm0,%xmm1,%xmm1 1424 vpmuludq %xmm0,%xmm5,%xmm0 1425 vpmuludq 48(%edx),%xmm6,%xmm5 1426 vpaddq %xmm5,%xmm4,%xmm4 1427 vpmuludq 32(%edx),%xmm6,%xmm7 1428 vpaddq %xmm7,%xmm3,%xmm3 1429 vpmuludq 16(%edx),%xmm6,%xmm5 1430 vpaddq %xmm5,%xmm2,%xmm2 1431 vmovdqa 80(%esp),%xmm7 1432 vpmuludq (%edx),%xmm6,%xmm6 1433 vpaddq %xmm6,%xmm1,%xmm1 1434 vmovdqa 32(%esp),%xmm5 1435 vpmuludq 64(%edx),%xmm7,%xmm7 1436 vpaddq %xmm7,%xmm0,%xmm0 1437 vpmuludq 32(%edx),%xmm5,%xmm6 1438 vpaddq %xmm6,%xmm4,%xmm4 1439 vpmuludq 16(%edx),%xmm5,%xmm7 1440 vpaddq %xmm7,%xmm3,%xmm3 1441 vmovdqa 96(%esp),%xmm6 1442 vpmuludq (%edx),%xmm5,%xmm5 1443 vpaddq %xmm5,%xmm2,%xmm2 1444 vpmuludq 64(%edx),%xmm6,%xmm7 1445 vpaddq %xmm7,%xmm1,%xmm1 1446 vmovdqa 48(%esp),%xmm5 1447 vpmuludq 48(%edx),%xmm6,%xmm6 1448 vpaddq %xmm6,%xmm0,%xmm0 1449 vpmuludq 16(%edx),%xmm5,%xmm7 1450 vpaddq %xmm7,%xmm4,%xmm4 1451 vmovdqa 112(%esp),%xmm6 1452 vpmuludq (%edx),%xmm5,%xmm5 1453 vpaddq %xmm5,%xmm3,%xmm3 1454 vpmuludq 64(%edx),%xmm6,%xmm7 1455 vpaddq %xmm7,%xmm2,%xmm2 1456 vpmuludq 48(%edx),%xmm6,%xmm5 1457 vpaddq %xmm5,%xmm1,%xmm1 1458 vmovdqa 64(%esp),%xmm7 1459 vpmuludq 32(%edx),%xmm6,%xmm6 1460 vpaddq %xmm6,%xmm0,%xmm0 1461 vmovdqa 128(%esp),%xmm5 1462 vpmuludq (%edx),%xmm7,%xmm7 1463 vpaddq %xmm7,%xmm4,%xmm4 1464 vpmuludq 64(%edx),%xmm5,%xmm6 1465 vpaddq %xmm6,%xmm3,%xmm3 1466 vpmuludq 16(%edx),%xmm5,%xmm7 1467 vpaddq %xmm7,%xmm0,%xmm0 1468 vpmuludq 32(%edx),%xmm5,%xmm6 1469 vpaddq %xmm6,%xmm1,%xmm1 1470 vmovdqa 64(%ebx),%xmm7 1471 vpmuludq 48(%edx),%xmm5,%xmm5 1472 vpaddq %xmm5,%xmm2,%xmm2 1473 vpsrlq $26,%xmm3,%xmm5 1474 vpand %xmm7,%xmm3,%xmm3 1475 vpsrlq $26,%xmm0,%xmm6 1476 vpand %xmm7,%xmm0,%xmm0 1477 vpaddq %xmm5,%xmm4,%xmm4 1478 vpaddq %xmm6,%xmm1,%xmm1 1479 vpsrlq $26,%xmm4,%xmm5 1480 vpand %xmm7,%xmm4,%xmm4 1481 vpsrlq $26,%xmm1,%xmm6 1482 vpand %xmm7,%xmm1,%xmm1 1483 vpaddq %xmm6,%xmm2,%xmm2 1484 vpaddd %xmm5,%xmm0,%xmm0 1485 vpsllq $2,%xmm5,%xmm5 1486 vpsrlq $26,%xmm2,%xmm6 1487 vpand %xmm7,%xmm2,%xmm2 1488 vpaddd %xmm5,%xmm0,%xmm0 1489 vpaddd %xmm6,%xmm3,%xmm3 1490 vpsrlq $26,%xmm3,%xmm6 1491 vpsrlq $26,%xmm0,%xmm5 1492 vpand %xmm7,%xmm0,%xmm0 1493 vpand %xmm7,%xmm3,%xmm3 1494 vpaddd %xmm5,%xmm1,%xmm1 1495 vpaddd %xmm6,%xmm4,%xmm4 1496 decl %ecx 1497 jz L019square_break 1498 vpunpcklqdq (%esp),%xmm0,%xmm0 1499 vpunpcklqdq 16(%esp),%xmm1,%xmm1 1500 vpunpcklqdq 32(%esp),%xmm2,%xmm2 1501 vpunpcklqdq 48(%esp),%xmm3,%xmm3 1502 vpunpcklqdq 64(%esp),%xmm4,%xmm4 1503 jmp L018square 1504 L019square_break: 1505 vpsllq $32,%xmm0,%xmm0 1506 vpsllq $32,%xmm1,%xmm1 1507 vpsllq $32,%xmm2,%xmm2 1508 vpsllq $32,%xmm3,%xmm3 1509 vpsllq $32,%xmm4,%xmm4 1510 vpor (%esp),%xmm0,%xmm0 1511 vpor 16(%esp),%xmm1,%xmm1 1512 vpor 32(%esp),%xmm2,%xmm2 1513 vpor 48(%esp),%xmm3,%xmm3 1514 vpor 64(%esp),%xmm4,%xmm4 1515 vpshufd $141,%xmm0,%xmm0 1516 vpshufd $141,%xmm1,%xmm1 1517 vpshufd $141,%xmm2,%xmm2 1518 vpshufd $141,%xmm3,%xmm3 1519 vpshufd $141,%xmm4,%xmm4 1520 vmovdqu %xmm0,(%edi) 1521 vmovdqu %xmm1,16(%edi) 1522 vmovdqu %xmm2,32(%edi) 1523 vmovdqu %xmm3,48(%edi) 1524 vmovdqu %xmm4,64(%edi) 1525 vpslld $2,%xmm1,%xmm6 1526 vpslld $2,%xmm2,%xmm5 1527 vpaddd %xmm1,%xmm6,%xmm6 1528 vpaddd %xmm2,%xmm5,%xmm5 1529 vmovdqu %xmm6,80(%edi) 1530 vmovdqu %xmm5,96(%edi) 1531 vpslld $2,%xmm3,%xmm6 1532 vpslld $2,%xmm4,%xmm5 1533 vpaddd %xmm3,%xmm6,%xmm6 1534 vpaddd %xmm4,%xmm5,%xmm5 1535 vmovdqu %xmm6,112(%edi) 1536 vmovdqu %xmm5,128(%edi) 1537 movl %ebp,%esp 1538 leal -48(%edi),%edi 1539 ret 1540 .align 5,0x90 1541 .align 4 1542 __poly1305_blocks_avx2: 1543 %ifdef __CET__ 1544 1545 .byte 243,15,30,251 1546 %endif 1547 1548 pushl %ebp 1549 pushl %ebx 1550 pushl %esi 1551 pushl %edi 1552 movl 20(%esp),%edi 1553 movl 24(%esp),%esi 1554 movl 28(%esp),%ecx 1555 movl 20(%edi),%eax 1556 andl $-16,%ecx 1557 jz L020nodata 1558 cmpl $64,%ecx 1559 jae L021enter_avx2 1560 testl %eax,%eax 1561 jz Lenter_blocks 1562 L021enter_avx2: 1563 vzeroupper 1564 call L022pic_point 1565 L022pic_point: 1566 popl %ebx 1567 leal Lconst_sse2-L022pic_point(%ebx),%ebx 1568 testl %eax,%eax 1569 jnz L023base2_26 1570 call __poly1305_init_avx2 1571 movl (%edi),%eax 1572 movl 3(%edi),%ecx 1573 movl 6(%edi),%edx 1574 movl 9(%edi),%esi 1575 movl 13(%edi),%ebp 1576 shrl $2,%ecx 1577 andl $67108863,%eax 1578 shrl $4,%edx 1579 andl $67108863,%ecx 1580 shrl $6,%esi 1581 andl $67108863,%edx 1582 movl %eax,(%edi) 1583 movl %ecx,4(%edi) 1584 movl %edx,8(%edi) 1585 movl %esi,12(%edi) 1586 movl %ebp,16(%edi) 1587 movl $1,20(%edi) 1588 movl 24(%esp),%esi 1589 movl 28(%esp),%ecx 1590 L023base2_26: 1591 movl 32(%esp),%eax 1592 movl %esp,%ebp 1593 subl $448,%esp 1594 andl $-512,%esp 1595 vmovdqu 48(%edi),%xmm0 1596 leal 288(%esp),%edx 1597 vmovdqu 64(%edi),%xmm1 1598 vmovdqu 80(%edi),%xmm2 1599 vmovdqu 96(%edi),%xmm3 1600 vmovdqu 112(%edi),%xmm4 1601 leal 48(%edi),%edi 1602 vpermq $64,%ymm0,%ymm0 1603 vpermq $64,%ymm1,%ymm1 1604 vpermq $64,%ymm2,%ymm2 1605 vpermq $64,%ymm3,%ymm3 1606 vpermq $64,%ymm4,%ymm4 1607 vpshufd $200,%ymm0,%ymm0 1608 vpshufd $200,%ymm1,%ymm1 1609 vpshufd $200,%ymm2,%ymm2 1610 vpshufd $200,%ymm3,%ymm3 1611 vpshufd $200,%ymm4,%ymm4 1612 vmovdqa %ymm0,-128(%edx) 1613 vmovdqu 80(%edi),%xmm0 1614 vmovdqa %ymm1,-96(%edx) 1615 vmovdqu 96(%edi),%xmm1 1616 vmovdqa %ymm2,-64(%edx) 1617 vmovdqu 112(%edi),%xmm2 1618 vmovdqa %ymm3,-32(%edx) 1619 vmovdqu 128(%edi),%xmm3 1620 vmovdqa %ymm4,(%edx) 1621 vpermq $64,%ymm0,%ymm0 1622 vpermq $64,%ymm1,%ymm1 1623 vpermq $64,%ymm2,%ymm2 1624 vpermq $64,%ymm3,%ymm3 1625 vpshufd $200,%ymm0,%ymm0 1626 vpshufd $200,%ymm1,%ymm1 1627 vpshufd $200,%ymm2,%ymm2 1628 vpshufd $200,%ymm3,%ymm3 1629 vmovdqa %ymm0,32(%edx) 1630 vmovd -48(%edi),%xmm0 1631 vmovdqa %ymm1,64(%edx) 1632 vmovd -44(%edi),%xmm1 1633 vmovdqa %ymm2,96(%edx) 1634 vmovd -40(%edi),%xmm2 1635 vmovdqa %ymm3,128(%edx) 1636 vmovd -36(%edi),%xmm3 1637 vmovd -32(%edi),%xmm4 1638 vmovdqa 64(%ebx),%ymm7 1639 negl %eax 1640 testl $63,%ecx 1641 jz L024even 1642 movl %ecx,%edx 1643 andl $-64,%ecx 1644 andl $63,%edx 1645 vmovdqu (%esi),%xmm5 1646 cmpl $32,%edx 1647 jb L025one 1648 vmovdqu 16(%esi),%xmm6 1649 je L026two 1650 vinserti128 $1,32(%esi),%ymm5,%ymm5 1651 leal 48(%esi),%esi 1652 leal 8(%ebx),%ebx 1653 leal 296(%esp),%edx 1654 jmp L027tail 1655 L026two: 1656 leal 32(%esi),%esi 1657 leal 16(%ebx),%ebx 1658 leal 304(%esp),%edx 1659 jmp L027tail 1660 L025one: 1661 leal 16(%esi),%esi 1662 vpxor %ymm6,%ymm6,%ymm6 1663 leal 32(%ebx,%eax,8),%ebx 1664 leal 312(%esp),%edx 1665 jmp L027tail 1666 .align 5,0x90 1667 L024even: 1668 vmovdqu (%esi),%xmm5 1669 vmovdqu 16(%esi),%xmm6 1670 vinserti128 $1,32(%esi),%ymm5,%ymm5 1671 vinserti128 $1,48(%esi),%ymm6,%ymm6 1672 leal 64(%esi),%esi 1673 subl $64,%ecx 1674 jz L027tail 1675 L028loop: 1676 vmovdqa %ymm2,64(%esp) 1677 vpsrldq $6,%ymm5,%ymm2 1678 vmovdqa %ymm0,(%esp) 1679 vpsrldq $6,%ymm6,%ymm0 1680 vmovdqa %ymm1,32(%esp) 1681 vpunpckhqdq %ymm6,%ymm5,%ymm1 1682 vpunpcklqdq %ymm6,%ymm5,%ymm5 1683 vpunpcklqdq %ymm0,%ymm2,%ymm2 1684 vpsrlq $30,%ymm2,%ymm0 1685 vpsrlq $4,%ymm2,%ymm2 1686 vpsrlq $26,%ymm5,%ymm6 1687 vpsrlq $40,%ymm1,%ymm1 1688 vpand %ymm7,%ymm2,%ymm2 1689 vpand %ymm7,%ymm5,%ymm5 1690 vpand %ymm7,%ymm6,%ymm6 1691 vpand %ymm7,%ymm0,%ymm0 1692 vpor (%ebx),%ymm1,%ymm1 1693 vpaddq 64(%esp),%ymm2,%ymm2 1694 vpaddq (%esp),%ymm5,%ymm5 1695 vpaddq 32(%esp),%ymm6,%ymm6 1696 vpaddq %ymm3,%ymm0,%ymm0 1697 vpaddq %ymm4,%ymm1,%ymm1 1698 vpmuludq -96(%edx),%ymm2,%ymm3 1699 vmovdqa %ymm6,32(%esp) 1700 vpmuludq -64(%edx),%ymm2,%ymm4 1701 vmovdqa %ymm0,96(%esp) 1702 vpmuludq 96(%edx),%ymm2,%ymm0 1703 vmovdqa %ymm1,128(%esp) 1704 vpmuludq 128(%edx),%ymm2,%ymm1 1705 vpmuludq -128(%edx),%ymm2,%ymm2 1706 vpmuludq -32(%edx),%ymm5,%ymm7 1707 vpaddq %ymm7,%ymm3,%ymm3 1708 vpmuludq (%edx),%ymm5,%ymm6 1709 vpaddq %ymm6,%ymm4,%ymm4 1710 vpmuludq -128(%edx),%ymm5,%ymm7 1711 vpaddq %ymm7,%ymm0,%ymm0 1712 vmovdqa 32(%esp),%ymm7 1713 vpmuludq -96(%edx),%ymm5,%ymm6 1714 vpaddq %ymm6,%ymm1,%ymm1 1715 vpmuludq -64(%edx),%ymm5,%ymm5 1716 vpaddq %ymm5,%ymm2,%ymm2 1717 vpmuludq -64(%edx),%ymm7,%ymm6 1718 vpaddq %ymm6,%ymm3,%ymm3 1719 vpmuludq -32(%edx),%ymm7,%ymm5 1720 vpaddq %ymm5,%ymm4,%ymm4 1721 vpmuludq 128(%edx),%ymm7,%ymm6 1722 vpaddq %ymm6,%ymm0,%ymm0 1723 vmovdqa 96(%esp),%ymm6 1724 vpmuludq -128(%edx),%ymm7,%ymm5 1725 vpaddq %ymm5,%ymm1,%ymm1 1726 vpmuludq -96(%edx),%ymm7,%ymm7 1727 vpaddq %ymm7,%ymm2,%ymm2 1728 vpmuludq -128(%edx),%ymm6,%ymm5 1729 vpaddq %ymm5,%ymm3,%ymm3 1730 vpmuludq -96(%edx),%ymm6,%ymm7 1731 vpaddq %ymm7,%ymm4,%ymm4 1732 vpmuludq 64(%edx),%ymm6,%ymm5 1733 vpaddq %ymm5,%ymm0,%ymm0 1734 vmovdqa 128(%esp),%ymm5 1735 vpmuludq 96(%edx),%ymm6,%ymm7 1736 vpaddq %ymm7,%ymm1,%ymm1 1737 vpmuludq 128(%edx),%ymm6,%ymm6 1738 vpaddq %ymm6,%ymm2,%ymm2 1739 vpmuludq 128(%edx),%ymm5,%ymm7 1740 vpaddq %ymm7,%ymm3,%ymm3 1741 vpmuludq 32(%edx),%ymm5,%ymm6 1742 vpaddq %ymm6,%ymm0,%ymm0 1743 vpmuludq -128(%edx),%ymm5,%ymm7 1744 vpaddq %ymm7,%ymm4,%ymm4 1745 vmovdqa 64(%ebx),%ymm7 1746 vpmuludq 64(%edx),%ymm5,%ymm6 1747 vpaddq %ymm6,%ymm1,%ymm1 1748 vpmuludq 96(%edx),%ymm5,%ymm5 1749 vpaddq %ymm5,%ymm2,%ymm2 1750 vpsrlq $26,%ymm3,%ymm5 1751 vpand %ymm7,%ymm3,%ymm3 1752 vpsrlq $26,%ymm0,%ymm6 1753 vpand %ymm7,%ymm0,%ymm0 1754 vpaddq %ymm5,%ymm4,%ymm4 1755 vpaddq %ymm6,%ymm1,%ymm1 1756 vpsrlq $26,%ymm4,%ymm5 1757 vpand %ymm7,%ymm4,%ymm4 1758 vpsrlq $26,%ymm1,%ymm6 1759 vpand %ymm7,%ymm1,%ymm1 1760 vpaddq %ymm6,%ymm2,%ymm2 1761 vpaddq %ymm5,%ymm0,%ymm0 1762 vpsllq $2,%ymm5,%ymm5 1763 vpsrlq $26,%ymm2,%ymm6 1764 vpand %ymm7,%ymm2,%ymm2 1765 vpaddq %ymm5,%ymm0,%ymm0 1766 vpaddq %ymm6,%ymm3,%ymm3 1767 vpsrlq $26,%ymm3,%ymm6 1768 vpsrlq $26,%ymm0,%ymm5 1769 vpand %ymm7,%ymm0,%ymm0 1770 vpand %ymm7,%ymm3,%ymm3 1771 vpaddq %ymm5,%ymm1,%ymm1 1772 vpaddq %ymm6,%ymm4,%ymm4 1773 vmovdqu (%esi),%xmm5 1774 vmovdqu 16(%esi),%xmm6 1775 vinserti128 $1,32(%esi),%ymm5,%ymm5 1776 vinserti128 $1,48(%esi),%ymm6,%ymm6 1777 leal 64(%esi),%esi 1778 subl $64,%ecx 1779 jnz L028loop 1780 L027tail: 1781 vmovdqa %ymm2,64(%esp) 1782 vpsrldq $6,%ymm5,%ymm2 1783 vmovdqa %ymm0,(%esp) 1784 vpsrldq $6,%ymm6,%ymm0 1785 vmovdqa %ymm1,32(%esp) 1786 vpunpckhqdq %ymm6,%ymm5,%ymm1 1787 vpunpcklqdq %ymm6,%ymm5,%ymm5 1788 vpunpcklqdq %ymm0,%ymm2,%ymm2 1789 vpsrlq $30,%ymm2,%ymm0 1790 vpsrlq $4,%ymm2,%ymm2 1791 vpsrlq $26,%ymm5,%ymm6 1792 vpsrlq $40,%ymm1,%ymm1 1793 vpand %ymm7,%ymm2,%ymm2 1794 vpand %ymm7,%ymm5,%ymm5 1795 vpand %ymm7,%ymm6,%ymm6 1796 vpand %ymm7,%ymm0,%ymm0 1797 vpor (%ebx),%ymm1,%ymm1 1798 andl $-64,%ebx 1799 vpaddq 64(%esp),%ymm2,%ymm2 1800 vpaddq (%esp),%ymm5,%ymm5 1801 vpaddq 32(%esp),%ymm6,%ymm6 1802 vpaddq %ymm3,%ymm0,%ymm0 1803 vpaddq %ymm4,%ymm1,%ymm1 1804 vpmuludq -92(%edx),%ymm2,%ymm3 1805 vmovdqa %ymm6,32(%esp) 1806 vpmuludq -60(%edx),%ymm2,%ymm4 1807 vmovdqa %ymm0,96(%esp) 1808 vpmuludq 100(%edx),%ymm2,%ymm0 1809 vmovdqa %ymm1,128(%esp) 1810 vpmuludq 132(%edx),%ymm2,%ymm1 1811 vpmuludq -124(%edx),%ymm2,%ymm2 1812 vpmuludq -28(%edx),%ymm5,%ymm7 1813 vpaddq %ymm7,%ymm3,%ymm3 1814 vpmuludq 4(%edx),%ymm5,%ymm6 1815 vpaddq %ymm6,%ymm4,%ymm4 1816 vpmuludq -124(%edx),%ymm5,%ymm7 1817 vpaddq %ymm7,%ymm0,%ymm0 1818 vmovdqa 32(%esp),%ymm7 1819 vpmuludq -92(%edx),%ymm5,%ymm6 1820 vpaddq %ymm6,%ymm1,%ymm1 1821 vpmuludq -60(%edx),%ymm5,%ymm5 1822 vpaddq %ymm5,%ymm2,%ymm2 1823 vpmuludq -60(%edx),%ymm7,%ymm6 1824 vpaddq %ymm6,%ymm3,%ymm3 1825 vpmuludq -28(%edx),%ymm7,%ymm5 1826 vpaddq %ymm5,%ymm4,%ymm4 1827 vpmuludq 132(%edx),%ymm7,%ymm6 1828 vpaddq %ymm6,%ymm0,%ymm0 1829 vmovdqa 96(%esp),%ymm6 1830 vpmuludq -124(%edx),%ymm7,%ymm5 1831 vpaddq %ymm5,%ymm1,%ymm1 1832 vpmuludq -92(%edx),%ymm7,%ymm7 1833 vpaddq %ymm7,%ymm2,%ymm2 1834 vpmuludq -124(%edx),%ymm6,%ymm5 1835 vpaddq %ymm5,%ymm3,%ymm3 1836 vpmuludq -92(%edx),%ymm6,%ymm7 1837 vpaddq %ymm7,%ymm4,%ymm4 1838 vpmuludq 68(%edx),%ymm6,%ymm5 1839 vpaddq %ymm5,%ymm0,%ymm0 1840 vmovdqa 128(%esp),%ymm5 1841 vpmuludq 100(%edx),%ymm6,%ymm7 1842 vpaddq %ymm7,%ymm1,%ymm1 1843 vpmuludq 132(%edx),%ymm6,%ymm6 1844 vpaddq %ymm6,%ymm2,%ymm2 1845 vpmuludq 132(%edx),%ymm5,%ymm7 1846 vpaddq %ymm7,%ymm3,%ymm3 1847 vpmuludq 36(%edx),%ymm5,%ymm6 1848 vpaddq %ymm6,%ymm0,%ymm0 1849 vpmuludq -124(%edx),%ymm5,%ymm7 1850 vpaddq %ymm7,%ymm4,%ymm4 1851 vmovdqa 64(%ebx),%ymm7 1852 vpmuludq 68(%edx),%ymm5,%ymm6 1853 vpaddq %ymm6,%ymm1,%ymm1 1854 vpmuludq 100(%edx),%ymm5,%ymm5 1855 vpaddq %ymm5,%ymm2,%ymm2 1856 vpsrldq $8,%ymm4,%ymm5 1857 vpsrldq $8,%ymm3,%ymm6 1858 vpaddq %ymm5,%ymm4,%ymm4 1859 vpsrldq $8,%ymm0,%ymm5 1860 vpaddq %ymm6,%ymm3,%ymm3 1861 vpsrldq $8,%ymm1,%ymm6 1862 vpaddq %ymm5,%ymm0,%ymm0 1863 vpsrldq $8,%ymm2,%ymm5 1864 vpaddq %ymm6,%ymm1,%ymm1 1865 vpermq $2,%ymm4,%ymm6 1866 vpaddq %ymm5,%ymm2,%ymm2 1867 vpermq $2,%ymm3,%ymm5 1868 vpaddq %ymm6,%ymm4,%ymm4 1869 vpermq $2,%ymm0,%ymm6 1870 vpaddq %ymm5,%ymm3,%ymm3 1871 vpermq $2,%ymm1,%ymm5 1872 vpaddq %ymm6,%ymm0,%ymm0 1873 vpermq $2,%ymm2,%ymm6 1874 vpaddq %ymm5,%ymm1,%ymm1 1875 vpaddq %ymm6,%ymm2,%ymm2 1876 vpsrlq $26,%ymm3,%ymm5 1877 vpand %ymm7,%ymm3,%ymm3 1878 vpsrlq $26,%ymm0,%ymm6 1879 vpand %ymm7,%ymm0,%ymm0 1880 vpaddq %ymm5,%ymm4,%ymm4 1881 vpaddq %ymm6,%ymm1,%ymm1 1882 vpsrlq $26,%ymm4,%ymm5 1883 vpand %ymm7,%ymm4,%ymm4 1884 vpsrlq $26,%ymm1,%ymm6 1885 vpand %ymm7,%ymm1,%ymm1 1886 vpaddq %ymm6,%ymm2,%ymm2 1887 vpaddq %ymm5,%ymm0,%ymm0 1888 vpsllq $2,%ymm5,%ymm5 1889 vpsrlq $26,%ymm2,%ymm6 1890 vpand %ymm7,%ymm2,%ymm2 1891 vpaddq %ymm5,%ymm0,%ymm0 1892 vpaddq %ymm6,%ymm3,%ymm3 1893 vpsrlq $26,%ymm3,%ymm6 1894 vpsrlq $26,%ymm0,%ymm5 1895 vpand %ymm7,%ymm0,%ymm0 1896 vpand %ymm7,%ymm3,%ymm3 1897 vpaddq %ymm5,%ymm1,%ymm1 1898 vpaddq %ymm6,%ymm4,%ymm4 1899 cmpl $0,%ecx 1900 je L029done 1901 vpshufd $252,%xmm0,%xmm0 1902 leal 288(%esp),%edx 1903 vpshufd $252,%xmm1,%xmm1 1904 vpshufd $252,%xmm2,%xmm2 1905 vpshufd $252,%xmm3,%xmm3 1906 vpshufd $252,%xmm4,%xmm4 1907 jmp L024even 1908 .align 4,0x90 1909 L029done: 1910 vmovd %xmm0,-48(%edi) 1911 vmovd %xmm1,-44(%edi) 1912 vmovd %xmm2,-40(%edi) 1913 vmovd %xmm3,-36(%edi) 1914 vmovd %xmm4,-32(%edi) 1915 vzeroupper 1916 movl %ebp,%esp 1917 L020nodata: 1918 popl %edi 1919 popl %esi 1920 popl %ebx 1921 popl %ebp 1922 ret 1923 .align 6,0x90 1924 Lconst_sse2: 1925 .long 16777216,0,16777216,0,16777216,0,16777216,0 1926 .long 0,0,0,0,0,0,0,0 1927 .long 67108863,0,67108863,0,67108863,0,67108863,0 1928 .long 268435455,268435452,268435452,268435452 1929 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 1930 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 1931 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 1932 .byte 114,103,62,0 1933 .align 2,0x90 1934 .section __IMPORT,__pointers,non_lazy_symbol_pointers 1935 L_OPENSSL_ia32cap_P$non_lazy_ptr: 1936 .indirect_symbol _OPENSSL_ia32cap_P 1937 .long 0 1938 .comm _OPENSSL_ia32cap_P,16,2 1939