1; 2; (C) Frank Klemm 1995,99,2000 3; Dedicated to the LAME project 4; 5; 6 %include "nasm.h" 7 8 segment_code 9 10; float_t scalar04_float32_i387 ( 11; const float32_t* const p, 12; const float32_t* const q ); 13 14proc scalar04_float32_i387 15%$p arg 4 16%$q arg 4 17;;; alloc 18 19 mov eax,[sp(%$p)] 20 mov edx,[sp(%$q)] 21 fld dword [eax] 22 fmul dword [edx] 23 fld dword [eax + 4] 24 fmul dword [edx + 4] 25 faddp st1,st0 26 fld dword [eax + 8] 27 fmul dword [edx + 8] 28 faddp st1,st0 29 fld dword [eax + 12] 30 fmul dword [edx + 12] 31 faddp st1,st0 32endproc 33 34 35proc scalar08_float32_i387 36%$p arg 4 37%$q arg 4 38;;; alloc 39 40 mov eax,[sp(%$p)] 41 mov edx,[sp(%$q)] 42 fld dword [eax] 43 fmul dword [edx] 44 fld dword [eax + 4] 45 fmul dword [edx + 4] 46 faddp st1,st0 47 fld dword [eax + 8] 48 fmul dword [edx + 8] 49 faddp st1,st0 50 fld dword [eax + 12] 51 fmul dword [edx + 12] 52 faddp st1,st0 53 fld dword [eax + 16] 54 fmul dword [edx + 16] 55 faddp st1,st0 56 fld dword [eax + 20] 57 fmul dword [edx + 20] 58 faddp st1,st0 59 fld dword [eax + 24] 60 fmul dword [edx + 24] 61 faddp st1,st0 62 fld dword [eax + 28] 63 fmul dword [edx + 28] 64 faddp st1,st0 65endproc 66 67 68proc scalar12_float32_i387 69%$p arg 4 70%$q arg 4 71;;; alloc 72 73 mov eax,[sp(%$p)] 74 mov edx,[sp(%$q)] 75 fld dword [eax] 76 fmul dword [edx] 77 fld dword [eax + 4] 78 fmul dword [edx + 4] 79 faddp st1,st0 80 fld dword [eax + 8] 81 fmul dword [edx + 8] 82 faddp st1,st0 83 fld dword [eax + 12] 84 fmul dword [edx + 12] 85 faddp st1,st0 86 fld dword [eax + 16] 87 fmul dword [edx + 16] 88 faddp st1,st0 89 fld dword [eax + 20] 90 fmul dword [edx + 20] 91 faddp st1,st0 92 fld dword [eax + 24] 93 fmul dword [edx + 24] 94 faddp st1,st0 95 fld dword [eax + 28] 96 fmul dword [edx + 28] 97 faddp st1,st0 98 fld dword [eax + 32] 99 fmul dword [edx + 32] 100 faddp st1,st0 101 fld dword [eax + 36] 102 fmul dword [edx + 36] 103 faddp st1,st0 104 fld dword [eax + 40] 105 fmul dword [edx + 40] 106 faddp st1,st0 107 fld dword [eax + 44] 108 fmul dword [edx + 44] 109 faddp st1,st0 110endproc 111 112 113proc scalar16_float32_i387 114%$p arg 4 115%$q arg 4 116;;; alloc 117 118 mov eax,[sp(%$p)] 119 mov edx,[sp(%$q)] 120 fld dword [eax] 121 fmul dword [edx] 122 fld dword [eax + 4] 123 fmul dword [edx + 4] 124 faddp st1,st0 125 fld dword [eax + 8] 126 fmul dword [edx + 8] 127 faddp st1,st0 128 fld dword [eax + 12] 129 fmul dword [edx + 12] 130 faddp st1,st0 131 fld dword [eax + 16] 132 fmul dword [edx + 16] 133 faddp st1,st0 134 fld dword [eax + 20] 135 fmul dword [edx + 20] 136 faddp st1,st0 137 fld dword [eax + 24] 138 fmul dword [edx + 24] 139 faddp st1,st0 140 fld dword [eax + 28] 141 fmul dword [edx + 28] 142 faddp st1,st0 143 fld dword [eax + 32] 144 fmul dword [edx + 32] 145 faddp st1,st0 146 fld dword [eax + 36] 147 fmul dword [edx + 36] 148 faddp st1,st0 149 fld dword [eax + 40] 150 fmul dword [edx + 40] 151 faddp st1,st0 152 fld dword [eax + 44] 153 fmul dword [edx + 44] 154 faddp st1,st0 155 fld dword [eax + 48] 156 fmul dword [edx + 48] 157 faddp st1,st0 158 fld dword [eax + 52] 159 fmul dword [edx + 52] 160 faddp st1,st0 161 fld dword [eax + 56] 162 fmul dword [edx + 56] 163 faddp st1,st0 164 fld dword [eax + 60] 165 fmul dword [edx + 60] 166 faddp st1,st0 167endproc 168 169 170proc scalar20_float32_i387 171%$p arg 4 172%$q arg 4 173;;; alloc 174 175 mov eax,[sp(%$p)] 176 mov edx,[sp(%$q)] 177 fld dword [eax] 178 fmul dword [edx] 179 fld dword [eax + 4] 180 fmul dword [edx + 4] 181 faddp st1,st0 182 fld dword [eax + 8] 183 fmul dword [edx + 8] 184 faddp st1,st0 185 fld dword [eax + 12] 186 fmul dword [edx + 12] 187 faddp st1,st0 188 fld dword [eax + 16] 189 fmul dword [edx + 16] 190 faddp st1,st0 191 fld dword [eax + 20] 192 fmul dword [edx + 20] 193 faddp st1,st0 194 fld dword [eax + 24] 195 fmul dword [edx + 24] 196 faddp st1,st0 197 fld dword [eax + 28] 198 fmul dword [edx + 28] 199 faddp st1,st0 200 fld dword [eax + 32] 201 fmul dword [edx + 32] 202 faddp st1,st0 203 fld dword [eax + 36] 204 fmul dword [edx + 36] 205 faddp st1,st0 206 fld dword [eax + 40] 207 fmul dword [edx + 40] 208 faddp st1,st0 209 fld dword [eax + 44] 210 fmul dword [edx + 44] 211 faddp st1,st0 212 fld dword [eax + 48] 213 fmul dword [edx + 48] 214 faddp st1,st0 215 fld dword [eax + 52] 216 fmul dword [edx + 52] 217 faddp st1,st0 218 fld dword [eax + 56] 219 fmul dword [edx + 56] 220 faddp st1,st0 221 fld dword [eax + 60] 222 fmul dword [edx + 60] 223 faddp st1,st0 224 fld dword [eax + 64] 225 fmul dword [edx + 64] 226 faddp st1,st0 227 fld dword [eax + 68] 228 fmul dword [edx + 68] 229 faddp st1,st0 230 fld dword [eax + 72] 231 fmul dword [edx + 72] 232 faddp st1,st0 233 fld dword [eax + 76] 234 fmul dword [edx + 76] 235 faddp st1,st0 236endproc 237 238 239proc scalar24_float32_i387 240%$p arg 4 241%$q arg 4 242;;; alloc 243 244 mov eax,[sp(%$p)] 245 mov edx,[sp(%$q)] 246 fld dword [eax] 247 fmul dword [edx] 248 fld dword [eax + 4] 249 fmul dword [edx + 4] 250 faddp st1,st0 251 fld dword [eax + 8] 252 fmul dword [edx + 8] 253 faddp st1,st0 254 fld dword [eax + 12] 255 fmul dword [edx + 12] 256 faddp st1,st0 257 fld dword [eax + 16] 258 fmul dword [edx + 16] 259 faddp st1,st0 260 fld dword [eax + 20] 261 fmul dword [edx + 20] 262 faddp st1,st0 263 fld dword [eax + 24] 264 fmul dword [edx + 24] 265 faddp st1,st0 266 fld dword [eax + 28] 267 fmul dword [edx + 28] 268 faddp st1,st0 269 fld dword [eax + 32] 270 fmul dword [edx + 32] 271 faddp st1,st0 272 fld dword [eax + 36] 273 fmul dword [edx + 36] 274 faddp st1,st0 275 fld dword [eax + 40] 276 fmul dword [edx + 40] 277 faddp st1,st0 278 fld dword [eax + 44] 279 fmul dword [edx + 44] 280 faddp st1,st0 281 fld dword [eax + 48] 282 fmul dword [edx + 48] 283 faddp st1,st0 284 fld dword [eax + 52] 285 fmul dword [edx + 52] 286 faddp st1,st0 287 fld dword [eax + 56] 288 fmul dword [edx + 56] 289 faddp st1,st0 290 fld dword [eax + 60] 291 fmul dword [edx + 60] 292 faddp st1,st0 293 fld dword [eax + 64] 294 fmul dword [edx + 64] 295 faddp st1,st0 296 fld dword [eax + 68] 297 fmul dword [edx + 68] 298 faddp st1,st0 299 fld dword [eax + 72] 300 fmul dword [edx + 72] 301 faddp st1,st0 302 fld dword [eax + 76] 303 fmul dword [edx + 76] 304 faddp st1,st0 305 fld dword [eax + 80] 306 fmul dword [edx + 80] 307 faddp st1,st0 308 fld dword [eax + 84] 309 fmul dword [edx + 84] 310 faddp st1,st0 311 fld dword [eax + 88] 312 fmul dword [edx + 88] 313 faddp st1,st0 314 fld dword [eax + 92] 315 fmul dword [edx + 92] 316 faddp st1,st0 317endproc 318 319 320proc scalar32_float32_i387 321%$p arg 4 322%$q arg 4 323;;; alloc 324 325 mov eax,[sp(%$p)] 326 mov edx,[sp(%$q)] 327 fld dword [eax] 328 fmul dword [edx] 329 fld dword [eax + 4] 330 fmul dword [edx + 4] 331 faddp st1,st0 332 fld dword [eax + 8] 333 fmul dword [edx + 8] 334 faddp st1,st0 335 fld dword [eax + 12] 336 fmul dword [edx + 12] 337 faddp st1,st0 338 fld dword [eax + 16] 339 fmul dword [edx + 16] 340 faddp st1,st0 341 fld dword [eax + 20] 342 fmul dword [edx + 20] 343 faddp st1,st0 344 fld dword [eax + 24] 345 fmul dword [edx + 24] 346 faddp st1,st0 347 fld dword [eax + 28] 348 fmul dword [edx + 28] 349 faddp st1,st0 350 fld dword [eax + 32] 351 fmul dword [edx + 32] 352 faddp st1,st0 353 fld dword [eax + 36] 354 fmul dword [edx + 36] 355 faddp st1,st0 356 fld dword [eax + 40] 357 fmul dword [edx + 40] 358 faddp st1,st0 359 fld dword [eax + 44] 360 fmul dword [edx + 44] 361 faddp st1,st0 362 fld dword [eax + 48] 363 fmul dword [edx + 48] 364 faddp st1,st0 365 fld dword [eax + 52] 366 fmul dword [edx + 52] 367 faddp st1,st0 368 fld dword [eax + 56] 369 fmul dword [edx + 56] 370 faddp st1,st0 371 fld dword [eax + 60] 372 fmul dword [edx + 60] 373 faddp st1,st0 374 fld dword [eax + 64] 375 fmul dword [edx + 64] 376 faddp st1,st0 377 fld dword [eax + 68] 378 fmul dword [edx + 68] 379 faddp st1,st0 380 fld dword [eax + 72] 381 fmul dword [edx + 72] 382 faddp st1,st0 383 fld dword [eax + 76] 384 fmul dword [edx + 76] 385 faddp st1,st0 386 fld dword [eax + 80] 387 fmul dword [edx + 80] 388 faddp st1,st0 389 fld dword [eax + 84] 390 fmul dword [edx + 84] 391 faddp st1,st0 392 fld dword [eax + 88] 393 fmul dword [edx + 88] 394 faddp st1,st0 395 fld dword [eax + 92] 396 fmul dword [edx + 92] 397 faddp st1,st0 398 fld dword [eax + 96] 399 fmul dword [edx + 96] 400 faddp st1,st0 401 fld dword [eax +100] 402 fmul dword [edx +100] 403 faddp st1,st0 404 fld dword [eax +104] 405 fmul dword [edx +104] 406 faddp st1,st0 407 fld dword [eax +108] 408 fmul dword [edx +108] 409 faddp st1,st0 410 fld dword [eax +112] 411 fmul dword [edx +112] 412 faddp st1,st0 413 fld dword [eax +116] 414 fmul dword [edx +116] 415 faddp st1,st0 416 fld dword [eax +120] 417 fmul dword [edx +120] 418 faddp st1,st0 419 fld dword [eax +124] 420 fmul dword [edx +124] 421 faddp st1,st0 422endproc 423 424 425; float_t scalar4n_float32_i387 ( 426; const float32_t* const p, 427; const float32_t* const q, 428; const size_t len ); 429 430proc scalar4n_float32_i387 431%$p arg 4 432%$q arg 4 433%$len arg 4 434;;; alloc 435 436 mov eax,[sp(%$p)] 437 mov edx,[sp(%$q)] 438 mov ecx,[sp(%$len)] 439 fld dword [eax] 440 fmul dword [edx] 441 fld dword [eax + 4] 442 fmul dword [edx + 4] 443 faddp st1,st0 444 fld dword [eax + 8] 445 fmul dword [edx + 8] 446 faddp st1,st0 447 fld dword [eax + 12] 448 fmul dword [edx + 12] 449 faddp st1,st0 450 dec ecx 451 jz .ret1 452 add eax,byte 16 453 add edx,byte 16 454.lbl1 455 fld dword [eax] 456 fmul dword [edx] 457 faddp st1,st0 458 fld dword [eax + 4] 459 fmul dword [edx + 4] 460 faddp st1,st0 461 fld dword [eax + 8] 462 fmul dword [edx + 8] 463 faddp st1,st0 464 fld dword [eax + 12] 465 fmul dword [edx + 12] 466 faddp st1,st0 467 add eax,byte 16 468 add edx,byte 16 469 dec ecx 470 jnz .lbl1 471.ret1 472endproc 473 474 475; float_t scalar1n_float32_i387 ( 476; const float32_t* const p, 477; const float32_t* const q, 478; const size_t len ); 479 480proc scalar1n_float32_i387 481%$p arg 4 482%$q arg 4 483%$len arg 4 484;;; alloc 485 486 mov eax,[sp(%$p)] 487 mov edx,[sp(%$q)] 488 mov ecx,[sp(%$len)] 489 fld0 490 shr ecx,1 491 jnc .lbl2 492 fld dword [eax] 493 fmul dword [edx] 494 faddp st1,st0 495 add eax,byte 4 496 add edx,byte 4 497.lbl2 498 shr ecx,1 499 jnc .lbl3 500 fld dword [eax] 501 fmul dword [edx] 502 faddp st1,st0 503 fld dword [eax + 4] 504 fmul dword [edx + 4] 505 faddp st1,st0 506 add eax,byte 8 507 add edx,byte 8 508 and ecx,ecx 509.lbl3 510 jz .ret2 511.lbl4 512 fld dword [eax] 513 fmul dword [edx] 514 faddp st1,st0 515 fld dword [eax + 4] 516 fmul dword [edx + 4] 517 faddp st1,st0 518 fld dword [eax + 8] 519 fmul dword [edx + 8] 520 faddp st1,st0 521 fld dword [eax + 12] 522 fmul dword [edx + 12] 523 faddp st1,st0 524 add eax,byte 16 525 add edx,byte 16 526 dec ecx 527 jnz .lbl4 528.ret2 529endproc 530 531 532proc scalar04_float32_3DNow 533%$p arg 4 534%$q arg 4 535 mov eax,[sp(%$p)] 536 mov edx,[sp(%$q)] 537 538 pmov mm0,qword [eax] 539 pmov mm1,qword [eax+8] 540 pfmul mm0,qword [edx] 541 pfmul mm1,qword [edx+8] 542 543 pfadd mm0,mm1 544 pmov qword [sp(%$p)],mm0 545 femms 546 fld dword [sp(%$p)] 547 fadd dword [sp(%$p)+4] 548endproc 549 550 551proc scalar08_float32_3DNow 552%$p arg 4 553%$q arg 4 554 mov eax,[sp(%$p)] 555 mov edx,[sp(%$q)] 556 557 pmov mm0,qword [eax] 558 pmov mm1,qword [eax+8] 559 pfmul mm0,qword [edx] 560 pfmul mm1,qword [edx+8] 561 562 pmov mm2,qword [eax+16] 563 pmov mm3,qword [eax+24] 564 pfmul mm2,qword [edx+16] 565 pfmul mm3,qword [edx+24] 566 pfadd mm0,mm2 567 pfadd mm1,mm3 568 569 pfadd mm0,mm1 570 pmov qword [sp(%$p)],mm0 571 femms 572 fld dword [sp(%$p)] 573 fadd dword [sp(%$p)+4] 574endproc 575 576 577proc scalar12_float32_3DNow 578%$p arg 4 579%$q arg 4 580 mov eax,[sp(%$p)] 581 mov edx,[sp(%$q)] 582 583 pmov mm0,qword [eax] 584 pmov mm1,qword [eax+8] 585 pfmul mm0,qword [edx] 586 pfmul mm1,qword [edx+8] 587 588 pmov mm2,qword [eax+16] 589 pmov mm3,qword [eax+24] 590 pfmul mm2,qword [edx+16] 591 pfmul mm3,qword [edx+24] 592 pfadd mm0,mm2 593 pfadd mm1,mm3 594 595 pmov mm2,qword [eax+32] 596 pmov mm3,qword [eax+40] 597 pfmul mm2,qword [edx+32] 598 pfmul mm3,qword [edx+40] 599 pfadd mm0,mm2 600 pfadd mm1,mm3 601 602 pfadd mm0,mm1 603 pmov qword [sp(%$p)],mm0 604 femms 605 fld dword [sp(%$p)] 606 fadd dword [sp(%$p)+4] 607endproc 608 609 610proc scalar16_float32_3DNow 611%$p arg 4 612%$q arg 4 613 mov eax,[sp(%$p)] 614 mov edx,[sp(%$q)] 615 616 pmov mm0,qword [eax] 617 pmov mm1,qword [eax+8] 618 pfmul mm0,qword [edx] 619 pfmul mm1,qword [edx+8] 620 621 pmov mm2,qword [eax+16] 622 pmov mm3,qword [eax+24] 623 pfmul mm2,qword [edx+16] 624 pfmul mm3,qword [edx+24] 625 pfadd mm0,mm2 626 pfadd mm1,mm3 627 628 pmov mm2,qword [eax+32] 629 pmov mm3,qword [eax+40] 630 pfmul mm2,qword [edx+32] 631 pfmul mm3,qword [edx+40] 632 pfadd mm0,mm2 633 pfadd mm1,mm3 634 635 pmov mm2,qword [eax+48] 636 pmov mm3,qword [eax+56] 637 pfmul mm2,qword [edx+48] 638 pfmul mm3,qword [edx+56] 639 pfadd mm0,mm2 640 pfadd mm1,mm3 641 642 pfadd mm0,mm1 643 pmov qword [sp(%$p)],mm0 644 femms 645 fld dword [sp(%$p)] 646 fadd dword [sp(%$p)+4] 647endproc 648 649 650proc scalar20_float32_3DNow 651%$p arg 4 652%$q arg 4 653 mov eax,[sp(%$p)] 654 mov edx,[sp(%$q)] 655 656 pmov mm0,qword [eax] 657 pmov mm1,qword [eax+8] 658 pfmul mm0,qword [edx] 659 pfmul mm1,qword [edx+8] 660 661 pmov mm2,qword [eax+16] 662 pmov mm3,qword [eax+24] 663 pfmul mm2,qword [edx+16] 664 pfmul mm3,qword [edx+24] 665 pfadd mm0,mm2 666 pfadd mm1,mm3 667 668 pmov mm2,qword [eax+32] 669 pmov mm3,qword [eax+40] 670 pfmul mm2,qword [edx+32] 671 pfmul mm3,qword [edx+40] 672 pfadd mm0,mm2 673 pfadd mm1,mm3 674 675 pmov mm2,qword [eax+48] 676 pmov mm3,qword [eax+56] 677 pfmul mm2,qword [edx+48] 678 pfmul mm3,qword [edx+56] 679 pfadd mm0,mm2 680 pfadd mm1,mm3 681 682 pmov mm2,qword [eax+64] 683 pmov mm3,qword [eax+72] 684 pfmul mm2,qword [edx+64] 685 pfmul mm3,qword [edx+72] 686 pfadd mm0,mm2 687 pfadd mm1,mm3 688 689 pfadd mm0,mm1 690 pmov qword [sp(%$p)],mm0 691 femms 692 fld dword [sp(%$p)] 693 fadd dword [sp(%$p)+4] 694endproc 695 696 697proc scalar24_float32_3DNow 698%$p arg 4 699%$q arg 4 700 mov eax,[sp(%$p)] 701 mov edx,[sp(%$q)] 702 703 pmov mm0,qword [eax] 704 pmov mm1,qword [eax+8] 705 pfmul mm0,qword [edx] 706 pfmul mm1,qword [edx+8] 707 708 pmov mm2,qword [eax+16] 709 pmov mm3,qword [eax+24] 710 pfmul mm2,qword [edx+16] 711 pfmul mm3,qword [edx+24] 712 pfadd mm0,mm2 713 pfadd mm1,mm3 714 715 pmov mm2,qword [eax+32] 716 pmov mm3,qword [eax+40] 717 pfmul mm2,qword [edx+32] 718 pfmul mm3,qword [edx+40] 719 pfadd mm0,mm2 720 pfadd mm1,mm3 721 722 pmov mm2,qword [eax+48] 723 pmov mm3,qword [eax+56] 724 pfmul mm2,qword [edx+48] 725 pfmul mm3,qword [edx+56] 726 pfadd mm0,mm2 727 pfadd mm1,mm3 728 729 pmov mm2,qword [eax+64] 730 pmov mm3,qword [eax+72] 731 pfmul mm2,qword [edx+64] 732 pfmul mm3,qword [edx+72] 733 pfadd mm0,mm2 734 pfadd mm1,mm3 735 736 pmov mm2,qword [eax+80] 737 pmov mm3,qword [eax+88] 738 pfmul mm2,qword [edx+80] 739 pfmul mm3,qword [edx+88] 740 pfadd mm0,mm2 741 pfadd mm1,mm3 742 743 pfadd mm0,mm1 744 pmov qword [sp(%$p)],mm0 745 femms 746 fld dword [sp(%$p)] 747 fadd dword [sp(%$p)+4] 748endproc 749 750proc scalar32_float32_3DNow 751%$p arg 4 752%$q arg 4 753 mov eax,[sp(%$p)] 754 mov edx,[sp(%$q)] 755 756 pmov mm0,qword [eax] 757 pmov mm1,qword [eax+8] 758 pfmul mm0,qword [edx] 759 pfmul mm1,qword [edx+8] 760 761 pmov mm2,qword [eax+16] 762 pmov mm3,qword [eax+24] 763 pfmul mm2,qword [edx+16] 764 pfmul mm3,qword [edx+24] 765 pfadd mm0,mm2 766 pfadd mm1,mm3 767 768 pmov mm2,qword [eax+32] 769 pmov mm3,qword [eax+40] 770 pfmul mm2,qword [edx+32] 771 pfmul mm3,qword [edx+40] 772 pfadd mm0,mm2 773 pfadd mm1,mm3 774 775 pmov mm2,qword [eax+48] 776 pmov mm3,qword [eax+56] 777 pfmul mm2,qword [edx+48] 778 pfmul mm3,qword [edx+56] 779 pfadd mm0,mm2 780 pfadd mm1,mm3 781 782 pmov mm2,qword [eax+64] 783 pmov mm3,qword [eax+72] 784 pfmul mm2,qword [edx+64] 785 pfmul mm3,qword [edx+72] 786 pfadd mm0,mm2 787 pfadd mm1,mm3 788 789 pmov mm2,qword [eax+80] 790 pmov mm3,qword [eax+88] 791 pfmul mm2,qword [edx+80] 792 pfmul mm3,qword [edx+88] 793 pfadd mm0,mm2 794 pfadd mm1,mm3 795 796 pmov mm2,qword [eax+96] 797 pmov mm3,qword [eax+104] 798 pfmul mm2,qword [edx+96] 799 pfmul mm3,qword [edx+104] 800 pfadd mm0,mm2 801 pfadd mm1,mm3 802 803 pmov mm2,qword [eax+112] 804 pmov mm3,qword [eax+120] 805 pfmul mm2,qword [edx+112] 806 pfmul mm3,qword [edx+120] 807 pfadd mm0,mm2 808 pfadd mm1,mm3 809 810 pfadd mm0,mm1 811 pmov qword [sp(%$p)],mm0 812 femms 813 fld dword [sp(%$p)] 814 fadd dword [sp(%$p)+4] 815endproc 816 817 818proc scalar4n_float32_3DNow 819%$p arg 4 820%$q arg 4 821%$len arg 4 822 823 mov eax,[sp(%$p)] 824 mov edx,[sp(%$q)] 825 mov ecx,[sp(%$len)] 826 827 pmov mm0,qword [eax] 828 pmov mm1,qword [eax+8] 829 pfmul mm0,qword [edx] 830 pfmul mm1,qword [edx+8] 831 dec ecx 832 jz .ret4 833 834 add eax,byte 16 835 add edx,byte 16 836.lbl4: 837 pmov mm2,qword [eax] 838 pmov mm3,qword [eax+8] 839 pfmul mm2,qword [edx] 840 pfmul mm3,qword [edx+8] 841 add eax,byte 16 842 add edx,byte 16 843 pfadd mm0,mm2 844 pfadd mm1,mm3 845 dec ecx 846 jnz .lbl4 847 848.ret4: pfadd mm0,mm1 849 pmov qword [sp(%$p)],mm0 850 femms 851 fld dword [sp(%$p)] 852 fadd dword [sp(%$p)+4] 853endproc 854 855 856proc scalar1n_float32_3DNow 857 jmp scalar24_float32_i387 858endproc 859 860 861proc scalar04_float32_SIMD 862 jmp scalar04_float32_i387 863endproc 864 865 866proc scalar08_float32_SIMD 867%$p arg 4 868%$q arg 4 869 mov eax,[sp(%$p)] 870 mov edx,[sp(%$q)] 871 872 movups xmm0, [eax] 873 movups xmm1, [eax+16] 874 mulps xmm0, [edx] 875 mulps xmm1, [edx+16] 876 877 addps xmm0,xmm1 878 sub esp,16 879 movups [esp],xmm0 880 fld dword [esp+ 0] 881 fadd dword [esp+ 4] 882 fadd dword [esp+ 8] 883 fadd dword [esp+12] 884 add esp,16 885endproc 886 887 888proc scalar12_float32_SIMD 889 jmp scalar12_float32_i387 890endproc 891 892 893proc scalar16_float32_SIMD 894%$p arg 4 895%$q arg 4 896 mov eax,[sp(%$p)] 897 mov edx,[sp(%$q)] 898 899 movups xmm0, [eax] 900 movups xmm1, [eax+16] 901 mulps xmm0, [edx] 902 mulps xmm1, [edx+16] 903 904 movups xmm2, [eax+32] 905 movups xmm3, [eax+48] 906 mulps xmm2, [edx+32] 907 mulps xmm3, [edx+48] 908 addps xmm0,xmm2 909 addps xmm1,xmm3 910 911 addps xmm0,xmm1 912 sub esp,16 913 movups [esp],xmm0 914 fld dword [esp+ 0] 915 fadd dword [esp+ 4] 916 fadd dword [esp+ 8] 917 fadd dword [esp+12] 918 add esp,16 919endproc 920 921 922proc scalar20_float32_SIMD 923 jmp scalar20_float32_i387 924endproc 925 926 927proc scalar24_float32_SIMD 928%$p arg 4 929%$q arg 4 930 mov eax,[sp(%$p)] 931 mov edx,[sp(%$q)] 932 933 movups xmm0, [eax] 934 movups xmm1, [eax+16] 935 mulps xmm0, [edx] 936 mulps xmm1, [edx+16] 937 938 movups xmm2, [eax+32] 939 movups xmm3, [eax+48] 940 mulps xmm2, [edx+32] 941 mulps xmm3, [edx+48] 942 addps xmm0,xmm2 943 addps xmm1,xmm3 944 945 movups xmm2, [eax+64] 946 movups xmm3, [eax+80] 947 mulps xmm2, [edx+64] 948 mulps xmm3, [edx+80] 949 addps xmm0,xmm2 950 addps xmm1,xmm3 951 952 addps xmm0,xmm1 953 sub esp,16 954 movups [esp],xmm0 955 fld dword [esp+ 0] 956 fadd dword [esp+ 4] 957 fadd dword [esp+ 8] 958 fadd dword [esp+12] 959 add esp,16 960endproc 961 962 963proc scalar32_float32_SIMD 964%$p arg 4 965%$q arg 4 966 mov eax,[sp(%$p)] 967 mov edx,[sp(%$q)] 968 969 movups xmm0, [eax] 970 movups xmm1, [eax+16] 971 mulps xmm0, [edx] 972 mulps xmm1, [edx+16] 973 974 movups xmm2, [eax+32] 975 movups xmm3, [eax+48] 976 mulps xmm2, [edx+32] 977 mulps xmm3, [edx+48] 978 addps xmm0,xmm2 979 addps xmm1,xmm3 980 981 movups xmm2, [eax+64] 982 movups xmm3, [eax+80] 983 mulps xmm2, [edx+64] 984 mulps xmm3, [edx+80] 985 addps xmm0,xmm2 986 addps xmm1,xmm3 987 988 movups xmm2, [eax+96] 989 movups xmm3, [eax+112] 990 mulps xmm2, [edx+96] 991 mulps xmm3, [edx+112] 992 addps xmm0,xmm2 993 addps xmm1,xmm3 994 995 addps xmm0,xmm1 996 997 ;sub esp,16 998 ;movups [esp],xmm0 999 ;fld dword [esp+ 0] 1000 ;fadd dword [esp+ 4] 1001 ;fadd dword [esp+ 8] 1002 ;fadd dword [esp+12] 1003 ;add esp,16 1004 1005 movhlps xmm1,xmm0 1006 addps xmm0,xmm1 1007 movlps [sp(%$p)],xmm0 1008 fld dword [sp(%$p)] 1009 fadd dword [sp(%$p)+4] 1010endproc 1011 1012 1013proc scalar4n_float32_SIMD 1014 jmp scalar4n_float32_i387 1015endproc 1016 1017 1018proc scalar1n_float32_SIMD 1019 jmp scalar1n_float32_i387 1020endproc 1021 1022; end of scalar.nas 1023