1;****************************************************************************** 2;* VP8 MMXEXT optimizations 3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> 4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27fourtap_filter_hw_m: times 4 dw -6, 123 28 times 4 dw 12, -1 29 times 4 dw -9, 93 30 times 4 dw 50, -6 31 times 4 dw -6, 50 32 times 4 dw 93, -9 33 times 4 dw -1, 12 34 times 4 dw 123, -6 35 36sixtap_filter_hw_m: times 4 dw 2, -11 37 times 4 dw 108, 36 38 times 4 dw -8, 1 39 times 4 dw 3, -16 40 times 4 dw 77, 77 41 times 4 dw -16, 3 42 times 4 dw 1, -8 43 times 4 dw 36, 108 44 times 4 dw -11, 2 45 46fourtap_filter_hb_m: times 8 db -6, 123 47 times 8 db 12, -1 48 times 8 db -9, 93 49 times 8 db 50, -6 50 times 8 db -6, 50 51 times 8 db 93, -9 52 times 8 db -1, 12 53 times 8 db 123, -6 54 55sixtap_filter_hb_m: times 8 db 2, 1 56 times 8 db -11, 108 57 times 8 db 36, -8 58 times 8 db 3, 3 59 times 8 db -16, 77 60 times 8 db 77, -16 61 times 8 db 1, 2 62 times 8 db -8, 36 63 times 8 db 108, -11 64 65fourtap_filter_v_m: times 8 dw -6 66 times 8 dw 123 67 times 8 dw 12 68 times 8 dw -1 69 times 8 dw -9 70 times 8 dw 93 71 times 8 dw 50 72 times 8 dw -6 73 times 8 dw -6 74 times 8 dw 50 75 times 8 dw 93 76 times 8 dw -9 77 times 8 dw -1 78 times 8 dw 12 79 times 8 dw 123 80 times 8 dw -6 81 82sixtap_filter_v_m: times 8 dw 2 83 times 8 dw -11 84 times 8 dw 108 85 times 8 dw 36 86 times 8 dw -8 87 times 8 dw 1 88 times 8 dw 3 89 times 8 dw -16 90 times 8 dw 77 91 times 8 dw 77 92 times 8 dw -16 93 times 8 dw 3 94 times 8 dw 1 95 times 8 dw -8 96 times 8 dw 36 97 times 8 dw 108 98 times 8 dw -11 99 times 8 dw 2 100 101bilinear_filter_vw_m: times 8 dw 1 102 times 8 dw 2 103 times 8 dw 3 104 times 8 dw 4 105 times 8 dw 5 106 times 8 dw 6 107 times 8 dw 7 108 109bilinear_filter_vb_m: times 8 db 7, 1 110 times 8 db 6, 2 111 times 8 db 5, 3 112 times 8 db 4, 4 113 times 8 db 3, 5 114 times 8 db 2, 6 115 times 8 db 1, 7 116 117%ifdef PIC 118%define fourtap_filter_hw picregq 119%define sixtap_filter_hw picregq 120%define fourtap_filter_hb picregq 121%define sixtap_filter_hb picregq 122%define fourtap_filter_v picregq 123%define sixtap_filter_v picregq 124%define bilinear_filter_vw picregq 125%define bilinear_filter_vb picregq 126%define npicregs 1 127%else 128%define fourtap_filter_hw fourtap_filter_hw_m 129%define sixtap_filter_hw sixtap_filter_hw_m 130%define fourtap_filter_hb fourtap_filter_hb_m 131%define sixtap_filter_hb sixtap_filter_hb_m 132%define fourtap_filter_v fourtap_filter_v_m 133%define sixtap_filter_v sixtap_filter_v_m 134%define bilinear_filter_vw bilinear_filter_vw_m 135%define bilinear_filter_vb bilinear_filter_vb_m 136%define npicregs 0 137%endif 138 139filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 140filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 141 142filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 143filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 144filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 145 146pw_20091: times 4 dw 20091 147pw_17734: times 4 dw 17734 148 149cextern pw_3 150cextern pw_4 151cextern pw_64 152cextern pw_256 153 154SECTION .text 155 156;------------------------------------------------------------------------------- 157; subpel MC functions: 158; 159; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride, 160; uint8_t *src, ptrdiff_t srcstride, 161; int height, int mx, int my); 162;------------------------------------------------------------------------------- 163 164%macro FILTER_SSSE3 1 165cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg 166 lea mxd, [mxq*3] 167 mova m3, [filter_h6_shuf2] 168 mova m4, [filter_h6_shuf3] 169%ifdef PIC 170 lea picregq, [sixtap_filter_hb_m] 171%endif 172 mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes 173 mova m6, [sixtap_filter_hb+mxq*8-32] 174 mova m7, [sixtap_filter_hb+mxq*8-16] 175 176.nextrow: 177 movu m0, [srcq-2] 178 mova m1, m0 179 mova m2, m0 180%if mmsize == 8 181; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the 182; shuffle with a memory operand 183 punpcklbw m0, [srcq+3] 184%else 185 pshufb m0, [filter_h6_shuf1] 186%endif 187 pshufb m1, m3 188 pshufb m2, m4 189 pmaddubsw m0, m5 190 pmaddubsw m1, m6 191 pmaddubsw m2, m7 192 paddsw m0, m1 193 paddsw m0, m2 194 pmulhrsw m0, [pw_256] 195 packuswb m0, m0 196 movh [dstq], m0 ; store 197 198 ; go to next line 199 add dstq, dststrideq 200 add srcq, srcstrideq 201 dec heightd ; next row 202 jg .nextrow 203 REP_RET 204 205cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg 206 shl mxd, 4 207 mova m2, [pw_256] 208 mova m3, [filter_h2_shuf] 209 mova m4, [filter_h4_shuf] 210%ifdef PIC 211 lea picregq, [fourtap_filter_hb_m] 212%endif 213 mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes 214 mova m6, [fourtap_filter_hb+mxq] 215 216.nextrow: 217 movu m0, [srcq-1] 218 mova m1, m0 219 pshufb m0, m3 220 pshufb m1, m4 221 pmaddubsw m0, m5 222 pmaddubsw m1, m6 223 paddsw m0, m1 224 pmulhrsw m0, m2 225 packuswb m0, m0 226 movh [dstq], m0 ; store 227 228 ; go to next line 229 add dstq, dststrideq 230 add srcq, srcstrideq 231 dec heightd ; next row 232 jg .nextrow 233 REP_RET 234 235cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 236 shl myd, 4 237%ifdef PIC 238 lea picregq, [fourtap_filter_hb_m] 239%endif 240 mova m5, [fourtap_filter_hb+myq-16] 241 mova m6, [fourtap_filter_hb+myq] 242 mova m7, [pw_256] 243 244 ; read 3 lines 245 sub srcq, srcstrideq 246 movh m0, [srcq] 247 movh m1, [srcq+ srcstrideq] 248 movh m2, [srcq+2*srcstrideq] 249 add srcq, srcstrideq 250 251.nextrow: 252 movh m3, [srcq+2*srcstrideq] ; read new row 253 mova m4, m0 254 mova m0, m1 255 punpcklbw m4, m1 256 mova m1, m2 257 punpcklbw m2, m3 258 pmaddubsw m4, m5 259 pmaddubsw m2, m6 260 paddsw m4, m2 261 mova m2, m3 262 pmulhrsw m4, m7 263 packuswb m4, m4 264 movh [dstq], m4 265 266 ; go to next line 267 add dstq, dststrideq 268 add srcq, srcstrideq 269 dec heightd ; next row 270 jg .nextrow 271 REP_RET 272 273cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 274 lea myd, [myq*3] 275%ifdef PIC 276 lea picregq, [sixtap_filter_hb_m] 277%endif 278 lea myq, [sixtap_filter_hb+myq*8] 279 280 ; read 5 lines 281 sub srcq, srcstrideq 282 sub srcq, srcstrideq 283 movh m0, [srcq] 284 movh m1, [srcq+srcstrideq] 285 movh m2, [srcq+srcstrideq*2] 286 lea srcq, [srcq+srcstrideq*2] 287 add srcq, srcstrideq 288 movh m3, [srcq] 289 movh m4, [srcq+srcstrideq] 290 291.nextrow: 292 movh m5, [srcq+2*srcstrideq] ; read new row 293 mova m6, m0 294 punpcklbw m6, m5 295 mova m0, m1 296 punpcklbw m1, m2 297 mova m7, m3 298 punpcklbw m7, m4 299 pmaddubsw m6, [myq-48] 300 pmaddubsw m1, [myq-32] 301 pmaddubsw m7, [myq-16] 302 paddsw m6, m1 303 paddsw m6, m7 304 mova m1, m2 305 mova m2, m3 306 pmulhrsw m6, [pw_256] 307 mova m3, m4 308 packuswb m6, m6 309 mova m4, m5 310 movh [dstq], m6 311 312 ; go to next line 313 add dstq, dststrideq 314 add srcq, srcstrideq 315 dec heightd ; next row 316 jg .nextrow 317 REP_RET 318%endmacro 319 320INIT_MMX ssse3 321FILTER_SSSE3 4 322INIT_XMM ssse3 323FILTER_SSSE3 8 324 325; 4x4 block, H-only 4-tap filter 326INIT_MMX mmxext 327cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg 328 shl mxd, 4 329%ifdef PIC 330 lea picregq, [fourtap_filter_hw_m] 331%endif 332 movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words 333 movq mm5, [fourtap_filter_hw+mxq] 334 movq mm7, [pw_64] 335 pxor mm6, mm6 336 337.nextrow: 338 movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels 339 340 ; first set of 2 pixels 341 movq mm2, mm1 ; byte ABCD.. 342 punpcklbw mm1, mm6 ; byte->word ABCD 343 pshufw mm0, mm2, 9 ; byte CDEF.. 344 punpcklbw mm0, mm6 ; byte->word CDEF 345 pshufw mm3, mm1, 0x94 ; word ABBC 346 pshufw mm1, mm0, 0x94 ; word CDDE 347 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 348 movq mm0, mm1 ; backup for second set of pixels 349 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 350 paddd mm3, mm1 ; finish 1st 2px 351 352 ; second set of 2 pixels, use backup of above 353 punpckhbw mm2, mm6 ; byte->word EFGH 354 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 355 pshufw mm1, mm2, 0x94 ; word EFFG 356 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 357 paddd mm0, mm1 ; finish 2nd 2px 358 359 ; merge two sets of 2 pixels into one set of 4, round/clip/store 360 packssdw mm3, mm0 ; merge dword->word (4px) 361 paddsw mm3, mm7 ; rounding 362 psraw mm3, 7 363 packuswb mm3, mm6 ; clip and word->bytes 364 movd [dstq], mm3 ; store 365 366 ; go to next line 367 add dstq, dststrideq 368 add srcq, srcstrideq 369 dec heightd ; next row 370 jg .nextrow 371 REP_RET 372 373; 4x4 block, H-only 6-tap filter 374INIT_MMX mmxext 375cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg 376 lea mxd, [mxq*3] 377%ifdef PIC 378 lea picregq, [sixtap_filter_hw_m] 379%endif 380 movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words 381 movq mm5, [sixtap_filter_hw+mxq*8-32] 382 movq mm6, [sixtap_filter_hw+mxq*8-16] 383 movq mm7, [pw_64] 384 pxor mm3, mm3 385 386.nextrow: 387 movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels 388 389 ; first set of 2 pixels 390 movq mm2, mm1 ; byte ABCD.. 391 punpcklbw mm1, mm3 ; byte->word ABCD 392 pshufw mm0, mm2, 0x9 ; byte CDEF.. 393 punpckhbw mm2, mm3 ; byte->word EFGH 394 punpcklbw mm0, mm3 ; byte->word CDEF 395 pshufw mm1, mm1, 0x94 ; word ABBC 396 pshufw mm2, mm2, 0x94 ; word EFFG 397 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 398 pshufw mm3, mm0, 0x94 ; word CDDE 399 movq mm0, mm3 ; backup for second set of pixels 400 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 401 paddd mm1, mm3 ; add to 1st 2px cache 402 movq mm3, mm2 ; backup for second set of pixels 403 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 404 paddd mm1, mm2 ; finish 1st 2px 405 406 ; second set of 2 pixels, use backup of above 407 movd mm2, [srcq+3] ; byte FGHI (prevent overreads) 408 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 409 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 410 paddd mm0, mm3 ; add to 2nd 2px cache 411 pxor mm3, mm3 412 punpcklbw mm2, mm3 ; byte->word FGHI 413 pshufw mm2, mm2, 0xE9 ; word GHHI 414 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 415 paddd mm0, mm2 ; finish 2nd 2px 416 417 ; merge two sets of 2 pixels into one set of 4, round/clip/store 418 packssdw mm1, mm0 ; merge dword->word (4px) 419 paddsw mm1, mm7 ; rounding 420 psraw mm1, 7 421 packuswb mm1, mm3 ; clip and word->bytes 422 movd [dstq], mm1 ; store 423 424 ; go to next line 425 add dstq, dststrideq 426 add srcq, srcstrideq 427 dec heightd ; next row 428 jg .nextrow 429 REP_RET 430 431INIT_XMM sse2 432cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg 433 shl mxd, 5 434%ifdef PIC 435 lea picregq, [fourtap_filter_v_m] 436%endif 437 lea mxq, [fourtap_filter_v+mxq-32] 438 pxor m7, m7 439 mova m4, [pw_64] 440 mova m5, [mxq+ 0] 441 mova m6, [mxq+16] 442%ifdef m8 443 mova m8, [mxq+32] 444 mova m9, [mxq+48] 445%endif 446.nextrow: 447 movq m0, [srcq-1] 448 movq m1, [srcq-0] 449 movq m2, [srcq+1] 450 movq m3, [srcq+2] 451 punpcklbw m0, m7 452 punpcklbw m1, m7 453 punpcklbw m2, m7 454 punpcklbw m3, m7 455 pmullw m0, m5 456 pmullw m1, m6 457%ifdef m8 458 pmullw m2, m8 459 pmullw m3, m9 460%else 461 pmullw m2, [mxq+32] 462 pmullw m3, [mxq+48] 463%endif 464 paddsw m0, m1 465 paddsw m2, m3 466 paddsw m0, m2 467 paddsw m0, m4 468 psraw m0, 7 469 packuswb m0, m7 470 movh [dstq], m0 ; store 471 472 ; go to next line 473 add dstq, dststrideq 474 add srcq, srcstrideq 475 dec heightd ; next row 476 jg .nextrow 477 REP_RET 478 479INIT_XMM sse2 480cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg 481 lea mxd, [mxq*3] 482 shl mxd, 4 483%ifdef PIC 484 lea picregq, [sixtap_filter_v_m] 485%endif 486 lea mxq, [sixtap_filter_v+mxq-96] 487 pxor m7, m7 488 mova m6, [pw_64] 489%ifdef m8 490 mova m8, [mxq+ 0] 491 mova m9, [mxq+16] 492 mova m10, [mxq+32] 493 mova m11, [mxq+48] 494 mova m12, [mxq+64] 495 mova m13, [mxq+80] 496%endif 497.nextrow: 498 movq m0, [srcq-2] 499 movq m1, [srcq-1] 500 movq m2, [srcq-0] 501 movq m3, [srcq+1] 502 movq m4, [srcq+2] 503 movq m5, [srcq+3] 504 punpcklbw m0, m7 505 punpcklbw m1, m7 506 punpcklbw m2, m7 507 punpcklbw m3, m7 508 punpcklbw m4, m7 509 punpcklbw m5, m7 510%ifdef m8 511 pmullw m0, m8 512 pmullw m1, m9 513 pmullw m2, m10 514 pmullw m3, m11 515 pmullw m4, m12 516 pmullw m5, m13 517%else 518 pmullw m0, [mxq+ 0] 519 pmullw m1, [mxq+16] 520 pmullw m2, [mxq+32] 521 pmullw m3, [mxq+48] 522 pmullw m4, [mxq+64] 523 pmullw m5, [mxq+80] 524%endif 525 paddsw m1, m4 526 paddsw m0, m5 527 paddsw m1, m2 528 paddsw m0, m3 529 paddsw m0, m1 530 paddsw m0, m6 531 psraw m0, 7 532 packuswb m0, m7 533 movh [dstq], m0 ; store 534 535 ; go to next line 536 add dstq, dststrideq 537 add srcq, srcstrideq 538 dec heightd ; next row 539 jg .nextrow 540 REP_RET 541 542%macro FILTER_V 1 543; 4x4 block, V-only 4-tap filter 544cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 545 shl myd, 5 546%ifdef PIC 547 lea picregq, [fourtap_filter_v_m] 548%endif 549 lea myq, [fourtap_filter_v+myq-32] 550 mova m6, [pw_64] 551 pxor m7, m7 552 mova m5, [myq+48] 553 554 ; read 3 lines 555 sub srcq, srcstrideq 556 movh m0, [srcq] 557 movh m1, [srcq+ srcstrideq] 558 movh m2, [srcq+2*srcstrideq] 559 add srcq, srcstrideq 560 punpcklbw m0, m7 561 punpcklbw m1, m7 562 punpcklbw m2, m7 563 564.nextrow: 565 ; first calculate negative taps (to prevent losing positive overflows) 566 movh m4, [srcq+2*srcstrideq] ; read new row 567 punpcklbw m4, m7 568 mova m3, m4 569 pmullw m0, [myq+0] 570 pmullw m4, m5 571 paddsw m4, m0 572 573 ; then calculate positive taps 574 mova m0, m1 575 pmullw m1, [myq+16] 576 paddsw m4, m1 577 mova m1, m2 578 pmullw m2, [myq+32] 579 paddsw m4, m2 580 mova m2, m3 581 582 ; round/clip/store 583 paddsw m4, m6 584 psraw m4, 7 585 packuswb m4, m7 586 movh [dstq], m4 587 588 ; go to next line 589 add dstq, dststrideq 590 add srcq, srcstrideq 591 dec heightd ; next row 592 jg .nextrow 593 REP_RET 594 595 596; 4x4 block, V-only 6-tap filter 597cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my 598 shl myd, 4 599 lea myq, [myq*3] 600%ifdef PIC 601 lea picregq, [sixtap_filter_v_m] 602%endif 603 lea myq, [sixtap_filter_v+myq-96] 604 pxor m7, m7 605 606 ; read 5 lines 607 sub srcq, srcstrideq 608 sub srcq, srcstrideq 609 movh m0, [srcq] 610 movh m1, [srcq+srcstrideq] 611 movh m2, [srcq+srcstrideq*2] 612 lea srcq, [srcq+srcstrideq*2] 613 add srcq, srcstrideq 614 movh m3, [srcq] 615 movh m4, [srcq+srcstrideq] 616 punpcklbw m0, m7 617 punpcklbw m1, m7 618 punpcklbw m2, m7 619 punpcklbw m3, m7 620 punpcklbw m4, m7 621 622.nextrow: 623 ; first calculate negative taps (to prevent losing positive overflows) 624 mova m5, m1 625 pmullw m5, [myq+16] 626 mova m6, m4 627 pmullw m6, [myq+64] 628 paddsw m6, m5 629 630 ; then calculate positive taps 631 movh m5, [srcq+2*srcstrideq] ; read new row 632 punpcklbw m5, m7 633 pmullw m0, [myq+0] 634 paddsw m6, m0 635 mova m0, m1 636 mova m1, m2 637 pmullw m2, [myq+32] 638 paddsw m6, m2 639 mova m2, m3 640 pmullw m3, [myq+48] 641 paddsw m6, m3 642 mova m3, m4 643 mova m4, m5 644 pmullw m5, [myq+80] 645 paddsw m6, m5 646 647 ; round/clip/store 648 paddsw m6, [pw_64] 649 psraw m6, 7 650 packuswb m6, m7 651 movh [dstq], m6 652 653 ; go to next line 654 add dstq, dststrideq 655 add srcq, srcstrideq 656 dec heightd ; next row 657 jg .nextrow 658 REP_RET 659%endmacro 660 661INIT_MMX mmxext 662FILTER_V 4 663INIT_XMM sse2 664FILTER_V 8 665 666%macro FILTER_BILINEAR 1 667%if cpuflag(ssse3) 668cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my 669 shl myd, 4 670%ifdef PIC 671 lea picregq, [bilinear_filter_vb_m] 672%endif 673 pxor m4, m4 674 mova m3, [bilinear_filter_vb+myq-16] 675.nextrow: 676 movh m0, [srcq+srcstrideq*0] 677 movh m1, [srcq+srcstrideq*1] 678 movh m2, [srcq+srcstrideq*2] 679 punpcklbw m0, m1 680 punpcklbw m1, m2 681 pmaddubsw m0, m3 682 pmaddubsw m1, m3 683 psraw m0, 2 684 psraw m1, 2 685 pavgw m0, m4 686 pavgw m1, m4 687%if mmsize==8 688 packuswb m0, m0 689 packuswb m1, m1 690 movh [dstq+dststrideq*0], m0 691 movh [dstq+dststrideq*1], m1 692%else 693 packuswb m0, m1 694 movh [dstq+dststrideq*0], m0 695 movhps [dstq+dststrideq*1], m0 696%endif 697%else ; cpuflag(ssse3) 698cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my 699 shl myd, 4 700%ifdef PIC 701 lea picregq, [bilinear_filter_vw_m] 702%endif 703 pxor m6, m6 704 mova m5, [bilinear_filter_vw+myq-1*16] 705 neg myq 706 mova m4, [bilinear_filter_vw+myq+7*16] 707.nextrow: 708 movh m0, [srcq+srcstrideq*0] 709 movh m1, [srcq+srcstrideq*1] 710 movh m3, [srcq+srcstrideq*2] 711 punpcklbw m0, m6 712 punpcklbw m1, m6 713 punpcklbw m3, m6 714 mova m2, m1 715 pmullw m0, m4 716 pmullw m1, m5 717 pmullw m2, m4 718 pmullw m3, m5 719 paddsw m0, m1 720 paddsw m2, m3 721 psraw m0, 2 722 psraw m2, 2 723 pavgw m0, m6 724 pavgw m2, m6 725%if mmsize == 8 726 packuswb m0, m0 727 packuswb m2, m2 728 movh [dstq+dststrideq*0], m0 729 movh [dstq+dststrideq*1], m2 730%else 731 packuswb m0, m2 732 movh [dstq+dststrideq*0], m0 733 movhps [dstq+dststrideq*1], m0 734%endif 735%endif ; cpuflag(ssse3) 736 737 lea dstq, [dstq+dststrideq*2] 738 lea srcq, [srcq+srcstrideq*2] 739 sub heightd, 2 740 jg .nextrow 741 REP_RET 742 743%if cpuflag(ssse3) 744cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg 745 shl mxd, 4 746%ifdef PIC 747 lea picregq, [bilinear_filter_vb_m] 748%endif 749 pxor m4, m4 750 mova m2, [filter_h2_shuf] 751 mova m3, [bilinear_filter_vb+mxq-16] 752.nextrow: 753 movu m0, [srcq+srcstrideq*0] 754 movu m1, [srcq+srcstrideq*1] 755 pshufb m0, m2 756 pshufb m1, m2 757 pmaddubsw m0, m3 758 pmaddubsw m1, m3 759 psraw m0, 2 760 psraw m1, 2 761 pavgw m0, m4 762 pavgw m1, m4 763%if mmsize==8 764 packuswb m0, m0 765 packuswb m1, m1 766 movh [dstq+dststrideq*0], m0 767 movh [dstq+dststrideq*1], m1 768%else 769 packuswb m0, m1 770 movh [dstq+dststrideq*0], m0 771 movhps [dstq+dststrideq*1], m0 772%endif 773%else ; cpuflag(ssse3) 774cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg 775 shl mxd, 4 776%ifdef PIC 777 lea picregq, [bilinear_filter_vw_m] 778%endif 779 pxor m6, m6 780 mova m5, [bilinear_filter_vw+mxq-1*16] 781 neg mxq 782 mova m4, [bilinear_filter_vw+mxq+7*16] 783.nextrow: 784 movh m0, [srcq+srcstrideq*0+0] 785 movh m1, [srcq+srcstrideq*0+1] 786 movh m2, [srcq+srcstrideq*1+0] 787 movh m3, [srcq+srcstrideq*1+1] 788 punpcklbw m0, m6 789 punpcklbw m1, m6 790 punpcklbw m2, m6 791 punpcklbw m3, m6 792 pmullw m0, m4 793 pmullw m1, m5 794 pmullw m2, m4 795 pmullw m3, m5 796 paddsw m0, m1 797 paddsw m2, m3 798 psraw m0, 2 799 psraw m2, 2 800 pavgw m0, m6 801 pavgw m2, m6 802%if mmsize == 8 803 packuswb m0, m0 804 packuswb m2, m2 805 movh [dstq+dststrideq*0], m0 806 movh [dstq+dststrideq*1], m2 807%else 808 packuswb m0, m2 809 movh [dstq+dststrideq*0], m0 810 movhps [dstq+dststrideq*1], m0 811%endif 812%endif ; cpuflag(ssse3) 813 814 lea dstq, [dstq+dststrideq*2] 815 lea srcq, [srcq+srcstrideq*2] 816 sub heightd, 2 817 jg .nextrow 818 REP_RET 819%endmacro 820 821INIT_MMX mmxext 822FILTER_BILINEAR 4 823INIT_XMM sse2 824FILTER_BILINEAR 8 825INIT_MMX ssse3 826FILTER_BILINEAR 4 827INIT_XMM ssse3 828FILTER_BILINEAR 8 829 830INIT_MMX mmx 831cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height 832.nextrow: 833 movq mm0, [srcq+srcstrideq*0] 834 movq mm1, [srcq+srcstrideq*1] 835 lea srcq, [srcq+srcstrideq*2] 836 movq [dstq+dststrideq*0], mm0 837 movq [dstq+dststrideq*1], mm1 838 lea dstq, [dstq+dststrideq*2] 839 sub heightd, 2 840 jg .nextrow 841 REP_RET 842 843INIT_XMM sse 844cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height 845.nextrow: 846 movups xmm0, [srcq+srcstrideq*0] 847 movups xmm1, [srcq+srcstrideq*1] 848 lea srcq, [srcq+srcstrideq*2] 849 movaps [dstq+dststrideq*0], xmm0 850 movaps [dstq+dststrideq*1], xmm1 851 lea dstq, [dstq+dststrideq*2] 852 sub heightd, 2 853 jg .nextrow 854 REP_RET 855 856;----------------------------------------------------------------------------- 857; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride); 858;----------------------------------------------------------------------------- 859 860%macro ADD_DC 4 861 %4 m2, [dst1q+%3] 862 %4 m3, [dst1q+strideq+%3] 863 %4 m4, [dst2q+%3] 864 %4 m5, [dst2q+strideq+%3] 865 paddusb m2, %1 866 paddusb m3, %1 867 paddusb m4, %1 868 paddusb m5, %1 869 psubusb m2, %2 870 psubusb m3, %2 871 psubusb m4, %2 872 psubusb m5, %2 873 %4 [dst1q+%3], m2 874 %4 [dst1q+strideq+%3], m3 875 %4 [dst2q+%3], m4 876 %4 [dst2q+strideq+%3], m5 877%endmacro 878 879%macro VP8_IDCT_DC_ADD 0 880cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride 881 ; load data 882 movd m0, [blockq] 883 pxor m1, m1 884 885 ; calculate DC 886 paddw m0, [pw_4] 887 movd [blockq], m1 888 DEFINE_ARGS dst1, dst2, stride 889 lea dst2q, [dst1q+strideq*2] 890 movd m2, [dst1q] 891 movd m3, [dst1q+strideq] 892 movd m4, [dst2q] 893 movd m5, [dst2q+strideq] 894 psraw m0, 3 895 pshuflw m0, m0, 0 896 punpcklqdq m0, m0 897 punpckldq m2, m3 898 punpckldq m4, m5 899 punpcklbw m2, m1 900 punpcklbw m4, m1 901 paddw m2, m0 902 paddw m4, m0 903 packuswb m2, m4 904 movd [dst1q], m2 905%if cpuflag(sse4) 906 pextrd [dst1q+strideq], m2, 1 907 pextrd [dst2q], m2, 2 908 pextrd [dst2q+strideq], m2, 3 909%else 910 psrldq m2, 4 911 movd [dst1q+strideq], m2 912 psrldq m2, 4 913 movd [dst2q], m2 914 psrldq m2, 4 915 movd [dst2q+strideq], m2 916%endif 917 RET 918%endmacro 919 920INIT_XMM sse2 921VP8_IDCT_DC_ADD 922INIT_XMM sse4 923VP8_IDCT_DC_ADD 924 925;----------------------------------------------------------------------------- 926; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); 927;----------------------------------------------------------------------------- 928 929INIT_XMM sse2 930cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride 931 ; load data 932 movd m0, [blockq+32*0] ; A 933 movd m1, [blockq+32*2] ; C 934 punpcklwd m0, [blockq+32*1] ; A B 935 punpcklwd m1, [blockq+32*3] ; C D 936 punpckldq m0, m1 ; A B C D 937 pxor m1, m1 938 939 ; calculate DC 940 paddw m0, [pw_4] 941 movd [blockq+32*0], m1 942 movd [blockq+32*1], m1 943 movd [blockq+32*2], m1 944 movd [blockq+32*3], m1 945 psraw m0, 3 946 psubw m1, m0 947 packuswb m0, m0 948 packuswb m1, m1 949 punpcklbw m0, m0 950 punpcklbw m1, m1 951 punpcklbw m0, m0 952 punpcklbw m1, m1 953 954 ; add DC 955 DEFINE_ARGS dst1, dst2, stride 956 lea dst2q, [dst1q+strideq*2] 957 ADD_DC m0, m1, 0, mova 958 RET 959 960;----------------------------------------------------------------------------- 961; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); 962;----------------------------------------------------------------------------- 963 964INIT_MMX mmx 965cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride 966 ; load data 967 movd m0, [blockq+32*0] ; A 968 movd m1, [blockq+32*2] ; C 969 punpcklwd m0, [blockq+32*1] ; A B 970 punpcklwd m1, [blockq+32*3] ; C D 971 punpckldq m0, m1 ; A B C D 972 pxor m6, m6 973 974 ; calculate DC 975 paddw m0, [pw_4] 976 movd [blockq+32*0], m6 977 movd [blockq+32*1], m6 978 movd [blockq+32*2], m6 979 movd [blockq+32*3], m6 980 psraw m0, 3 981 psubw m6, m0 982 packuswb m0, m0 983 packuswb m6, m6 984 punpcklbw m0, m0 ; AABBCCDD 985 punpcklbw m6, m6 ; AABBCCDD 986 movq m1, m0 987 movq m7, m6 988 punpcklbw m0, m0 ; AAAABBBB 989 punpckhbw m1, m1 ; CCCCDDDD 990 punpcklbw m6, m6 ; AAAABBBB 991 punpckhbw m7, m7 ; CCCCDDDD 992 993 ; add DC 994 DEFINE_ARGS dst1, dst2, stride 995 lea dst2q, [dst1q+strideq*2] 996 ADD_DC m0, m6, 0, mova 997 lea dst1q, [dst1q+strideq*4] 998 lea dst2q, [dst2q+strideq*4] 999 ADD_DC m1, m7, 0, mova 1000 RET 1001 1002;----------------------------------------------------------------------------- 1003; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride); 1004;----------------------------------------------------------------------------- 1005 1006; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) 1007; this macro assumes that m6/m7 have words for 20091/17734 loaded 1008%macro VP8_MULTIPLY_SUMSUB 4 1009 mova %3, %1 1010 mova %4, %2 1011 pmulhw %3, m6 ;20091(1) 1012 pmulhw %4, m6 ;20091(2) 1013 paddw %3, %1 1014 paddw %4, %2 1015 paddw %1, %1 1016 paddw %2, %2 1017 pmulhw %1, m7 ;35468(1) 1018 pmulhw %2, m7 ;35468(2) 1019 psubw %1, %4 1020 paddw %2, %3 1021%endmacro 1022 1023; calculate x0=%1+%3; x1=%1-%3 1024; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) 1025; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) 1026; %5/%6 are temporary registers 1027; we assume m6/m7 have constant words 20091/17734 loaded in them 1028%macro VP8_IDCT_TRANSFORM4x4_1D 6 1029 SUMSUB_BA w, %3, %1, %5 ;t0, t1 1030 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 1031 SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3 1032 SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2 1033 SWAP %4, %1 1034 SWAP %4, %3 1035%endmacro 1036 1037INIT_MMX sse 1038cglobal vp8_idct_add, 3, 3, 0, dst, block, stride 1039 ; load block data 1040 movq m0, [blockq+ 0] 1041 movq m1, [blockq+ 8] 1042 movq m2, [blockq+16] 1043 movq m3, [blockq+24] 1044 movq m6, [pw_20091] 1045 movq m7, [pw_17734] 1046 xorps xmm0, xmm0 1047 movaps [blockq+ 0], xmm0 1048 movaps [blockq+16], xmm0 1049 1050 ; actual IDCT 1051 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 1052 TRANSPOSE4x4W 0, 1, 2, 3, 4 1053 paddw m0, [pw_4] 1054 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 1055 TRANSPOSE4x4W 0, 1, 2, 3, 4 1056 1057 ; store 1058 pxor m4, m4 1059 DEFINE_ARGS dst1, dst2, stride 1060 lea dst2q, [dst1q+2*strideq] 1061 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq 1062 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq 1063 1064 RET 1065 1066;----------------------------------------------------------------------------- 1067; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16]) 1068;----------------------------------------------------------------------------- 1069 1070%macro SCATTER_WHT 3 1071 movd dc1d, m%1 1072 movd dc2d, m%2 1073 mov [blockq+2*16*(0+%3)], dc1w 1074 mov [blockq+2*16*(1+%3)], dc2w 1075 shr dc1d, 16 1076 shr dc2d, 16 1077 psrlq m%1, 32 1078 psrlq m%2, 32 1079 mov [blockq+2*16*(4+%3)], dc1w 1080 mov [blockq+2*16*(5+%3)], dc2w 1081 movd dc1d, m%1 1082 movd dc2d, m%2 1083 mov [blockq+2*16*(8+%3)], dc1w 1084 mov [blockq+2*16*(9+%3)], dc2w 1085 shr dc1d, 16 1086 shr dc2d, 16 1087 mov [blockq+2*16*(12+%3)], dc1w 1088 mov [blockq+2*16*(13+%3)], dc2w 1089%endmacro 1090 1091%macro HADAMARD4_1D 4 1092 SUMSUB_BADC w, %2, %1, %4, %3 1093 SUMSUB_BADC w, %4, %2, %3, %1 1094 SWAP %1, %4, %3 1095%endmacro 1096 1097INIT_MMX sse 1098cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 1099 movq m0, [dc1q] 1100 movq m1, [dc1q+8] 1101 movq m2, [dc1q+16] 1102 movq m3, [dc1q+24] 1103 xorps xmm0, xmm0 1104 movaps [dc1q+ 0], xmm0 1105 movaps [dc1q+16], xmm0 1106 HADAMARD4_1D 0, 1, 2, 3 1107 TRANSPOSE4x4W 0, 1, 2, 3, 4 1108 paddw m0, [pw_3] 1109 HADAMARD4_1D 0, 1, 2, 3 1110 psraw m0, 3 1111 psraw m1, 3 1112 psraw m2, 3 1113 psraw m3, 3 1114 SCATTER_WHT 0, 1, 0 1115 SCATTER_WHT 2, 3, 2 1116 RET 1117