1;****************************************************************************** 2;* MMX/SSE2-optimized functions for the RV40 decoder 3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> 4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> 5;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024 29 30sixtap_filter_hb_m: times 8 db 1, -5 31 times 8 db 52, 20 32 ; multiplied by 2 to have the same shift 33 times 8 db 2, -10 34 times 8 db 40, 40 35 ; back to normal 36 times 8 db 1, -5 37 times 8 db 20, 52 38 39sixtap_filter_v_m: times 8 dw 1 40 times 8 dw -5 41 times 8 dw 52 42 times 8 dw 20 43 ; multiplied by 2 to have the same shift 44 times 8 dw 2 45 times 8 dw -10 46 times 8 dw 40 47 times 8 dw 40 48 ; back to normal 49 times 8 dw 1 50 times 8 dw -5 51 times 8 dw 20 52 times 8 dw 52 53 54%ifdef PIC 55%define sixtap_filter_hw picregq 56%define sixtap_filter_hb picregq 57%define sixtap_filter_v picregq 58%define npicregs 1 59%else 60%define sixtap_filter_hw sixtap_filter_hw_m 61%define sixtap_filter_hb sixtap_filter_hb_m 62%define sixtap_filter_v sixtap_filter_v_m 63%define npicregs 0 64%endif 65 66filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 67filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 68filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11 69 70cextern pw_32 71cextern pw_16 72cextern pw_512 73 74SECTION .text 75 76;----------------------------------------------------------------------------- 77; subpel MC functions: 78; 79; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride, 80; uint8_t *src, int srcstride, 81; int len, int m); 82;---------------------------------------------------------------------- 83%macro LOAD 2 84%if WIN64 85 movsxd %1q, %1d 86%endif 87%ifdef PIC 88 add %1q, picregq 89%else 90 add %1q, %2 91%endif 92%endmacro 93 94%macro STORE 3 95%ifidn %3, avg 96 movh %2, [dstq] 97%endif 98 packuswb %1, %1 99%ifidn %3, avg 100 PAVGB %1, %2 101%endif 102 movh [dstq], %1 103%endmacro 104 105%macro FILTER_V 1 106cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg 107%ifdef PIC 108 lea picregq, [sixtap_filter_v_m] 109%endif 110 pxor m7, m7 111 LOAD my, sixtap_filter_v 112 113 ; read 5 lines 114 sub srcq, srcstrideq 115 sub srcq, srcstrideq 116 movh m0, [srcq] 117 movh m1, [srcq+srcstrideq] 118 movh m2, [srcq+srcstrideq*2] 119 lea srcq, [srcq+srcstrideq*2] 120 add srcq, srcstrideq 121 movh m3, [srcq] 122 movh m4, [srcq+srcstrideq] 123 punpcklbw m0, m7 124 punpcklbw m1, m7 125 punpcklbw m2, m7 126 punpcklbw m3, m7 127 punpcklbw m4, m7 128 129%ifdef m8 130 mova m8, [myq+ 0] 131 mova m9, [myq+16] 132 mova m10, [myq+32] 133 mova m11, [myq+48] 134%define COEFF05 m8 135%define COEFF14 m9 136%define COEFF2 m10 137%define COEFF3 m11 138%else 139%define COEFF05 [myq+ 0] 140%define COEFF14 [myq+16] 141%define COEFF2 [myq+32] 142%define COEFF3 [myq+48] 143%endif 144.nextrow: 145 mova m6, m1 146 movh m5, [srcq+2*srcstrideq] ; read new row 147 paddw m6, m4 148 punpcklbw m5, m7 149 pmullw m6, COEFF14 150 paddw m0, m5 151 pmullw m0, COEFF05 152 paddw m6, m0 153 mova m0, m1 154 paddw m6, [pw_32] 155 mova m1, m2 156 pmullw m2, COEFF2 157 paddw m6, m2 158 mova m2, m3 159 pmullw m3, COEFF3 160 paddw m6, m3 161 162 ; round/clip/store 163 mova m3, m4 164 psraw m6, 6 165 mova m4, m5 166 STORE m6, m5, %1 167 168 ; go to next line 169 add dstq, dststrideq 170 add srcq, srcstrideq 171 dec heightd ; next row 172 jg .nextrow 173 REP_RET 174%endmacro 175 176%macro FILTER_H 1 177cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg 178%ifdef PIC 179 lea picregq, [sixtap_filter_v_m] 180%endif 181 pxor m7, m7 182 LOAD mx, sixtap_filter_v 183 mova m6, [pw_32] 184%ifdef m8 185 mova m8, [mxq+ 0] 186 mova m9, [mxq+16] 187 mova m10, [mxq+32] 188 mova m11, [mxq+48] 189%define COEFF05 m8 190%define COEFF14 m9 191%define COEFF2 m10 192%define COEFF3 m11 193%else 194%define COEFF05 [mxq+ 0] 195%define COEFF14 [mxq+16] 196%define COEFF2 [mxq+32] 197%define COEFF3 [mxq+48] 198%endif 199.nextrow: 200 movq m0, [srcq-2] 201 movq m5, [srcq+3] 202 movq m1, [srcq-1] 203 movq m4, [srcq+2] 204 punpcklbw m0, m7 205 punpcklbw m5, m7 206 punpcklbw m1, m7 207 punpcklbw m4, m7 208 movq m2, [srcq-0] 209 movq m3, [srcq+1] 210 paddw m0, m5 211 paddw m1, m4 212 punpcklbw m2, m7 213 punpcklbw m3, m7 214 pmullw m0, COEFF05 215 pmullw m1, COEFF14 216 pmullw m2, COEFF2 217 pmullw m3, COEFF3 218 paddw m0, m6 219 paddw m1, m2 220 paddw m0, m3 221 paddw m0, m1 222 psraw m0, 6 223 STORE m0, m1, %1 224 225 ; go to next line 226 add dstq, dststrideq 227 add srcq, srcstrideq 228 dec heightd ; next row 229 jg .nextrow 230 REP_RET 231%endmacro 232 233INIT_XMM sse2 234FILTER_H put 235FILTER_H avg 236FILTER_V put 237FILTER_V avg 238 239%macro FILTER_SSSE3 1 240cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg 241%ifdef PIC 242 lea picregq, [sixtap_filter_hb_m] 243%endif 244 245 ; read 5 lines 246 sub srcq, srcstrideq 247 LOAD my, sixtap_filter_hb 248 sub srcq, srcstrideq 249 movh m0, [srcq] 250 movh m1, [srcq+srcstrideq] 251 movh m2, [srcq+srcstrideq*2] 252 lea srcq, [srcq+srcstrideq*2] 253 add srcq, srcstrideq 254 mova m5, [myq] 255 movh m3, [srcq] 256 movh m4, [srcq+srcstrideq] 257 lea srcq, [srcq+2*srcstrideq] 258 259.nextrow: 260 mova m6, m2 261 punpcklbw m0, m1 262 punpcklbw m6, m3 263 pmaddubsw m0, m5 264 pmaddubsw m6, [myq+16] 265 movh m7, [srcq] ; read new row 266 paddw m6, m0 267 mova m0, m1 268 mova m1, m2 269 mova m2, m3 270 mova m3, m4 271 mova m4, m7 272 punpcklbw m7, m3 273 pmaddubsw m7, m5 274 paddw m6, m7 275 pmulhrsw m6, [pw_512] 276 STORE m6, m7, %1 277 278 ; go to next line 279 add dstq, dststrideq 280 add srcq, srcstrideq 281 dec heightd ; next row 282 jg .nextrow 283 REP_RET 284 285cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg 286%ifdef PIC 287 lea picregq, [sixtap_filter_hb_m] 288%endif 289 mova m3, [filter_h6_shuf2] 290 mova m4, [filter_h6_shuf3] 291 LOAD mx, sixtap_filter_hb 292 mova m5, [mxq] ; set up 6tap filter in bytes 293 mova m6, [mxq+16] 294 mova m7, [filter_h6_shuf1] 295 296.nextrow: 297 movu m0, [srcq-2] 298 mova m1, m0 299 mova m2, m0 300 pshufb m0, m7 301 pshufb m1, m3 302 pshufb m2, m4 303 pmaddubsw m0, m5 304 pmaddubsw m1, m6 305 pmaddubsw m2, m5 306 paddw m0, m1 307 paddw m0, m2 308 pmulhrsw m0, [pw_512] 309 STORE m0, m1, %1 310 311 ; go to next line 312 add dstq, dststrideq 313 add srcq, srcstrideq 314 dec heightd ; next row 315 jg .nextrow 316 REP_RET 317%endmacro 318 319INIT_XMM ssse3 320FILTER_SSSE3 put 321FILTER_SSSE3 avg 322 323; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2 324%macro RV40_WCORE 4-5 325 movh m4, [%3 + r6 + 0] 326 movh m5, [%4 + r6 + 0] 327%if %0 == 4 328%define OFFSET r6 + mmsize / 2 329%else 330 ; 8x8 block and SSE2, stride was provided 331%define OFFSET r6 332 add r6, r5 333%endif 334 movh m6, [%3 + OFFSET] 335 movh m7, [%4 + OFFSET] 336 337%if %1 == 0 338 ; 14-bit weights 339 punpcklbw m4, m0 340 punpcklbw m5, m0 341 punpcklbw m6, m0 342 punpcklbw m7, m0 343 344 psllw m4, 7 345 psllw m5, 7 346 psllw m6, 7 347 psllw m7, 7 348 pmulhw m4, m3 349 pmulhw m5, m2 350 pmulhw m6, m3 351 pmulhw m7, m2 352 353 paddw m4, m5 354 paddw m6, m7 355%else 356 ; 5-bit weights 357%if cpuflag(ssse3) 358 punpcklbw m4, m5 359 punpcklbw m6, m7 360 361 pmaddubsw m4, m3 362 pmaddubsw m6, m3 363%else 364 punpcklbw m4, m0 365 punpcklbw m5, m0 366 punpcklbw m6, m0 367 punpcklbw m7, m0 368 369 pmullw m4, m3 370 pmullw m5, m2 371 pmullw m6, m3 372 pmullw m7, m2 373 paddw m4, m5 374 paddw m6, m7 375%endif 376 377%endif 378 379 ; bias and shift down 380%if cpuflag(ssse3) 381 pmulhrsw m4, m1 382 pmulhrsw m6, m1 383%else 384 paddw m4, m1 385 paddw m6, m1 386 psrlw m4, 5 387 psrlw m6, 5 388%endif 389 390 packuswb m4, m6 391%if %0 == 5 392 ; Only called for 8x8 blocks and SSE2 393 sub r6, r5 394 movh [%2 + r6], m4 395 add r6, r5 396 movhps [%2 + r6], m4 397%else 398 mova [%2 + r6], m4 399%endif 400%endmacro 401 402 403%macro MAIN_LOOP 2 404%if mmsize == 8 405 RV40_WCORE %2, r0, r1, r2 406%if %1 == 16 407 RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8 408%endif 409 410 ; Prepare for next loop 411 add r6, r5 412%else 413%ifidn %1, 8 414 RV40_WCORE %2, r0, r1, r2, r5 415 ; Prepare 2 next lines 416 add r6, r5 417%else 418 RV40_WCORE %2, r0, r1, r2 419 ; Prepare single next line 420 add r6, r5 421%endif 422%endif 423 424%endmacro 425 426; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) 427; %1=size %2=num of xmm regs 428; The weights are FP0.14 notation of fractions depending on pts. 429; For timebases without rounding error (i.e. PAL), the fractions 430; can be simplified, and several operations can be avoided. 431; Therefore, we check here whether they are multiples of 2^9 for 432; those simplifications to occur. 433%macro RV40_WEIGHT 3 434cglobal rv40_weight_func_%1_%2, 6, 7, 8 435%if cpuflag(ssse3) 436 mova m1, [pw_1024] 437%else 438 mova m1, [pw_16] 439%endif 440 pxor m0, m0 441 ; Set loop counter and increments 442 mov r6, r5 443 shl r6, %3 444 add r0, r6 445 add r1, r6 446 add r2, r6 447 neg r6 448 449 movd m2, r3d 450 movd m3, r4d 451%ifidn %1,rnd 452%define RND 0 453 SPLATW m2, m2 454%else 455%define RND 1 456%if cpuflag(ssse3) 457 punpcklbw m3, m2 458%else 459 SPLATW m2, m2 460%endif 461%endif 462 SPLATW m3, m3 463 464.loop: 465 MAIN_LOOP %2, RND 466 jnz .loop 467 REP_RET 468%endmacro 469 470INIT_XMM sse2 471RV40_WEIGHT rnd, 8, 3 472RV40_WEIGHT rnd, 16, 4 473RV40_WEIGHT nornd, 8, 3 474RV40_WEIGHT nornd, 16, 4 475 476INIT_XMM ssse3 477RV40_WEIGHT rnd, 8, 3 478RV40_WEIGHT rnd, 16, 4 479RV40_WEIGHT nornd, 8, 3 480RV40_WEIGHT nornd, 16, 4 481