1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSE2-optimized functions for the RV40 decoder 3cabdff1aSopenharmony_ci;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> 4cabdff1aSopenharmony_ci;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> 5cabdff1aSopenharmony_ci;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cipw_1024: times 8 dw 1 << (16 - 6) ; pw_1024 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_cisixtap_filter_hb_m: times 8 db 1, -5 31cabdff1aSopenharmony_ci times 8 db 52, 20 32cabdff1aSopenharmony_ci ; multiplied by 2 to have the same shift 33cabdff1aSopenharmony_ci times 8 db 2, -10 34cabdff1aSopenharmony_ci times 8 db 40, 40 35cabdff1aSopenharmony_ci ; back to normal 36cabdff1aSopenharmony_ci times 8 db 1, -5 37cabdff1aSopenharmony_ci times 8 db 20, 52 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_cisixtap_filter_v_m: times 8 dw 1 40cabdff1aSopenharmony_ci times 8 dw -5 41cabdff1aSopenharmony_ci times 8 dw 52 42cabdff1aSopenharmony_ci times 8 dw 20 43cabdff1aSopenharmony_ci ; multiplied by 2 to have the same shift 44cabdff1aSopenharmony_ci times 8 dw 2 45cabdff1aSopenharmony_ci times 8 dw -10 46cabdff1aSopenharmony_ci times 8 dw 40 47cabdff1aSopenharmony_ci times 8 dw 40 48cabdff1aSopenharmony_ci ; back to normal 49cabdff1aSopenharmony_ci times 8 dw 1 50cabdff1aSopenharmony_ci times 8 dw -5 51cabdff1aSopenharmony_ci times 8 dw 20 52cabdff1aSopenharmony_ci times 8 dw 52 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci%ifdef PIC 55cabdff1aSopenharmony_ci%define sixtap_filter_hw picregq 56cabdff1aSopenharmony_ci%define sixtap_filter_hb picregq 57cabdff1aSopenharmony_ci%define sixtap_filter_v picregq 58cabdff1aSopenharmony_ci%define npicregs 1 59cabdff1aSopenharmony_ci%else 60cabdff1aSopenharmony_ci%define sixtap_filter_hw sixtap_filter_hw_m 61cabdff1aSopenharmony_ci%define sixtap_filter_hb sixtap_filter_hb_m 62cabdff1aSopenharmony_ci%define sixtap_filter_v sixtap_filter_v_m 63cabdff1aSopenharmony_ci%define npicregs 0 64cabdff1aSopenharmony_ci%endif 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_cifilter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 67cabdff1aSopenharmony_cifilter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 68cabdff1aSopenharmony_cifilter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11 69cabdff1aSopenharmony_ci 70cabdff1aSopenharmony_cicextern pw_32 71cabdff1aSopenharmony_cicextern pw_16 72cabdff1aSopenharmony_cicextern pw_512 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_ciSECTION .text 75cabdff1aSopenharmony_ci 76cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 77cabdff1aSopenharmony_ci; subpel MC functions: 78cabdff1aSopenharmony_ci; 79cabdff1aSopenharmony_ci; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride, 80cabdff1aSopenharmony_ci; uint8_t *src, int srcstride, 81cabdff1aSopenharmony_ci; int len, int m); 82cabdff1aSopenharmony_ci;---------------------------------------------------------------------- 83cabdff1aSopenharmony_ci%macro LOAD 2 84cabdff1aSopenharmony_ci%if WIN64 85cabdff1aSopenharmony_ci movsxd %1q, %1d 86cabdff1aSopenharmony_ci%endif 87cabdff1aSopenharmony_ci%ifdef PIC 88cabdff1aSopenharmony_ci add %1q, picregq 89cabdff1aSopenharmony_ci%else 90cabdff1aSopenharmony_ci add %1q, %2 91cabdff1aSopenharmony_ci%endif 92cabdff1aSopenharmony_ci%endmacro 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci%macro STORE 3 95cabdff1aSopenharmony_ci%ifidn %3, avg 96cabdff1aSopenharmony_ci movh %2, [dstq] 97cabdff1aSopenharmony_ci%endif 98cabdff1aSopenharmony_ci packuswb %1, %1 99cabdff1aSopenharmony_ci%ifidn %3, avg 100cabdff1aSopenharmony_ci PAVGB %1, %2 101cabdff1aSopenharmony_ci%endif 102cabdff1aSopenharmony_ci movh [dstq], %1 103cabdff1aSopenharmony_ci%endmacro 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci%macro FILTER_V 1 106cabdff1aSopenharmony_cicglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg 107cabdff1aSopenharmony_ci%ifdef PIC 108cabdff1aSopenharmony_ci lea picregq, [sixtap_filter_v_m] 109cabdff1aSopenharmony_ci%endif 110cabdff1aSopenharmony_ci pxor m7, m7 111cabdff1aSopenharmony_ci LOAD my, sixtap_filter_v 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci ; read 5 lines 114cabdff1aSopenharmony_ci sub srcq, srcstrideq 115cabdff1aSopenharmony_ci sub srcq, srcstrideq 116cabdff1aSopenharmony_ci movh m0, [srcq] 117cabdff1aSopenharmony_ci movh m1, [srcq+srcstrideq] 118cabdff1aSopenharmony_ci movh m2, [srcq+srcstrideq*2] 119cabdff1aSopenharmony_ci lea srcq, [srcq+srcstrideq*2] 120cabdff1aSopenharmony_ci add srcq, srcstrideq 121cabdff1aSopenharmony_ci movh m3, [srcq] 122cabdff1aSopenharmony_ci movh m4, [srcq+srcstrideq] 123cabdff1aSopenharmony_ci punpcklbw m0, m7 124cabdff1aSopenharmony_ci punpcklbw m1, m7 125cabdff1aSopenharmony_ci punpcklbw m2, m7 126cabdff1aSopenharmony_ci punpcklbw m3, m7 127cabdff1aSopenharmony_ci punpcklbw m4, m7 128cabdff1aSopenharmony_ci 129cabdff1aSopenharmony_ci%ifdef m8 130cabdff1aSopenharmony_ci mova m8, [myq+ 0] 131cabdff1aSopenharmony_ci mova m9, [myq+16] 132cabdff1aSopenharmony_ci mova m10, [myq+32] 133cabdff1aSopenharmony_ci mova m11, [myq+48] 134cabdff1aSopenharmony_ci%define COEFF05 m8 135cabdff1aSopenharmony_ci%define COEFF14 m9 136cabdff1aSopenharmony_ci%define COEFF2 m10 137cabdff1aSopenharmony_ci%define COEFF3 m11 138cabdff1aSopenharmony_ci%else 139cabdff1aSopenharmony_ci%define COEFF05 [myq+ 0] 140cabdff1aSopenharmony_ci%define COEFF14 [myq+16] 141cabdff1aSopenharmony_ci%define COEFF2 [myq+32] 142cabdff1aSopenharmony_ci%define COEFF3 [myq+48] 143cabdff1aSopenharmony_ci%endif 144cabdff1aSopenharmony_ci.nextrow: 145cabdff1aSopenharmony_ci mova m6, m1 146cabdff1aSopenharmony_ci movh m5, [srcq+2*srcstrideq] ; read new row 147cabdff1aSopenharmony_ci paddw m6, m4 148cabdff1aSopenharmony_ci punpcklbw m5, m7 149cabdff1aSopenharmony_ci pmullw m6, COEFF14 150cabdff1aSopenharmony_ci paddw m0, m5 151cabdff1aSopenharmony_ci pmullw m0, COEFF05 152cabdff1aSopenharmony_ci paddw m6, m0 153cabdff1aSopenharmony_ci mova m0, m1 154cabdff1aSopenharmony_ci paddw m6, [pw_32] 155cabdff1aSopenharmony_ci mova m1, m2 156cabdff1aSopenharmony_ci pmullw m2, COEFF2 157cabdff1aSopenharmony_ci paddw m6, m2 158cabdff1aSopenharmony_ci mova m2, m3 159cabdff1aSopenharmony_ci pmullw m3, COEFF3 160cabdff1aSopenharmony_ci paddw m6, m3 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci ; round/clip/store 163cabdff1aSopenharmony_ci mova m3, m4 164cabdff1aSopenharmony_ci psraw m6, 6 165cabdff1aSopenharmony_ci mova m4, m5 166cabdff1aSopenharmony_ci STORE m6, m5, %1 167cabdff1aSopenharmony_ci 168cabdff1aSopenharmony_ci ; go to next line 169cabdff1aSopenharmony_ci add dstq, dststrideq 170cabdff1aSopenharmony_ci add srcq, srcstrideq 171cabdff1aSopenharmony_ci dec heightd ; next row 172cabdff1aSopenharmony_ci jg .nextrow 173cabdff1aSopenharmony_ci REP_RET 174cabdff1aSopenharmony_ci%endmacro 175cabdff1aSopenharmony_ci 176cabdff1aSopenharmony_ci%macro FILTER_H 1 177cabdff1aSopenharmony_cicglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg 178cabdff1aSopenharmony_ci%ifdef PIC 179cabdff1aSopenharmony_ci lea picregq, [sixtap_filter_v_m] 180cabdff1aSopenharmony_ci%endif 181cabdff1aSopenharmony_ci pxor m7, m7 182cabdff1aSopenharmony_ci LOAD mx, sixtap_filter_v 183cabdff1aSopenharmony_ci mova m6, [pw_32] 184cabdff1aSopenharmony_ci%ifdef m8 185cabdff1aSopenharmony_ci mova m8, [mxq+ 0] 186cabdff1aSopenharmony_ci mova m9, [mxq+16] 187cabdff1aSopenharmony_ci mova m10, [mxq+32] 188cabdff1aSopenharmony_ci mova m11, [mxq+48] 189cabdff1aSopenharmony_ci%define COEFF05 m8 190cabdff1aSopenharmony_ci%define COEFF14 m9 191cabdff1aSopenharmony_ci%define COEFF2 m10 192cabdff1aSopenharmony_ci%define COEFF3 m11 193cabdff1aSopenharmony_ci%else 194cabdff1aSopenharmony_ci%define COEFF05 [mxq+ 0] 195cabdff1aSopenharmony_ci%define COEFF14 [mxq+16] 196cabdff1aSopenharmony_ci%define COEFF2 [mxq+32] 197cabdff1aSopenharmony_ci%define COEFF3 [mxq+48] 198cabdff1aSopenharmony_ci%endif 199cabdff1aSopenharmony_ci.nextrow: 200cabdff1aSopenharmony_ci movq m0, [srcq-2] 201cabdff1aSopenharmony_ci movq m5, [srcq+3] 202cabdff1aSopenharmony_ci movq m1, [srcq-1] 203cabdff1aSopenharmony_ci movq m4, [srcq+2] 204cabdff1aSopenharmony_ci punpcklbw m0, m7 205cabdff1aSopenharmony_ci punpcklbw m5, m7 206cabdff1aSopenharmony_ci punpcklbw m1, m7 207cabdff1aSopenharmony_ci punpcklbw m4, m7 208cabdff1aSopenharmony_ci movq m2, [srcq-0] 209cabdff1aSopenharmony_ci movq m3, [srcq+1] 210cabdff1aSopenharmony_ci paddw m0, m5 211cabdff1aSopenharmony_ci paddw m1, m4 212cabdff1aSopenharmony_ci punpcklbw m2, m7 213cabdff1aSopenharmony_ci punpcklbw m3, m7 214cabdff1aSopenharmony_ci pmullw m0, COEFF05 215cabdff1aSopenharmony_ci pmullw m1, COEFF14 216cabdff1aSopenharmony_ci pmullw m2, COEFF2 217cabdff1aSopenharmony_ci pmullw m3, COEFF3 218cabdff1aSopenharmony_ci paddw m0, m6 219cabdff1aSopenharmony_ci paddw m1, m2 220cabdff1aSopenharmony_ci paddw m0, m3 221cabdff1aSopenharmony_ci paddw m0, m1 222cabdff1aSopenharmony_ci psraw m0, 6 223cabdff1aSopenharmony_ci STORE m0, m1, %1 224cabdff1aSopenharmony_ci 225cabdff1aSopenharmony_ci ; go to next line 226cabdff1aSopenharmony_ci add dstq, dststrideq 227cabdff1aSopenharmony_ci add srcq, srcstrideq 228cabdff1aSopenharmony_ci dec heightd ; next row 229cabdff1aSopenharmony_ci jg .nextrow 230cabdff1aSopenharmony_ci REP_RET 231cabdff1aSopenharmony_ci%endmacro 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_ciINIT_XMM sse2 234cabdff1aSopenharmony_ciFILTER_H put 235cabdff1aSopenharmony_ciFILTER_H avg 236cabdff1aSopenharmony_ciFILTER_V put 237cabdff1aSopenharmony_ciFILTER_V avg 238cabdff1aSopenharmony_ci 239cabdff1aSopenharmony_ci%macro FILTER_SSSE3 1 240cabdff1aSopenharmony_cicglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg 241cabdff1aSopenharmony_ci%ifdef PIC 242cabdff1aSopenharmony_ci lea picregq, [sixtap_filter_hb_m] 243cabdff1aSopenharmony_ci%endif 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci ; read 5 lines 246cabdff1aSopenharmony_ci sub srcq, srcstrideq 247cabdff1aSopenharmony_ci LOAD my, sixtap_filter_hb 248cabdff1aSopenharmony_ci sub srcq, srcstrideq 249cabdff1aSopenharmony_ci movh m0, [srcq] 250cabdff1aSopenharmony_ci movh m1, [srcq+srcstrideq] 251cabdff1aSopenharmony_ci movh m2, [srcq+srcstrideq*2] 252cabdff1aSopenharmony_ci lea srcq, [srcq+srcstrideq*2] 253cabdff1aSopenharmony_ci add srcq, srcstrideq 254cabdff1aSopenharmony_ci mova m5, [myq] 255cabdff1aSopenharmony_ci movh m3, [srcq] 256cabdff1aSopenharmony_ci movh m4, [srcq+srcstrideq] 257cabdff1aSopenharmony_ci lea srcq, [srcq+2*srcstrideq] 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_ci.nextrow: 260cabdff1aSopenharmony_ci mova m6, m2 261cabdff1aSopenharmony_ci punpcklbw m0, m1 262cabdff1aSopenharmony_ci punpcklbw m6, m3 263cabdff1aSopenharmony_ci pmaddubsw m0, m5 264cabdff1aSopenharmony_ci pmaddubsw m6, [myq+16] 265cabdff1aSopenharmony_ci movh m7, [srcq] ; read new row 266cabdff1aSopenharmony_ci paddw m6, m0 267cabdff1aSopenharmony_ci mova m0, m1 268cabdff1aSopenharmony_ci mova m1, m2 269cabdff1aSopenharmony_ci mova m2, m3 270cabdff1aSopenharmony_ci mova m3, m4 271cabdff1aSopenharmony_ci mova m4, m7 272cabdff1aSopenharmony_ci punpcklbw m7, m3 273cabdff1aSopenharmony_ci pmaddubsw m7, m5 274cabdff1aSopenharmony_ci paddw m6, m7 275cabdff1aSopenharmony_ci pmulhrsw m6, [pw_512] 276cabdff1aSopenharmony_ci STORE m6, m7, %1 277cabdff1aSopenharmony_ci 278cabdff1aSopenharmony_ci ; go to next line 279cabdff1aSopenharmony_ci add dstq, dststrideq 280cabdff1aSopenharmony_ci add srcq, srcstrideq 281cabdff1aSopenharmony_ci dec heightd ; next row 282cabdff1aSopenharmony_ci jg .nextrow 283cabdff1aSopenharmony_ci REP_RET 284cabdff1aSopenharmony_ci 285cabdff1aSopenharmony_cicglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg 286cabdff1aSopenharmony_ci%ifdef PIC 287cabdff1aSopenharmony_ci lea picregq, [sixtap_filter_hb_m] 288cabdff1aSopenharmony_ci%endif 289cabdff1aSopenharmony_ci mova m3, [filter_h6_shuf2] 290cabdff1aSopenharmony_ci mova m4, [filter_h6_shuf3] 291cabdff1aSopenharmony_ci LOAD mx, sixtap_filter_hb 292cabdff1aSopenharmony_ci mova m5, [mxq] ; set up 6tap filter in bytes 293cabdff1aSopenharmony_ci mova m6, [mxq+16] 294cabdff1aSopenharmony_ci mova m7, [filter_h6_shuf1] 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci.nextrow: 297cabdff1aSopenharmony_ci movu m0, [srcq-2] 298cabdff1aSopenharmony_ci mova m1, m0 299cabdff1aSopenharmony_ci mova m2, m0 300cabdff1aSopenharmony_ci pshufb m0, m7 301cabdff1aSopenharmony_ci pshufb m1, m3 302cabdff1aSopenharmony_ci pshufb m2, m4 303cabdff1aSopenharmony_ci pmaddubsw m0, m5 304cabdff1aSopenharmony_ci pmaddubsw m1, m6 305cabdff1aSopenharmony_ci pmaddubsw m2, m5 306cabdff1aSopenharmony_ci paddw m0, m1 307cabdff1aSopenharmony_ci paddw m0, m2 308cabdff1aSopenharmony_ci pmulhrsw m0, [pw_512] 309cabdff1aSopenharmony_ci STORE m0, m1, %1 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci ; go to next line 312cabdff1aSopenharmony_ci add dstq, dststrideq 313cabdff1aSopenharmony_ci add srcq, srcstrideq 314cabdff1aSopenharmony_ci dec heightd ; next row 315cabdff1aSopenharmony_ci jg .nextrow 316cabdff1aSopenharmony_ci REP_RET 317cabdff1aSopenharmony_ci%endmacro 318cabdff1aSopenharmony_ci 319cabdff1aSopenharmony_ciINIT_XMM ssse3 320cabdff1aSopenharmony_ciFILTER_SSSE3 put 321cabdff1aSopenharmony_ciFILTER_SSSE3 avg 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_ci; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2 324cabdff1aSopenharmony_ci%macro RV40_WCORE 4-5 325cabdff1aSopenharmony_ci movh m4, [%3 + r6 + 0] 326cabdff1aSopenharmony_ci movh m5, [%4 + r6 + 0] 327cabdff1aSopenharmony_ci%if %0 == 4 328cabdff1aSopenharmony_ci%define OFFSET r6 + mmsize / 2 329cabdff1aSopenharmony_ci%else 330cabdff1aSopenharmony_ci ; 8x8 block and SSE2, stride was provided 331cabdff1aSopenharmony_ci%define OFFSET r6 332cabdff1aSopenharmony_ci add r6, r5 333cabdff1aSopenharmony_ci%endif 334cabdff1aSopenharmony_ci movh m6, [%3 + OFFSET] 335cabdff1aSopenharmony_ci movh m7, [%4 + OFFSET] 336cabdff1aSopenharmony_ci 337cabdff1aSopenharmony_ci%if %1 == 0 338cabdff1aSopenharmony_ci ; 14-bit weights 339cabdff1aSopenharmony_ci punpcklbw m4, m0 340cabdff1aSopenharmony_ci punpcklbw m5, m0 341cabdff1aSopenharmony_ci punpcklbw m6, m0 342cabdff1aSopenharmony_ci punpcklbw m7, m0 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci psllw m4, 7 345cabdff1aSopenharmony_ci psllw m5, 7 346cabdff1aSopenharmony_ci psllw m6, 7 347cabdff1aSopenharmony_ci psllw m7, 7 348cabdff1aSopenharmony_ci pmulhw m4, m3 349cabdff1aSopenharmony_ci pmulhw m5, m2 350cabdff1aSopenharmony_ci pmulhw m6, m3 351cabdff1aSopenharmony_ci pmulhw m7, m2 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ci paddw m4, m5 354cabdff1aSopenharmony_ci paddw m6, m7 355cabdff1aSopenharmony_ci%else 356cabdff1aSopenharmony_ci ; 5-bit weights 357cabdff1aSopenharmony_ci%if cpuflag(ssse3) 358cabdff1aSopenharmony_ci punpcklbw m4, m5 359cabdff1aSopenharmony_ci punpcklbw m6, m7 360cabdff1aSopenharmony_ci 361cabdff1aSopenharmony_ci pmaddubsw m4, m3 362cabdff1aSopenharmony_ci pmaddubsw m6, m3 363cabdff1aSopenharmony_ci%else 364cabdff1aSopenharmony_ci punpcklbw m4, m0 365cabdff1aSopenharmony_ci punpcklbw m5, m0 366cabdff1aSopenharmony_ci punpcklbw m6, m0 367cabdff1aSopenharmony_ci punpcklbw m7, m0 368cabdff1aSopenharmony_ci 369cabdff1aSopenharmony_ci pmullw m4, m3 370cabdff1aSopenharmony_ci pmullw m5, m2 371cabdff1aSopenharmony_ci pmullw m6, m3 372cabdff1aSopenharmony_ci pmullw m7, m2 373cabdff1aSopenharmony_ci paddw m4, m5 374cabdff1aSopenharmony_ci paddw m6, m7 375cabdff1aSopenharmony_ci%endif 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci%endif 378cabdff1aSopenharmony_ci 379cabdff1aSopenharmony_ci ; bias and shift down 380cabdff1aSopenharmony_ci%if cpuflag(ssse3) 381cabdff1aSopenharmony_ci pmulhrsw m4, m1 382cabdff1aSopenharmony_ci pmulhrsw m6, m1 383cabdff1aSopenharmony_ci%else 384cabdff1aSopenharmony_ci paddw m4, m1 385cabdff1aSopenharmony_ci paddw m6, m1 386cabdff1aSopenharmony_ci psrlw m4, 5 387cabdff1aSopenharmony_ci psrlw m6, 5 388cabdff1aSopenharmony_ci%endif 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci packuswb m4, m6 391cabdff1aSopenharmony_ci%if %0 == 5 392cabdff1aSopenharmony_ci ; Only called for 8x8 blocks and SSE2 393cabdff1aSopenharmony_ci sub r6, r5 394cabdff1aSopenharmony_ci movh [%2 + r6], m4 395cabdff1aSopenharmony_ci add r6, r5 396cabdff1aSopenharmony_ci movhps [%2 + r6], m4 397cabdff1aSopenharmony_ci%else 398cabdff1aSopenharmony_ci mova [%2 + r6], m4 399cabdff1aSopenharmony_ci%endif 400cabdff1aSopenharmony_ci%endmacro 401cabdff1aSopenharmony_ci 402cabdff1aSopenharmony_ci 403cabdff1aSopenharmony_ci%macro MAIN_LOOP 2 404cabdff1aSopenharmony_ci%if mmsize == 8 405cabdff1aSopenharmony_ci RV40_WCORE %2, r0, r1, r2 406cabdff1aSopenharmony_ci%if %1 == 16 407cabdff1aSopenharmony_ci RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8 408cabdff1aSopenharmony_ci%endif 409cabdff1aSopenharmony_ci 410cabdff1aSopenharmony_ci ; Prepare for next loop 411cabdff1aSopenharmony_ci add r6, r5 412cabdff1aSopenharmony_ci%else 413cabdff1aSopenharmony_ci%ifidn %1, 8 414cabdff1aSopenharmony_ci RV40_WCORE %2, r0, r1, r2, r5 415cabdff1aSopenharmony_ci ; Prepare 2 next lines 416cabdff1aSopenharmony_ci add r6, r5 417cabdff1aSopenharmony_ci%else 418cabdff1aSopenharmony_ci RV40_WCORE %2, r0, r1, r2 419cabdff1aSopenharmony_ci ; Prepare single next line 420cabdff1aSopenharmony_ci add r6, r5 421cabdff1aSopenharmony_ci%endif 422cabdff1aSopenharmony_ci%endif 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci%endmacro 425cabdff1aSopenharmony_ci 426cabdff1aSopenharmony_ci; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) 427cabdff1aSopenharmony_ci; %1=size %2=num of xmm regs 428cabdff1aSopenharmony_ci; The weights are FP0.14 notation of fractions depending on pts. 429cabdff1aSopenharmony_ci; For timebases without rounding error (i.e. PAL), the fractions 430cabdff1aSopenharmony_ci; can be simplified, and several operations can be avoided. 431cabdff1aSopenharmony_ci; Therefore, we check here whether they are multiples of 2^9 for 432cabdff1aSopenharmony_ci; those simplifications to occur. 433cabdff1aSopenharmony_ci%macro RV40_WEIGHT 3 434cabdff1aSopenharmony_cicglobal rv40_weight_func_%1_%2, 6, 7, 8 435cabdff1aSopenharmony_ci%if cpuflag(ssse3) 436cabdff1aSopenharmony_ci mova m1, [pw_1024] 437cabdff1aSopenharmony_ci%else 438cabdff1aSopenharmony_ci mova m1, [pw_16] 439cabdff1aSopenharmony_ci%endif 440cabdff1aSopenharmony_ci pxor m0, m0 441cabdff1aSopenharmony_ci ; Set loop counter and increments 442cabdff1aSopenharmony_ci mov r6, r5 443cabdff1aSopenharmony_ci shl r6, %3 444cabdff1aSopenharmony_ci add r0, r6 445cabdff1aSopenharmony_ci add r1, r6 446cabdff1aSopenharmony_ci add r2, r6 447cabdff1aSopenharmony_ci neg r6 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci movd m2, r3d 450cabdff1aSopenharmony_ci movd m3, r4d 451cabdff1aSopenharmony_ci%ifidn %1,rnd 452cabdff1aSopenharmony_ci%define RND 0 453cabdff1aSopenharmony_ci SPLATW m2, m2 454cabdff1aSopenharmony_ci%else 455cabdff1aSopenharmony_ci%define RND 1 456cabdff1aSopenharmony_ci%if cpuflag(ssse3) 457cabdff1aSopenharmony_ci punpcklbw m3, m2 458cabdff1aSopenharmony_ci%else 459cabdff1aSopenharmony_ci SPLATW m2, m2 460cabdff1aSopenharmony_ci%endif 461cabdff1aSopenharmony_ci%endif 462cabdff1aSopenharmony_ci SPLATW m3, m3 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ci.loop: 465cabdff1aSopenharmony_ci MAIN_LOOP %2, RND 466cabdff1aSopenharmony_ci jnz .loop 467cabdff1aSopenharmony_ci REP_RET 468cabdff1aSopenharmony_ci%endmacro 469cabdff1aSopenharmony_ci 470cabdff1aSopenharmony_ciINIT_XMM sse2 471cabdff1aSopenharmony_ciRV40_WEIGHT rnd, 8, 3 472cabdff1aSopenharmony_ciRV40_WEIGHT rnd, 16, 4 473cabdff1aSopenharmony_ciRV40_WEIGHT nornd, 8, 3 474cabdff1aSopenharmony_ciRV40_WEIGHT nornd, 16, 4 475cabdff1aSopenharmony_ci 476cabdff1aSopenharmony_ciINIT_XMM ssse3 477cabdff1aSopenharmony_ciRV40_WEIGHT rnd, 8, 3 478cabdff1aSopenharmony_ciRV40_WEIGHT rnd, 16, 4 479cabdff1aSopenharmony_ciRV40_WEIGHT nornd, 8, 3 480cabdff1aSopenharmony_ciRV40_WEIGHT nornd, 16, 4 481