1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* VC1 motion compensation optimizations 3cabdff1aSopenharmony_ci;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> 4cabdff1aSopenharmony_ci;* 5cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci;****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_cicextern pw_9 25cabdff1aSopenharmony_cicextern pw_128 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ciSECTION .text 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ci%if HAVE_MMX_INLINE 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ci; XXX some of these macros are not used right now, but they will in the future 32cabdff1aSopenharmony_ci; when more functions are ported. 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci%macro OP_PUT 2 ; dst, src 35cabdff1aSopenharmony_ci%endmacro 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci%macro OP_AVG 2 ; dst, src 38cabdff1aSopenharmony_ci pavgb %1, %2 39cabdff1aSopenharmony_ci%endmacro 40cabdff1aSopenharmony_ci 41cabdff1aSopenharmony_ci%macro NORMALIZE_MMX 1 ; shift 42cabdff1aSopenharmony_ci paddw m3, m7 ; +bias-r 43cabdff1aSopenharmony_ci paddw m4, m7 ; +bias-r 44cabdff1aSopenharmony_ci psraw m3, %1 45cabdff1aSopenharmony_ci psraw m4, %1 46cabdff1aSopenharmony_ci%endmacro 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci%macro TRANSFER_DO_PACK 2 ; op, dst 49cabdff1aSopenharmony_ci packuswb m3, m4 50cabdff1aSopenharmony_ci %1 m3, [%2] 51cabdff1aSopenharmony_ci mova [%2], m3 52cabdff1aSopenharmony_ci%endmacro 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci%macro TRANSFER_DONT_PACK 2 ; op, dst 55cabdff1aSopenharmony_ci %1 m3, [%2] 56cabdff1aSopenharmony_ci %1 m3, [%2 + mmsize] 57cabdff1aSopenharmony_ci mova [%2], m3 58cabdff1aSopenharmony_ci mova [mmsize + %2], m4 59cabdff1aSopenharmony_ci%endmacro 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci; see MSPEL_FILTER13_CORE for use as UNPACK macro 62cabdff1aSopenharmony_ci%macro DO_UNPACK 1 ; reg 63cabdff1aSopenharmony_ci punpcklbw %1, m0 64cabdff1aSopenharmony_ci%endmacro 65cabdff1aSopenharmony_ci%macro DONT_UNPACK 1 ; reg 66cabdff1aSopenharmony_ci%endmacro 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci; Compute the rounder 32-r or 8-r and unpacks it to m7 69cabdff1aSopenharmony_ci%macro LOAD_ROUNDER_MMX 1 ; round 70cabdff1aSopenharmony_ci movd m7, %1 71cabdff1aSopenharmony_ci punpcklwd m7, m7 72cabdff1aSopenharmony_ci punpckldq m7, m7 73cabdff1aSopenharmony_ci%endmacro 74cabdff1aSopenharmony_ci 75cabdff1aSopenharmony_ci%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3 76cabdff1aSopenharmony_ci paddw m%3, m%4 77cabdff1aSopenharmony_ci movh m%2, [srcq + stride_neg2] 78cabdff1aSopenharmony_ci pmullw m%3, m6 79cabdff1aSopenharmony_ci punpcklbw m%2, m0 80cabdff1aSopenharmony_ci movh m%5, [srcq + strideq] 81cabdff1aSopenharmony_ci psubw m%3, m%2 82cabdff1aSopenharmony_ci punpcklbw m%5, m0 83cabdff1aSopenharmony_ci paddw m%3, m7 84cabdff1aSopenharmony_ci psubw m%3, m%5 85cabdff1aSopenharmony_ci psraw m%3, shift 86cabdff1aSopenharmony_ci movu [dstq + %1], m%3 87cabdff1aSopenharmony_ci add srcq, strideq 88cabdff1aSopenharmony_ci%endmacro 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ciINIT_MMX mmx 91cabdff1aSopenharmony_ci; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src, 92cabdff1aSopenharmony_ci; x86_reg stride, int rnd, int64_t shift) 93cabdff1aSopenharmony_ci; Sacrificing m6 makes it possible to pipeline loads from src 94cabdff1aSopenharmony_ci%if ARCH_X86_32 95cabdff1aSopenharmony_cicglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride 96cabdff1aSopenharmony_ci DECLARE_REG_TMP 3, 4, 5 97cabdff1aSopenharmony_ci %define rnd r3mp 98cabdff1aSopenharmony_ci %define shift qword r4m 99cabdff1aSopenharmony_ci%else ; X86_64 100cabdff1aSopenharmony_cicglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride 101cabdff1aSopenharmony_ci DECLARE_REG_TMP 4, 5, 6 102cabdff1aSopenharmony_ci %define rnd r3d 103cabdff1aSopenharmony_ci ; We need shift either in memory or in a mm reg as it's used in psraw 104cabdff1aSopenharmony_ci ; On WIN64, the arg is already on the stack 105cabdff1aSopenharmony_ci ; On UNIX64, m5 doesn't seem to be used 106cabdff1aSopenharmony_ci%if WIN64 107cabdff1aSopenharmony_ci %define shift r4mp 108cabdff1aSopenharmony_ci%else ; UNIX64 109cabdff1aSopenharmony_ci %define shift m5 110cabdff1aSopenharmony_ci mova shift, r4q 111cabdff1aSopenharmony_ci%endif ; WIN64 112cabdff1aSopenharmony_ci%endif ; X86_32 113cabdff1aSopenharmony_ci%define stride_neg2 t0q 114cabdff1aSopenharmony_ci%define stride_9minus4 t1q 115cabdff1aSopenharmony_ci%define i t2q 116cabdff1aSopenharmony_ci mov stride_neg2, strideq 117cabdff1aSopenharmony_ci neg stride_neg2 118cabdff1aSopenharmony_ci add stride_neg2, stride_neg2 119cabdff1aSopenharmony_ci lea stride_9minus4, [strideq * 9 - 4] 120cabdff1aSopenharmony_ci mov i, 3 121cabdff1aSopenharmony_ci LOAD_ROUNDER_MMX rnd 122cabdff1aSopenharmony_ci mova m6, [pw_9] 123cabdff1aSopenharmony_ci pxor m0, m0 124cabdff1aSopenharmony_ci.loop: 125cabdff1aSopenharmony_ci movh m2, [srcq] 126cabdff1aSopenharmony_ci add srcq, strideq 127cabdff1aSopenharmony_ci movh m3, [srcq] 128cabdff1aSopenharmony_ci punpcklbw m2, m0 129cabdff1aSopenharmony_ci punpcklbw m3, m0 130cabdff1aSopenharmony_ci SHIFT2_LINE 0, 1, 2, 3, 4 131cabdff1aSopenharmony_ci SHIFT2_LINE 24, 2, 3, 4, 1 132cabdff1aSopenharmony_ci SHIFT2_LINE 48, 3, 4, 1, 2 133cabdff1aSopenharmony_ci SHIFT2_LINE 72, 4, 1, 2, 3 134cabdff1aSopenharmony_ci SHIFT2_LINE 96, 1, 2, 3, 4 135cabdff1aSopenharmony_ci SHIFT2_LINE 120, 2, 3, 4, 1 136cabdff1aSopenharmony_ci SHIFT2_LINE 144, 3, 4, 1, 2 137cabdff1aSopenharmony_ci SHIFT2_LINE 168, 4, 1, 2, 3 138cabdff1aSopenharmony_ci sub srcq, stride_9minus4 139cabdff1aSopenharmony_ci add dstq, 8 140cabdff1aSopenharmony_ci dec i 141cabdff1aSopenharmony_ci jnz .loop 142cabdff1aSopenharmony_ci REP_RET 143cabdff1aSopenharmony_ci%undef rnd 144cabdff1aSopenharmony_ci%undef shift 145cabdff1aSopenharmony_ci%undef stride_neg2 146cabdff1aSopenharmony_ci%undef stride_9minus4 147cabdff1aSopenharmony_ci%undef i 148cabdff1aSopenharmony_ci 149cabdff1aSopenharmony_ci; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride, 150cabdff1aSopenharmony_ci; const int16_t *src, int rnd); 151cabdff1aSopenharmony_ci; Data is already unpacked, so some operations can directly be made from 152cabdff1aSopenharmony_ci; memory. 153cabdff1aSopenharmony_ci%macro HOR_16B_SHIFT2 2 ; op, opname 154cabdff1aSopenharmony_cicglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h 155cabdff1aSopenharmony_ci mov hq, 8 156cabdff1aSopenharmony_ci sub srcq, 2 157cabdff1aSopenharmony_ci sub rndd, (-1+9+9-1) * 1024 ; add -1024 bias 158cabdff1aSopenharmony_ci LOAD_ROUNDER_MMX rndd 159cabdff1aSopenharmony_ci mova m5, [pw_9] 160cabdff1aSopenharmony_ci mova m6, [pw_128] 161cabdff1aSopenharmony_ci pxor m0, m0 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci.loop: 164cabdff1aSopenharmony_ci mova m1, [srcq + 2 * 0] 165cabdff1aSopenharmony_ci mova m2, [srcq + 2 * 0 + mmsize] 166cabdff1aSopenharmony_ci mova m3, [srcq + 2 * 1] 167cabdff1aSopenharmony_ci mova m4, [srcq + 2 * 1 + mmsize] 168cabdff1aSopenharmony_ci paddw m3, [srcq + 2 * 2] 169cabdff1aSopenharmony_ci paddw m4, [srcq + 2 * 2 + mmsize] 170cabdff1aSopenharmony_ci paddw m1, [srcq + 2 * 3] 171cabdff1aSopenharmony_ci paddw m2, [srcq + 2 * 3 + mmsize] 172cabdff1aSopenharmony_ci pmullw m3, m5 173cabdff1aSopenharmony_ci pmullw m4, m5 174cabdff1aSopenharmony_ci psubw m3, m1 175cabdff1aSopenharmony_ci psubw m4, m2 176cabdff1aSopenharmony_ci NORMALIZE_MMX 7 177cabdff1aSopenharmony_ci ; remove bias 178cabdff1aSopenharmony_ci paddw m3, m6 179cabdff1aSopenharmony_ci paddw m4, m6 180cabdff1aSopenharmony_ci TRANSFER_DO_PACK %1, dstq 181cabdff1aSopenharmony_ci add srcq, 24 182cabdff1aSopenharmony_ci add dstq, strideq 183cabdff1aSopenharmony_ci dec hq 184cabdff1aSopenharmony_ci jnz .loop 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci RET 187cabdff1aSopenharmony_ci%endmacro 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ciINIT_MMX mmx 190cabdff1aSopenharmony_ciHOR_16B_SHIFT2 OP_PUT, put 191cabdff1aSopenharmony_ci 192cabdff1aSopenharmony_ciINIT_MMX mmxext 193cabdff1aSopenharmony_ciHOR_16B_SHIFT2 OP_AVG, avg 194cabdff1aSopenharmony_ci%endif ; HAVE_MMX_INLINE 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_ci%macro INV_TRANS_INIT 0 197cabdff1aSopenharmony_ci movsxdifnidn linesizeq, linesized 198cabdff1aSopenharmony_ci movd m0, blockd 199cabdff1aSopenharmony_ci SPLATW m0, m0 200cabdff1aSopenharmony_ci pxor m1, m1 201cabdff1aSopenharmony_ci psubw m1, m0 202cabdff1aSopenharmony_ci packuswb m0, m0 203cabdff1aSopenharmony_ci packuswb m1, m1 204cabdff1aSopenharmony_ci 205cabdff1aSopenharmony_ci DEFINE_ARGS dest, linesize, linesize3 206cabdff1aSopenharmony_ci lea linesize3q, [linesizeq*3] 207cabdff1aSopenharmony_ci%endmacro 208cabdff1aSopenharmony_ci 209cabdff1aSopenharmony_ci%macro INV_TRANS_PROCESS 1 210cabdff1aSopenharmony_ci mov%1 m2, [destq+linesizeq*0] 211cabdff1aSopenharmony_ci mov%1 m3, [destq+linesizeq*1] 212cabdff1aSopenharmony_ci mov%1 m4, [destq+linesizeq*2] 213cabdff1aSopenharmony_ci mov%1 m5, [destq+linesize3q] 214cabdff1aSopenharmony_ci paddusb m2, m0 215cabdff1aSopenharmony_ci paddusb m3, m0 216cabdff1aSopenharmony_ci paddusb m4, m0 217cabdff1aSopenharmony_ci paddusb m5, m0 218cabdff1aSopenharmony_ci psubusb m2, m1 219cabdff1aSopenharmony_ci psubusb m3, m1 220cabdff1aSopenharmony_ci psubusb m4, m1 221cabdff1aSopenharmony_ci psubusb m5, m1 222cabdff1aSopenharmony_ci mov%1 [linesizeq*0+destq], m2 223cabdff1aSopenharmony_ci mov%1 [linesizeq*1+destq], m3 224cabdff1aSopenharmony_ci mov%1 [linesizeq*2+destq], m4 225cabdff1aSopenharmony_ci mov%1 [linesize3q +destq], m5 226cabdff1aSopenharmony_ci%endmacro 227cabdff1aSopenharmony_ci 228cabdff1aSopenharmony_ci; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 229cabdff1aSopenharmony_ciINIT_MMX mmxext 230cabdff1aSopenharmony_cicglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block 231cabdff1aSopenharmony_ci movsx r3d, WORD [blockq] 232cabdff1aSopenharmony_ci mov blockd, r3d ; dc 233cabdff1aSopenharmony_ci shl blockd, 4 ; 16 * dc 234cabdff1aSopenharmony_ci lea blockd, [blockq+r3+4] ; 17 * dc + 4 235cabdff1aSopenharmony_ci sar blockd, 3 ; >> 3 236cabdff1aSopenharmony_ci mov r3d, blockd ; dc 237cabdff1aSopenharmony_ci shl blockd, 4 ; 16 * dc 238cabdff1aSopenharmony_ci lea blockd, [blockq+r3+64] ; 17 * dc + 64 239cabdff1aSopenharmony_ci sar blockd, 7 ; >> 7 240cabdff1aSopenharmony_ci 241cabdff1aSopenharmony_ci INV_TRANS_INIT 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci INV_TRANS_PROCESS h 244cabdff1aSopenharmony_ci RET 245cabdff1aSopenharmony_ci 246cabdff1aSopenharmony_ciINIT_MMX mmxext 247cabdff1aSopenharmony_cicglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block 248cabdff1aSopenharmony_ci movsx r3d, WORD [blockq] 249cabdff1aSopenharmony_ci mov blockd, r3d ; dc 250cabdff1aSopenharmony_ci shl blockd, 4 ; 16 * dc 251cabdff1aSopenharmony_ci lea blockd, [blockq+r3+4] ; 17 * dc + 4 252cabdff1aSopenharmony_ci sar blockd, 3 ; >> 3 253cabdff1aSopenharmony_ci shl blockd, 2 ; 4 * dc 254cabdff1aSopenharmony_ci lea blockd, [blockq*3+64] ; 12 * dc + 64 255cabdff1aSopenharmony_ci sar blockd, 7 ; >> 7 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_ci INV_TRANS_INIT 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_ci INV_TRANS_PROCESS h 260cabdff1aSopenharmony_ci lea destq, [destq+linesizeq*4] 261cabdff1aSopenharmony_ci INV_TRANS_PROCESS h 262cabdff1aSopenharmony_ci RET 263cabdff1aSopenharmony_ci 264cabdff1aSopenharmony_ciINIT_MMX mmxext 265cabdff1aSopenharmony_cicglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block 266cabdff1aSopenharmony_ci movsx blockd, WORD [blockq] ; dc 267cabdff1aSopenharmony_ci lea blockd, [blockq*3+1] ; 3 * dc + 1 268cabdff1aSopenharmony_ci sar blockd, 1 ; >> 1 269cabdff1aSopenharmony_ci mov r3d, blockd ; dc 270cabdff1aSopenharmony_ci shl blockd, 4 ; 16 * dc 271cabdff1aSopenharmony_ci lea blockd, [blockq+r3+64] ; 17 * dc + 64 272cabdff1aSopenharmony_ci sar blockd, 7 ; >> 7 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ci INV_TRANS_INIT 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci INV_TRANS_PROCESS a 277cabdff1aSopenharmony_ci RET 278cabdff1aSopenharmony_ci 279cabdff1aSopenharmony_ciINIT_MMX mmxext 280cabdff1aSopenharmony_cicglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block 281cabdff1aSopenharmony_ci movsx blockd, WORD [blockq] ; dc 282cabdff1aSopenharmony_ci lea blockd, [blockq*3+1] ; 3 * dc + 1 283cabdff1aSopenharmony_ci sar blockd, 1 ; >> 1 284cabdff1aSopenharmony_ci lea blockd, [blockq*3+16] ; 3 * dc + 16 285cabdff1aSopenharmony_ci sar blockd, 5 ; >> 5 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci INV_TRANS_INIT 288cabdff1aSopenharmony_ci 289cabdff1aSopenharmony_ci INV_TRANS_PROCESS a 290cabdff1aSopenharmony_ci lea destq, [destq+linesizeq*4] 291cabdff1aSopenharmony_ci INV_TRANS_PROCESS a 292cabdff1aSopenharmony_ci RET 293