1cabdff1aSopenharmony_ci; XVID MPEG-4 VIDEO CODEC 2cabdff1aSopenharmony_ci; 3cabdff1aSopenharmony_ci; Conversion from gcc syntax to x264asm syntax with modifications 4cabdff1aSopenharmony_ci; by Christophe Gisquet <christophe.gisquet@gmail.com> 5cabdff1aSopenharmony_ci; 6cabdff1aSopenharmony_ci; =========== SSE2 inverse discrete cosine transform =========== 7cabdff1aSopenharmony_ci; 8cabdff1aSopenharmony_ci; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> 9cabdff1aSopenharmony_ci; 10cabdff1aSopenharmony_ci; Conversion to gcc syntax with modifications 11cabdff1aSopenharmony_ci; by Alexander Strange <astrange@ithinksw.com> 12cabdff1aSopenharmony_ci; 13cabdff1aSopenharmony_ci; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. 14cabdff1aSopenharmony_ci; 15cabdff1aSopenharmony_ci; Vertical pass is an implementation of the scheme: 16cabdff1aSopenharmony_ci; Loeffler C., Ligtenberg A., and Moschytz C.S.: 17cabdff1aSopenharmony_ci; Practical Fast 1D DCT Algorithm with Eleven Multiplications, 18cabdff1aSopenharmony_ci; Proc. ICASSP 1989, 988-991. 19cabdff1aSopenharmony_ci; 20cabdff1aSopenharmony_ci; Horizontal pass is a double 4x4 vector/matrix multiplication, 21cabdff1aSopenharmony_ci; (see also Intel's Application Note 922: 22cabdff1aSopenharmony_ci; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm 23cabdff1aSopenharmony_ci; Copyright (C) 1999 Intel Corporation) 24cabdff1aSopenharmony_ci; 25cabdff1aSopenharmony_ci; More details at http://skal.planet-d.net/coding/dct.html 26cabdff1aSopenharmony_ci; 27cabdff1aSopenharmony_ci; ======= MMX and XMM forward discrete cosine transform ======= 28cabdff1aSopenharmony_ci; 29cabdff1aSopenharmony_ci; Copyright(C) 2001 Peter Ross <pross@xvid.org> 30cabdff1aSopenharmony_ci; 31cabdff1aSopenharmony_ci; Originally provided by Intel at AP-922 32cabdff1aSopenharmony_ci; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm 33cabdff1aSopenharmony_ci; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm) 34cabdff1aSopenharmony_ci; but in a limited edition. 35cabdff1aSopenharmony_ci; New macro implements a column part for precise iDCT 36cabdff1aSopenharmony_ci; The routine precision now satisfies IEEE standard 1180-1990. 37cabdff1aSopenharmony_ci; 38cabdff1aSopenharmony_ci; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru> 39cabdff1aSopenharmony_ci; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org> 40cabdff1aSopenharmony_ci; 41cabdff1aSopenharmony_ci; http://www.elecard.com/peter/idct.html 42cabdff1aSopenharmony_ci; http://www.linuxvideo.org/mpeg2dec/ 43cabdff1aSopenharmony_ci; 44cabdff1aSopenharmony_ci; These examples contain code fragments for first stage iDCT 8x8 45cabdff1aSopenharmony_ci; (for rows) and first stage DCT 8x8 (for columns) 46cabdff1aSopenharmony_ci; 47cabdff1aSopenharmony_ci; conversion to gcc syntax by Michael Niedermayer 48cabdff1aSopenharmony_ci; 49cabdff1aSopenharmony_ci; ====================================================================== 50cabdff1aSopenharmony_ci; 51cabdff1aSopenharmony_ci; This file is part of FFmpeg. 52cabdff1aSopenharmony_ci; 53cabdff1aSopenharmony_ci; FFmpeg is free software; you can redistribute it and/or 54cabdff1aSopenharmony_ci; modify it under the terms of the GNU Lesser General Public 55cabdff1aSopenharmony_ci; License as published by the Free Software Foundation; either 56cabdff1aSopenharmony_ci; version 2.1 of the License, or (at your option) any later version. 57cabdff1aSopenharmony_ci; 58cabdff1aSopenharmony_ci; FFmpeg is distributed in the hope that it will be useful, 59cabdff1aSopenharmony_ci; but WITHOUT ANY WARRANTY; without even the implied warranty of 60cabdff1aSopenharmony_ci; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 61cabdff1aSopenharmony_ci; Lesser General Public License for more details. 62cabdff1aSopenharmony_ci; 63cabdff1aSopenharmony_ci; You should have received a copy of the GNU Lesser General Public License 64cabdff1aSopenharmony_ci; along with FFmpeg; if not, write to the Free Software Foundation, 65cabdff1aSopenharmony_ci; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 68cabdff1aSopenharmony_ci 69cabdff1aSopenharmony_ciSECTION_RODATA 70cabdff1aSopenharmony_ci; Similar to tg_1_16 in MMX code 71cabdff1aSopenharmony_citan1: times 8 dw 13036 72cabdff1aSopenharmony_citan2: times 8 dw 27146 73cabdff1aSopenharmony_citan3: times 8 dw 43790 74cabdff1aSopenharmony_cisqrt2: times 8 dw 23170 75cabdff1aSopenharmony_ci 76cabdff1aSopenharmony_ci; SSE2 tables 77cabdff1aSopenharmony_ciiTab1: dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d 78cabdff1aSopenharmony_ci dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61 79cabdff1aSopenharmony_ci dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7 80cabdff1aSopenharmony_ci dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b 81cabdff1aSopenharmony_ciiTab2: dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5 82cabdff1aSopenharmony_ci dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04 83cabdff1aSopenharmony_ci dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41 84cabdff1aSopenharmony_ci dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df 85cabdff1aSopenharmony_ciiTab3: dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf 86cabdff1aSopenharmony_ci dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf 87cabdff1aSopenharmony_ci dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d 88cabdff1aSopenharmony_ci dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 89cabdff1aSopenharmony_ciiTab4: dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746 90cabdff1aSopenharmony_ci dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac 91cabdff1aSopenharmony_ci dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df 92cabdff1aSopenharmony_ci dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci; Similar to rounder_0 in MMX code 95cabdff1aSopenharmony_ci; 4 first similar, then: 4*8->6*16 5*8->4*16 6/7*8->5*16 96cabdff1aSopenharmony_ciwalkenIdctRounders: times 4 dd 65536 97cabdff1aSopenharmony_ci times 4 dd 3597 98cabdff1aSopenharmony_ci times 4 dd 2260 99cabdff1aSopenharmony_ci times 4 dd 1203 100cabdff1aSopenharmony_ci times 4 dd 120 101cabdff1aSopenharmony_ci times 4 dd 512 102cabdff1aSopenharmony_ci times 2 dd 0 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_cipb_127: times 8 db 127 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ciSECTION .text 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ci; Temporary storage before the column pass 109cabdff1aSopenharmony_ci%define ROW1 xmm6 110cabdff1aSopenharmony_ci%define ROW3 xmm4 111cabdff1aSopenharmony_ci%define ROW5 xmm5 112cabdff1aSopenharmony_ci%define ROW7 xmm7 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ci%macro CLEAR_ODD 1 115cabdff1aSopenharmony_ci pxor %1, %1 116cabdff1aSopenharmony_ci%endmacro 117cabdff1aSopenharmony_ci%macro PUT_ODD 1 118cabdff1aSopenharmony_ci pshufhw %1, xmm2, 0x1B 119cabdff1aSopenharmony_ci%endmacro 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci%macro MOV32 2 122cabdff1aSopenharmony_ci%if ARCH_X86_32 123cabdff1aSopenharmony_ci movdqa %2, %1 124cabdff1aSopenharmony_ci%endif 125cabdff1aSopenharmony_ci%endmacro 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci%macro CLEAR_EVEN 1 128cabdff1aSopenharmony_ci%if ARCH_X86_64 129cabdff1aSopenharmony_ci CLEAR_ODD %1 130cabdff1aSopenharmony_ci%endif 131cabdff1aSopenharmony_ci%endmacro 132cabdff1aSopenharmony_ci 133cabdff1aSopenharmony_ci%macro PUT_EVEN 1 134cabdff1aSopenharmony_ci%if ARCH_X86_64 135cabdff1aSopenharmony_ci PUT_ODD %1 136cabdff1aSopenharmony_ci%else 137cabdff1aSopenharmony_ci pshufhw xmm2, xmm2, 0x1B 138cabdff1aSopenharmony_ci movdqa %1, xmm2 139cabdff1aSopenharmony_ci%endif 140cabdff1aSopenharmony_ci%endmacro 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ci%if ARCH_X86_64 143cabdff1aSopenharmony_ci%define ROW0 xmm8 144cabdff1aSopenharmony_ci%define REG0 ROW0 145cabdff1aSopenharmony_ci%define ROW2 xmm9 146cabdff1aSopenharmony_ci%define REG2 ROW2 147cabdff1aSopenharmony_ci%define ROW4 xmm10 148cabdff1aSopenharmony_ci%define REG4 ROW4 149cabdff1aSopenharmony_ci%define ROW6 xmm11 150cabdff1aSopenharmony_ci%define REG6 ROW6 151cabdff1aSopenharmony_ci%define XMMS xmm12 152cabdff1aSopenharmony_ci%define SREG2 REG2 153cabdff1aSopenharmony_ci%define TAN3 xmm13 154cabdff1aSopenharmony_ci%define TAN1 xmm14 155cabdff1aSopenharmony_ci%else 156cabdff1aSopenharmony_ci%define ROW0 [BLOCK + 0*16] 157cabdff1aSopenharmony_ci%define REG0 xmm4 158cabdff1aSopenharmony_ci%define ROW2 [BLOCK + 2*16] 159cabdff1aSopenharmony_ci%define REG2 xmm4 160cabdff1aSopenharmony_ci%define ROW4 [BLOCK + 4*16] 161cabdff1aSopenharmony_ci%define REG4 xmm6 162cabdff1aSopenharmony_ci%define ROW6 [BLOCK + 6*16] 163cabdff1aSopenharmony_ci%define REG6 xmm6 164cabdff1aSopenharmony_ci%define XMMS xmm2 165cabdff1aSopenharmony_ci%define SREG2 xmm7 166cabdff1aSopenharmony_ci%define TAN3 xmm0 167cabdff1aSopenharmony_ci%define TAN1 xmm2 168cabdff1aSopenharmony_ci%endif 169cabdff1aSopenharmony_ci 170cabdff1aSopenharmony_ci%macro JZ 2 171cabdff1aSopenharmony_ci test %1, %1 172cabdff1aSopenharmony_ci jz .%2 173cabdff1aSopenharmony_ci%endmacro 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci%macro JNZ 2 176cabdff1aSopenharmony_ci test %1, %1 177cabdff1aSopenharmony_ci jnz .%2 178cabdff1aSopenharmony_ci%endmacro 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci%macro TEST_ONE_ROW 4 ; src, reg, clear, arg 181cabdff1aSopenharmony_ci %3 %4 182cabdff1aSopenharmony_ci movq mm1, [%1] 183cabdff1aSopenharmony_ci por mm1, [%1 + 8] 184cabdff1aSopenharmony_ci paddusb mm1, mm0 185cabdff1aSopenharmony_ci pmovmskb %2, mm1 186cabdff1aSopenharmony_ci%endmacro 187cabdff1aSopenharmony_ci 188cabdff1aSopenharmony_ci;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2 189cabdff1aSopenharmony_ci%macro TEST_TWO_ROWS 8 190cabdff1aSopenharmony_ci %5 %6 191cabdff1aSopenharmony_ci %7 %8 192cabdff1aSopenharmony_ci movq mm1, [%1 + 0] 193cabdff1aSopenharmony_ci por mm1, [%1 + 8] 194cabdff1aSopenharmony_ci movq mm2, [%2 + 0] 195cabdff1aSopenharmony_ci por mm2, [%2 + 8] 196cabdff1aSopenharmony_ci paddusb mm1, mm0 197cabdff1aSopenharmony_ci paddusb mm2, mm0 198cabdff1aSopenharmony_ci pmovmskb %3, mm1 199cabdff1aSopenharmony_ci pmovmskb %4, mm2 200cabdff1aSopenharmony_ci%endmacro 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci; IDCT pass on rows. 203cabdff1aSopenharmony_ci%macro iMTX_MULT 4-5 ; src, table, put, arg, rounder 204cabdff1aSopenharmony_ci movdqa xmm3, [%1] 205cabdff1aSopenharmony_ci movdqa xmm0, xmm3 206cabdff1aSopenharmony_ci pshufd xmm1, xmm3, 0x11 ; 4602 207cabdff1aSopenharmony_ci punpcklqdq xmm0, xmm0 ; 0246 208cabdff1aSopenharmony_ci pmaddwd xmm0, [%2] 209cabdff1aSopenharmony_ci pmaddwd xmm1, [%2+16] 210cabdff1aSopenharmony_ci pshufd xmm2, xmm3, 0xBB ; 5713 211cabdff1aSopenharmony_ci punpckhqdq xmm3, xmm3 ; 1357 212cabdff1aSopenharmony_ci pmaddwd xmm2, [%2+32] 213cabdff1aSopenharmony_ci pmaddwd xmm3, [%2+48] 214cabdff1aSopenharmony_ci paddd xmm0, xmm1 215cabdff1aSopenharmony_ci paddd xmm2, xmm3 216cabdff1aSopenharmony_ci%if %0 == 5 217cabdff1aSopenharmony_ci paddd xmm0, [walkenIdctRounders+%5] 218cabdff1aSopenharmony_ci%endif 219cabdff1aSopenharmony_ci movdqa xmm3, xmm2 220cabdff1aSopenharmony_ci paddd xmm2, xmm0 221cabdff1aSopenharmony_ci psubd xmm0, xmm3 222cabdff1aSopenharmony_ci psrad xmm2, 11 223cabdff1aSopenharmony_ci psrad xmm0, 11 224cabdff1aSopenharmony_ci packssdw xmm2, xmm0 225cabdff1aSopenharmony_ci %3 %4 226cabdff1aSopenharmony_ci%endmacro 227cabdff1aSopenharmony_ci 228cabdff1aSopenharmony_ci%macro iLLM_HEAD 0 229cabdff1aSopenharmony_ci movdqa TAN3, [tan3] 230cabdff1aSopenharmony_ci movdqa TAN1, [tan1] 231cabdff1aSopenharmony_ci%endmacro 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_ci%macro FIRST_HALF 2 ; %1=dct %2=type(normal,add,put) 234cabdff1aSopenharmony_ci psraw xmm5, 6 235cabdff1aSopenharmony_ci psraw REG0, 6 236cabdff1aSopenharmony_ci psraw TAN3, 6 237cabdff1aSopenharmony_ci psraw xmm3, 6 238cabdff1aSopenharmony_ci ; dct coeffs must still be written for AC prediction 239cabdff1aSopenharmony_ci%if %2 == 0 240cabdff1aSopenharmony_ci movdqa [%1+1*16], TAN3 241cabdff1aSopenharmony_ci movdqa [%1+2*16], xmm3 242cabdff1aSopenharmony_ci movdqa [%1+5*16], REG0 243cabdff1aSopenharmony_ci movdqa [%1+6*16], xmm5 244cabdff1aSopenharmony_ci%else 245cabdff1aSopenharmony_ci ; Must now load args as gprs are no longer used for masks 246cabdff1aSopenharmony_ci ; DEST is set to where address of dest was loaded 247cabdff1aSopenharmony_ci %if ARCH_X86_32 248cabdff1aSopenharmony_ci %if %2 == 2 ; Not enough xmms, store 249cabdff1aSopenharmony_ci movdqa [%1+1*16], TAN3 250cabdff1aSopenharmony_ci movdqa [%1+2*16], xmm3 251cabdff1aSopenharmony_ci movdqa [%1+5*16], REG0 252cabdff1aSopenharmony_ci movdqa [%1+6*16], xmm5 253cabdff1aSopenharmony_ci %endif 254cabdff1aSopenharmony_ci %xdefine DEST r2q ; BLOCK is r0, stride r1 255cabdff1aSopenharmony_ci movifnidn DEST, destm 256cabdff1aSopenharmony_ci movifnidn strideq, stridem 257cabdff1aSopenharmony_ci %else 258cabdff1aSopenharmony_ci %xdefine DEST r0q 259cabdff1aSopenharmony_ci %endif 260cabdff1aSopenharmony_ci lea r3q, [3*strideq] 261cabdff1aSopenharmony_ci %if %2 == 1 262cabdff1aSopenharmony_ci packuswb TAN3, xmm3 263cabdff1aSopenharmony_ci packuswb xmm5, REG0 264cabdff1aSopenharmony_ci movq [DEST + strideq], TAN3 265cabdff1aSopenharmony_ci movhps [DEST + 2*strideq], TAN3 266cabdff1aSopenharmony_ci ; REG0 and TAN3 are now available (and likely used in second half) 267cabdff1aSopenharmony_ci %endif 268cabdff1aSopenharmony_ci%endif 269cabdff1aSopenharmony_ci%endmacro 270cabdff1aSopenharmony_ci 271cabdff1aSopenharmony_ci%macro SECOND_HALF 6 ; %1=dct %2=type(normal,add,put) 3-6: xmms 272cabdff1aSopenharmony_ci psraw %3, 6 273cabdff1aSopenharmony_ci psraw %4, 6 274cabdff1aSopenharmony_ci psraw %5, 6 275cabdff1aSopenharmony_ci psraw %6, 6 276cabdff1aSopenharmony_ci ; dct coeffs must still be written for AC prediction 277cabdff1aSopenharmony_ci%if %2 == 0 278cabdff1aSopenharmony_ci movdqa [%1+0*16], %3 279cabdff1aSopenharmony_ci movdqa [%1+3*16], %5 280cabdff1aSopenharmony_ci movdqa [%1+4*16], %6 281cabdff1aSopenharmony_ci movdqa [%1+7*16], %4 282cabdff1aSopenharmony_ci%elif %2 == 1 283cabdff1aSopenharmony_ci packuswb %3, %5 284cabdff1aSopenharmony_ci packuswb %6, %4 285cabdff1aSopenharmony_ci ; address of dest may have been loaded 286cabdff1aSopenharmony_ci movq [DEST], %3 287cabdff1aSopenharmony_ci movhps [DEST + r3q], %3 288cabdff1aSopenharmony_ci lea DEST, [DEST + 4*strideq] 289cabdff1aSopenharmony_ci movq [DEST], %6 290cabdff1aSopenharmony_ci movhps [DEST + r3q], %6 291cabdff1aSopenharmony_ci ; and now write remainder of first half 292cabdff1aSopenharmony_ci movq [DEST + 2*strideq], xmm5 293cabdff1aSopenharmony_ci movhps [DEST + strideq], xmm5 294cabdff1aSopenharmony_ci%elif %2 == 2 295cabdff1aSopenharmony_ci pxor xmm0, xmm0 296cabdff1aSopenharmony_ci %if ARCH_X86_32 297cabdff1aSopenharmony_ci ; free: m3 REG0=m4 m5 298cabdff1aSopenharmony_ci ; input: m1, m7, m2, m6 299cabdff1aSopenharmony_ci movq xmm3, [DEST+0*strideq] 300cabdff1aSopenharmony_ci movq xmm4, [DEST+1*strideq] 301cabdff1aSopenharmony_ci punpcklbw xmm3, xmm0 302cabdff1aSopenharmony_ci punpcklbw xmm4, xmm0 303cabdff1aSopenharmony_ci paddsw xmm3, %3 304cabdff1aSopenharmony_ci paddsw xmm4, [%1 + 1*16] 305cabdff1aSopenharmony_ci movq %3, [DEST+2*strideq] 306cabdff1aSopenharmony_ci movq xmm5, [DEST+ r3q] 307cabdff1aSopenharmony_ci punpcklbw %3, xmm0 308cabdff1aSopenharmony_ci punpcklbw xmm5, xmm0 309cabdff1aSopenharmony_ci paddsw %3, [%1 + 2*16] 310cabdff1aSopenharmony_ci paddsw xmm5, %5 311cabdff1aSopenharmony_ci packuswb xmm3, xmm4 312cabdff1aSopenharmony_ci packuswb %3, xmm5 313cabdff1aSopenharmony_ci movq [DEST+0*strideq], xmm3 314cabdff1aSopenharmony_ci movhps [DEST+1*strideq], xmm3 315cabdff1aSopenharmony_ci movq [DEST+2*strideq], %3 316cabdff1aSopenharmony_ci movhps [DEST+ r3q], %3 317cabdff1aSopenharmony_ci lea DEST, [DEST+4*strideq] 318cabdff1aSopenharmony_ci movq xmm3, [DEST+0*strideq] 319cabdff1aSopenharmony_ci movq xmm4, [DEST+1*strideq] 320cabdff1aSopenharmony_ci movq %3, [DEST+2*strideq] 321cabdff1aSopenharmony_ci movq xmm5, [DEST+ r3q] 322cabdff1aSopenharmony_ci punpcklbw xmm3, xmm0 323cabdff1aSopenharmony_ci punpcklbw xmm4, xmm0 324cabdff1aSopenharmony_ci punpcklbw %3, xmm0 325cabdff1aSopenharmony_ci punpcklbw xmm5, xmm0 326cabdff1aSopenharmony_ci paddsw xmm3, %6 327cabdff1aSopenharmony_ci paddsw xmm4, [%1 + 5*16] 328cabdff1aSopenharmony_ci paddsw %3, [%1 + 6*16] 329cabdff1aSopenharmony_ci paddsw xmm5, %4 330cabdff1aSopenharmony_ci packuswb xmm3, xmm4 331cabdff1aSopenharmony_ci packuswb %3, xmm5 332cabdff1aSopenharmony_ci movq [DEST+0*strideq], xmm3 333cabdff1aSopenharmony_ci movhps [DEST+1*strideq], xmm3 334cabdff1aSopenharmony_ci movq [DEST+2*strideq], %3 335cabdff1aSopenharmony_ci movhps [DEST+ r3q], %3 336cabdff1aSopenharmony_ci %else 337cabdff1aSopenharmony_ci ; l1:TAN3=m13 l2:m3 l5:REG0=m8 l6=m5 338cabdff1aSopenharmony_ci ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10 339cabdff1aSopenharmony_ci movq xmm2, [DEST+0*strideq] 340cabdff1aSopenharmony_ci movq xmm4, [DEST+1*strideq] 341cabdff1aSopenharmony_ci movq xmm12, [DEST+2*strideq] 342cabdff1aSopenharmony_ci movq xmm11, [DEST+ r3q] 343cabdff1aSopenharmony_ci punpcklbw xmm2, xmm0 344cabdff1aSopenharmony_ci punpcklbw xmm4, xmm0 345cabdff1aSopenharmony_ci punpcklbw xmm12, xmm0 346cabdff1aSopenharmony_ci punpcklbw xmm11, xmm0 347cabdff1aSopenharmony_ci paddsw xmm2, %3 348cabdff1aSopenharmony_ci paddsw xmm4, TAN3 349cabdff1aSopenharmony_ci paddsw xmm12, xmm3 350cabdff1aSopenharmony_ci paddsw xmm11, %5 351cabdff1aSopenharmony_ci packuswb xmm2, xmm4 352cabdff1aSopenharmony_ci packuswb xmm12, xmm11 353cabdff1aSopenharmony_ci movq [DEST+0*strideq], xmm2 354cabdff1aSopenharmony_ci movhps [DEST+1*strideq], xmm2 355cabdff1aSopenharmony_ci movq [DEST+2*strideq], xmm12 356cabdff1aSopenharmony_ci movhps [DEST+ r3q], xmm12 357cabdff1aSopenharmony_ci lea DEST, [DEST+4*strideq] 358cabdff1aSopenharmony_ci movq xmm2, [DEST+0*strideq] 359cabdff1aSopenharmony_ci movq xmm4, [DEST+1*strideq] 360cabdff1aSopenharmony_ci movq xmm12, [DEST+2*strideq] 361cabdff1aSopenharmony_ci movq xmm11, [DEST+ r3q] 362cabdff1aSopenharmony_ci punpcklbw xmm2, xmm0 363cabdff1aSopenharmony_ci punpcklbw xmm4, xmm0 364cabdff1aSopenharmony_ci punpcklbw xmm12, xmm0 365cabdff1aSopenharmony_ci punpcklbw xmm11, xmm0 366cabdff1aSopenharmony_ci paddsw xmm2, %6 367cabdff1aSopenharmony_ci paddsw xmm4, REG0 368cabdff1aSopenharmony_ci paddsw xmm12, xmm5 369cabdff1aSopenharmony_ci paddsw xmm11, %4 370cabdff1aSopenharmony_ci packuswb xmm2, xmm4 371cabdff1aSopenharmony_ci packuswb xmm12, xmm11 372cabdff1aSopenharmony_ci movq [DEST+0*strideq], xmm2 373cabdff1aSopenharmony_ci movhps [DEST+1*strideq], xmm2 374cabdff1aSopenharmony_ci movq [DEST+2*strideq], xmm12 375cabdff1aSopenharmony_ci movhps [DEST+ r3q], xmm12 376cabdff1aSopenharmony_ci %endif 377cabdff1aSopenharmony_ci%endif 378cabdff1aSopenharmony_ci%endmacro 379cabdff1aSopenharmony_ci 380cabdff1aSopenharmony_ci 381cabdff1aSopenharmony_ci; IDCT pass on columns. 382cabdff1aSopenharmony_ci%macro iLLM_PASS 2 ; %1=dct %2=type(normal,add,put) 383cabdff1aSopenharmony_ci movdqa xmm1, TAN3 384cabdff1aSopenharmony_ci movdqa xmm3, TAN1 385cabdff1aSopenharmony_ci pmulhw TAN3, xmm4 386cabdff1aSopenharmony_ci pmulhw xmm1, xmm5 387cabdff1aSopenharmony_ci paddsw TAN3, xmm4 388cabdff1aSopenharmony_ci paddsw xmm1, xmm5 389cabdff1aSopenharmony_ci psubsw TAN3, xmm5 390cabdff1aSopenharmony_ci paddsw xmm1, xmm4 391cabdff1aSopenharmony_ci pmulhw xmm3, xmm7 392cabdff1aSopenharmony_ci pmulhw TAN1, xmm6 393cabdff1aSopenharmony_ci paddsw xmm3, xmm6 394cabdff1aSopenharmony_ci psubsw TAN1, xmm7 395cabdff1aSopenharmony_ci movdqa xmm7, xmm3 396cabdff1aSopenharmony_ci movdqa xmm6, TAN1 397cabdff1aSopenharmony_ci psubsw xmm3, xmm1 398cabdff1aSopenharmony_ci psubsw TAN1, TAN3 399cabdff1aSopenharmony_ci paddsw xmm1, xmm7 400cabdff1aSopenharmony_ci paddsw TAN3, xmm6 401cabdff1aSopenharmony_ci movdqa xmm6, xmm3 402cabdff1aSopenharmony_ci psubsw xmm3, TAN3 403cabdff1aSopenharmony_ci paddsw TAN3, xmm6 404cabdff1aSopenharmony_ci movdqa xmm4, [sqrt2] 405cabdff1aSopenharmony_ci pmulhw xmm3, xmm4 406cabdff1aSopenharmony_ci pmulhw TAN3, xmm4 407cabdff1aSopenharmony_ci paddsw TAN3, TAN3 408cabdff1aSopenharmony_ci paddsw xmm3, xmm3 409cabdff1aSopenharmony_ci movdqa xmm7, [tan2] 410cabdff1aSopenharmony_ci MOV32 ROW2, REG2 411cabdff1aSopenharmony_ci MOV32 ROW6, REG6 412cabdff1aSopenharmony_ci movdqa xmm5, xmm7 413cabdff1aSopenharmony_ci pmulhw xmm7, REG6 414cabdff1aSopenharmony_ci pmulhw xmm5, REG2 415cabdff1aSopenharmony_ci paddsw xmm7, REG2 416cabdff1aSopenharmony_ci psubsw xmm5, REG6 417cabdff1aSopenharmony_ci MOV32 ROW0, REG0 418cabdff1aSopenharmony_ci MOV32 ROW4, REG4 419cabdff1aSopenharmony_ci MOV32 TAN1, [BLOCK] 420cabdff1aSopenharmony_ci movdqa XMMS, REG0 421cabdff1aSopenharmony_ci psubsw REG0, REG4 422cabdff1aSopenharmony_ci paddsw REG4, XMMS 423cabdff1aSopenharmony_ci movdqa XMMS, REG4 424cabdff1aSopenharmony_ci psubsw REG4, xmm7 425cabdff1aSopenharmony_ci paddsw xmm7, XMMS 426cabdff1aSopenharmony_ci movdqa XMMS, REG0 427cabdff1aSopenharmony_ci psubsw REG0, xmm5 428cabdff1aSopenharmony_ci paddsw xmm5, XMMS 429cabdff1aSopenharmony_ci movdqa XMMS, xmm5 430cabdff1aSopenharmony_ci psubsw xmm5, TAN3 431cabdff1aSopenharmony_ci paddsw TAN3, XMMS 432cabdff1aSopenharmony_ci movdqa XMMS, REG0 433cabdff1aSopenharmony_ci psubsw REG0, xmm3 434cabdff1aSopenharmony_ci paddsw xmm3, XMMS 435cabdff1aSopenharmony_ci MOV32 [BLOCK], TAN1 436cabdff1aSopenharmony_ci 437cabdff1aSopenharmony_ci FIRST_HALF %1, %2 438cabdff1aSopenharmony_ci 439cabdff1aSopenharmony_ci movdqa xmm0, xmm7 440cabdff1aSopenharmony_ci movdqa xmm4, REG4 441cabdff1aSopenharmony_ci psubsw xmm7, xmm1 442cabdff1aSopenharmony_ci psubsw REG4, TAN1 443cabdff1aSopenharmony_ci paddsw xmm1, xmm0 444cabdff1aSopenharmony_ci paddsw TAN1, xmm4 445cabdff1aSopenharmony_ci 446cabdff1aSopenharmony_ci SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4 447cabdff1aSopenharmony_ci%endmacro 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci; IDCT pass on columns, assuming rows 4-7 are zero 450cabdff1aSopenharmony_ci%macro iLLM_PASS_SPARSE 2 ; %1=dct %2=type(normal,put,add) 451cabdff1aSopenharmony_ci pmulhw TAN3, xmm4 452cabdff1aSopenharmony_ci paddsw TAN3, xmm4 453cabdff1aSopenharmony_ci movdqa xmm3, xmm6 454cabdff1aSopenharmony_ci pmulhw TAN1, xmm6 455cabdff1aSopenharmony_ci movdqa xmm1, xmm4 456cabdff1aSopenharmony_ci psubsw xmm3, xmm1 457cabdff1aSopenharmony_ci paddsw xmm1, xmm6 458cabdff1aSopenharmony_ci movdqa xmm6, TAN1 459cabdff1aSopenharmony_ci psubsw TAN1, TAN3 460cabdff1aSopenharmony_ci paddsw TAN3, xmm6 461cabdff1aSopenharmony_ci movdqa xmm6, xmm3 462cabdff1aSopenharmony_ci psubsw xmm3, TAN3 463cabdff1aSopenharmony_ci paddsw TAN3, xmm6 464cabdff1aSopenharmony_ci movdqa xmm4, [sqrt2] 465cabdff1aSopenharmony_ci pmulhw xmm3, xmm4 466cabdff1aSopenharmony_ci pmulhw TAN3, xmm4 467cabdff1aSopenharmony_ci paddsw TAN3, TAN3 468cabdff1aSopenharmony_ci paddsw xmm3, xmm3 469cabdff1aSopenharmony_ci movdqa xmm5, [tan2] 470cabdff1aSopenharmony_ci MOV32 ROW2, SREG2 471cabdff1aSopenharmony_ci pmulhw xmm5, SREG2 472cabdff1aSopenharmony_ci MOV32 ROW0, REG0 473cabdff1aSopenharmony_ci movdqa xmm6, REG0 474cabdff1aSopenharmony_ci psubsw xmm6, SREG2 475cabdff1aSopenharmony_ci paddsw SREG2, REG0 476cabdff1aSopenharmony_ci MOV32 TAN1, [BLOCK] 477cabdff1aSopenharmony_ci movdqa XMMS, REG0 478cabdff1aSopenharmony_ci psubsw REG0, xmm5 479cabdff1aSopenharmony_ci paddsw xmm5, XMMS 480cabdff1aSopenharmony_ci movdqa XMMS, xmm5 481cabdff1aSopenharmony_ci psubsw xmm5, TAN3 482cabdff1aSopenharmony_ci paddsw TAN3, XMMS 483cabdff1aSopenharmony_ci movdqa XMMS, REG0 484cabdff1aSopenharmony_ci psubsw REG0, xmm3 485cabdff1aSopenharmony_ci paddsw xmm3, XMMS 486cabdff1aSopenharmony_ci MOV32 [BLOCK], TAN1 487cabdff1aSopenharmony_ci 488cabdff1aSopenharmony_ci FIRST_HALF %1, %2 489cabdff1aSopenharmony_ci 490cabdff1aSopenharmony_ci movdqa xmm0, SREG2 491cabdff1aSopenharmony_ci movdqa xmm4, xmm6 492cabdff1aSopenharmony_ci psubsw SREG2, xmm1 493cabdff1aSopenharmony_ci psubsw xmm6, TAN1 494cabdff1aSopenharmony_ci paddsw xmm1, xmm0 495cabdff1aSopenharmony_ci paddsw TAN1, xmm4 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6 498cabdff1aSopenharmony_ci%endmacro 499cabdff1aSopenharmony_ci 500cabdff1aSopenharmony_ci%macro IDCT_SSE2 1 ; 0=normal 1=put 2=add 501cabdff1aSopenharmony_ci%if %1 == 0 || ARCH_X86_32 502cabdff1aSopenharmony_ci %define GPR0 r1d 503cabdff1aSopenharmony_ci %define GPR1 r2d 504cabdff1aSopenharmony_ci %define GPR2 r3d 505cabdff1aSopenharmony_ci %define GPR3 r4d 506cabdff1aSopenharmony_ci %define NUM_GPRS 5 507cabdff1aSopenharmony_ci%else 508cabdff1aSopenharmony_ci %define GPR0 r3d 509cabdff1aSopenharmony_ci %define GPR1 r4d 510cabdff1aSopenharmony_ci %define GPR2 r5d 511cabdff1aSopenharmony_ci %define GPR3 r6d 512cabdff1aSopenharmony_ci %define NUM_GPRS 7 513cabdff1aSopenharmony_ci%endif 514cabdff1aSopenharmony_ci%if %1 == 0 515cabdff1aSopenharmony_cicglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block 516cabdff1aSopenharmony_ci%xdefine BLOCK blockq 517cabdff1aSopenharmony_ci%else 518cabdff1aSopenharmony_ci %if %1 == 1 519cabdff1aSopenharmony_cicglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block 520cabdff1aSopenharmony_ci %else 521cabdff1aSopenharmony_cicglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block 522cabdff1aSopenharmony_ci %endif 523cabdff1aSopenharmony_ci %if ARCH_X86_64 524cabdff1aSopenharmony_ci %xdefine BLOCK blockq 525cabdff1aSopenharmony_ci %else 526cabdff1aSopenharmony_ci mov r0q, blockm 527cabdff1aSopenharmony_ci %xdefine BLOCK r0q 528cabdff1aSopenharmony_ci %endif 529cabdff1aSopenharmony_ci%endif 530cabdff1aSopenharmony_ci movq mm0, [pb_127] 531cabdff1aSopenharmony_ci iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16 532cabdff1aSopenharmony_ci iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16 533cabdff1aSopenharmony_ci iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16 534cabdff1aSopenharmony_ci 535cabdff1aSopenharmony_ci TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c 536cabdff1aSopenharmony_ci JZ GPR0, col1 537cabdff1aSopenharmony_ci iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16 538cabdff1aSopenharmony_ci.col1: 539cabdff1aSopenharmony_ci TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d 540cabdff1aSopenharmony_ci TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_ci iLLM_HEAD 543cabdff1aSopenharmony_ci JNZ GPR1, 2 544cabdff1aSopenharmony_ci JNZ GPR0, 3 545cabdff1aSopenharmony_ci JNZ GPR2, 4 546cabdff1aSopenharmony_ci JNZ GPR3, 5 547cabdff1aSopenharmony_ci iLLM_PASS_SPARSE BLOCK, %1 548cabdff1aSopenharmony_ci jmp .6 549cabdff1aSopenharmony_ci.2: 550cabdff1aSopenharmony_ci iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4 551cabdff1aSopenharmony_ci.3: 552cabdff1aSopenharmony_ci iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16 553cabdff1aSopenharmony_ci JZ GPR2, col2 554cabdff1aSopenharmony_ci.4: 555cabdff1aSopenharmony_ci iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16 556cabdff1aSopenharmony_ci.col2: 557cabdff1aSopenharmony_ci JZ GPR3, col3 558cabdff1aSopenharmony_ci.5: 559cabdff1aSopenharmony_ci iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16 560cabdff1aSopenharmony_ci.col3: 561cabdff1aSopenharmony_ci%if ARCH_X86_32 562cabdff1aSopenharmony_ci iLLM_HEAD 563cabdff1aSopenharmony_ci%endif 564cabdff1aSopenharmony_ci iLLM_PASS BLOCK, %1 565cabdff1aSopenharmony_ci.6: 566cabdff1aSopenharmony_ci RET 567cabdff1aSopenharmony_ci%endmacro 568cabdff1aSopenharmony_ci 569cabdff1aSopenharmony_ciINIT_XMM sse2 570cabdff1aSopenharmony_ciIDCT_SSE2 0 571cabdff1aSopenharmony_ciIDCT_SSE2 1 572cabdff1aSopenharmony_ciIDCT_SSE2 2 573