1cabdff1aSopenharmony_ci; XVID MPEG-4 VIDEO CODEC
2cabdff1aSopenharmony_ci;
3cabdff1aSopenharmony_ci; Conversion from gcc syntax to x264asm syntax with modifications
4cabdff1aSopenharmony_ci; by Christophe Gisquet <christophe.gisquet@gmail.com>
5cabdff1aSopenharmony_ci;
6cabdff1aSopenharmony_ci; ===========     SSE2 inverse discrete cosine transform     ===========
7cabdff1aSopenharmony_ci;
8cabdff1aSopenharmony_ci; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
9cabdff1aSopenharmony_ci;
10cabdff1aSopenharmony_ci; Conversion to gcc syntax with modifications
11cabdff1aSopenharmony_ci; by Alexander Strange <astrange@ithinksw.com>
12cabdff1aSopenharmony_ci;
13cabdff1aSopenharmony_ci; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
14cabdff1aSopenharmony_ci;
15cabdff1aSopenharmony_ci; Vertical pass is an implementation of the scheme:
16cabdff1aSopenharmony_ci;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
17cabdff1aSopenharmony_ci;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
18cabdff1aSopenharmony_ci;  Proc. ICASSP 1989, 988-991.
19cabdff1aSopenharmony_ci;
20cabdff1aSopenharmony_ci; Horizontal pass is a double 4x4 vector/matrix multiplication,
21cabdff1aSopenharmony_ci; (see also Intel's Application Note 922:
22cabdff1aSopenharmony_ci;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
23cabdff1aSopenharmony_ci;  Copyright (C) 1999 Intel Corporation)
24cabdff1aSopenharmony_ci;
25cabdff1aSopenharmony_ci; More details at http://skal.planet-d.net/coding/dct.html
26cabdff1aSopenharmony_ci;
27cabdff1aSopenharmony_ci; =======     MMX and XMM forward discrete cosine transform     =======
28cabdff1aSopenharmony_ci;
29cabdff1aSopenharmony_ci; Copyright(C) 2001 Peter Ross <pross@xvid.org>
30cabdff1aSopenharmony_ci;
31cabdff1aSopenharmony_ci; Originally provided by Intel at AP-922
32cabdff1aSopenharmony_ci; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
33cabdff1aSopenharmony_ci; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
34cabdff1aSopenharmony_ci; but in a limited edition.
35cabdff1aSopenharmony_ci; New macro implements a column part for precise iDCT
36cabdff1aSopenharmony_ci; The routine precision now satisfies IEEE standard 1180-1990.
37cabdff1aSopenharmony_ci;
38cabdff1aSopenharmony_ci; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
39cabdff1aSopenharmony_ci; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
40cabdff1aSopenharmony_ci;
41cabdff1aSopenharmony_ci; http://www.elecard.com/peter/idct.html
42cabdff1aSopenharmony_ci; http://www.linuxvideo.org/mpeg2dec/
43cabdff1aSopenharmony_ci;
44cabdff1aSopenharmony_ci; These examples contain code fragments for first stage iDCT 8x8
45cabdff1aSopenharmony_ci; (for rows) and first stage DCT 8x8 (for columns)
46cabdff1aSopenharmony_ci;
47cabdff1aSopenharmony_ci; conversion to gcc syntax by Michael Niedermayer
48cabdff1aSopenharmony_ci;
49cabdff1aSopenharmony_ci; ======================================================================
50cabdff1aSopenharmony_ci;
51cabdff1aSopenharmony_ci; This file is part of FFmpeg.
52cabdff1aSopenharmony_ci;
53cabdff1aSopenharmony_ci; FFmpeg is free software; you can redistribute it and/or
54cabdff1aSopenharmony_ci; modify it under the terms of the GNU Lesser General Public
55cabdff1aSopenharmony_ci; License as published by the Free Software Foundation; either
56cabdff1aSopenharmony_ci; version 2.1 of the License, or (at your option) any later version.
57cabdff1aSopenharmony_ci;
58cabdff1aSopenharmony_ci; FFmpeg is distributed in the hope that it will be useful,
59cabdff1aSopenharmony_ci; but WITHOUT ANY WARRANTY; without even the implied warranty of
60cabdff1aSopenharmony_ci; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
61cabdff1aSopenharmony_ci; Lesser General Public License for more details.
62cabdff1aSopenharmony_ci;
63cabdff1aSopenharmony_ci; You should have received a copy of the GNU Lesser General Public License
64cabdff1aSopenharmony_ci; along with FFmpeg; if not, write to the Free Software Foundation,
65cabdff1aSopenharmony_ci; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
68cabdff1aSopenharmony_ci
69cabdff1aSopenharmony_ciSECTION_RODATA
70cabdff1aSopenharmony_ci; Similar to tg_1_16 in MMX code
71cabdff1aSopenharmony_citan1:   times 8 dw 13036
72cabdff1aSopenharmony_citan2:   times 8 dw 27146
73cabdff1aSopenharmony_citan3:   times 8 dw 43790
74cabdff1aSopenharmony_cisqrt2:  times 8 dw 23170
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci; SSE2 tables
77cabdff1aSopenharmony_ciiTab1:  dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
78cabdff1aSopenharmony_ci        dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61
79cabdff1aSopenharmony_ci        dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7
80cabdff1aSopenharmony_ci        dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
81cabdff1aSopenharmony_ciiTab2:  dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5
82cabdff1aSopenharmony_ci        dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04
83cabdff1aSopenharmony_ci        dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41
84cabdff1aSopenharmony_ci        dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
85cabdff1aSopenharmony_ciiTab3:  dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf
86cabdff1aSopenharmony_ci        dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf
87cabdff1aSopenharmony_ci        dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d
88cabdff1aSopenharmony_ci        dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
89cabdff1aSopenharmony_ciiTab4:  dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
90cabdff1aSopenharmony_ci        dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac
91cabdff1aSopenharmony_ci        dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
92cabdff1aSopenharmony_ci        dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci; Similar to rounder_0 in MMX code
95cabdff1aSopenharmony_ci; 4 first similar, then: 4*8->6*16  5*8->4*16  6/7*8->5*16
96cabdff1aSopenharmony_ciwalkenIdctRounders: times 4 dd 65536
97cabdff1aSopenharmony_ci                    times 4 dd  3597
98cabdff1aSopenharmony_ci                    times 4 dd  2260
99cabdff1aSopenharmony_ci                    times 4 dd  1203
100cabdff1aSopenharmony_ci                    times 4 dd   120
101cabdff1aSopenharmony_ci                    times 4 dd   512
102cabdff1aSopenharmony_ci                    times 2 dd     0
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_cipb_127: times 8 db 127
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ciSECTION .text
107cabdff1aSopenharmony_ci
108cabdff1aSopenharmony_ci; Temporary storage before the column pass
109cabdff1aSopenharmony_ci%define ROW1 xmm6
110cabdff1aSopenharmony_ci%define ROW3 xmm4
111cabdff1aSopenharmony_ci%define ROW5 xmm5
112cabdff1aSopenharmony_ci%define ROW7 xmm7
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ci%macro CLEAR_ODD 1
115cabdff1aSopenharmony_ci    pxor      %1, %1
116cabdff1aSopenharmony_ci%endmacro
117cabdff1aSopenharmony_ci%macro PUT_ODD 1
118cabdff1aSopenharmony_ci    pshufhw   %1, xmm2, 0x1B
119cabdff1aSopenharmony_ci%endmacro
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_ci%macro MOV32 2
122cabdff1aSopenharmony_ci%if ARCH_X86_32
123cabdff1aSopenharmony_ci    movdqa    %2, %1
124cabdff1aSopenharmony_ci%endif
125cabdff1aSopenharmony_ci%endmacro
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci%macro CLEAR_EVEN 1
128cabdff1aSopenharmony_ci%if ARCH_X86_64
129cabdff1aSopenharmony_ci    CLEAR_ODD %1
130cabdff1aSopenharmony_ci%endif
131cabdff1aSopenharmony_ci%endmacro
132cabdff1aSopenharmony_ci
133cabdff1aSopenharmony_ci%macro PUT_EVEN 1
134cabdff1aSopenharmony_ci%if ARCH_X86_64
135cabdff1aSopenharmony_ci    PUT_ODD   %1
136cabdff1aSopenharmony_ci%else
137cabdff1aSopenharmony_ci    pshufhw xmm2, xmm2, 0x1B
138cabdff1aSopenharmony_ci    movdqa    %1, xmm2
139cabdff1aSopenharmony_ci%endif
140cabdff1aSopenharmony_ci%endmacro
141cabdff1aSopenharmony_ci
142cabdff1aSopenharmony_ci%if ARCH_X86_64
143cabdff1aSopenharmony_ci%define ROW0  xmm8
144cabdff1aSopenharmony_ci%define REG0  ROW0
145cabdff1aSopenharmony_ci%define ROW2  xmm9
146cabdff1aSopenharmony_ci%define REG2  ROW2
147cabdff1aSopenharmony_ci%define ROW4  xmm10
148cabdff1aSopenharmony_ci%define REG4  ROW4
149cabdff1aSopenharmony_ci%define ROW6  xmm11
150cabdff1aSopenharmony_ci%define REG6  ROW6
151cabdff1aSopenharmony_ci%define XMMS  xmm12
152cabdff1aSopenharmony_ci%define SREG2 REG2
153cabdff1aSopenharmony_ci%define TAN3  xmm13
154cabdff1aSopenharmony_ci%define TAN1  xmm14
155cabdff1aSopenharmony_ci%else
156cabdff1aSopenharmony_ci%define ROW0  [BLOCK + 0*16]
157cabdff1aSopenharmony_ci%define REG0  xmm4
158cabdff1aSopenharmony_ci%define ROW2  [BLOCK + 2*16]
159cabdff1aSopenharmony_ci%define REG2  xmm4
160cabdff1aSopenharmony_ci%define ROW4  [BLOCK + 4*16]
161cabdff1aSopenharmony_ci%define REG4  xmm6
162cabdff1aSopenharmony_ci%define ROW6  [BLOCK + 6*16]
163cabdff1aSopenharmony_ci%define REG6  xmm6
164cabdff1aSopenharmony_ci%define XMMS  xmm2
165cabdff1aSopenharmony_ci%define SREG2 xmm7
166cabdff1aSopenharmony_ci%define TAN3  xmm0
167cabdff1aSopenharmony_ci%define TAN1  xmm2
168cabdff1aSopenharmony_ci%endif
169cabdff1aSopenharmony_ci
170cabdff1aSopenharmony_ci%macro JZ  2
171cabdff1aSopenharmony_ci    test      %1, %1
172cabdff1aSopenharmony_ci    jz       .%2
173cabdff1aSopenharmony_ci%endmacro
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ci%macro JNZ  2
176cabdff1aSopenharmony_ci    test      %1, %1
177cabdff1aSopenharmony_ci    jnz      .%2
178cabdff1aSopenharmony_ci%endmacro
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci%macro TEST_ONE_ROW 4 ; src, reg, clear, arg
181cabdff1aSopenharmony_ci    %3        %4
182cabdff1aSopenharmony_ci    movq     mm1, [%1]
183cabdff1aSopenharmony_ci    por      mm1, [%1 + 8]
184cabdff1aSopenharmony_ci    paddusb  mm1, mm0
185cabdff1aSopenharmony_ci    pmovmskb  %2, mm1
186cabdff1aSopenharmony_ci%endmacro
187cabdff1aSopenharmony_ci
188cabdff1aSopenharmony_ci;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
189cabdff1aSopenharmony_ci%macro  TEST_TWO_ROWS  8
190cabdff1aSopenharmony_ci    %5         %6
191cabdff1aSopenharmony_ci    %7         %8
192cabdff1aSopenharmony_ci    movq      mm1, [%1 + 0]
193cabdff1aSopenharmony_ci    por       mm1, [%1 + 8]
194cabdff1aSopenharmony_ci    movq      mm2, [%2 + 0]
195cabdff1aSopenharmony_ci    por       mm2, [%2 + 8]
196cabdff1aSopenharmony_ci    paddusb   mm1, mm0
197cabdff1aSopenharmony_ci    paddusb   mm2, mm0
198cabdff1aSopenharmony_ci    pmovmskb   %3, mm1
199cabdff1aSopenharmony_ci    pmovmskb   %4, mm2
200cabdff1aSopenharmony_ci%endmacro
201cabdff1aSopenharmony_ci
202cabdff1aSopenharmony_ci; IDCT pass on rows.
203cabdff1aSopenharmony_ci%macro iMTX_MULT   4-5 ; src, table, put, arg, rounder
204cabdff1aSopenharmony_ci    movdqa       xmm3, [%1]
205cabdff1aSopenharmony_ci    movdqa       xmm0, xmm3
206cabdff1aSopenharmony_ci    pshufd       xmm1, xmm3, 0x11 ; 4602
207cabdff1aSopenharmony_ci    punpcklqdq   xmm0, xmm0       ; 0246
208cabdff1aSopenharmony_ci    pmaddwd      xmm0, [%2]
209cabdff1aSopenharmony_ci    pmaddwd      xmm1, [%2+16]
210cabdff1aSopenharmony_ci    pshufd       xmm2, xmm3, 0xBB ; 5713
211cabdff1aSopenharmony_ci    punpckhqdq   xmm3, xmm3       ; 1357
212cabdff1aSopenharmony_ci    pmaddwd      xmm2, [%2+32]
213cabdff1aSopenharmony_ci    pmaddwd      xmm3, [%2+48]
214cabdff1aSopenharmony_ci    paddd        xmm0, xmm1
215cabdff1aSopenharmony_ci    paddd        xmm2, xmm3
216cabdff1aSopenharmony_ci%if %0 == 5
217cabdff1aSopenharmony_ci    paddd        xmm0, [walkenIdctRounders+%5]
218cabdff1aSopenharmony_ci%endif
219cabdff1aSopenharmony_ci    movdqa       xmm3, xmm2
220cabdff1aSopenharmony_ci    paddd        xmm2, xmm0
221cabdff1aSopenharmony_ci    psubd        xmm0, xmm3
222cabdff1aSopenharmony_ci    psrad        xmm2, 11
223cabdff1aSopenharmony_ci    psrad        xmm0, 11
224cabdff1aSopenharmony_ci    packssdw     xmm2, xmm0
225cabdff1aSopenharmony_ci    %3           %4
226cabdff1aSopenharmony_ci%endmacro
227cabdff1aSopenharmony_ci
228cabdff1aSopenharmony_ci%macro iLLM_HEAD 0
229cabdff1aSopenharmony_ci    movdqa   TAN3, [tan3]
230cabdff1aSopenharmony_ci    movdqa   TAN1, [tan1]
231cabdff1aSopenharmony_ci%endmacro
232cabdff1aSopenharmony_ci
233cabdff1aSopenharmony_ci%macro FIRST_HALF 2  ; %1=dct  %2=type(normal,add,put)
234cabdff1aSopenharmony_ci    psraw    xmm5, 6
235cabdff1aSopenharmony_ci    psraw    REG0, 6
236cabdff1aSopenharmony_ci    psraw    TAN3, 6
237cabdff1aSopenharmony_ci    psraw    xmm3, 6
238cabdff1aSopenharmony_ci    ; dct coeffs must still be written for AC prediction
239cabdff1aSopenharmony_ci%if %2 == 0
240cabdff1aSopenharmony_ci    movdqa   [%1+1*16], TAN3
241cabdff1aSopenharmony_ci    movdqa   [%1+2*16], xmm3
242cabdff1aSopenharmony_ci    movdqa   [%1+5*16], REG0
243cabdff1aSopenharmony_ci    movdqa   [%1+6*16], xmm5
244cabdff1aSopenharmony_ci%else
245cabdff1aSopenharmony_ci    ; Must now load args as gprs are no longer used for masks
246cabdff1aSopenharmony_ci    ; DEST is set to where address of dest was loaded
247cabdff1aSopenharmony_ci    %if ARCH_X86_32
248cabdff1aSopenharmony_ci        %if %2 == 2 ; Not enough xmms, store
249cabdff1aSopenharmony_ci    movdqa   [%1+1*16], TAN3
250cabdff1aSopenharmony_ci    movdqa   [%1+2*16], xmm3
251cabdff1aSopenharmony_ci    movdqa   [%1+5*16], REG0
252cabdff1aSopenharmony_ci    movdqa   [%1+6*16], xmm5
253cabdff1aSopenharmony_ci        %endif
254cabdff1aSopenharmony_ci    %xdefine DEST r2q ; BLOCK is r0, stride r1
255cabdff1aSopenharmony_ci    movifnidn DEST, destm
256cabdff1aSopenharmony_ci    movifnidn strideq, stridem
257cabdff1aSopenharmony_ci    %else
258cabdff1aSopenharmony_ci    %xdefine DEST r0q
259cabdff1aSopenharmony_ci    %endif
260cabdff1aSopenharmony_ci    lea      r3q, [3*strideq]
261cabdff1aSopenharmony_ci    %if %2 == 1
262cabdff1aSopenharmony_ci    packuswb TAN3, xmm3
263cabdff1aSopenharmony_ci    packuswb xmm5, REG0
264cabdff1aSopenharmony_ci    movq     [DEST + strideq], TAN3
265cabdff1aSopenharmony_ci    movhps   [DEST + 2*strideq], TAN3
266cabdff1aSopenharmony_ci    ; REG0 and TAN3 are now available (and likely used in second half)
267cabdff1aSopenharmony_ci    %endif
268cabdff1aSopenharmony_ci%endif
269cabdff1aSopenharmony_ci%endmacro
270cabdff1aSopenharmony_ci
271cabdff1aSopenharmony_ci%macro SECOND_HALF 6 ; %1=dct  %2=type(normal,add,put) 3-6: xmms
272cabdff1aSopenharmony_ci    psraw    %3, 6
273cabdff1aSopenharmony_ci    psraw    %4, 6
274cabdff1aSopenharmony_ci    psraw    %5, 6
275cabdff1aSopenharmony_ci    psraw    %6, 6
276cabdff1aSopenharmony_ci    ; dct coeffs must still be written for AC prediction
277cabdff1aSopenharmony_ci%if %2 == 0
278cabdff1aSopenharmony_ci    movdqa   [%1+0*16], %3
279cabdff1aSopenharmony_ci    movdqa   [%1+3*16], %5
280cabdff1aSopenharmony_ci    movdqa   [%1+4*16], %6
281cabdff1aSopenharmony_ci    movdqa   [%1+7*16], %4
282cabdff1aSopenharmony_ci%elif %2 == 1
283cabdff1aSopenharmony_ci    packuswb %3, %5
284cabdff1aSopenharmony_ci    packuswb %6, %4
285cabdff1aSopenharmony_ci    ; address of dest may have been loaded
286cabdff1aSopenharmony_ci    movq     [DEST], %3
287cabdff1aSopenharmony_ci    movhps   [DEST + r3q], %3
288cabdff1aSopenharmony_ci    lea      DEST, [DEST + 4*strideq]
289cabdff1aSopenharmony_ci    movq     [DEST], %6
290cabdff1aSopenharmony_ci    movhps   [DEST + r3q], %6
291cabdff1aSopenharmony_ci    ; and now write remainder of first half
292cabdff1aSopenharmony_ci    movq     [DEST + 2*strideq], xmm5
293cabdff1aSopenharmony_ci    movhps   [DEST + strideq], xmm5
294cabdff1aSopenharmony_ci%elif %2 == 2
295cabdff1aSopenharmony_ci    pxor        xmm0, xmm0
296cabdff1aSopenharmony_ci    %if ARCH_X86_32
297cabdff1aSopenharmony_ci    ; free: m3 REG0=m4 m5
298cabdff1aSopenharmony_ci    ; input: m1, m7, m2, m6
299cabdff1aSopenharmony_ci    movq        xmm3, [DEST+0*strideq]
300cabdff1aSopenharmony_ci    movq        xmm4, [DEST+1*strideq]
301cabdff1aSopenharmony_ci    punpcklbw   xmm3, xmm0
302cabdff1aSopenharmony_ci    punpcklbw   xmm4, xmm0
303cabdff1aSopenharmony_ci    paddsw      xmm3, %3
304cabdff1aSopenharmony_ci    paddsw      xmm4, [%1 + 1*16]
305cabdff1aSopenharmony_ci    movq          %3, [DEST+2*strideq]
306cabdff1aSopenharmony_ci    movq        xmm5, [DEST+      r3q]
307cabdff1aSopenharmony_ci    punpcklbw     %3, xmm0
308cabdff1aSopenharmony_ci    punpcklbw   xmm5, xmm0
309cabdff1aSopenharmony_ci    paddsw        %3, [%1 + 2*16]
310cabdff1aSopenharmony_ci    paddsw      xmm5, %5
311cabdff1aSopenharmony_ci    packuswb    xmm3, xmm4
312cabdff1aSopenharmony_ci    packuswb      %3, xmm5
313cabdff1aSopenharmony_ci    movq    [DEST+0*strideq], xmm3
314cabdff1aSopenharmony_ci    movhps  [DEST+1*strideq], xmm3
315cabdff1aSopenharmony_ci    movq    [DEST+2*strideq], %3
316cabdff1aSopenharmony_ci    movhps  [DEST+      r3q], %3
317cabdff1aSopenharmony_ci    lea         DEST, [DEST+4*strideq]
318cabdff1aSopenharmony_ci    movq        xmm3, [DEST+0*strideq]
319cabdff1aSopenharmony_ci    movq        xmm4, [DEST+1*strideq]
320cabdff1aSopenharmony_ci    movq          %3, [DEST+2*strideq]
321cabdff1aSopenharmony_ci    movq        xmm5, [DEST+      r3q]
322cabdff1aSopenharmony_ci    punpcklbw   xmm3, xmm0
323cabdff1aSopenharmony_ci    punpcklbw   xmm4, xmm0
324cabdff1aSopenharmony_ci    punpcklbw     %3, xmm0
325cabdff1aSopenharmony_ci    punpcklbw   xmm5, xmm0
326cabdff1aSopenharmony_ci    paddsw      xmm3, %6
327cabdff1aSopenharmony_ci    paddsw      xmm4, [%1 + 5*16]
328cabdff1aSopenharmony_ci    paddsw        %3, [%1 + 6*16]
329cabdff1aSopenharmony_ci    paddsw      xmm5, %4
330cabdff1aSopenharmony_ci    packuswb    xmm3, xmm4
331cabdff1aSopenharmony_ci    packuswb      %3, xmm5
332cabdff1aSopenharmony_ci    movq    [DEST+0*strideq], xmm3
333cabdff1aSopenharmony_ci    movhps  [DEST+1*strideq], xmm3
334cabdff1aSopenharmony_ci    movq    [DEST+2*strideq], %3
335cabdff1aSopenharmony_ci    movhps  [DEST+      r3q], %3
336cabdff1aSopenharmony_ci    %else
337cabdff1aSopenharmony_ci    ; l1:TAN3=m13  l2:m3  l5:REG0=m8 l6=m5
338cabdff1aSopenharmony_ci    ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
339cabdff1aSopenharmony_ci    movq        xmm2, [DEST+0*strideq]
340cabdff1aSopenharmony_ci    movq        xmm4, [DEST+1*strideq]
341cabdff1aSopenharmony_ci    movq       xmm12, [DEST+2*strideq]
342cabdff1aSopenharmony_ci    movq       xmm11, [DEST+      r3q]
343cabdff1aSopenharmony_ci    punpcklbw   xmm2, xmm0
344cabdff1aSopenharmony_ci    punpcklbw   xmm4, xmm0
345cabdff1aSopenharmony_ci    punpcklbw  xmm12, xmm0
346cabdff1aSopenharmony_ci    punpcklbw  xmm11, xmm0
347cabdff1aSopenharmony_ci    paddsw      xmm2, %3
348cabdff1aSopenharmony_ci    paddsw      xmm4, TAN3
349cabdff1aSopenharmony_ci    paddsw     xmm12, xmm3
350cabdff1aSopenharmony_ci    paddsw     xmm11, %5
351cabdff1aSopenharmony_ci    packuswb    xmm2, xmm4
352cabdff1aSopenharmony_ci    packuswb   xmm12, xmm11
353cabdff1aSopenharmony_ci    movq    [DEST+0*strideq], xmm2
354cabdff1aSopenharmony_ci    movhps  [DEST+1*strideq], xmm2
355cabdff1aSopenharmony_ci    movq    [DEST+2*strideq], xmm12
356cabdff1aSopenharmony_ci    movhps  [DEST+      r3q], xmm12
357cabdff1aSopenharmony_ci    lea         DEST, [DEST+4*strideq]
358cabdff1aSopenharmony_ci    movq        xmm2, [DEST+0*strideq]
359cabdff1aSopenharmony_ci    movq        xmm4, [DEST+1*strideq]
360cabdff1aSopenharmony_ci    movq       xmm12, [DEST+2*strideq]
361cabdff1aSopenharmony_ci    movq       xmm11, [DEST+      r3q]
362cabdff1aSopenharmony_ci    punpcklbw   xmm2, xmm0
363cabdff1aSopenharmony_ci    punpcklbw   xmm4, xmm0
364cabdff1aSopenharmony_ci    punpcklbw  xmm12, xmm0
365cabdff1aSopenharmony_ci    punpcklbw  xmm11, xmm0
366cabdff1aSopenharmony_ci    paddsw      xmm2, %6
367cabdff1aSopenharmony_ci    paddsw      xmm4, REG0
368cabdff1aSopenharmony_ci    paddsw     xmm12, xmm5
369cabdff1aSopenharmony_ci    paddsw     xmm11, %4
370cabdff1aSopenharmony_ci    packuswb    xmm2, xmm4
371cabdff1aSopenharmony_ci    packuswb   xmm12, xmm11
372cabdff1aSopenharmony_ci    movq    [DEST+0*strideq], xmm2
373cabdff1aSopenharmony_ci    movhps  [DEST+1*strideq], xmm2
374cabdff1aSopenharmony_ci    movq    [DEST+2*strideq], xmm12
375cabdff1aSopenharmony_ci    movhps  [DEST+      r3q], xmm12
376cabdff1aSopenharmony_ci    %endif
377cabdff1aSopenharmony_ci%endif
378cabdff1aSopenharmony_ci%endmacro
379cabdff1aSopenharmony_ci
380cabdff1aSopenharmony_ci
381cabdff1aSopenharmony_ci; IDCT pass on columns.
382cabdff1aSopenharmony_ci%macro iLLM_PASS  2  ; %1=dct  %2=type(normal,add,put)
383cabdff1aSopenharmony_ci    movdqa   xmm1, TAN3
384cabdff1aSopenharmony_ci    movdqa   xmm3, TAN1
385cabdff1aSopenharmony_ci    pmulhw   TAN3, xmm4
386cabdff1aSopenharmony_ci    pmulhw   xmm1, xmm5
387cabdff1aSopenharmony_ci    paddsw   TAN3, xmm4
388cabdff1aSopenharmony_ci    paddsw   xmm1, xmm5
389cabdff1aSopenharmony_ci    psubsw   TAN3, xmm5
390cabdff1aSopenharmony_ci    paddsw   xmm1, xmm4
391cabdff1aSopenharmony_ci    pmulhw   xmm3, xmm7
392cabdff1aSopenharmony_ci    pmulhw   TAN1, xmm6
393cabdff1aSopenharmony_ci    paddsw   xmm3, xmm6
394cabdff1aSopenharmony_ci    psubsw   TAN1, xmm7
395cabdff1aSopenharmony_ci    movdqa   xmm7, xmm3
396cabdff1aSopenharmony_ci    movdqa   xmm6, TAN1
397cabdff1aSopenharmony_ci    psubsw   xmm3, xmm1
398cabdff1aSopenharmony_ci    psubsw   TAN1, TAN3
399cabdff1aSopenharmony_ci    paddsw   xmm1, xmm7
400cabdff1aSopenharmony_ci    paddsw   TAN3, xmm6
401cabdff1aSopenharmony_ci    movdqa   xmm6, xmm3
402cabdff1aSopenharmony_ci    psubsw   xmm3, TAN3
403cabdff1aSopenharmony_ci    paddsw   TAN3, xmm6
404cabdff1aSopenharmony_ci    movdqa   xmm4, [sqrt2]
405cabdff1aSopenharmony_ci    pmulhw   xmm3, xmm4
406cabdff1aSopenharmony_ci    pmulhw   TAN3, xmm4
407cabdff1aSopenharmony_ci    paddsw   TAN3, TAN3
408cabdff1aSopenharmony_ci    paddsw   xmm3, xmm3
409cabdff1aSopenharmony_ci    movdqa   xmm7, [tan2]
410cabdff1aSopenharmony_ci    MOV32    ROW2, REG2
411cabdff1aSopenharmony_ci    MOV32    ROW6, REG6
412cabdff1aSopenharmony_ci    movdqa   xmm5, xmm7
413cabdff1aSopenharmony_ci    pmulhw   xmm7, REG6
414cabdff1aSopenharmony_ci    pmulhw   xmm5, REG2
415cabdff1aSopenharmony_ci    paddsw   xmm7, REG2
416cabdff1aSopenharmony_ci    psubsw   xmm5, REG6
417cabdff1aSopenharmony_ci    MOV32    ROW0, REG0
418cabdff1aSopenharmony_ci    MOV32    ROW4, REG4
419cabdff1aSopenharmony_ci    MOV32    TAN1, [BLOCK]
420cabdff1aSopenharmony_ci    movdqa   XMMS, REG0
421cabdff1aSopenharmony_ci    psubsw   REG0, REG4
422cabdff1aSopenharmony_ci    paddsw   REG4, XMMS
423cabdff1aSopenharmony_ci    movdqa   XMMS, REG4
424cabdff1aSopenharmony_ci    psubsw   REG4, xmm7
425cabdff1aSopenharmony_ci    paddsw   xmm7, XMMS
426cabdff1aSopenharmony_ci    movdqa   XMMS, REG0
427cabdff1aSopenharmony_ci    psubsw   REG0, xmm5
428cabdff1aSopenharmony_ci    paddsw   xmm5, XMMS
429cabdff1aSopenharmony_ci    movdqa   XMMS, xmm5
430cabdff1aSopenharmony_ci    psubsw   xmm5, TAN3
431cabdff1aSopenharmony_ci    paddsw   TAN3, XMMS
432cabdff1aSopenharmony_ci    movdqa   XMMS, REG0
433cabdff1aSopenharmony_ci    psubsw   REG0, xmm3
434cabdff1aSopenharmony_ci    paddsw   xmm3, XMMS
435cabdff1aSopenharmony_ci    MOV32    [BLOCK], TAN1
436cabdff1aSopenharmony_ci
437cabdff1aSopenharmony_ci    FIRST_HALF %1, %2
438cabdff1aSopenharmony_ci
439cabdff1aSopenharmony_ci    movdqa   xmm0, xmm7
440cabdff1aSopenharmony_ci    movdqa   xmm4, REG4
441cabdff1aSopenharmony_ci    psubsw   xmm7, xmm1
442cabdff1aSopenharmony_ci    psubsw   REG4, TAN1
443cabdff1aSopenharmony_ci    paddsw   xmm1, xmm0
444cabdff1aSopenharmony_ci    paddsw   TAN1, xmm4
445cabdff1aSopenharmony_ci
446cabdff1aSopenharmony_ci    SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
447cabdff1aSopenharmony_ci%endmacro
448cabdff1aSopenharmony_ci
449cabdff1aSopenharmony_ci; IDCT pass on columns, assuming rows 4-7 are zero
450cabdff1aSopenharmony_ci%macro iLLM_PASS_SPARSE   2 ; %1=dct   %2=type(normal,put,add)
451cabdff1aSopenharmony_ci    pmulhw   TAN3, xmm4
452cabdff1aSopenharmony_ci    paddsw   TAN3, xmm4
453cabdff1aSopenharmony_ci    movdqa   xmm3, xmm6
454cabdff1aSopenharmony_ci    pmulhw   TAN1, xmm6
455cabdff1aSopenharmony_ci    movdqa   xmm1, xmm4
456cabdff1aSopenharmony_ci    psubsw   xmm3, xmm1
457cabdff1aSopenharmony_ci    paddsw   xmm1, xmm6
458cabdff1aSopenharmony_ci    movdqa   xmm6, TAN1
459cabdff1aSopenharmony_ci    psubsw   TAN1, TAN3
460cabdff1aSopenharmony_ci    paddsw   TAN3, xmm6
461cabdff1aSopenharmony_ci    movdqa   xmm6, xmm3
462cabdff1aSopenharmony_ci    psubsw   xmm3, TAN3
463cabdff1aSopenharmony_ci    paddsw   TAN3, xmm6
464cabdff1aSopenharmony_ci    movdqa   xmm4, [sqrt2]
465cabdff1aSopenharmony_ci    pmulhw   xmm3, xmm4
466cabdff1aSopenharmony_ci    pmulhw   TAN3, xmm4
467cabdff1aSopenharmony_ci    paddsw   TAN3, TAN3
468cabdff1aSopenharmony_ci    paddsw   xmm3, xmm3
469cabdff1aSopenharmony_ci    movdqa   xmm5, [tan2]
470cabdff1aSopenharmony_ci    MOV32    ROW2, SREG2
471cabdff1aSopenharmony_ci    pmulhw   xmm5, SREG2
472cabdff1aSopenharmony_ci    MOV32    ROW0, REG0
473cabdff1aSopenharmony_ci    movdqa   xmm6, REG0
474cabdff1aSopenharmony_ci    psubsw   xmm6, SREG2
475cabdff1aSopenharmony_ci    paddsw  SREG2, REG0
476cabdff1aSopenharmony_ci    MOV32    TAN1, [BLOCK]
477cabdff1aSopenharmony_ci    movdqa   XMMS, REG0
478cabdff1aSopenharmony_ci    psubsw   REG0, xmm5
479cabdff1aSopenharmony_ci    paddsw   xmm5, XMMS
480cabdff1aSopenharmony_ci    movdqa   XMMS, xmm5
481cabdff1aSopenharmony_ci    psubsw   xmm5, TAN3
482cabdff1aSopenharmony_ci    paddsw   TAN3, XMMS
483cabdff1aSopenharmony_ci    movdqa   XMMS, REG0
484cabdff1aSopenharmony_ci    psubsw   REG0, xmm3
485cabdff1aSopenharmony_ci    paddsw   xmm3, XMMS
486cabdff1aSopenharmony_ci    MOV32    [BLOCK], TAN1
487cabdff1aSopenharmony_ci
488cabdff1aSopenharmony_ci    FIRST_HALF %1, %2
489cabdff1aSopenharmony_ci
490cabdff1aSopenharmony_ci    movdqa   xmm0, SREG2
491cabdff1aSopenharmony_ci    movdqa   xmm4, xmm6
492cabdff1aSopenharmony_ci    psubsw  SREG2, xmm1
493cabdff1aSopenharmony_ci    psubsw   xmm6, TAN1
494cabdff1aSopenharmony_ci    paddsw   xmm1, xmm0
495cabdff1aSopenharmony_ci    paddsw   TAN1, xmm4
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci    SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
498cabdff1aSopenharmony_ci%endmacro
499cabdff1aSopenharmony_ci
500cabdff1aSopenharmony_ci%macro IDCT_SSE2 1 ; 0=normal  1=put  2=add
501cabdff1aSopenharmony_ci%if %1 == 0 || ARCH_X86_32
502cabdff1aSopenharmony_ci    %define GPR0  r1d
503cabdff1aSopenharmony_ci    %define GPR1  r2d
504cabdff1aSopenharmony_ci    %define GPR2  r3d
505cabdff1aSopenharmony_ci    %define GPR3  r4d
506cabdff1aSopenharmony_ci    %define NUM_GPRS 5
507cabdff1aSopenharmony_ci%else
508cabdff1aSopenharmony_ci    %define GPR0  r3d
509cabdff1aSopenharmony_ci    %define GPR1  r4d
510cabdff1aSopenharmony_ci    %define GPR2  r5d
511cabdff1aSopenharmony_ci    %define GPR3  r6d
512cabdff1aSopenharmony_ci    %define NUM_GPRS 7
513cabdff1aSopenharmony_ci%endif
514cabdff1aSopenharmony_ci%if %1 == 0
515cabdff1aSopenharmony_cicglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
516cabdff1aSopenharmony_ci%xdefine BLOCK blockq
517cabdff1aSopenharmony_ci%else
518cabdff1aSopenharmony_ci    %if %1 == 1
519cabdff1aSopenharmony_cicglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
520cabdff1aSopenharmony_ci    %else
521cabdff1aSopenharmony_cicglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
522cabdff1aSopenharmony_ci    %endif
523cabdff1aSopenharmony_ci    %if ARCH_X86_64
524cabdff1aSopenharmony_ci    %xdefine BLOCK blockq
525cabdff1aSopenharmony_ci    %else
526cabdff1aSopenharmony_ci    mov    r0q, blockm
527cabdff1aSopenharmony_ci    %xdefine BLOCK r0q
528cabdff1aSopenharmony_ci    %endif
529cabdff1aSopenharmony_ci%endif
530cabdff1aSopenharmony_ci    movq           mm0, [pb_127]
531cabdff1aSopenharmony_ci    iMTX_MULT      BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
532cabdff1aSopenharmony_ci    iMTX_MULT      BLOCK + 1*16, iTab2, PUT_ODD, ROW1,  1*16
533cabdff1aSopenharmony_ci    iMTX_MULT      BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
534cabdff1aSopenharmony_ci
535cabdff1aSopenharmony_ci    TEST_TWO_ROWS  BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
536cabdff1aSopenharmony_ci    JZ   GPR0, col1
537cabdff1aSopenharmony_ci    iMTX_MULT      BLOCK + 3*16, iTab4, PUT_ODD, ROW3,  3*16
538cabdff1aSopenharmony_ci.col1:
539cabdff1aSopenharmony_ci    TEST_TWO_ROWS  BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
540cabdff1aSopenharmony_ci    TEST_ONE_ROW   BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_ci    iLLM_HEAD
543cabdff1aSopenharmony_ci    JNZ  GPR1, 2
544cabdff1aSopenharmony_ci    JNZ  GPR0, 3
545cabdff1aSopenharmony_ci    JNZ  GPR2, 4
546cabdff1aSopenharmony_ci    JNZ  GPR3, 5
547cabdff1aSopenharmony_ci    iLLM_PASS_SPARSE BLOCK, %1
548cabdff1aSopenharmony_ci    jmp .6
549cabdff1aSopenharmony_ci.2:
550cabdff1aSopenharmony_ci    iMTX_MULT     BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
551cabdff1aSopenharmony_ci.3:
552cabdff1aSopenharmony_ci    iMTX_MULT     BLOCK + 5*16, iTab4, PUT_ODD, ROW5,  4*16
553cabdff1aSopenharmony_ci    JZ   GPR2, col2
554cabdff1aSopenharmony_ci.4:
555cabdff1aSopenharmony_ci    iMTX_MULT     BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
556cabdff1aSopenharmony_ci.col2:
557cabdff1aSopenharmony_ci    JZ   GPR3, col3
558cabdff1aSopenharmony_ci.5:
559cabdff1aSopenharmony_ci    iMTX_MULT     BLOCK + 7*16, iTab2, PUT_ODD, ROW7,  5*16
560cabdff1aSopenharmony_ci.col3:
561cabdff1aSopenharmony_ci%if ARCH_X86_32
562cabdff1aSopenharmony_ci    iLLM_HEAD
563cabdff1aSopenharmony_ci%endif
564cabdff1aSopenharmony_ci    iLLM_PASS     BLOCK, %1
565cabdff1aSopenharmony_ci.6:
566cabdff1aSopenharmony_ci    RET
567cabdff1aSopenharmony_ci%endmacro
568cabdff1aSopenharmony_ci
569cabdff1aSopenharmony_ciINIT_XMM sse2
570cabdff1aSopenharmony_ciIDCT_SSE2 0
571cabdff1aSopenharmony_ciIDCT_SSE2 1
572cabdff1aSopenharmony_ciIDCT_SSE2 2
573