1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSE2-optimized functions for the VP3 decoder
3cabdff1aSopenharmony_ci;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci;******************************************************************************
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci; MMX-optimized functions cribbed from the original VP3 source code.
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION_RODATA
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_civp3_idct_data: times 8 dw 64277
29cabdff1aSopenharmony_ci               times 8 dw 60547
30cabdff1aSopenharmony_ci               times 8 dw 54491
31cabdff1aSopenharmony_ci               times 8 dw 46341
32cabdff1aSopenharmony_ci               times 8 dw 36410
33cabdff1aSopenharmony_ci               times 8 dw 25080
34cabdff1aSopenharmony_ci               times 8 dw 12785
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_cipb_7:  times 8 db 0x07
37cabdff1aSopenharmony_cipb_1F: times 8 db 0x1f
38cabdff1aSopenharmony_cipb_81: times 8 db 0x81
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_cicextern pb_1
41cabdff1aSopenharmony_cicextern pb_3
42cabdff1aSopenharmony_cicextern pb_80
43cabdff1aSopenharmony_cicextern pb_FE
44cabdff1aSopenharmony_ci
45cabdff1aSopenharmony_cicextern pw_8
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ciSECTION .text
48cabdff1aSopenharmony_ci
49cabdff1aSopenharmony_ci; this is off by one or two for some cases when filter_limit is greater than 63
50cabdff1aSopenharmony_ci; in:  p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
51cabdff1aSopenharmony_ci; out: p1 in mm4, p2 in mm3
52cabdff1aSopenharmony_ci%macro VP3_LOOP_FILTER 0
53cabdff1aSopenharmony_ci    movq          m7, m6
54cabdff1aSopenharmony_ci    pand          m6, [pb_7]    ; p0&7
55cabdff1aSopenharmony_ci    psrlw         m7, 3
56cabdff1aSopenharmony_ci    pand          m7, [pb_1F]   ; p0>>3
57cabdff1aSopenharmony_ci    movq          m3, m2        ; p2
58cabdff1aSopenharmony_ci    pxor          m2, m4
59cabdff1aSopenharmony_ci    pand          m2, [pb_1]    ; (p2^p1)&1
60cabdff1aSopenharmony_ci    movq          m5, m2
61cabdff1aSopenharmony_ci    paddb         m2, m2
62cabdff1aSopenharmony_ci    paddb         m2, m5        ; 3*(p2^p1)&1
63cabdff1aSopenharmony_ci    paddb         m2, m6        ; extra bits lost in shifts
64cabdff1aSopenharmony_ci    pcmpeqb       m0, m0
65cabdff1aSopenharmony_ci    pxor          m1, m0        ; 255 - p3
66cabdff1aSopenharmony_ci    pavgb         m1, m2        ; (256 - p3 + extrabits) >> 1
67cabdff1aSopenharmony_ci    pxor          m0, m4        ; 255 - p1
68cabdff1aSopenharmony_ci    pavgb         m0, m3        ; (256 + p2-p1) >> 1
69cabdff1aSopenharmony_ci    paddb         m1, [pb_3]
70cabdff1aSopenharmony_ci    pavgb         m1, m0        ; 128+2+(   p2-p1  - p3) >> 2
71cabdff1aSopenharmony_ci    pavgb         m1, m0        ; 128+1+(3*(p2-p1) - p3) >> 3
72cabdff1aSopenharmony_ci    paddusb       m7, m1        ; d+128+1
73cabdff1aSopenharmony_ci    movq          m6, [pb_81]
74cabdff1aSopenharmony_ci    psubusb       m6, m7
75cabdff1aSopenharmony_ci    psubusb       m7, [pb_81]
76cabdff1aSopenharmony_ci
77cabdff1aSopenharmony_ci    movq          m5, [r2+516]  ; flim
78cabdff1aSopenharmony_ci    pminub        m6, m5
79cabdff1aSopenharmony_ci    pminub        m7, m5
80cabdff1aSopenharmony_ci    movq          m0, m6
81cabdff1aSopenharmony_ci    movq          m1, m7
82cabdff1aSopenharmony_ci    paddb         m6, m6
83cabdff1aSopenharmony_ci    paddb         m7, m7
84cabdff1aSopenharmony_ci    pminub        m6, m5
85cabdff1aSopenharmony_ci    pminub        m7, m5
86cabdff1aSopenharmony_ci    psubb         m6, m0
87cabdff1aSopenharmony_ci    psubb         m7, m1
88cabdff1aSopenharmony_ci    paddusb       m4, m7
89cabdff1aSopenharmony_ci    psubusb       m4, m6
90cabdff1aSopenharmony_ci    psubusb       m3, m7
91cabdff1aSopenharmony_ci    paddusb       m3, m6
92cabdff1aSopenharmony_ci%endmacro
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci%macro STORE_4_WORDS 1
95cabdff1aSopenharmony_ci    movd         r2d, %1
96cabdff1aSopenharmony_ci    mov  [r0     -1], r2w
97cabdff1aSopenharmony_ci    psrlq         %1, 32
98cabdff1aSopenharmony_ci    shr           r2, 16
99cabdff1aSopenharmony_ci    mov  [r0+r1  -1], r2w
100cabdff1aSopenharmony_ci    movd         r2d, %1
101cabdff1aSopenharmony_ci    mov  [r0+r1*2-1], r2w
102cabdff1aSopenharmony_ci    shr           r2, 16
103cabdff1aSopenharmony_ci    mov  [r0+r3  -1], r2w
104cabdff1aSopenharmony_ci%endmacro
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ciINIT_MMX mmxext
107cabdff1aSopenharmony_cicglobal vp3_v_loop_filter, 3, 4
108cabdff1aSopenharmony_ci    mov           r3, r1
109cabdff1aSopenharmony_ci    neg           r1
110cabdff1aSopenharmony_ci    movq          m6, [r0+r1*2]
111cabdff1aSopenharmony_ci    movq          m4, [r0+r1  ]
112cabdff1aSopenharmony_ci    movq          m2, [r0     ]
113cabdff1aSopenharmony_ci    movq          m1, [r0+r3  ]
114cabdff1aSopenharmony_ci
115cabdff1aSopenharmony_ci    VP3_LOOP_FILTER
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci    movq     [r0+r1], m4
118cabdff1aSopenharmony_ci    movq     [r0   ], m3
119cabdff1aSopenharmony_ci    RET
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_cicglobal vp3_h_loop_filter, 3, 4
122cabdff1aSopenharmony_ci    lea           r3, [r1*3]
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_ci    movd          m6, [r0     -2]
125cabdff1aSopenharmony_ci    movd          m4, [r0+r1  -2]
126cabdff1aSopenharmony_ci    movd          m2, [r0+r1*2-2]
127cabdff1aSopenharmony_ci    movd          m1, [r0+r3  -2]
128cabdff1aSopenharmony_ci    lea           r0, [r0+r1*4  ]
129cabdff1aSopenharmony_ci    punpcklbw     m6, [r0     -2]
130cabdff1aSopenharmony_ci    punpcklbw     m4, [r0+r1  -2]
131cabdff1aSopenharmony_ci    punpcklbw     m2, [r0+r1*2-2]
132cabdff1aSopenharmony_ci    punpcklbw     m1, [r0+r3  -2]
133cabdff1aSopenharmony_ci    sub           r0, r3
134cabdff1aSopenharmony_ci    sub           r0, r1
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci    TRANSPOSE4x4B  6, 4, 2, 1, 0
137cabdff1aSopenharmony_ci    VP3_LOOP_FILTER
138cabdff1aSopenharmony_ci    SBUTTERFLY    bw, 4, 3, 5
139cabdff1aSopenharmony_ci
140cabdff1aSopenharmony_ci    STORE_4_WORDS m4
141cabdff1aSopenharmony_ci    lea           r0, [r0+r1*4  ]
142cabdff1aSopenharmony_ci    STORE_4_WORDS m3
143cabdff1aSopenharmony_ci    RET
144cabdff1aSopenharmony_ci
145cabdff1aSopenharmony_ci%macro PAVGB_NO_RND 0
146cabdff1aSopenharmony_ci    mova   m4, m0
147cabdff1aSopenharmony_ci    mova   m5, m2
148cabdff1aSopenharmony_ci    pand   m4, m1
149cabdff1aSopenharmony_ci    pand   m5, m3
150cabdff1aSopenharmony_ci    pxor   m1, m0
151cabdff1aSopenharmony_ci    pxor   m3, m2
152cabdff1aSopenharmony_ci    pand   m1, m6
153cabdff1aSopenharmony_ci    pand   m3, m6
154cabdff1aSopenharmony_ci    psrlq  m1, 1
155cabdff1aSopenharmony_ci    psrlq  m3, 1
156cabdff1aSopenharmony_ci    paddb  m4, m1
157cabdff1aSopenharmony_ci    paddb  m5, m3
158cabdff1aSopenharmony_ci%endmacro
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_ciINIT_MMX mmx
161cabdff1aSopenharmony_cicglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
162cabdff1aSopenharmony_ci    mova   m6, [pb_FE]
163cabdff1aSopenharmony_ci    lea    stride3q,[strideq+strideq*2]
164cabdff1aSopenharmony_ci.loop:
165cabdff1aSopenharmony_ci    mova   m0, [src1q]
166cabdff1aSopenharmony_ci    mova   m1, [src2q]
167cabdff1aSopenharmony_ci    mova   m2, [src1q+strideq]
168cabdff1aSopenharmony_ci    mova   m3, [src2q+strideq]
169cabdff1aSopenharmony_ci    PAVGB_NO_RND
170cabdff1aSopenharmony_ci    mova   [dstq], m4
171cabdff1aSopenharmony_ci    mova   [dstq+strideq], m5
172cabdff1aSopenharmony_ci
173cabdff1aSopenharmony_ci    mova   m0, [src1q+strideq*2]
174cabdff1aSopenharmony_ci    mova   m1, [src2q+strideq*2]
175cabdff1aSopenharmony_ci    mova   m2, [src1q+stride3q]
176cabdff1aSopenharmony_ci    mova   m3, [src2q+stride3q]
177cabdff1aSopenharmony_ci    PAVGB_NO_RND
178cabdff1aSopenharmony_ci    mova   [dstq+strideq*2], m4
179cabdff1aSopenharmony_ci    mova   [dstq+stride3q],  m5
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci    lea    src1q, [src1q+strideq*4]
182cabdff1aSopenharmony_ci    lea    src2q, [src2q+strideq*4]
183cabdff1aSopenharmony_ci    lea    dstq,  [dstq+strideq*4]
184cabdff1aSopenharmony_ci    sub    hd, 4
185cabdff1aSopenharmony_ci    jnz .loop
186cabdff1aSopenharmony_ci    RET
187cabdff1aSopenharmony_ci
188cabdff1aSopenharmony_ci; from original comments: The Macro does IDct on 4 1-D Dcts
189cabdff1aSopenharmony_ci%macro BeginIDCT 0
190cabdff1aSopenharmony_ci    movq          m2, I(3)
191cabdff1aSopenharmony_ci    movq          m6, C(3)
192cabdff1aSopenharmony_ci    movq          m4, m2
193cabdff1aSopenharmony_ci    movq          m7, J(5)
194cabdff1aSopenharmony_ci    pmulhw        m4, m6        ; r4 = c3*i3 - i3
195cabdff1aSopenharmony_ci    movq          m1, C(5)
196cabdff1aSopenharmony_ci    pmulhw        m6, m7        ; r6 = c3*i5 - i5
197cabdff1aSopenharmony_ci    movq          m5, m1
198cabdff1aSopenharmony_ci    pmulhw        m1, m2        ; r1 = c5*i3 - i3
199cabdff1aSopenharmony_ci    movq          m3, I(1)
200cabdff1aSopenharmony_ci    pmulhw        m5, m7        ; r5 = c5*i5 - i5
201cabdff1aSopenharmony_ci    movq          m0, C(1)
202cabdff1aSopenharmony_ci    paddw         m4, m2        ; r4 = c3*i3
203cabdff1aSopenharmony_ci    paddw         m6, m7        ; r6 = c3*i5
204cabdff1aSopenharmony_ci    paddw         m2, m1        ; r2 = c5*i3
205cabdff1aSopenharmony_ci    movq          m1, J(7)
206cabdff1aSopenharmony_ci    paddw         m7, m5        ; r7 = c5*i5
207cabdff1aSopenharmony_ci    movq          m5, m0        ; r5 = c1
208cabdff1aSopenharmony_ci    pmulhw        m0, m3        ; r0 = c1*i1 - i1
209cabdff1aSopenharmony_ci    paddsw        m4, m7        ; r4 = C = c3*i3 + c5*i5
210cabdff1aSopenharmony_ci    pmulhw        m5, m1        ; r5 = c1*i7 - i7
211cabdff1aSopenharmony_ci    movq          m7, C(7)
212cabdff1aSopenharmony_ci    psubsw        m6, m2        ; r6 = D = c3*i5 - c5*i3
213cabdff1aSopenharmony_ci    paddw         m0, m3        ; r0 = c1*i1
214cabdff1aSopenharmony_ci    pmulhw        m3, m7        ; r3 = c7*i1
215cabdff1aSopenharmony_ci    movq          m2, I(2)
216cabdff1aSopenharmony_ci    pmulhw        m7, m1        ; r7 = c7*i7
217cabdff1aSopenharmony_ci    paddw         m5, m1        ; r5 = c1*i7
218cabdff1aSopenharmony_ci    movq          m1, m2        ; r1 = i2
219cabdff1aSopenharmony_ci    pmulhw        m2, C(2)      ; r2 = c2*i2 - i2
220cabdff1aSopenharmony_ci    psubsw        m3, m5        ; r3 = B = c7*i1 - c1*i7
221cabdff1aSopenharmony_ci    movq          m5, J(6)
222cabdff1aSopenharmony_ci    paddsw        m0, m7        ; r0 = A = c1*i1 + c7*i7
223cabdff1aSopenharmony_ci    movq          m7, m5        ; r7 = i6
224cabdff1aSopenharmony_ci    psubsw        m0, m4        ; r0 = A - C
225cabdff1aSopenharmony_ci    pmulhw        m5, C(2)      ; r5 = c2*i6 - i6
226cabdff1aSopenharmony_ci    paddw         m2, m1        ; r2 = c2*i2
227cabdff1aSopenharmony_ci    pmulhw        m1, C(6)      ; r1 = c6*i2
228cabdff1aSopenharmony_ci    paddsw        m4, m4        ; r4 = C + C
229cabdff1aSopenharmony_ci    paddsw        m4, m0        ; r4 = C. = A + C
230cabdff1aSopenharmony_ci    psubsw        m3, m6        ; r3 = B - D
231cabdff1aSopenharmony_ci    paddw         m5, m7        ; r5 = c2*i6
232cabdff1aSopenharmony_ci    paddsw        m6, m6        ; r6 = D + D
233cabdff1aSopenharmony_ci    pmulhw        m7, C(6)      ; r7 = c6*i6
234cabdff1aSopenharmony_ci    paddsw        m6, m3        ; r6 = D. = B + D
235cabdff1aSopenharmony_ci    movq        I(1), m4        ; save C. at I(1)
236cabdff1aSopenharmony_ci    psubsw        m1, m5        ; r1 = H = c6*i2 - c2*i6
237cabdff1aSopenharmony_ci    movq          m4, C(4)
238cabdff1aSopenharmony_ci    movq          m5, m3        ; r5 = B - D
239cabdff1aSopenharmony_ci    pmulhw        m3, m4        ; r3 = (c4 - 1) * (B - D)
240cabdff1aSopenharmony_ci    paddsw        m7, m2        ; r3 = (c4 - 1) * (B - D)
241cabdff1aSopenharmony_ci    movq        I(2), m6        ; save D. at I(2)
242cabdff1aSopenharmony_ci    movq          m2, m0        ; r2 = A - C
243cabdff1aSopenharmony_ci    movq          m6, I(0)
244cabdff1aSopenharmony_ci    pmulhw        m0, m4        ; r0 = (c4 - 1) * (A - C)
245cabdff1aSopenharmony_ci    paddw         m5, m3        ; r5 = B. = c4 * (B - D)
246cabdff1aSopenharmony_ci    movq          m3, J(4)
247cabdff1aSopenharmony_ci    psubsw        m5, m1        ; r5 = B.. = B. - H
248cabdff1aSopenharmony_ci    paddw         m2, m0        ; r0 = A. = c4 * (A - C)
249cabdff1aSopenharmony_ci    psubsw        m6, m3        ; r6 = i0 - i4
250cabdff1aSopenharmony_ci    movq          m0, m6
251cabdff1aSopenharmony_ci    pmulhw        m6, m4        ; r6 = (c4 - 1) * (i0 - i4)
252cabdff1aSopenharmony_ci    paddsw        m3, m3        ; r3 = i4 + i4
253cabdff1aSopenharmony_ci    paddsw        m1, m1        ; r1 = H + H
254cabdff1aSopenharmony_ci    paddsw        m3, m0        ; r3 = i0 + i4
255cabdff1aSopenharmony_ci    paddsw        m1, m5        ; r1 = H. = B + H
256cabdff1aSopenharmony_ci    pmulhw        m4, m3        ; r4 = (c4 - 1) * (i0 + i4)
257cabdff1aSopenharmony_ci    paddsw        m6, m0        ; r6 = F = c4 * (i0 - i4)
258cabdff1aSopenharmony_ci    psubsw        m6, m2        ; r6 = F. = F - A.
259cabdff1aSopenharmony_ci    paddsw        m2, m2        ; r2 = A. + A.
260cabdff1aSopenharmony_ci    movq          m0, I(1)      ; r0 = C.
261cabdff1aSopenharmony_ci    paddsw        m2, m6        ; r2 = A.. = F + A.
262cabdff1aSopenharmony_ci    paddw         m4, m3        ; r4 = E = c4 * (i0 + i4)
263cabdff1aSopenharmony_ci    psubsw        m2, m1        ; r2 = R2 = A.. - H.
264cabdff1aSopenharmony_ci%endmacro
265cabdff1aSopenharmony_ci
266cabdff1aSopenharmony_ci; RowIDCT gets ready to transpose
267cabdff1aSopenharmony_ci%macro RowIDCT 0
268cabdff1aSopenharmony_ci    BeginIDCT
269cabdff1aSopenharmony_ci    movq          m3, I(2)      ; r3 = D.
270cabdff1aSopenharmony_ci    psubsw        m4, m7        ; r4 = E. = E - G
271cabdff1aSopenharmony_ci    paddsw        m1, m1        ; r1 = H. + H.
272cabdff1aSopenharmony_ci    paddsw        m7, m7        ; r7 = G + G
273cabdff1aSopenharmony_ci    paddsw        m1, m2        ; r1 = R1 = A.. + H.
274cabdff1aSopenharmony_ci    paddsw        m7, m4        ; r1 = R1 = A.. + H.
275cabdff1aSopenharmony_ci    psubsw        m4, m3        ; r4 = R4 = E. - D.
276cabdff1aSopenharmony_ci    paddsw        m3, m3
277cabdff1aSopenharmony_ci    psubsw        m6, m5        ; r6 = R6 = F. - B..
278cabdff1aSopenharmony_ci    paddsw        m5, m5
279cabdff1aSopenharmony_ci    paddsw        m3, m4        ; r3 = R3 = E. + D.
280cabdff1aSopenharmony_ci    paddsw        m5, m6        ; r5 = R5 = F. + B..
281cabdff1aSopenharmony_ci    psubsw        m7, m0        ; r7 = R7 = G. - C.
282cabdff1aSopenharmony_ci    paddsw        m0, m0
283cabdff1aSopenharmony_ci    movq        I(1), m1        ; save R1
284cabdff1aSopenharmony_ci    paddsw        m0, m7        ; r0 = R0 = G. + C.
285cabdff1aSopenharmony_ci%endmacro
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci; Column IDCT normalizes and stores final results
288cabdff1aSopenharmony_ci%macro ColumnIDCT 0
289cabdff1aSopenharmony_ci    BeginIDCT
290cabdff1aSopenharmony_ci    paddsw        m2, OC_8      ; adjust R2 (and R1) for shift
291cabdff1aSopenharmony_ci    paddsw        m1, m1        ; r1 = H. + H.
292cabdff1aSopenharmony_ci    paddsw        m1, m2        ; r1 = R1 = A.. + H.
293cabdff1aSopenharmony_ci    psraw         m2, 4         ; r2 = NR2
294cabdff1aSopenharmony_ci    psubsw        m4, m7        ; r4 = E. = E - G
295cabdff1aSopenharmony_ci    psraw         m1, 4         ; r1 = NR2
296cabdff1aSopenharmony_ci    movq          m3, I(2)      ; r3 = D.
297cabdff1aSopenharmony_ci    paddsw        m7, m7        ; r7 = G + G
298cabdff1aSopenharmony_ci    movq        I(2), m2        ; store NR2 at I2
299cabdff1aSopenharmony_ci    paddsw        m7, m4        ; r7 = G. = E + G
300cabdff1aSopenharmony_ci    movq        I(1), m1        ; store NR1 at I1
301cabdff1aSopenharmony_ci    psubsw        m4, m3        ; r4 = R4 = E. - D.
302cabdff1aSopenharmony_ci    paddsw        m4, OC_8      ; adjust R4 (and R3) for shift
303cabdff1aSopenharmony_ci    paddsw        m3, m3        ; r3 = D. + D.
304cabdff1aSopenharmony_ci    paddsw        m3, m4        ; r3 = R3 = E. + D.
305cabdff1aSopenharmony_ci    psraw         m4, 4         ; r4 = NR4
306cabdff1aSopenharmony_ci    psubsw        m6, m5        ; r6 = R6 = F. - B..
307cabdff1aSopenharmony_ci    psraw         m3, 4         ; r3 = NR3
308cabdff1aSopenharmony_ci    paddsw        m6, OC_8      ; adjust R6 (and R5) for shift
309cabdff1aSopenharmony_ci    paddsw        m5, m5        ; r5 = B.. + B..
310cabdff1aSopenharmony_ci    paddsw        m5, m6        ; r5 = R5 = F. + B..
311cabdff1aSopenharmony_ci    psraw         m6, 4         ; r6 = NR6
312cabdff1aSopenharmony_ci    movq        J(4), m4        ; store NR4 at J4
313cabdff1aSopenharmony_ci    psraw         m5, 4         ; r5 = NR5
314cabdff1aSopenharmony_ci    movq        I(3), m3        ; store NR3 at I3
315cabdff1aSopenharmony_ci    psubsw        m7, m0        ; r7 = R7 = G. - C.
316cabdff1aSopenharmony_ci    paddsw        m7, OC_8      ; adjust R7 (and R0) for shift
317cabdff1aSopenharmony_ci    paddsw        m0, m0        ; r0 = C. + C.
318cabdff1aSopenharmony_ci    paddsw        m0, m7        ; r0 = R0 = G. + C.
319cabdff1aSopenharmony_ci    psraw         m7, 4         ; r7 = NR7
320cabdff1aSopenharmony_ci    movq        J(6), m6        ; store NR6 at J6
321cabdff1aSopenharmony_ci    psraw         m0, 4         ; r0 = NR0
322cabdff1aSopenharmony_ci    movq        J(5), m5        ; store NR5 at J5
323cabdff1aSopenharmony_ci    movq        J(7), m7        ; store NR7 at J7
324cabdff1aSopenharmony_ci    movq        I(0), m0        ; store NR0 at I0
325cabdff1aSopenharmony_ci%endmacro
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_ci; Following macro does two 4x4 transposes in place.
328cabdff1aSopenharmony_ci;
329cabdff1aSopenharmony_ci; At entry (we assume):
330cabdff1aSopenharmony_ci;
331cabdff1aSopenharmony_ci;   r0 = a3 a2 a1 a0
332cabdff1aSopenharmony_ci;   I(1) = b3 b2 b1 b0
333cabdff1aSopenharmony_ci;   r2 = c3 c2 c1 c0
334cabdff1aSopenharmony_ci;   r3 = d3 d2 d1 d0
335cabdff1aSopenharmony_ci;
336cabdff1aSopenharmony_ci;   r4 = e3 e2 e1 e0
337cabdff1aSopenharmony_ci;   r5 = f3 f2 f1 f0
338cabdff1aSopenharmony_ci;   r6 = g3 g2 g1 g0
339cabdff1aSopenharmony_ci;   r7 = h3 h2 h1 h0
340cabdff1aSopenharmony_ci;
341cabdff1aSopenharmony_ci; At exit, we have:
342cabdff1aSopenharmony_ci;
343cabdff1aSopenharmony_ci;   I(0) = d0 c0 b0 a0
344cabdff1aSopenharmony_ci;   I(1) = d1 c1 b1 a1
345cabdff1aSopenharmony_ci;   I(2) = d2 c2 b2 a2
346cabdff1aSopenharmony_ci;   I(3) = d3 c3 b3 a3
347cabdff1aSopenharmony_ci;
348cabdff1aSopenharmony_ci;   J(4) = h0 g0 f0 e0
349cabdff1aSopenharmony_ci;   J(5) = h1 g1 f1 e1
350cabdff1aSopenharmony_ci;   J(6) = h2 g2 f2 e2
351cabdff1aSopenharmony_ci;   J(7) = h3 g3 f3 e3
352cabdff1aSopenharmony_ci;
353cabdff1aSopenharmony_ci;  I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
354cabdff1aSopenharmony_ci;  J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
355cabdff1aSopenharmony_ci;
356cabdff1aSopenharmony_ci;  Since r1 is free at entry, we calculate the Js first.
357cabdff1aSopenharmony_ci%macro Transpose 0
358cabdff1aSopenharmony_ci    movq          m1, m4        ; r1 = e3 e2 e1 e0
359cabdff1aSopenharmony_ci    punpcklwd     m4, m5        ; r4 = f1 e1 f0 e0
360cabdff1aSopenharmony_ci    movq        I(0), m0        ; save a3 a2 a1 a0
361cabdff1aSopenharmony_ci    punpckhwd     m1, m5        ; r1 = f3 e3 f2 e2
362cabdff1aSopenharmony_ci    movq          m0, m6        ; r0 = g3 g2 g1 g0
363cabdff1aSopenharmony_ci    punpcklwd     m6, m7        ; r6 = h1 g1 h0 g0
364cabdff1aSopenharmony_ci    movq          m5, m4        ; r5 = f1 e1 f0 e0
365cabdff1aSopenharmony_ci    punpckldq     m4, m6        ; r4 = h0 g0 f0 e0 = R4
366cabdff1aSopenharmony_ci    punpckhdq     m5, m6        ; r5 = h1 g1 f1 e1 = R5
367cabdff1aSopenharmony_ci    movq          m6, m1        ; r6 = f3 e3 f2 e2
368cabdff1aSopenharmony_ci    movq        J(4), m4
369cabdff1aSopenharmony_ci    punpckhwd     m0, m7        ; r0 = h3 g3 h2 g2
370cabdff1aSopenharmony_ci    movq        J(5), m5
371cabdff1aSopenharmony_ci    punpckhdq     m6, m0        ; r6 = h3 g3 f3 e3 = R7
372cabdff1aSopenharmony_ci    movq          m4, I(0)      ; r4 = a3 a2 a1 a0
373cabdff1aSopenharmony_ci    punpckldq     m1, m0        ; r1 = h2 g2 f2 e2 = R6
374cabdff1aSopenharmony_ci    movq          m5, I(1)      ; r5 = b3 b2 b1 b0
375cabdff1aSopenharmony_ci    movq          m0, m4        ; r0 = a3 a2 a1 a0
376cabdff1aSopenharmony_ci    movq        J(7), m6
377cabdff1aSopenharmony_ci    punpcklwd     m0, m5        ; r0 = b1 a1 b0 a0
378cabdff1aSopenharmony_ci    movq        J(6), m1
379cabdff1aSopenharmony_ci    punpckhwd     m4, m5        ; r4 = b3 a3 b2 a2
380cabdff1aSopenharmony_ci    movq          m5, m2        ; r5 = c3 c2 c1 c0
381cabdff1aSopenharmony_ci    punpcklwd     m2, m3        ; r2 = d1 c1 d0 c0
382cabdff1aSopenharmony_ci    movq          m1, m0        ; r1 = b1 a1 b0 a0
383cabdff1aSopenharmony_ci    punpckldq     m0, m2        ; r0 = d0 c0 b0 a0 = R0
384cabdff1aSopenharmony_ci    punpckhdq     m1, m2        ; r1 = d1 c1 b1 a1 = R1
385cabdff1aSopenharmony_ci    movq          m2, m4        ; r2 = b3 a3 b2 a2
386cabdff1aSopenharmony_ci    movq        I(0), m0
387cabdff1aSopenharmony_ci    punpckhwd     m5, m3        ; r5 = d3 c3 d2 c2
388cabdff1aSopenharmony_ci    movq        I(1), m1
389cabdff1aSopenharmony_ci    punpckhdq     m4, m5        ; r4 = d3 c3 b3 a3 = R3
390cabdff1aSopenharmony_ci    punpckldq     m2, m5        ; r2 = d2 c2 b2 a2 = R2
391cabdff1aSopenharmony_ci    movq        I(3), m4
392cabdff1aSopenharmony_ci    movq        I(2), m2
393cabdff1aSopenharmony_ci%endmacro
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci%macro VP3_1D_IDCT_SSE2 0
396cabdff1aSopenharmony_ci    movdqa        m2, I(3)      ; xmm2 = i3
397cabdff1aSopenharmony_ci    movdqa        m6, C(3)      ; xmm6 = c3
398cabdff1aSopenharmony_ci    movdqa        m4, m2        ; xmm4 = i3
399cabdff1aSopenharmony_ci    movdqa        m7, I(5)      ; xmm7 = i5
400cabdff1aSopenharmony_ci    pmulhw        m4, m6        ; xmm4 = c3 * i3 - i3
401cabdff1aSopenharmony_ci    movdqa        m1, C(5)      ; xmm1 = c5
402cabdff1aSopenharmony_ci    pmulhw        m6, m7        ; xmm6 = c3 * i5 - i5
403cabdff1aSopenharmony_ci    movdqa        m5, m1        ; xmm5 = c5
404cabdff1aSopenharmony_ci    pmulhw        m1, m2        ; xmm1 = c5 * i3 - i3
405cabdff1aSopenharmony_ci    movdqa        m3, I(1)      ; xmm3 = i1
406cabdff1aSopenharmony_ci    pmulhw        m5, m7        ; xmm5 = c5 * i5 - i5
407cabdff1aSopenharmony_ci    movdqa        m0, C(1)      ; xmm0 = c1
408cabdff1aSopenharmony_ci    paddw         m4, m2        ; xmm4 = c3 * i3
409cabdff1aSopenharmony_ci    paddw         m6, m7        ; xmm6 = c3 * i5
410cabdff1aSopenharmony_ci    paddw         m2, m1        ; xmm2 = c5 * i3
411cabdff1aSopenharmony_ci    movdqa        m1, I(7)      ; xmm1 = i7
412cabdff1aSopenharmony_ci    paddw         m7, m5        ; xmm7 = c5 * i5
413cabdff1aSopenharmony_ci    movdqa        m5, m0        ; xmm5 = c1
414cabdff1aSopenharmony_ci    pmulhw        m0, m3        ; xmm0 = c1 * i1 - i1
415cabdff1aSopenharmony_ci    paddsw        m4, m7        ; xmm4 = c3 * i3 + c5 * i5 = C
416cabdff1aSopenharmony_ci    pmulhw        m5, m1        ; xmm5 = c1 * i7 - i7
417cabdff1aSopenharmony_ci    movdqa        m7, C(7)      ; xmm7 = c7
418cabdff1aSopenharmony_ci    psubsw        m6, m2        ; xmm6 = c3 * i5 - c5 * i3 = D
419cabdff1aSopenharmony_ci    paddw         m0, m3        ; xmm0 = c1 * i1
420cabdff1aSopenharmony_ci    pmulhw        m3, m7        ; xmm3 = c7 * i1
421cabdff1aSopenharmony_ci    movdqa        m2, I(2)      ; xmm2 = i2
422cabdff1aSopenharmony_ci    pmulhw        m7, m1        ; xmm7 = c7 * i7
423cabdff1aSopenharmony_ci    paddw         m5, m1        ; xmm5 = c1 * i7
424cabdff1aSopenharmony_ci    movdqa        m1, m2        ; xmm1 = i2
425cabdff1aSopenharmony_ci    pmulhw        m2, C(2)      ; xmm2 = i2 * c2 -i2
426cabdff1aSopenharmony_ci    psubsw        m3, m5        ; xmm3 = c7 * i1 - c1 * i7 = B
427cabdff1aSopenharmony_ci    movdqa        m5, I(6)      ; xmm5 = i6
428cabdff1aSopenharmony_ci    paddsw        m0, m7        ; xmm0 = c1 * i1 + c7 * i7 = A
429cabdff1aSopenharmony_ci    movdqa        m7, m5        ; xmm7 = i6
430cabdff1aSopenharmony_ci    psubsw        m0, m4        ; xmm0 = A - C
431cabdff1aSopenharmony_ci    pmulhw        m5, C(2)      ; xmm5 = c2 * i6 - i6
432cabdff1aSopenharmony_ci    paddw         m2, m1        ; xmm2 = i2 * c2
433cabdff1aSopenharmony_ci    pmulhw        m1, C(6)      ; xmm1 = c6 * i2
434cabdff1aSopenharmony_ci    paddsw        m4, m4        ; xmm4 = C + C
435cabdff1aSopenharmony_ci    paddsw        m4, m0        ; xmm4 = A + C = C.
436cabdff1aSopenharmony_ci    psubsw        m3, m6        ; xmm3 = B - D
437cabdff1aSopenharmony_ci    paddw         m5, m7        ; xmm5 = c2 * i6
438cabdff1aSopenharmony_ci    paddsw        m6, m6        ; xmm6 = D + D
439cabdff1aSopenharmony_ci    pmulhw        m7, C(6)      ; xmm7 = c6 * i6
440cabdff1aSopenharmony_ci    paddsw        m6, m3        ; xmm6 = B + D = D.
441cabdff1aSopenharmony_ci    movdqa      I(1), m4        ; Save C. at I(1)
442cabdff1aSopenharmony_ci    psubsw        m1, m5        ; xmm1 = c6 * i2 - c2 * i6 = H
443cabdff1aSopenharmony_ci    movdqa        m4, C(4)      ; xmm4 = C4
444cabdff1aSopenharmony_ci    movdqa        m5, m3        ; xmm5 = B - D
445cabdff1aSopenharmony_ci    pmulhw        m3, m4        ; xmm3 = ( c4 -1 ) * ( B - D )
446cabdff1aSopenharmony_ci    paddsw        m7, m2        ; xmm7 = c2 * i2 + c6 * i6 = G
447cabdff1aSopenharmony_ci    movdqa      I(2), m6        ; save D. at I(2)
448cabdff1aSopenharmony_ci    movdqa        m2, m0        ; xmm2 = A - C
449cabdff1aSopenharmony_ci    movdqa        m6, I(0)      ; xmm6 = i0
450cabdff1aSopenharmony_ci    pmulhw        m0, m4        ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
451cabdff1aSopenharmony_ci    paddw         m5, m3        ; xmm5 = c4 * ( B - D ) = B.
452cabdff1aSopenharmony_ci    movdqa        m3, I(4)      ; xmm3 = i4
453cabdff1aSopenharmony_ci    psubsw        m5, m1        ; xmm5 = B. - H = B..
454cabdff1aSopenharmony_ci    paddw         m2, m0        ; xmm2 = c4 * ( A - C) = A.
455cabdff1aSopenharmony_ci    psubsw        m6, m3        ; xmm6 = i0 - i4
456cabdff1aSopenharmony_ci    movdqa        m0, m6        ; xmm0 = i0 - i4
457cabdff1aSopenharmony_ci    pmulhw        m6, m4        ; xmm6 = (c4 - 1) * (i0 - i4) = F
458cabdff1aSopenharmony_ci    paddsw        m3, m3        ; xmm3 = i4 + i4
459cabdff1aSopenharmony_ci    paddsw        m1, m1        ; xmm1 = H + H
460cabdff1aSopenharmony_ci    paddsw        m3, m0        ; xmm3 = i0 + i4
461cabdff1aSopenharmony_ci    paddsw        m1, m5        ; xmm1 = B. + H = H.
462cabdff1aSopenharmony_ci    pmulhw        m4, m3        ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
463cabdff1aSopenharmony_ci    paddw         m6, m0        ; xmm6 = c4 * ( i0 - i4 )
464cabdff1aSopenharmony_ci    psubsw        m6, m2        ; xmm6 = F - A. = F.
465cabdff1aSopenharmony_ci    paddsw        m2, m2        ; xmm2 = A. + A.
466cabdff1aSopenharmony_ci    movdqa        m0, I(1)      ; Load        C. from I(1)
467cabdff1aSopenharmony_ci    paddsw        m2, m6        ; xmm2 = F + A. = A..
468cabdff1aSopenharmony_ci    paddw         m4, m3        ; xmm4 = c4 * ( i0 + i4 ) = 3
469cabdff1aSopenharmony_ci    psubsw        m2, m1        ; xmm2 = A.. - H. = R2
470cabdff1aSopenharmony_ci    ADD(m2)                     ; Adjust R2 and R1 before shifting
471cabdff1aSopenharmony_ci    paddsw        m1, m1        ; xmm1 = H. + H.
472cabdff1aSopenharmony_ci    paddsw        m1, m2        ; xmm1 = A.. + H. = R1
473cabdff1aSopenharmony_ci    SHIFT(m2)                   ; xmm2 = op2
474cabdff1aSopenharmony_ci    psubsw        m4, m7        ; xmm4 = E - G = E.
475cabdff1aSopenharmony_ci    SHIFT(m1)                   ; xmm1 = op1
476cabdff1aSopenharmony_ci    movdqa        m3, I(2)      ; Load D. from I(2)
477cabdff1aSopenharmony_ci    paddsw        m7, m7        ; xmm7 = G + G
478cabdff1aSopenharmony_ci    paddsw        m7, m4        ; xmm7 = E + G = G.
479cabdff1aSopenharmony_ci    psubsw        m4, m3        ; xmm4 = E. - D. = R4
480cabdff1aSopenharmony_ci    ADD(m4)                     ; Adjust R4 and R3 before shifting
481cabdff1aSopenharmony_ci    paddsw        m3, m3        ; xmm3 = D. + D.
482cabdff1aSopenharmony_ci    paddsw        m3, m4        ; xmm3 = E. + D. = R3
483cabdff1aSopenharmony_ci    SHIFT(m4)                   ; xmm4 = op4
484cabdff1aSopenharmony_ci    psubsw        m6, m5        ; xmm6 = F. - B..= R6
485cabdff1aSopenharmony_ci    SHIFT(m3)                   ; xmm3 = op3
486cabdff1aSopenharmony_ci    ADD(m6)                     ; Adjust R6 and R5 before shifting
487cabdff1aSopenharmony_ci    paddsw        m5, m5        ; xmm5 = B.. + B..
488cabdff1aSopenharmony_ci    paddsw        m5, m6        ; xmm5 = F. + B.. = R5
489cabdff1aSopenharmony_ci    SHIFT(m6)                   ; xmm6 = op6
490cabdff1aSopenharmony_ci    SHIFT(m5)                   ; xmm5 = op5
491cabdff1aSopenharmony_ci    psubsw        m7, m0        ; xmm7 = G. - C. = R7
492cabdff1aSopenharmony_ci    ADD(m7)                     ; Adjust R7 and R0 before shifting
493cabdff1aSopenharmony_ci    paddsw        m0, m0        ; xmm0 = C. + C.
494cabdff1aSopenharmony_ci    paddsw        m0, m7        ; xmm0 = G. + C.
495cabdff1aSopenharmony_ci    SHIFT(m7)                   ; xmm7 = op7
496cabdff1aSopenharmony_ci    SHIFT(m0)                   ; xmm0 = op0
497cabdff1aSopenharmony_ci%endmacro
498cabdff1aSopenharmony_ci
499cabdff1aSopenharmony_ci%macro PUT_BLOCK 8
500cabdff1aSopenharmony_ci    movdqa      O(0), m%1
501cabdff1aSopenharmony_ci    movdqa      O(1), m%2
502cabdff1aSopenharmony_ci    movdqa      O(2), m%3
503cabdff1aSopenharmony_ci    movdqa      O(3), m%4
504cabdff1aSopenharmony_ci    movdqa      O(4), m%5
505cabdff1aSopenharmony_ci    movdqa      O(5), m%6
506cabdff1aSopenharmony_ci    movdqa      O(6), m%7
507cabdff1aSopenharmony_ci    movdqa      O(7), m%8
508cabdff1aSopenharmony_ci%endmacro
509cabdff1aSopenharmony_ci
510cabdff1aSopenharmony_ci%macro VP3_IDCT 1
511cabdff1aSopenharmony_ci%if mmsize == 16
512cabdff1aSopenharmony_ci%define I(x) [%1+16*x]
513cabdff1aSopenharmony_ci%define O(x) [%1+16*x]
514cabdff1aSopenharmony_ci%define C(x) [vp3_idct_data+16*(x-1)]
515cabdff1aSopenharmony_ci%define SHIFT(x)
516cabdff1aSopenharmony_ci%define ADD(x)
517cabdff1aSopenharmony_ci        VP3_1D_IDCT_SSE2
518cabdff1aSopenharmony_ci%if ARCH_X86_64
519cabdff1aSopenharmony_ci        TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
520cabdff1aSopenharmony_ci%else
521cabdff1aSopenharmony_ci        TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
522cabdff1aSopenharmony_ci%endif
523cabdff1aSopenharmony_ci        PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
524cabdff1aSopenharmony_ci
525cabdff1aSopenharmony_ci%define SHIFT(x) psraw  x, 4
526cabdff1aSopenharmony_ci%define ADD(x)   paddsw x, [pw_8]
527cabdff1aSopenharmony_ci        VP3_1D_IDCT_SSE2
528cabdff1aSopenharmony_ci        PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
529cabdff1aSopenharmony_ci%else ; mmsize == 8
530cabdff1aSopenharmony_ci    ; eax = quantized input
531cabdff1aSopenharmony_ci    ; ebx = dequantizer matrix
532cabdff1aSopenharmony_ci    ; ecx = IDCT constants
533cabdff1aSopenharmony_ci    ;  M(I) = ecx + MaskOffset(0) + I * 8
534cabdff1aSopenharmony_ci    ;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
535cabdff1aSopenharmony_ci    ; edx = output
536cabdff1aSopenharmony_ci    ; r0..r7 = mm0..mm7
537cabdff1aSopenharmony_ci%define OC_8 [pw_8]
538cabdff1aSopenharmony_ci%define C(x) [vp3_idct_data+16*(x-1)]
539cabdff1aSopenharmony_ci
540cabdff1aSopenharmony_ci    ; at this point, function has completed dequantization + dezigzag +
541cabdff1aSopenharmony_ci    ; partial transposition; now do the idct itself
542cabdff1aSopenharmony_ci%define I(x) [%1+16*x]
543cabdff1aSopenharmony_ci%define J(x) [%1+16*x]
544cabdff1aSopenharmony_ci    RowIDCT
545cabdff1aSopenharmony_ci    Transpose
546cabdff1aSopenharmony_ci
547cabdff1aSopenharmony_ci%define I(x) [%1+16*x+8]
548cabdff1aSopenharmony_ci%define J(x) [%1+16*x+8]
549cabdff1aSopenharmony_ci    RowIDCT
550cabdff1aSopenharmony_ci    Transpose
551cabdff1aSopenharmony_ci
552cabdff1aSopenharmony_ci%define I(x) [%1+16* x]
553cabdff1aSopenharmony_ci%define J(x) [%1+16*(x-4)+8]
554cabdff1aSopenharmony_ci    ColumnIDCT
555cabdff1aSopenharmony_ci
556cabdff1aSopenharmony_ci%define I(x) [%1+16* x   +64]
557cabdff1aSopenharmony_ci%define J(x) [%1+16*(x-4)+72]
558cabdff1aSopenharmony_ci    ColumnIDCT
559cabdff1aSopenharmony_ci%endif ; mmsize == 16/8
560cabdff1aSopenharmony_ci%endmacro
561cabdff1aSopenharmony_ci
562cabdff1aSopenharmony_ci%macro vp3_idct_funcs 0
563cabdff1aSopenharmony_cicglobal vp3_idct_put, 3, 4, 9
564cabdff1aSopenharmony_ci    VP3_IDCT      r2
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_ci    mova          m4, [pb_80]
567cabdff1aSopenharmony_ci    lea           r3, [r1*3]
568cabdff1aSopenharmony_ci%assign %%i 0
569cabdff1aSopenharmony_ci%rep 16/mmsize
570cabdff1aSopenharmony_ci    mova          m0, [r2+mmsize*0+%%i]
571cabdff1aSopenharmony_ci    mova          m1, [r2+mmsize*2+%%i]
572cabdff1aSopenharmony_ci    mova          m2, [r2+mmsize*4+%%i]
573cabdff1aSopenharmony_ci    mova          m3, [r2+mmsize*6+%%i]
574cabdff1aSopenharmony_ci    packsswb      m0, [r2+mmsize*1+%%i]
575cabdff1aSopenharmony_ci    packsswb      m1, [r2+mmsize*3+%%i]
576cabdff1aSopenharmony_ci    packsswb      m2, [r2+mmsize*5+%%i]
577cabdff1aSopenharmony_ci    packsswb      m3, [r2+mmsize*7+%%i]
578cabdff1aSopenharmony_ci    paddb         m0, m4
579cabdff1aSopenharmony_ci    paddb         m1, m4
580cabdff1aSopenharmony_ci    paddb         m2, m4
581cabdff1aSopenharmony_ci    paddb         m3, m4
582cabdff1aSopenharmony_ci    movq   [r0     ], m0
583cabdff1aSopenharmony_ci    movhps [r0+r1  ], m0
584cabdff1aSopenharmony_ci    movq   [r0+r1*2], m1
585cabdff1aSopenharmony_ci    movhps [r0+r3  ], m1
586cabdff1aSopenharmony_ci%if %%i == 0
587cabdff1aSopenharmony_ci    lea           r0, [r0+r1*4]
588cabdff1aSopenharmony_ci%endif
589cabdff1aSopenharmony_ci    movq   [r0     ], m2
590cabdff1aSopenharmony_ci    movhps [r0+r1  ], m2
591cabdff1aSopenharmony_ci    movq   [r0+r1*2], m3
592cabdff1aSopenharmony_ci    movhps [r0+r3  ], m3
593cabdff1aSopenharmony_ci%assign %%i %%i+8
594cabdff1aSopenharmony_ci%endrep
595cabdff1aSopenharmony_ci
596cabdff1aSopenharmony_ci    pxor          m0, m0
597cabdff1aSopenharmony_ci%assign %%offset 0
598cabdff1aSopenharmony_ci%rep 128/mmsize
599cabdff1aSopenharmony_ci    mova [r2+%%offset], m0
600cabdff1aSopenharmony_ci%assign %%offset %%offset+mmsize
601cabdff1aSopenharmony_ci%endrep
602cabdff1aSopenharmony_ci    RET
603cabdff1aSopenharmony_ci
604cabdff1aSopenharmony_cicglobal vp3_idct_add, 3, 4, 9
605cabdff1aSopenharmony_ci    VP3_IDCT      r2
606cabdff1aSopenharmony_ci
607cabdff1aSopenharmony_ci    lea           r3, [r1*3]
608cabdff1aSopenharmony_ci    pxor          m4, m4
609cabdff1aSopenharmony_ci%assign %%i 0
610cabdff1aSopenharmony_ci%rep 2
611cabdff1aSopenharmony_ci    movq          m0, [r0]
612cabdff1aSopenharmony_ci    movq          m1, [r0+r1]
613cabdff1aSopenharmony_ci    movq          m2, [r0+r1*2]
614cabdff1aSopenharmony_ci    movq          m3, [r0+r3]
615cabdff1aSopenharmony_ci    punpcklbw     m0, m4
616cabdff1aSopenharmony_ci    punpcklbw     m1, m4
617cabdff1aSopenharmony_ci    punpcklbw     m2, m4
618cabdff1aSopenharmony_ci    punpcklbw     m3, m4
619cabdff1aSopenharmony_ci    paddsw        m0, [r2+ 0+%%i]
620cabdff1aSopenharmony_ci    paddsw        m1, [r2+16+%%i]
621cabdff1aSopenharmony_ci    paddsw        m2, [r2+32+%%i]
622cabdff1aSopenharmony_ci    paddsw        m3, [r2+48+%%i]
623cabdff1aSopenharmony_ci    packuswb      m0, m1
624cabdff1aSopenharmony_ci    packuswb      m2, m3
625cabdff1aSopenharmony_ci    movq   [r0     ], m0
626cabdff1aSopenharmony_ci    movhps [r0+r1  ], m0
627cabdff1aSopenharmony_ci    movq   [r0+r1*2], m2
628cabdff1aSopenharmony_ci    movhps [r0+r3  ], m2
629cabdff1aSopenharmony_ci%if %%i == 0
630cabdff1aSopenharmony_ci    lea           r0, [r0+r1*4]
631cabdff1aSopenharmony_ci%endif
632cabdff1aSopenharmony_ci%assign %%i %%i+64
633cabdff1aSopenharmony_ci%endrep
634cabdff1aSopenharmony_ci%assign %%i 0
635cabdff1aSopenharmony_ci%rep 128/mmsize
636cabdff1aSopenharmony_ci    mova    [r2+%%i], m4
637cabdff1aSopenharmony_ci%assign %%i %%i+mmsize
638cabdff1aSopenharmony_ci%endrep
639cabdff1aSopenharmony_ci    RET
640cabdff1aSopenharmony_ci%endmacro
641cabdff1aSopenharmony_ci
642cabdff1aSopenharmony_ciINIT_XMM sse2
643cabdff1aSopenharmony_civp3_idct_funcs
644cabdff1aSopenharmony_ci
645cabdff1aSopenharmony_ci%macro DC_ADD 0
646cabdff1aSopenharmony_ci    movq          m2, [r0     ]
647cabdff1aSopenharmony_ci    movq          m3, [r0+r1  ]
648cabdff1aSopenharmony_ci    paddusb       m2, m0
649cabdff1aSopenharmony_ci    movq          m4, [r0+r1*2]
650cabdff1aSopenharmony_ci    paddusb       m3, m0
651cabdff1aSopenharmony_ci    movq          m5, [r0+r2  ]
652cabdff1aSopenharmony_ci    paddusb       m4, m0
653cabdff1aSopenharmony_ci    paddusb       m5, m0
654cabdff1aSopenharmony_ci    psubusb       m2, m1
655cabdff1aSopenharmony_ci    psubusb       m3, m1
656cabdff1aSopenharmony_ci    movq   [r0     ], m2
657cabdff1aSopenharmony_ci    psubusb       m4, m1
658cabdff1aSopenharmony_ci    movq   [r0+r1  ], m3
659cabdff1aSopenharmony_ci    psubusb       m5, m1
660cabdff1aSopenharmony_ci    movq   [r0+r1*2], m4
661cabdff1aSopenharmony_ci    movq   [r0+r2  ], m5
662cabdff1aSopenharmony_ci%endmacro
663cabdff1aSopenharmony_ci
664cabdff1aSopenharmony_ciINIT_MMX mmxext
665cabdff1aSopenharmony_cicglobal vp3_idct_dc_add, 3, 4
666cabdff1aSopenharmony_ci    movsx         r3, word [r2]
667cabdff1aSopenharmony_ci    mov    word [r2], 0
668cabdff1aSopenharmony_ci    lea           r2, [r1*3]
669cabdff1aSopenharmony_ci    add           r3, 15
670cabdff1aSopenharmony_ci    sar           r3, 5
671cabdff1aSopenharmony_ci    movd          m0, r3d
672cabdff1aSopenharmony_ci    pshufw        m0, m0, 0x0
673cabdff1aSopenharmony_ci    pxor          m1, m1
674cabdff1aSopenharmony_ci    psubw         m1, m0
675cabdff1aSopenharmony_ci    packuswb      m0, m0
676cabdff1aSopenharmony_ci    packuswb      m1, m1
677cabdff1aSopenharmony_ci    DC_ADD
678cabdff1aSopenharmony_ci    lea           r0, [r0+r1*4]
679cabdff1aSopenharmony_ci    DC_ADD
680cabdff1aSopenharmony_ci    RET
681