1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSE2-optimized H.264 iDCT
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5cabdff1aSopenharmony_ci;* Copyright (C) 2003-2008 x264 project
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8cabdff1aSopenharmony_ci;*          Loren Merritt <lorenm@u.washington.edu>
9cabdff1aSopenharmony_ci;*          Holger Lubitz <hal@duncan.ol.sub.de>
10cabdff1aSopenharmony_ci;*          Min Chen <chenm001.163.com>
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
15cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
16cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
17cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
20cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
23cabdff1aSopenharmony_ci;*
24cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
25cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
26cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27cabdff1aSopenharmony_ci;*****************************************************************************
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ciSECTION_RODATA
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ciscan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
34cabdff1aSopenharmony_ci           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
35cabdff1aSopenharmony_ci           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
36cabdff1aSopenharmony_ci           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
37cabdff1aSopenharmony_ci           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
38cabdff1aSopenharmony_ci           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
39cabdff1aSopenharmony_ci           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
40cabdff1aSopenharmony_ci           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
41cabdff1aSopenharmony_ci           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
42cabdff1aSopenharmony_ci           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
43cabdff1aSopenharmony_ci           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
44cabdff1aSopenharmony_ci           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
45cabdff1aSopenharmony_ci%ifdef PIC
46cabdff1aSopenharmony_ci%define npicregs 1
47cabdff1aSopenharmony_ci%define scan8 picregq
48cabdff1aSopenharmony_ci%else
49cabdff1aSopenharmony_ci%define npicregs 0
50cabdff1aSopenharmony_ci%define scan8 scan8_mem
51cabdff1aSopenharmony_ci%endif
52cabdff1aSopenharmony_ci
53cabdff1aSopenharmony_cicextern pw_32
54cabdff1aSopenharmony_cicextern pw_1
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ciSECTION .text
57cabdff1aSopenharmony_ci
58cabdff1aSopenharmony_ci; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
59cabdff1aSopenharmony_ci%macro IDCT4_ADD 3
60cabdff1aSopenharmony_ci    ; Load dct coeffs
61cabdff1aSopenharmony_ci    movq         m0, [%2]
62cabdff1aSopenharmony_ci    movq         m1, [%2+8]
63cabdff1aSopenharmony_ci    movq         m2, [%2+16]
64cabdff1aSopenharmony_ci    movq         m3, [%2+24]
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_ci    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
67cabdff1aSopenharmony_ci    mova         m6, [pw_32]
68cabdff1aSopenharmony_ci    %if mmsize == 8
69cabdff1aSopenharmony_ci        TRANSPOSE4x4W 0, 1, 2, 3, 4
70cabdff1aSopenharmony_ci    %else
71cabdff1aSopenharmony_ci        punpcklwd m0, m1
72cabdff1aSopenharmony_ci        punpcklwd m2, m3
73cabdff1aSopenharmony_ci        SBUTTERFLY dq, 0, 2, 4
74cabdff1aSopenharmony_ci        MOVHL m1, m0
75cabdff1aSopenharmony_ci        MOVHL m3, m2
76cabdff1aSopenharmony_ci    %endif
77cabdff1aSopenharmony_ci    paddw        m0, m6
78cabdff1aSopenharmony_ci    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
79cabdff1aSopenharmony_ci    pxor         m7, m7
80cabdff1aSopenharmony_ci    movq    [%2+ 0], m7
81cabdff1aSopenharmony_ci    movq    [%2+ 8], m7
82cabdff1aSopenharmony_ci    movq    [%2+16], m7
83cabdff1aSopenharmony_ci    movq    [%2+24], m7
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ci    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
86cabdff1aSopenharmony_ci    lea          %1, [%1+%3*2]
87cabdff1aSopenharmony_ci    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
88cabdff1aSopenharmony_ci%endmacro
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci%macro IDCT8_1D 2
91cabdff1aSopenharmony_ci    psraw        m0, m1, 1
92cabdff1aSopenharmony_ci    SWAP 0, 1
93cabdff1aSopenharmony_ci    psraw        m4, m5, 1
94cabdff1aSopenharmony_ci    paddw        m4, m5
95cabdff1aSopenharmony_ci    paddw        m1, m0
96cabdff1aSopenharmony_ci    paddw        m4, m7
97cabdff1aSopenharmony_ci    paddw        m1, m5
98cabdff1aSopenharmony_ci    psubw        m4, m0
99cabdff1aSopenharmony_ci    paddw        m1, m3
100cabdff1aSopenharmony_ci
101cabdff1aSopenharmony_ci    psubw        m0, m3
102cabdff1aSopenharmony_ci    psubw        m5, m3
103cabdff1aSopenharmony_ci    psraw        m3, 1
104cabdff1aSopenharmony_ci    paddw        m0, m7
105cabdff1aSopenharmony_ci    psubw        m5, m7
106cabdff1aSopenharmony_ci    psraw        m7, 1
107cabdff1aSopenharmony_ci    psubw        m0, m3
108cabdff1aSopenharmony_ci    psubw        m5, m7
109cabdff1aSopenharmony_ci
110cabdff1aSopenharmony_ci    psraw        m7, m1, 2
111cabdff1aSopenharmony_ci    SWAP 7,1
112cabdff1aSopenharmony_ci    psraw        m3, m4, 2
113cabdff1aSopenharmony_ci    paddw        m3, m0
114cabdff1aSopenharmony_ci    psraw        m0, 2
115cabdff1aSopenharmony_ci    paddw        m1, m5
116cabdff1aSopenharmony_ci    psraw        m5, 2
117cabdff1aSopenharmony_ci    psubw        m0, m4
118cabdff1aSopenharmony_ci    psubw        m7, m5
119cabdff1aSopenharmony_ci
120cabdff1aSopenharmony_ci    psraw        m5, m6, 1
121cabdff1aSopenharmony_ci    SWAP 5,6
122cabdff1aSopenharmony_ci    psraw        m4, m2, 1
123cabdff1aSopenharmony_ci    paddw        m6, m2
124cabdff1aSopenharmony_ci    psubw        m4, m5
125cabdff1aSopenharmony_ci
126cabdff1aSopenharmony_ci    mova         m2, %1
127cabdff1aSopenharmony_ci    mova         m5, %2
128cabdff1aSopenharmony_ci    SUMSUB_BA    w, 5, 2
129cabdff1aSopenharmony_ci    SUMSUB_BA    w, 6, 5
130cabdff1aSopenharmony_ci    SUMSUB_BA    w, 4, 2
131cabdff1aSopenharmony_ci    SUMSUB_BA    w, 7, 6
132cabdff1aSopenharmony_ci    SUMSUB_BA    w, 0, 4
133cabdff1aSopenharmony_ci    SUMSUB_BA    w, 3, 2
134cabdff1aSopenharmony_ci    SUMSUB_BA    w, 1, 5
135cabdff1aSopenharmony_ci    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
136cabdff1aSopenharmony_ci%endmacro
137cabdff1aSopenharmony_ci
138cabdff1aSopenharmony_ci%macro IDCT8_1D_FULL 1
139cabdff1aSopenharmony_ci    mova         m7, [%1+112]
140cabdff1aSopenharmony_ci    mova         m6, [%1+ 96]
141cabdff1aSopenharmony_ci    mova         m5, [%1+ 80]
142cabdff1aSopenharmony_ci    mova         m3, [%1+ 48]
143cabdff1aSopenharmony_ci    mova         m2, [%1+ 32]
144cabdff1aSopenharmony_ci    mova         m1, [%1+ 16]
145cabdff1aSopenharmony_ci    IDCT8_1D   [%1], [%1+ 64]
146cabdff1aSopenharmony_ci%endmacro
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_ci; %1=int16_t *block, %2=int16_t *dstblock
149cabdff1aSopenharmony_ci%macro IDCT8_ADD_MMX_START 2
150cabdff1aSopenharmony_ci    IDCT8_1D_FULL %1
151cabdff1aSopenharmony_ci    mova       [%1], m7
152cabdff1aSopenharmony_ci    TRANSPOSE4x4W 0, 1, 2, 3, 7
153cabdff1aSopenharmony_ci    mova         m7, [%1]
154cabdff1aSopenharmony_ci    mova    [%2   ], m0
155cabdff1aSopenharmony_ci    mova    [%2+16], m1
156cabdff1aSopenharmony_ci    mova    [%2+32], m2
157cabdff1aSopenharmony_ci    mova    [%2+48], m3
158cabdff1aSopenharmony_ci    TRANSPOSE4x4W 4, 5, 6, 7, 3
159cabdff1aSopenharmony_ci    mova    [%2+ 8], m4
160cabdff1aSopenharmony_ci    mova    [%2+24], m5
161cabdff1aSopenharmony_ci    mova    [%2+40], m6
162cabdff1aSopenharmony_ci    mova    [%2+56], m7
163cabdff1aSopenharmony_ci%endmacro
164cabdff1aSopenharmony_ci
165cabdff1aSopenharmony_ci; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
166cabdff1aSopenharmony_ci%macro IDCT8_ADD_MMX_END 3-4
167cabdff1aSopenharmony_ci    IDCT8_1D_FULL %2
168cabdff1aSopenharmony_ci    mova    [%2   ], m5
169cabdff1aSopenharmony_ci    mova    [%2+16], m6
170cabdff1aSopenharmony_ci    mova    [%2+32], m7
171cabdff1aSopenharmony_ci
172cabdff1aSopenharmony_ci    pxor         m7, m7
173cabdff1aSopenharmony_ci%if %0 == 4
174cabdff1aSopenharmony_ci    movq   [%4+  0], m7
175cabdff1aSopenharmony_ci    movq   [%4+  8], m7
176cabdff1aSopenharmony_ci    movq   [%4+ 16], m7
177cabdff1aSopenharmony_ci    movq   [%4+ 24], m7
178cabdff1aSopenharmony_ci    movq   [%4+ 32], m7
179cabdff1aSopenharmony_ci    movq   [%4+ 40], m7
180cabdff1aSopenharmony_ci    movq   [%4+ 48], m7
181cabdff1aSopenharmony_ci    movq   [%4+ 56], m7
182cabdff1aSopenharmony_ci    movq   [%4+ 64], m7
183cabdff1aSopenharmony_ci    movq   [%4+ 72], m7
184cabdff1aSopenharmony_ci    movq   [%4+ 80], m7
185cabdff1aSopenharmony_ci    movq   [%4+ 88], m7
186cabdff1aSopenharmony_ci    movq   [%4+ 96], m7
187cabdff1aSopenharmony_ci    movq   [%4+104], m7
188cabdff1aSopenharmony_ci    movq   [%4+112], m7
189cabdff1aSopenharmony_ci    movq   [%4+120], m7
190cabdff1aSopenharmony_ci%endif
191cabdff1aSopenharmony_ci    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
192cabdff1aSopenharmony_ci    lea          %1, [%1+%3*2]
193cabdff1aSopenharmony_ci    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
194cabdff1aSopenharmony_ci    mova         m0, [%2   ]
195cabdff1aSopenharmony_ci    mova         m1, [%2+16]
196cabdff1aSopenharmony_ci    mova         m2, [%2+32]
197cabdff1aSopenharmony_ci    lea          %1, [%1+%3*2]
198cabdff1aSopenharmony_ci    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
199cabdff1aSopenharmony_ci    lea          %1, [%1+%3*2]
200cabdff1aSopenharmony_ci    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
201cabdff1aSopenharmony_ci%endmacro
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
204cabdff1aSopenharmony_ci%macro IDCT8_ADD_SSE 4
205cabdff1aSopenharmony_ci    IDCT8_1D_FULL %2
206cabdff1aSopenharmony_ci%if ARCH_X86_64
207cabdff1aSopenharmony_ci    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
208cabdff1aSopenharmony_ci%else
209cabdff1aSopenharmony_ci    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
210cabdff1aSopenharmony_ci%endif
211cabdff1aSopenharmony_ci    paddw        m0, [pw_32]
212cabdff1aSopenharmony_ci
213cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
214cabdff1aSopenharmony_ci    mova    [%2   ], m0
215cabdff1aSopenharmony_ci    mova    [%2+16], m4
216cabdff1aSopenharmony_ci    IDCT8_1D   [%2], [%2+ 16]
217cabdff1aSopenharmony_ci    mova    [%2   ], m6
218cabdff1aSopenharmony_ci    mova    [%2+16], m7
219cabdff1aSopenharmony_ci%else
220cabdff1aSopenharmony_ci    SWAP          0, 8
221cabdff1aSopenharmony_ci    SWAP          4, 9
222cabdff1aSopenharmony_ci    IDCT8_1D     m8, m9
223cabdff1aSopenharmony_ci    SWAP          6, 8
224cabdff1aSopenharmony_ci    SWAP          7, 9
225cabdff1aSopenharmony_ci%endif
226cabdff1aSopenharmony_ci
227cabdff1aSopenharmony_ci    pxor         m7, m7
228cabdff1aSopenharmony_ci    lea          %4, [%3*3]
229cabdff1aSopenharmony_ci    STORE_DIFF   m0, m6, m7, [%1     ]
230cabdff1aSopenharmony_ci    STORE_DIFF   m1, m6, m7, [%1+%3  ]
231cabdff1aSopenharmony_ci    STORE_DIFF   m2, m6, m7, [%1+%3*2]
232cabdff1aSopenharmony_ci    STORE_DIFF   m3, m6, m7, [%1+%4  ]
233cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
234cabdff1aSopenharmony_ci    mova         m0, [%2   ]
235cabdff1aSopenharmony_ci    mova         m1, [%2+16]
236cabdff1aSopenharmony_ci%else
237cabdff1aSopenharmony_ci    SWAP          0, 8
238cabdff1aSopenharmony_ci    SWAP          1, 9
239cabdff1aSopenharmony_ci%endif
240cabdff1aSopenharmony_ci    mova   [%2+  0], m7
241cabdff1aSopenharmony_ci    mova   [%2+ 16], m7
242cabdff1aSopenharmony_ci    mova   [%2+ 32], m7
243cabdff1aSopenharmony_ci    mova   [%2+ 48], m7
244cabdff1aSopenharmony_ci    mova   [%2+ 64], m7
245cabdff1aSopenharmony_ci    mova   [%2+ 80], m7
246cabdff1aSopenharmony_ci    mova   [%2+ 96], m7
247cabdff1aSopenharmony_ci    mova   [%2+112], m7
248cabdff1aSopenharmony_ci    lea          %1, [%1+%3*4]
249cabdff1aSopenharmony_ci    STORE_DIFF   m4, m6, m7, [%1     ]
250cabdff1aSopenharmony_ci    STORE_DIFF   m5, m6, m7, [%1+%3  ]
251cabdff1aSopenharmony_ci    STORE_DIFF   m0, m6, m7, [%1+%3*2]
252cabdff1aSopenharmony_ci    STORE_DIFF   m1, m6, m7, [%1+%4  ]
253cabdff1aSopenharmony_ci%endmacro
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ciINIT_XMM sse2
256cabdff1aSopenharmony_ci; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
257cabdff1aSopenharmony_cicglobal h264_idct8_add_8, 3, 4, 10
258cabdff1aSopenharmony_ci    movsxdifnidn  r2, r2d
259cabdff1aSopenharmony_ci    IDCT8_ADD_SSE r0, r1, r2, r3
260cabdff1aSopenharmony_ci    RET
261cabdff1aSopenharmony_ci
262cabdff1aSopenharmony_ci%macro DC_ADD_MMXEXT_INIT 2
263cabdff1aSopenharmony_ci    add          %1, 32
264cabdff1aSopenharmony_ci    sar          %1, 6
265cabdff1aSopenharmony_ci    movd         m0, %1d
266cabdff1aSopenharmony_ci    lea          %1, [%2*3]
267cabdff1aSopenharmony_ci    pshufw       m0, m0, 0
268cabdff1aSopenharmony_ci    pxor         m1, m1
269cabdff1aSopenharmony_ci    psubw        m1, m0
270cabdff1aSopenharmony_ci    packuswb     m0, m0
271cabdff1aSopenharmony_ci    packuswb     m1, m1
272cabdff1aSopenharmony_ci%endmacro
273cabdff1aSopenharmony_ci
274cabdff1aSopenharmony_ci%macro DC_ADD_MMXEXT_OP 4
275cabdff1aSopenharmony_ci    %1           m2, [%2     ]
276cabdff1aSopenharmony_ci    %1           m3, [%2+%3  ]
277cabdff1aSopenharmony_ci    %1           m4, [%2+%3*2]
278cabdff1aSopenharmony_ci    %1           m5, [%2+%4  ]
279cabdff1aSopenharmony_ci    paddusb      m2, m0
280cabdff1aSopenharmony_ci    paddusb      m3, m0
281cabdff1aSopenharmony_ci    paddusb      m4, m0
282cabdff1aSopenharmony_ci    paddusb      m5, m0
283cabdff1aSopenharmony_ci    psubusb      m2, m1
284cabdff1aSopenharmony_ci    psubusb      m3, m1
285cabdff1aSopenharmony_ci    psubusb      m4, m1
286cabdff1aSopenharmony_ci    psubusb      m5, m1
287cabdff1aSopenharmony_ci    %1    [%2     ], m2
288cabdff1aSopenharmony_ci    %1    [%2+%3  ], m3
289cabdff1aSopenharmony_ci    %1    [%2+%3*2], m4
290cabdff1aSopenharmony_ci    %1    [%2+%4  ], m5
291cabdff1aSopenharmony_ci%endmacro
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ciINIT_MMX mmxext
294cabdff1aSopenharmony_ci%if ARCH_X86_64
295cabdff1aSopenharmony_ci; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
296cabdff1aSopenharmony_cicglobal h264_idct8_dc_add_8, 3, 4, 0
297cabdff1aSopenharmony_ci    movsxd       r2, r2d
298cabdff1aSopenharmony_ci    movsx        r3, word [r1]
299cabdff1aSopenharmony_ci    mov  dword [r1], 0
300cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_INIT r3, r2
301cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_OP mova, r0, r2, r3
302cabdff1aSopenharmony_ci    lea          r0, [r0+r2*4]
303cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_OP mova, r0, r2, r3
304cabdff1aSopenharmony_ci    RET
305cabdff1aSopenharmony_ci%else
306cabdff1aSopenharmony_ci; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
307cabdff1aSopenharmony_cicglobal h264_idct8_dc_add_8, 2, 3, 0
308cabdff1aSopenharmony_ci    movsx        r2, word [r1]
309cabdff1aSopenharmony_ci    mov  dword [r1], 0
310cabdff1aSopenharmony_ci    mov          r1, r2m
311cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_INIT r2, r1
312cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_OP mova, r0, r1, r2
313cabdff1aSopenharmony_ci    lea          r0, [r0+r1*4]
314cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_OP mova, r0, r1, r2
315cabdff1aSopenharmony_ci    RET
316cabdff1aSopenharmony_ci%endif
317cabdff1aSopenharmony_ci
318cabdff1aSopenharmony_ciINIT_XMM sse2
319cabdff1aSopenharmony_ci; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
320cabdff1aSopenharmony_ci;                                int16_t *block, int stride,
321cabdff1aSopenharmony_ci;                                const uint8_t nnzc[6 * 8])
322cabdff1aSopenharmony_cicglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
323cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
324cabdff1aSopenharmony_ci    xor          r5, r5
325cabdff1aSopenharmony_ci%ifdef PIC
326cabdff1aSopenharmony_ci    lea     picregq, [scan8_mem]
327cabdff1aSopenharmony_ci%endif
328cabdff1aSopenharmony_ci.nextblock:
329cabdff1aSopenharmony_ci    movzx        r6, byte [scan8+r5]
330cabdff1aSopenharmony_ci    movzx        r6, byte [r4+r6]
331cabdff1aSopenharmony_ci    test         r6, r6
332cabdff1aSopenharmony_ci    jz .skipblock
333cabdff1aSopenharmony_ci    cmp          r6, 1
334cabdff1aSopenharmony_ci    jnz .no_dc
335cabdff1aSopenharmony_ci    movsx        r6, word [r2]
336cabdff1aSopenharmony_ci    test         r6, r6
337cabdff1aSopenharmony_ci    jz .no_dc
338cabdff1aSopenharmony_ciINIT_MMX cpuname
339cabdff1aSopenharmony_ci    mov   word [r2], 0
340cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_INIT r6, r3
341cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
342cabdff1aSopenharmony_ci%define dst2q r1
343cabdff1aSopenharmony_ci%define dst2d r1d
344cabdff1aSopenharmony_ci%endif
345cabdff1aSopenharmony_ci    mov       dst2d, dword [r1+r5*4]
346cabdff1aSopenharmony_ci    add       dst2q, r0
347cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
348cabdff1aSopenharmony_ci    lea       dst2q, [dst2q+r3*4]
349cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
350cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
351cabdff1aSopenharmony_ci    mov          r1, r1m
352cabdff1aSopenharmony_ci%endif
353cabdff1aSopenharmony_ci    add          r5, 4
354cabdff1aSopenharmony_ci    add          r2, 128
355cabdff1aSopenharmony_ci    cmp          r5, 16
356cabdff1aSopenharmony_ci    jl .nextblock
357cabdff1aSopenharmony_ci    REP_RET
358cabdff1aSopenharmony_ci.no_dc:
359cabdff1aSopenharmony_ciINIT_XMM cpuname
360cabdff1aSopenharmony_ci    mov       dst2d, dword [r1+r5*4]
361cabdff1aSopenharmony_ci    add       dst2q, r0
362cabdff1aSopenharmony_ci    IDCT8_ADD_SSE dst2q, r2, r3, r6
363cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
364cabdff1aSopenharmony_ci    mov          r1, r1m
365cabdff1aSopenharmony_ci%endif
366cabdff1aSopenharmony_ci.skipblock:
367cabdff1aSopenharmony_ci    add          r5, 4
368cabdff1aSopenharmony_ci    add          r2, 128
369cabdff1aSopenharmony_ci    cmp          r5, 16
370cabdff1aSopenharmony_ci    jl .nextblock
371cabdff1aSopenharmony_ci    REP_RET
372cabdff1aSopenharmony_ci
373cabdff1aSopenharmony_ciINIT_MMX mmx
374cabdff1aSopenharmony_cih264_idct_add8_mmx_plane:
375cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
376cabdff1aSopenharmony_ci.nextblock:
377cabdff1aSopenharmony_ci    movzx        r6, byte [scan8+r5]
378cabdff1aSopenharmony_ci    movzx        r6, byte [r4+r6]
379cabdff1aSopenharmony_ci    or          r6w, word [r2]
380cabdff1aSopenharmony_ci    test         r6, r6
381cabdff1aSopenharmony_ci    jz .skipblock
382cabdff1aSopenharmony_ci%if ARCH_X86_64
383cabdff1aSopenharmony_ci    mov         r0d, dword [r1+r5*4]
384cabdff1aSopenharmony_ci    add          r0, [dst2q]
385cabdff1aSopenharmony_ci%else
386cabdff1aSopenharmony_ci    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
387cabdff1aSopenharmony_ci    mov          r0, [r0]
388cabdff1aSopenharmony_ci    add          r0, dword [r1+r5*4]
389cabdff1aSopenharmony_ci%endif
390cabdff1aSopenharmony_ci    IDCT4_ADD    r0, r2, r3
391cabdff1aSopenharmony_ci.skipblock:
392cabdff1aSopenharmony_ci    inc          r5
393cabdff1aSopenharmony_ci    add          r2, 32
394cabdff1aSopenharmony_ci    test         r5, 3
395cabdff1aSopenharmony_ci    jnz .nextblock
396cabdff1aSopenharmony_ci    rep ret
397cabdff1aSopenharmony_ci
398cabdff1aSopenharmony_cicglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
399cabdff1aSopenharmony_ci; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
400cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
401cabdff1aSopenharmony_ci%ifdef PIC
402cabdff1aSopenharmony_ci    lea     picregq, [scan8_mem]
403cabdff1aSopenharmony_ci%endif
404cabdff1aSopenharmony_ci%if ARCH_X86_64
405cabdff1aSopenharmony_ci    mov       dst2q, r0
406cabdff1aSopenharmony_ci%endif
407cabdff1aSopenharmony_ci
408cabdff1aSopenharmony_ci    mov          r5, 16  ; i
409cabdff1aSopenharmony_ci    add          r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t
410cabdff1aSopenharmony_ci
411cabdff1aSopenharmony_ci    call         h264_idct_add8_mmx_plane
412cabdff1aSopenharmony_ci    add r5, 4
413cabdff1aSopenharmony_ci    call         h264_idct_add8_mmx_plane
414cabdff1aSopenharmony_ci
415cabdff1aSopenharmony_ci%if ARCH_X86_64
416cabdff1aSopenharmony_ci    add       dst2q, gprsize ; dest[1]
417cabdff1aSopenharmony_ci%else
418cabdff1aSopenharmony_ci    add        r0mp, gprsize
419cabdff1aSopenharmony_ci%endif
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ci    add r5, 4   ; set to 32
422cabdff1aSopenharmony_ci    add r2, 256 ; set to i * 16 * sizeof(dctcoef)
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci    call         h264_idct_add8_mmx_plane
425cabdff1aSopenharmony_ci    add r5, 4
426cabdff1aSopenharmony_ci    call         h264_idct_add8_mmx_plane
427cabdff1aSopenharmony_ci
428cabdff1aSopenharmony_ci    RET ; TODO: check rep ret after a function call
429cabdff1aSopenharmony_ci
430cabdff1aSopenharmony_ci; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
431cabdff1aSopenharmony_cih264_idct_dc_add8_mmxext:
432cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
433cabdff1aSopenharmony_ci    movd         m0, [r2   ]          ;  0 0 X D
434cabdff1aSopenharmony_ci    mov word [r2+ 0], 0
435cabdff1aSopenharmony_ci    punpcklwd    m0, [r2+32]          ;  x X d D
436cabdff1aSopenharmony_ci    mov word [r2+32], 0
437cabdff1aSopenharmony_ci    paddsw       m0, [pw_32]
438cabdff1aSopenharmony_ci    psraw        m0, 6
439cabdff1aSopenharmony_ci    punpcklwd    m0, m0               ;  d d D D
440cabdff1aSopenharmony_ci    pxor         m1, m1               ;  0 0 0 0
441cabdff1aSopenharmony_ci    psubw        m1, m0               ; -d-d-D-D
442cabdff1aSopenharmony_ci    packuswb     m0, m1               ; -d-d-D-D d d D D
443cabdff1aSopenharmony_ci    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
444cabdff1aSopenharmony_ci    punpcklwd    m0, m0               ;  d d d d D D D D
445cabdff1aSopenharmony_ci    lea          r6, [r3*3]
446cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_OP movq, r0, r3, r6
447cabdff1aSopenharmony_ci    ret
448cabdff1aSopenharmony_ci
449cabdff1aSopenharmony_ciALIGN 16
450cabdff1aSopenharmony_ciINIT_XMM sse2
451cabdff1aSopenharmony_ci; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
452cabdff1aSopenharmony_cih264_add8x4_idct_sse2:
453cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
454cabdff1aSopenharmony_ci    movq   m0, [r2+ 0]
455cabdff1aSopenharmony_ci    movq   m1, [r2+ 8]
456cabdff1aSopenharmony_ci    movq   m2, [r2+16]
457cabdff1aSopenharmony_ci    movq   m3, [r2+24]
458cabdff1aSopenharmony_ci    movhps m0, [r2+32]
459cabdff1aSopenharmony_ci    movhps m1, [r2+40]
460cabdff1aSopenharmony_ci    movhps m2, [r2+48]
461cabdff1aSopenharmony_ci    movhps m3, [r2+56]
462cabdff1aSopenharmony_ci    IDCT4_1D w,0,1,2,3,4,5
463cabdff1aSopenharmony_ci    TRANSPOSE2x4x4W 0,1,2,3,4
464cabdff1aSopenharmony_ci    paddw m0, [pw_32]
465cabdff1aSopenharmony_ci    IDCT4_1D w,0,1,2,3,4,5
466cabdff1aSopenharmony_ci    pxor  m7, m7
467cabdff1aSopenharmony_ci    mova [r2+ 0], m7
468cabdff1aSopenharmony_ci    mova [r2+16], m7
469cabdff1aSopenharmony_ci    mova [r2+32], m7
470cabdff1aSopenharmony_ci    mova [r2+48], m7
471cabdff1aSopenharmony_ci    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
472cabdff1aSopenharmony_ci    lea   r0, [r0+r3*2]
473cabdff1aSopenharmony_ci    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
474cabdff1aSopenharmony_ci    ret
475cabdff1aSopenharmony_ci
476cabdff1aSopenharmony_ci%macro add16_sse2_cycle 2
477cabdff1aSopenharmony_ci    movzx       r0, word [r4+%2]
478cabdff1aSopenharmony_ci    test        r0, r0
479cabdff1aSopenharmony_ci    jz .cycle%1end
480cabdff1aSopenharmony_ci    mov        r0d, dword [r1+%1*8]
481cabdff1aSopenharmony_ci%if ARCH_X86_64
482cabdff1aSopenharmony_ci    add         r0, r5
483cabdff1aSopenharmony_ci%else
484cabdff1aSopenharmony_ci    add         r0, r0m
485cabdff1aSopenharmony_ci%endif
486cabdff1aSopenharmony_ci    call        h264_add8x4_idct_sse2
487cabdff1aSopenharmony_ci.cycle%1end:
488cabdff1aSopenharmony_ci%if %1 < 7
489cabdff1aSopenharmony_ci    add         r2, 64
490cabdff1aSopenharmony_ci%endif
491cabdff1aSopenharmony_ci%endmacro
492cabdff1aSopenharmony_ci
493cabdff1aSopenharmony_ci; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
494cabdff1aSopenharmony_ci;                                int16_t *block, int stride,
495cabdff1aSopenharmony_ci;                                const uint8_t nnzc[6 * 8])
496cabdff1aSopenharmony_cicglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
497cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
498cabdff1aSopenharmony_ci%if ARCH_X86_64
499cabdff1aSopenharmony_ci    mov         r5, r0
500cabdff1aSopenharmony_ci%endif
501cabdff1aSopenharmony_ci    ; unrolling of the loop leads to an average performance gain of
502cabdff1aSopenharmony_ci    ; 20-25%
503cabdff1aSopenharmony_ci    add16_sse2_cycle 0, 0xc
504cabdff1aSopenharmony_ci    add16_sse2_cycle 1, 0x14
505cabdff1aSopenharmony_ci    add16_sse2_cycle 2, 0xe
506cabdff1aSopenharmony_ci    add16_sse2_cycle 3, 0x16
507cabdff1aSopenharmony_ci    add16_sse2_cycle 4, 0x1c
508cabdff1aSopenharmony_ci    add16_sse2_cycle 5, 0x24
509cabdff1aSopenharmony_ci    add16_sse2_cycle 6, 0x1e
510cabdff1aSopenharmony_ci    add16_sse2_cycle 7, 0x26
511cabdff1aSopenharmony_ciREP_RET
512cabdff1aSopenharmony_ci
513cabdff1aSopenharmony_ci%macro add16intra_sse2_cycle 2
514cabdff1aSopenharmony_ci    movzx       r0, word [r4+%2]
515cabdff1aSopenharmony_ci    test        r0, r0
516cabdff1aSopenharmony_ci    jz .try%1dc
517cabdff1aSopenharmony_ci    mov        r0d, dword [r1+%1*8]
518cabdff1aSopenharmony_ci%if ARCH_X86_64
519cabdff1aSopenharmony_ci    add         r0, r7
520cabdff1aSopenharmony_ci%else
521cabdff1aSopenharmony_ci    add         r0, r0m
522cabdff1aSopenharmony_ci%endif
523cabdff1aSopenharmony_ci    call        h264_add8x4_idct_sse2
524cabdff1aSopenharmony_ci    jmp .cycle%1end
525cabdff1aSopenharmony_ci.try%1dc:
526cabdff1aSopenharmony_ci    movsx       r0, word [r2   ]
527cabdff1aSopenharmony_ci    or         r0w, word [r2+32]
528cabdff1aSopenharmony_ci    jz .cycle%1end
529cabdff1aSopenharmony_ci    mov        r0d, dword [r1+%1*8]
530cabdff1aSopenharmony_ci%if ARCH_X86_64
531cabdff1aSopenharmony_ci    add         r0, r7
532cabdff1aSopenharmony_ci%else
533cabdff1aSopenharmony_ci    add         r0, r0m
534cabdff1aSopenharmony_ci%endif
535cabdff1aSopenharmony_ci    call        h264_idct_dc_add8_mmxext
536cabdff1aSopenharmony_ci.cycle%1end:
537cabdff1aSopenharmony_ci%if %1 < 7
538cabdff1aSopenharmony_ci    add         r2, 64
539cabdff1aSopenharmony_ci%endif
540cabdff1aSopenharmony_ci%endmacro
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_ci; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
543cabdff1aSopenharmony_ci;                                     int16_t *block, int stride,
544cabdff1aSopenharmony_ci;                                     const uint8_t nnzc[6 * 8])
545cabdff1aSopenharmony_cicglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
546cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
547cabdff1aSopenharmony_ci%if ARCH_X86_64
548cabdff1aSopenharmony_ci    mov         r7, r0
549cabdff1aSopenharmony_ci%endif
550cabdff1aSopenharmony_ci    add16intra_sse2_cycle 0, 0xc
551cabdff1aSopenharmony_ci    add16intra_sse2_cycle 1, 0x14
552cabdff1aSopenharmony_ci    add16intra_sse2_cycle 2, 0xe
553cabdff1aSopenharmony_ci    add16intra_sse2_cycle 3, 0x16
554cabdff1aSopenharmony_ci    add16intra_sse2_cycle 4, 0x1c
555cabdff1aSopenharmony_ci    add16intra_sse2_cycle 5, 0x24
556cabdff1aSopenharmony_ci    add16intra_sse2_cycle 6, 0x1e
557cabdff1aSopenharmony_ci    add16intra_sse2_cycle 7, 0x26
558cabdff1aSopenharmony_ciREP_RET
559cabdff1aSopenharmony_ci
560cabdff1aSopenharmony_ci%macro add8_sse2_cycle 2
561cabdff1aSopenharmony_ci    movzx       r0, word [r4+%2]
562cabdff1aSopenharmony_ci    test        r0, r0
563cabdff1aSopenharmony_ci    jz .try%1dc
564cabdff1aSopenharmony_ci%if ARCH_X86_64
565cabdff1aSopenharmony_ci    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
566cabdff1aSopenharmony_ci    add         r0, [r7]
567cabdff1aSopenharmony_ci%else
568cabdff1aSopenharmony_ci    mov         r0, r0m
569cabdff1aSopenharmony_ci    mov         r0, [r0]
570cabdff1aSopenharmony_ci    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
571cabdff1aSopenharmony_ci%endif
572cabdff1aSopenharmony_ci    call        h264_add8x4_idct_sse2
573cabdff1aSopenharmony_ci    jmp .cycle%1end
574cabdff1aSopenharmony_ci.try%1dc:
575cabdff1aSopenharmony_ci    movsx       r0, word [r2   ]
576cabdff1aSopenharmony_ci    or         r0w, word [r2+32]
577cabdff1aSopenharmony_ci    jz .cycle%1end
578cabdff1aSopenharmony_ci%if ARCH_X86_64
579cabdff1aSopenharmony_ci    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
580cabdff1aSopenharmony_ci    add         r0, [r7]
581cabdff1aSopenharmony_ci%else
582cabdff1aSopenharmony_ci    mov         r0, r0m
583cabdff1aSopenharmony_ci    mov         r0, [r0]
584cabdff1aSopenharmony_ci    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
585cabdff1aSopenharmony_ci%endif
586cabdff1aSopenharmony_ci    call        h264_idct_dc_add8_mmxext
587cabdff1aSopenharmony_ci.cycle%1end:
588cabdff1aSopenharmony_ci%if %1 == 1
589cabdff1aSopenharmony_ci    add         r2, 384+64
590cabdff1aSopenharmony_ci%elif %1 < 3
591cabdff1aSopenharmony_ci    add         r2, 64
592cabdff1aSopenharmony_ci%endif
593cabdff1aSopenharmony_ci%endmacro
594cabdff1aSopenharmony_ci
595cabdff1aSopenharmony_ci; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
596cabdff1aSopenharmony_ci;                               int16_t *block, int stride,
597cabdff1aSopenharmony_ci;                               const uint8_t nnzc[6 * 8])
598cabdff1aSopenharmony_cicglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
599cabdff1aSopenharmony_ci    movsxdifnidn r3, r3d
600cabdff1aSopenharmony_ci    add          r2, 512
601cabdff1aSopenharmony_ci%if ARCH_X86_64
602cabdff1aSopenharmony_ci    mov          r7, r0
603cabdff1aSopenharmony_ci%endif
604cabdff1aSopenharmony_ci    add8_sse2_cycle 0, 0x34
605cabdff1aSopenharmony_ci    add8_sse2_cycle 1, 0x3c
606cabdff1aSopenharmony_ci%if ARCH_X86_64
607cabdff1aSopenharmony_ci    add          r7, gprsize
608cabdff1aSopenharmony_ci%else
609cabdff1aSopenharmony_ci    add        r0mp, gprsize
610cabdff1aSopenharmony_ci%endif
611cabdff1aSopenharmony_ci    add8_sse2_cycle 2, 0x5c
612cabdff1aSopenharmony_ci    add8_sse2_cycle 3, 0x64
613cabdff1aSopenharmony_ciREP_RET
614cabdff1aSopenharmony_ci
615cabdff1aSopenharmony_ci;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
616cabdff1aSopenharmony_ci
617cabdff1aSopenharmony_ci%macro WALSH4_1D 5
618cabdff1aSopenharmony_ci    SUMSUB_BADC w, %4, %3, %2, %1, %5
619cabdff1aSopenharmony_ci    SUMSUB_BADC w, %4, %2, %3, %1, %5
620cabdff1aSopenharmony_ci    SWAP %1, %4, %3
621cabdff1aSopenharmony_ci%endmacro
622cabdff1aSopenharmony_ci
623cabdff1aSopenharmony_ci%macro DEQUANT 1-3
624cabdff1aSopenharmony_ci%if cpuflag(sse2)
625cabdff1aSopenharmony_ci    movd      xmm4, t3d
626cabdff1aSopenharmony_ci    movq      xmm5, [pw_1]
627cabdff1aSopenharmony_ci    pshufd    xmm4, xmm4, 0
628cabdff1aSopenharmony_ci    movq2dq   xmm0, m0
629cabdff1aSopenharmony_ci    movq2dq   xmm1, m1
630cabdff1aSopenharmony_ci    movq2dq   xmm2, m2
631cabdff1aSopenharmony_ci    movq2dq   xmm3, m3
632cabdff1aSopenharmony_ci    punpcklwd xmm0, xmm5
633cabdff1aSopenharmony_ci    punpcklwd xmm1, xmm5
634cabdff1aSopenharmony_ci    punpcklwd xmm2, xmm5
635cabdff1aSopenharmony_ci    punpcklwd xmm3, xmm5
636cabdff1aSopenharmony_ci    pmaddwd   xmm0, xmm4
637cabdff1aSopenharmony_ci    pmaddwd   xmm1, xmm4
638cabdff1aSopenharmony_ci    pmaddwd   xmm2, xmm4
639cabdff1aSopenharmony_ci    pmaddwd   xmm3, xmm4
640cabdff1aSopenharmony_ci    psrad     xmm0, %1
641cabdff1aSopenharmony_ci    psrad     xmm1, %1
642cabdff1aSopenharmony_ci    psrad     xmm2, %1
643cabdff1aSopenharmony_ci    psrad     xmm3, %1
644cabdff1aSopenharmony_ci    packssdw  xmm0, xmm1
645cabdff1aSopenharmony_ci    packssdw  xmm2, xmm3
646cabdff1aSopenharmony_ci%else
647cabdff1aSopenharmony_ci    mova        m7, [pw_1]
648cabdff1aSopenharmony_ci    mova        m4, %1
649cabdff1aSopenharmony_ci    punpcklwd   %1, m7
650cabdff1aSopenharmony_ci    punpckhwd   m4, m7
651cabdff1aSopenharmony_ci    mova        m5, %2
652cabdff1aSopenharmony_ci    punpcklwd   %2, m7
653cabdff1aSopenharmony_ci    punpckhwd   m5, m7
654cabdff1aSopenharmony_ci    movd        m7, t3d
655cabdff1aSopenharmony_ci    punpckldq   m7, m7
656cabdff1aSopenharmony_ci    pmaddwd     %1, m7
657cabdff1aSopenharmony_ci    pmaddwd     %2, m7
658cabdff1aSopenharmony_ci    pmaddwd     m4, m7
659cabdff1aSopenharmony_ci    pmaddwd     m5, m7
660cabdff1aSopenharmony_ci    psrad       %1, %3
661cabdff1aSopenharmony_ci    psrad       %2, %3
662cabdff1aSopenharmony_ci    psrad       m4, %3
663cabdff1aSopenharmony_ci    psrad       m5, %3
664cabdff1aSopenharmony_ci    packssdw    %1, m4
665cabdff1aSopenharmony_ci    packssdw    %2, m5
666cabdff1aSopenharmony_ci%endif
667cabdff1aSopenharmony_ci%endmacro
668cabdff1aSopenharmony_ci
669cabdff1aSopenharmony_ci%macro STORE_WORDS 5-9
670cabdff1aSopenharmony_ci%if cpuflag(sse)
671cabdff1aSopenharmony_ci    movd  t0d, %1
672cabdff1aSopenharmony_ci    psrldq  %1, 4
673cabdff1aSopenharmony_ci    movd  t1d, %1
674cabdff1aSopenharmony_ci    psrldq  %1, 4
675cabdff1aSopenharmony_ci    mov [t2+%2*32], t0w
676cabdff1aSopenharmony_ci    mov [t2+%4*32], t1w
677cabdff1aSopenharmony_ci    shr   t0d, 16
678cabdff1aSopenharmony_ci    shr   t1d, 16
679cabdff1aSopenharmony_ci    mov [t2+%3*32], t0w
680cabdff1aSopenharmony_ci    mov [t2+%5*32], t1w
681cabdff1aSopenharmony_ci    movd  t0d, %1
682cabdff1aSopenharmony_ci    psrldq  %1, 4
683cabdff1aSopenharmony_ci    movd  t1d, %1
684cabdff1aSopenharmony_ci    mov [t2+%6*32], t0w
685cabdff1aSopenharmony_ci    mov [t2+%8*32], t1w
686cabdff1aSopenharmony_ci    shr   t0d, 16
687cabdff1aSopenharmony_ci    shr   t1d, 16
688cabdff1aSopenharmony_ci    mov [t2+%7*32], t0w
689cabdff1aSopenharmony_ci    mov [t2+%9*32], t1w
690cabdff1aSopenharmony_ci%else
691cabdff1aSopenharmony_ci    movd  t0d, %1
692cabdff1aSopenharmony_ci    psrlq  %1, 32
693cabdff1aSopenharmony_ci    movd  t1d, %1
694cabdff1aSopenharmony_ci    mov [t2+%2*32], t0w
695cabdff1aSopenharmony_ci    mov [t2+%4*32], t1w
696cabdff1aSopenharmony_ci    shr   t0d, 16
697cabdff1aSopenharmony_ci    shr   t1d, 16
698cabdff1aSopenharmony_ci    mov [t2+%3*32], t0w
699cabdff1aSopenharmony_ci    mov [t2+%5*32], t1w
700cabdff1aSopenharmony_ci%endif
701cabdff1aSopenharmony_ci%endmacro
702cabdff1aSopenharmony_ci
703cabdff1aSopenharmony_ci%macro DEQUANT_STORE 1
704cabdff1aSopenharmony_ci%if cpuflag(sse2)
705cabdff1aSopenharmony_ci    DEQUANT     %1
706cabdff1aSopenharmony_ci    STORE_WORDS xmm0,  0,  1,  4,  5,  2,  3,  6,  7
707cabdff1aSopenharmony_ci    STORE_WORDS xmm2,  8,  9, 12, 13, 10, 11, 14, 15
708cabdff1aSopenharmony_ci%else
709cabdff1aSopenharmony_ci    DEQUANT     m0, m1, %1
710cabdff1aSopenharmony_ci    STORE_WORDS m0,  0,  1,  4,  5
711cabdff1aSopenharmony_ci    STORE_WORDS m1,  2,  3,  6,  7
712cabdff1aSopenharmony_ci
713cabdff1aSopenharmony_ci    DEQUANT     m2, m3, %1
714cabdff1aSopenharmony_ci    STORE_WORDS m2,  8,  9, 12, 13
715cabdff1aSopenharmony_ci    STORE_WORDS m3, 10, 11, 14, 15
716cabdff1aSopenharmony_ci%endif
717cabdff1aSopenharmony_ci%endmacro
718cabdff1aSopenharmony_ci
719cabdff1aSopenharmony_ci%macro IDCT_DC_DEQUANT 1
720cabdff1aSopenharmony_cicglobal h264_luma_dc_dequant_idct, 3, 4, %1
721cabdff1aSopenharmony_ci    ; manually spill XMM registers for Win64 because
722cabdff1aSopenharmony_ci    ; the code here is initialized with INIT_MMX
723cabdff1aSopenharmony_ci    WIN64_SPILL_XMM %1
724cabdff1aSopenharmony_ci    movq        m3, [r1+24]
725cabdff1aSopenharmony_ci    movq        m2, [r1+16]
726cabdff1aSopenharmony_ci    movq        m1, [r1+ 8]
727cabdff1aSopenharmony_ci    movq        m0, [r1+ 0]
728cabdff1aSopenharmony_ci    WALSH4_1D    0,1,2,3,4
729cabdff1aSopenharmony_ci    TRANSPOSE4x4W 0,1,2,3,4
730cabdff1aSopenharmony_ci    WALSH4_1D    0,1,2,3,4
731cabdff1aSopenharmony_ci
732cabdff1aSopenharmony_ci; shift, tmp, output, qmul
733cabdff1aSopenharmony_ci%if WIN64
734cabdff1aSopenharmony_ci    DECLARE_REG_TMP 0,3,1,2
735cabdff1aSopenharmony_ci    ; we can't avoid this, because r0 is the shift register (ecx) on win64
736cabdff1aSopenharmony_ci    xchg        r0, t2
737cabdff1aSopenharmony_ci%elif ARCH_X86_64
738cabdff1aSopenharmony_ci    DECLARE_REG_TMP 3,1,0,2
739cabdff1aSopenharmony_ci%else
740cabdff1aSopenharmony_ci    DECLARE_REG_TMP 1,3,0,2
741cabdff1aSopenharmony_ci%endif
742cabdff1aSopenharmony_ci
743cabdff1aSopenharmony_ci    cmp        t3d, 32767
744cabdff1aSopenharmony_ci    jg .big_qmul
745cabdff1aSopenharmony_ci    add        t3d, 128 << 16
746cabdff1aSopenharmony_ci    DEQUANT_STORE 8
747cabdff1aSopenharmony_ci    RET
748cabdff1aSopenharmony_ci.big_qmul:
749cabdff1aSopenharmony_ci    bsr        t0d, t3d
750cabdff1aSopenharmony_ci    add        t3d, 128 << 16
751cabdff1aSopenharmony_ci    mov        t1d, 7
752cabdff1aSopenharmony_ci    cmp        t0d, t1d
753cabdff1aSopenharmony_ci    cmovg      t0d, t1d
754cabdff1aSopenharmony_ci    inc        t1d
755cabdff1aSopenharmony_ci    shr        t3d, t0b
756cabdff1aSopenharmony_ci    sub        t1d, t0d
757cabdff1aSopenharmony_ci    movd      xmm6, t1d
758cabdff1aSopenharmony_ci    DEQUANT_STORE xmm6
759cabdff1aSopenharmony_ci    RET
760cabdff1aSopenharmony_ci%endmacro
761cabdff1aSopenharmony_ci
762cabdff1aSopenharmony_ciINIT_MMX sse2
763cabdff1aSopenharmony_ciIDCT_DC_DEQUANT 7
764cabdff1aSopenharmony_ci
765cabdff1aSopenharmony_ci%ifdef __NASM_VER__
766cabdff1aSopenharmony_ci%if __NASM_MAJOR__ >= 2 && __NASM_MINOR__ >= 4
767cabdff1aSopenharmony_ci%unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
768cabdff1aSopenharmony_ci%endif
769cabdff1aSopenharmony_ci%endif
770cabdff1aSopenharmony_ci%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
771cabdff1aSopenharmony_ci    movd       %3, [%7]
772cabdff1aSopenharmony_ci    movd       %4, [%7+%8]
773cabdff1aSopenharmony_ci    psraw      %1, %6
774cabdff1aSopenharmony_ci    psraw      %2, %6
775cabdff1aSopenharmony_ci    punpcklbw  %3, %5
776cabdff1aSopenharmony_ci    punpcklbw  %4, %5
777cabdff1aSopenharmony_ci    paddw      %3, %1
778cabdff1aSopenharmony_ci    paddw      %4, %2
779cabdff1aSopenharmony_ci    packuswb   %3, %5
780cabdff1aSopenharmony_ci    packuswb   %4, %5
781cabdff1aSopenharmony_ci    movd     [%7], %3
782cabdff1aSopenharmony_ci    movd  [%7+%8], %4
783cabdff1aSopenharmony_ci%endmacro
784cabdff1aSopenharmony_ci
785cabdff1aSopenharmony_ci%macro DC_ADD_INIT 1
786cabdff1aSopenharmony_ci    add      %1d, 32
787cabdff1aSopenharmony_ci    sar      %1d, 6
788cabdff1aSopenharmony_ci    movd     m0, %1d
789cabdff1aSopenharmony_ci    pshuflw  m0, m0, 0
790cabdff1aSopenharmony_ci    lea      %1, [3*stride_q]
791cabdff1aSopenharmony_ci    pxor     m1, m1
792cabdff1aSopenharmony_ci    psubw    m1, m0
793cabdff1aSopenharmony_ci    packuswb m0, m0
794cabdff1aSopenharmony_ci    packuswb m1, m1
795cabdff1aSopenharmony_ci%endmacro
796cabdff1aSopenharmony_ci
797cabdff1aSopenharmony_ci%macro IDCT_XMM 1
798cabdff1aSopenharmony_ci
799cabdff1aSopenharmony_ciINIT_XMM %1
800cabdff1aSopenharmony_ci
801cabdff1aSopenharmony_cicglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
802cabdff1aSopenharmony_ci    movsxdifnidn stride_q, stride_d
803cabdff1aSopenharmony_ci    IDCT4_ADD    dst_q, block_q, stride_q
804cabdff1aSopenharmony_ciRET
805cabdff1aSopenharmony_ci
806cabdff1aSopenharmony_cicglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
807cabdff1aSopenharmony_ci    movsxdifnidn stride_q, stride_d
808cabdff1aSopenharmony_ci    movsx             r3d, word [block_q]
809cabdff1aSopenharmony_ci    mov   dword [block_q], 0
810cabdff1aSopenharmony_ci    DC_ADD_INIT r3
811cabdff1aSopenharmony_ci    DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
812cabdff1aSopenharmony_ciRET
813cabdff1aSopenharmony_ci
814cabdff1aSopenharmony_ci%endmacro
815cabdff1aSopenharmony_ci
816cabdff1aSopenharmony_ciIDCT_XMM sse2
817cabdff1aSopenharmony_ciIDCT_XMM avx
818