1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (C) 2015 James Almer
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ciSECTION_RODATA
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_cips_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ciSECTION .text
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ci;*************************************************************************
32cabdff1aSopenharmony_ci;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
33cabdff1aSopenharmony_ci;*************************************************************************
34cabdff1aSopenharmony_ci%macro PS_ADD_SQUARES 1
35cabdff1aSopenharmony_cicglobal ps_add_squares, 3, 3, %1, dst, src, n
36cabdff1aSopenharmony_ci    shl    nd, 3
37cabdff1aSopenharmony_ci    add  srcq, nq
38cabdff1aSopenharmony_ci    neg    nq
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_cialign 16
41cabdff1aSopenharmony_ci.loop:
42cabdff1aSopenharmony_ci    movaps m0, [srcq+nq]
43cabdff1aSopenharmony_ci    movaps m1, [srcq+nq+mmsize]
44cabdff1aSopenharmony_ci    mulps  m0, m0
45cabdff1aSopenharmony_ci    mulps  m1, m1
46cabdff1aSopenharmony_ci    HADDPS m0, m1, m2
47cabdff1aSopenharmony_ci    addps  m0, [dstq]
48cabdff1aSopenharmony_ci    movaps [dstq], m0
49cabdff1aSopenharmony_ci    add  dstq, mmsize
50cabdff1aSopenharmony_ci    add    nq, mmsize*2
51cabdff1aSopenharmony_ci    jl .loop
52cabdff1aSopenharmony_ci    REP_RET
53cabdff1aSopenharmony_ci%endmacro
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ciINIT_XMM sse
56cabdff1aSopenharmony_ciPS_ADD_SQUARES 2
57cabdff1aSopenharmony_ciINIT_XMM sse3
58cabdff1aSopenharmony_ciPS_ADD_SQUARES 3
59cabdff1aSopenharmony_ci
60cabdff1aSopenharmony_ci;*******************************************************************
61cabdff1aSopenharmony_ci;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
62cabdff1aSopenharmony_ci;                                   float *src1, int n);
63cabdff1aSopenharmony_ci;*******************************************************************
64cabdff1aSopenharmony_ciINIT_XMM sse
65cabdff1aSopenharmony_cicglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
66cabdff1aSopenharmony_ci    shl      nd, 3
67cabdff1aSopenharmony_ci    add   src1q, nq
68cabdff1aSopenharmony_ci    add    dstq, nq
69cabdff1aSopenharmony_ci    neg      nq
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_cialign 16
72cabdff1aSopenharmony_ci.loop:
73cabdff1aSopenharmony_ci    movu     m0, [src1q+nq]
74cabdff1aSopenharmony_ci    movu     m1, [src1q+nq+mmsize]
75cabdff1aSopenharmony_ci    mova     m2, [src2q]
76cabdff1aSopenharmony_ci    mova     m3, m2
77cabdff1aSopenharmony_ci    unpcklps m2, m2
78cabdff1aSopenharmony_ci    unpckhps m3, m3
79cabdff1aSopenharmony_ci    mulps    m0, m2
80cabdff1aSopenharmony_ci    mulps    m1, m3
81cabdff1aSopenharmony_ci    mova [dstq+nq], m0
82cabdff1aSopenharmony_ci    mova [dstq+nq+mmsize], m1
83cabdff1aSopenharmony_ci    add   src2q, mmsize
84cabdff1aSopenharmony_ci    add      nq, mmsize*2
85cabdff1aSopenharmony_ci    jl .loop
86cabdff1aSopenharmony_ci    REP_RET
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci;***********************************************************************
89cabdff1aSopenharmony_ci;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
90cabdff1aSopenharmony_ci;                                   float h[2][4], float h_step[2][4],
91cabdff1aSopenharmony_ci;                                   int len);
92cabdff1aSopenharmony_ci;***********************************************************************
93cabdff1aSopenharmony_ciINIT_XMM sse3
94cabdff1aSopenharmony_cicglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
95cabdff1aSopenharmony_ci    movaps   m0, [hq]
96cabdff1aSopenharmony_ci    movaps   m1, [h_stepq]
97cabdff1aSopenharmony_ci    unpcklps m4, m0, m0
98cabdff1aSopenharmony_ci    unpckhps m0, m0
99cabdff1aSopenharmony_ci    unpcklps m5, m1, m1
100cabdff1aSopenharmony_ci    unpckhps m1, m1
101cabdff1aSopenharmony_ci    shl      nd, 3
102cabdff1aSopenharmony_ci    add      lq, nq
103cabdff1aSopenharmony_ci    add      rq, nq
104cabdff1aSopenharmony_ci    neg      nq
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_cialign 16
107cabdff1aSopenharmony_ci.loop:
108cabdff1aSopenharmony_ci    addps    m4, m5
109cabdff1aSopenharmony_ci    addps    m0, m1
110cabdff1aSopenharmony_ci    movddup  m2, [lq+nq]
111cabdff1aSopenharmony_ci    movddup  m3, [rq+nq]
112cabdff1aSopenharmony_ci    mulps    m2, m4
113cabdff1aSopenharmony_ci    mulps    m3, m0
114cabdff1aSopenharmony_ci    addps    m2, m3
115cabdff1aSopenharmony_ci    movsd  [lq+nq], m2
116cabdff1aSopenharmony_ci    movhps [rq+nq], m2
117cabdff1aSopenharmony_ci    add      nq, 8
118cabdff1aSopenharmony_ci    jl .loop
119cabdff1aSopenharmony_ci    REP_RET
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_ci;***************************************************************************
122cabdff1aSopenharmony_ci;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
123cabdff1aSopenharmony_ci;                                       float h[2][4], float h_step[2][4],
124cabdff1aSopenharmony_ci;                                       int len);
125cabdff1aSopenharmony_ci;***************************************************************************
126cabdff1aSopenharmony_ciINIT_XMM sse3
127cabdff1aSopenharmony_cicglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
128cabdff1aSopenharmony_ci    movaps   m0, [hq]
129cabdff1aSopenharmony_ci    movaps   m1, [hq+mmsize]
130cabdff1aSopenharmony_ci%if ARCH_X86_64
131cabdff1aSopenharmony_ci    movaps   m8, [h_stepq]
132cabdff1aSopenharmony_ci    movaps   m9, [h_stepq+mmsize]
133cabdff1aSopenharmony_ci    %define  H_STEP0 m8
134cabdff1aSopenharmony_ci    %define  H_STEP1 m9
135cabdff1aSopenharmony_ci%else
136cabdff1aSopenharmony_ci    %define  H_STEP0 [h_stepq]
137cabdff1aSopenharmony_ci    %define  H_STEP1 [h_stepq+mmsize]
138cabdff1aSopenharmony_ci%endif
139cabdff1aSopenharmony_ci    shl      nd, 3
140cabdff1aSopenharmony_ci    add      lq, nq
141cabdff1aSopenharmony_ci    add      rq, nq
142cabdff1aSopenharmony_ci    neg      nq
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_cialign 16
145cabdff1aSopenharmony_ci.loop:
146cabdff1aSopenharmony_ci    addps    m0, H_STEP0
147cabdff1aSopenharmony_ci    addps    m1, H_STEP1
148cabdff1aSopenharmony_ci    movddup  m2, [lq+nq]
149cabdff1aSopenharmony_ci    movddup  m3, [rq+nq]
150cabdff1aSopenharmony_ci    shufps   m4, m2, m2, q2301
151cabdff1aSopenharmony_ci    shufps   m5, m3, m3, q2301
152cabdff1aSopenharmony_ci    unpcklps m6, m0, m0
153cabdff1aSopenharmony_ci    unpckhps m7, m0, m0
154cabdff1aSopenharmony_ci    mulps    m2, m6
155cabdff1aSopenharmony_ci    mulps    m3, m7
156cabdff1aSopenharmony_ci    unpcklps m6, m1, m1
157cabdff1aSopenharmony_ci    unpckhps m7, m1, m1
158cabdff1aSopenharmony_ci    mulps    m4, m6
159cabdff1aSopenharmony_ci    mulps    m5, m7
160cabdff1aSopenharmony_ci    addps    m2, m3
161cabdff1aSopenharmony_ci    addsubps m2, m4
162cabdff1aSopenharmony_ci    addsubps m2, m5
163cabdff1aSopenharmony_ci    movsd  [lq+nq], m2
164cabdff1aSopenharmony_ci    movhps [rq+nq], m2
165cabdff1aSopenharmony_ci    add      nq, 8
166cabdff1aSopenharmony_ci    jl .loop
167cabdff1aSopenharmony_ci    REP_RET
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_ci;**********************************************************
170cabdff1aSopenharmony_ci;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
171cabdff1aSopenharmony_ci;                                   float (*in)[32][2],
172cabdff1aSopenharmony_ci;                                   int i, int len)
173cabdff1aSopenharmony_ci;**********************************************************
174cabdff1aSopenharmony_ciINIT_XMM sse
175cabdff1aSopenharmony_cicglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
176cabdff1aSopenharmony_ci    movsxdifnidn        iq, id
177cabdff1aSopenharmony_ci    mov               lend, 32 << 3
178cabdff1aSopenharmony_ci    lea                inq, [inq+iq*4]
179cabdff1aSopenharmony_ci    mov               tmpd, id
180cabdff1aSopenharmony_ci    shl               tmpd, 8
181cabdff1aSopenharmony_ci    add               outq, tmpq
182cabdff1aSopenharmony_ci    mov               tmpd, 64
183cabdff1aSopenharmony_ci    sub               tmpd, id
184cabdff1aSopenharmony_ci    mov                 id, tmpd
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_ci    test                id, 1
187cabdff1aSopenharmony_ci    jne .loop4
188cabdff1aSopenharmony_ci    test                id, 2
189cabdff1aSopenharmony_ci    jne .loop8
190cabdff1aSopenharmony_ci
191cabdff1aSopenharmony_cialign 16
192cabdff1aSopenharmony_ci.loop16:
193cabdff1aSopenharmony_ci    mov               in0q, inq
194cabdff1aSopenharmony_ci    mov               in1q, 38*64*4
195cabdff1aSopenharmony_ci    add               in1q, in0q
196cabdff1aSopenharmony_ci    mov               tmpd, lend
197cabdff1aSopenharmony_ci
198cabdff1aSopenharmony_ci.inner_loop16:
199cabdff1aSopenharmony_ci    movaps              m0, [in0q]
200cabdff1aSopenharmony_ci    movaps              m1, [in1q]
201cabdff1aSopenharmony_ci    movaps              m2, [in0q+lenq]
202cabdff1aSopenharmony_ci    movaps              m3, [in1q+lenq]
203cabdff1aSopenharmony_ci    TRANSPOSE4x4PS 0, 1, 2, 3, 4
204cabdff1aSopenharmony_ci    movaps          [outq], m0
205cabdff1aSopenharmony_ci    movaps     [outq+lenq], m1
206cabdff1aSopenharmony_ci    movaps   [outq+lenq*2], m2
207cabdff1aSopenharmony_ci    movaps [outq+3*32*2*4], m3
208cabdff1aSopenharmony_ci    lea               in0q, [in0q+lenq*2]
209cabdff1aSopenharmony_ci    lea               in1q, [in1q+lenq*2]
210cabdff1aSopenharmony_ci    add               outq, mmsize
211cabdff1aSopenharmony_ci    sub               tmpd, mmsize
212cabdff1aSopenharmony_ci    jg .inner_loop16
213cabdff1aSopenharmony_ci    add                inq, 16
214cabdff1aSopenharmony_ci    add               outq, 3*32*2*4
215cabdff1aSopenharmony_ci    sub                 id, 4
216cabdff1aSopenharmony_ci    jg .loop16
217cabdff1aSopenharmony_ci    RET
218cabdff1aSopenharmony_ci
219cabdff1aSopenharmony_cialign 16
220cabdff1aSopenharmony_ci.loop8:
221cabdff1aSopenharmony_ci    mov               in0q, inq
222cabdff1aSopenharmony_ci    mov               in1q, 38*64*4
223cabdff1aSopenharmony_ci    add               in1q, in0q
224cabdff1aSopenharmony_ci    mov               tmpd, lend
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_ci.inner_loop8:
227cabdff1aSopenharmony_ci    movlps              m0, [in0q]
228cabdff1aSopenharmony_ci    movlps              m1, [in1q]
229cabdff1aSopenharmony_ci    movhps              m0, [in0q+lenq]
230cabdff1aSopenharmony_ci    movhps              m1, [in1q+lenq]
231cabdff1aSopenharmony_ci    SBUTTERFLYPS 0, 1, 2
232cabdff1aSopenharmony_ci    SBUTTERFLYPD 0, 1, 2
233cabdff1aSopenharmony_ci    movaps          [outq], m0
234cabdff1aSopenharmony_ci    movaps     [outq+lenq], m1
235cabdff1aSopenharmony_ci    lea               in0q, [in0q+lenq*2]
236cabdff1aSopenharmony_ci    lea               in1q, [in1q+lenq*2]
237cabdff1aSopenharmony_ci    add               outq, mmsize
238cabdff1aSopenharmony_ci    sub               tmpd, mmsize
239cabdff1aSopenharmony_ci    jg .inner_loop8
240cabdff1aSopenharmony_ci    add                inq, 8
241cabdff1aSopenharmony_ci    add               outq, lenq
242cabdff1aSopenharmony_ci    sub                 id, 2
243cabdff1aSopenharmony_ci    jg .loop16
244cabdff1aSopenharmony_ci    RET
245cabdff1aSopenharmony_ci
246cabdff1aSopenharmony_cialign 16
247cabdff1aSopenharmony_ci.loop4:
248cabdff1aSopenharmony_ci    mov               in0q, inq
249cabdff1aSopenharmony_ci    mov               in1q, 38*64*4
250cabdff1aSopenharmony_ci    add               in1q, in0q
251cabdff1aSopenharmony_ci    mov               tmpd, lend
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci.inner_loop4:
254cabdff1aSopenharmony_ci    movss               m0, [in0q]
255cabdff1aSopenharmony_ci    movss               m1, [in1q]
256cabdff1aSopenharmony_ci    movss               m2, [in0q+lenq]
257cabdff1aSopenharmony_ci    movss               m3, [in1q+lenq]
258cabdff1aSopenharmony_ci    movlhps             m0, m1
259cabdff1aSopenharmony_ci    movlhps             m2, m3
260cabdff1aSopenharmony_ci    shufps              m0, m2, q2020
261cabdff1aSopenharmony_ci    movaps          [outq], m0
262cabdff1aSopenharmony_ci    lea               in0q, [in0q+lenq*2]
263cabdff1aSopenharmony_ci    lea               in1q, [in1q+lenq*2]
264cabdff1aSopenharmony_ci    add               outq, mmsize
265cabdff1aSopenharmony_ci    sub               tmpd, mmsize
266cabdff1aSopenharmony_ci    jg .inner_loop4
267cabdff1aSopenharmony_ci    add                inq, 4
268cabdff1aSopenharmony_ci    sub                 id, 1
269cabdff1aSopenharmony_ci    test                id, 2
270cabdff1aSopenharmony_ci    jne .loop8
271cabdff1aSopenharmony_ci    cmp                 id, 4
272cabdff1aSopenharmony_ci    jge .loop16
273cabdff1aSopenharmony_ci    RET
274cabdff1aSopenharmony_ci
275cabdff1aSopenharmony_ci;***********************************************************
276cabdff1aSopenharmony_ci;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
277cabdff1aSopenharmony_ci;                                    float (*in)[32][2],
278cabdff1aSopenharmony_ci;                                    int i, int len)
279cabdff1aSopenharmony_ci;***********************************************************
280cabdff1aSopenharmony_ci%macro HYBRID_SYNTHESIS_DEINT 0
281cabdff1aSopenharmony_cicglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
282cabdff1aSopenharmony_ci%if cpuflag(sse4)
283cabdff1aSopenharmony_ci%define MOVH movsd
284cabdff1aSopenharmony_ci%else
285cabdff1aSopenharmony_ci%define MOVH movlps
286cabdff1aSopenharmony_ci%endif
287cabdff1aSopenharmony_ci    movsxdifnidn        iq, id
288cabdff1aSopenharmony_ci    mov               lend, 32 << 3
289cabdff1aSopenharmony_ci    lea               outq, [outq+iq*4]
290cabdff1aSopenharmony_ci    mov               tmpd, id
291cabdff1aSopenharmony_ci    shl               tmpd, 8
292cabdff1aSopenharmony_ci    add                inq, tmpq
293cabdff1aSopenharmony_ci    mov               tmpd, 64
294cabdff1aSopenharmony_ci    sub               tmpd, id
295cabdff1aSopenharmony_ci    mov                 id, tmpd
296cabdff1aSopenharmony_ci
297cabdff1aSopenharmony_ci    test                id, 1
298cabdff1aSopenharmony_ci    jne .loop4
299cabdff1aSopenharmony_ci    test                id, 2
300cabdff1aSopenharmony_ci    jne .loop8
301cabdff1aSopenharmony_ci
302cabdff1aSopenharmony_cialign 16
303cabdff1aSopenharmony_ci.loop16:
304cabdff1aSopenharmony_ci    mov              out0q, outq
305cabdff1aSopenharmony_ci    mov              out1q, 38*64*4
306cabdff1aSopenharmony_ci    add              out1q, out0q
307cabdff1aSopenharmony_ci    mov               tmpd, lend
308cabdff1aSopenharmony_ci
309cabdff1aSopenharmony_ci.inner_loop16:
310cabdff1aSopenharmony_ci    movaps              m0, [inq]
311cabdff1aSopenharmony_ci    movaps              m1, [inq+lenq]
312cabdff1aSopenharmony_ci    movaps              m2, [inq+lenq*2]
313cabdff1aSopenharmony_ci    movaps              m3, [inq+3*32*2*4]
314cabdff1aSopenharmony_ci    TRANSPOSE4x4PS 0, 1, 2, 3, 4
315cabdff1aSopenharmony_ci    movaps         [out0q], m0
316cabdff1aSopenharmony_ci    movaps         [out1q], m1
317cabdff1aSopenharmony_ci    movaps    [out0q+lenq], m2
318cabdff1aSopenharmony_ci    movaps    [out1q+lenq], m3
319cabdff1aSopenharmony_ci    lea              out0q, [out0q+lenq*2]
320cabdff1aSopenharmony_ci    lea              out1q, [out1q+lenq*2]
321cabdff1aSopenharmony_ci    add                inq, mmsize
322cabdff1aSopenharmony_ci    sub               tmpd, mmsize
323cabdff1aSopenharmony_ci    jg .inner_loop16
324cabdff1aSopenharmony_ci    add               outq, 16
325cabdff1aSopenharmony_ci    add                inq, 3*32*2*4
326cabdff1aSopenharmony_ci    sub                 id, 4
327cabdff1aSopenharmony_ci    jg .loop16
328cabdff1aSopenharmony_ci    RET
329cabdff1aSopenharmony_ci
330cabdff1aSopenharmony_cialign 16
331cabdff1aSopenharmony_ci.loop8:
332cabdff1aSopenharmony_ci    mov              out0q, outq
333cabdff1aSopenharmony_ci    mov              out1q, 38*64*4
334cabdff1aSopenharmony_ci    add              out1q, out0q
335cabdff1aSopenharmony_ci    mov               tmpd, lend
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_ci.inner_loop8:
338cabdff1aSopenharmony_ci    movaps              m0, [inq]
339cabdff1aSopenharmony_ci    movaps              m1, [inq+lenq]
340cabdff1aSopenharmony_ci    SBUTTERFLYPS 0, 1, 2
341cabdff1aSopenharmony_ci    SBUTTERFLYPD 0, 1, 2
342cabdff1aSopenharmony_ci    MOVH           [out0q], m0
343cabdff1aSopenharmony_ci    MOVH           [out1q], m1
344cabdff1aSopenharmony_ci    movhps    [out0q+lenq], m0
345cabdff1aSopenharmony_ci    movhps    [out1q+lenq], m1
346cabdff1aSopenharmony_ci    lea              out0q, [out0q+lenq*2]
347cabdff1aSopenharmony_ci    lea              out1q, [out1q+lenq*2]
348cabdff1aSopenharmony_ci    add                inq, mmsize
349cabdff1aSopenharmony_ci    sub               tmpd, mmsize
350cabdff1aSopenharmony_ci    jg .inner_loop8
351cabdff1aSopenharmony_ci    add               outq, 8
352cabdff1aSopenharmony_ci    add                inq, lenq
353cabdff1aSopenharmony_ci    sub                 id, 2
354cabdff1aSopenharmony_ci    jg .loop16
355cabdff1aSopenharmony_ci    RET
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_cialign 16
358cabdff1aSopenharmony_ci.loop4:
359cabdff1aSopenharmony_ci    mov              out0q, outq
360cabdff1aSopenharmony_ci    mov              out1q, 38*64*4
361cabdff1aSopenharmony_ci    add              out1q, out0q
362cabdff1aSopenharmony_ci    mov               tmpd, lend
363cabdff1aSopenharmony_ci
364cabdff1aSopenharmony_ci.inner_loop4:
365cabdff1aSopenharmony_ci    movaps              m0, [inq]
366cabdff1aSopenharmony_ci    movss          [out0q], m0
367cabdff1aSopenharmony_ci%if cpuflag(sse4)
368cabdff1aSopenharmony_ci    extractps      [out1q], m0, 1
369cabdff1aSopenharmony_ci    extractps [out0q+lenq], m0, 2
370cabdff1aSopenharmony_ci    extractps [out1q+lenq], m0, 3
371cabdff1aSopenharmony_ci%else
372cabdff1aSopenharmony_ci    movhlps             m1, m0
373cabdff1aSopenharmony_ci    movss     [out0q+lenq], m1
374cabdff1aSopenharmony_ci    shufps              m0, m0, 0xb1
375cabdff1aSopenharmony_ci    movss          [out1q], m0
376cabdff1aSopenharmony_ci    movhlps             m1, m0
377cabdff1aSopenharmony_ci    movss     [out1q+lenq], m1
378cabdff1aSopenharmony_ci%endif
379cabdff1aSopenharmony_ci    lea              out0q, [out0q+lenq*2]
380cabdff1aSopenharmony_ci    lea              out1q, [out1q+lenq*2]
381cabdff1aSopenharmony_ci    add                inq, mmsize
382cabdff1aSopenharmony_ci    sub               tmpd, mmsize
383cabdff1aSopenharmony_ci    jg .inner_loop4
384cabdff1aSopenharmony_ci    add               outq, 4
385cabdff1aSopenharmony_ci    sub                 id, 1
386cabdff1aSopenharmony_ci    test                id, 2
387cabdff1aSopenharmony_ci    jne .loop8
388cabdff1aSopenharmony_ci    cmp                 id, 4
389cabdff1aSopenharmony_ci    jge .loop16
390cabdff1aSopenharmony_ci    RET
391cabdff1aSopenharmony_ci%endmacro
392cabdff1aSopenharmony_ci
393cabdff1aSopenharmony_ciINIT_XMM sse
394cabdff1aSopenharmony_ciHYBRID_SYNTHESIS_DEINT
395cabdff1aSopenharmony_ciINIT_XMM sse4
396cabdff1aSopenharmony_ciHYBRID_SYNTHESIS_DEINT
397cabdff1aSopenharmony_ci
398cabdff1aSopenharmony_ci;*******************************************************************
399cabdff1aSopenharmony_ci;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
400cabdff1aSopenharmony_ci;                                 const float (*filter)[8][2],
401cabdff1aSopenharmony_ci;                                 ptrdiff_t stride, int n);
402cabdff1aSopenharmony_ci;*******************************************************************
403cabdff1aSopenharmony_ci%macro PS_HYBRID_ANALYSIS_LOOP 3
404cabdff1aSopenharmony_ci    movu     %1, [inq+mmsize*%3]
405cabdff1aSopenharmony_ci    movu     m1, [inq+mmsize*(5-%3)+8]
406cabdff1aSopenharmony_ci%if cpuflag(sse3)
407cabdff1aSopenharmony_ci    pshufd   %2, %1, q2301
408cabdff1aSopenharmony_ci    pshufd   m4, m1, q0123
409cabdff1aSopenharmony_ci    pshufd   m1, m1, q1032
410cabdff1aSopenharmony_ci    pshufd   m2, [filterq+nq+mmsize*%3], q2301
411cabdff1aSopenharmony_ci    addsubps %2, m4
412cabdff1aSopenharmony_ci    addsubps %1, m1
413cabdff1aSopenharmony_ci%else
414cabdff1aSopenharmony_ci    mova     m2, [filterq+nq+mmsize*%3]
415cabdff1aSopenharmony_ci    mova     %2, %1
416cabdff1aSopenharmony_ci    mova     m4, m1
417cabdff1aSopenharmony_ci    shufps   %2, %2, q2301
418cabdff1aSopenharmony_ci    shufps   m4, m4, q0123
419cabdff1aSopenharmony_ci    shufps   m1, m1, q1032
420cabdff1aSopenharmony_ci    shufps   m2, m2, q2301
421cabdff1aSopenharmony_ci    xorps    m4, m7
422cabdff1aSopenharmony_ci    xorps    m1, m7
423cabdff1aSopenharmony_ci    subps    %2, m4
424cabdff1aSopenharmony_ci    subps    %1, m1
425cabdff1aSopenharmony_ci%endif
426cabdff1aSopenharmony_ci    mulps    %2, m2
427cabdff1aSopenharmony_ci    mulps    %1, m2
428cabdff1aSopenharmony_ci%if %3
429cabdff1aSopenharmony_ci    addps    m3, %2
430cabdff1aSopenharmony_ci    addps    m0, %1
431cabdff1aSopenharmony_ci%endif
432cabdff1aSopenharmony_ci%endmacro
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_ci%macro PS_HYBRID_ANALYSIS 0
435cabdff1aSopenharmony_cicglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
436cabdff1aSopenharmony_ci%if cpuflag(sse3)
437cabdff1aSopenharmony_ci%define MOVH movsd
438cabdff1aSopenharmony_ci%else
439cabdff1aSopenharmony_ci%define MOVH movlps
440cabdff1aSopenharmony_ci%endif
441cabdff1aSopenharmony_ci    shl strideq, 3
442cabdff1aSopenharmony_ci    shl nd, 6
443cabdff1aSopenharmony_ci    add filterq, nq
444cabdff1aSopenharmony_ci    neg nq
445cabdff1aSopenharmony_ci    mova m7, [ps_p1m1p1m1]
446cabdff1aSopenharmony_ci
447cabdff1aSopenharmony_cialign 16
448cabdff1aSopenharmony_ci.loop:
449cabdff1aSopenharmony_ci    PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
450cabdff1aSopenharmony_ci    PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
451cabdff1aSopenharmony_ci    PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
452cabdff1aSopenharmony_ci
453cabdff1aSopenharmony_ci%if cpuflag(sse3)
454cabdff1aSopenharmony_ci    pshufd   m3, m3, q2301
455cabdff1aSopenharmony_ci    xorps    m0, m7
456cabdff1aSopenharmony_ci    hsubps   m3, m0
457cabdff1aSopenharmony_ci    pshufd   m1, m3, q0020
458cabdff1aSopenharmony_ci    pshufd   m3, m3, q0031
459cabdff1aSopenharmony_ci    addps    m1, m3
460cabdff1aSopenharmony_ci    movsd    m2, [inq+6*8]
461cabdff1aSopenharmony_ci%else
462cabdff1aSopenharmony_ci    mova     m1, m3
463cabdff1aSopenharmony_ci    mova     m2, m0
464cabdff1aSopenharmony_ci    shufps   m1, m1, q2301
465cabdff1aSopenharmony_ci    shufps   m2, m2, q2301
466cabdff1aSopenharmony_ci    subps    m1, m3
467cabdff1aSopenharmony_ci    addps    m2, m0
468cabdff1aSopenharmony_ci    unpcklps m3, m1, m2
469cabdff1aSopenharmony_ci    unpckhps m1, m2
470cabdff1aSopenharmony_ci    addps    m1, m3
471cabdff1aSopenharmony_ci    movu     m2, [inq+6*8] ; faster than movlps and no risk of overread
472cabdff1aSopenharmony_ci%endif
473cabdff1aSopenharmony_ci    movss    m3, [filterq+nq+8*6]
474cabdff1aSopenharmony_ci    SPLATD   m3
475cabdff1aSopenharmony_ci    mulps    m2, m3
476cabdff1aSopenharmony_ci    addps    m1, m2
477cabdff1aSopenharmony_ci    MOVH [outq], m1
478cabdff1aSopenharmony_ci    add    outq, strideq
479cabdff1aSopenharmony_ci    add      nq, 64
480cabdff1aSopenharmony_ci    jl .loop
481cabdff1aSopenharmony_ci    REP_RET
482cabdff1aSopenharmony_ci%endmacro
483cabdff1aSopenharmony_ci
484cabdff1aSopenharmony_ciINIT_XMM sse
485cabdff1aSopenharmony_ciPS_HYBRID_ANALYSIS
486cabdff1aSopenharmony_ciINIT_XMM sse3
487cabdff1aSopenharmony_ciPS_HYBRID_ANALYSIS
488