1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* FLAC DSP SIMD optimizations
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (C) 2014 Loren Merritt
5cabdff1aSopenharmony_ci;* Copyright (C) 2014 James Almer
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;******************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION .text
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_ci%macro PMACSDQL 5
29cabdff1aSopenharmony_ci%if cpuflag(xop)
30cabdff1aSopenharmony_ci    pmacsdql %1, %2, %3, %1
31cabdff1aSopenharmony_ci%else
32cabdff1aSopenharmony_ci    pmuldq   %2, %3
33cabdff1aSopenharmony_ci    paddq    %1, %2
34cabdff1aSopenharmony_ci%endif
35cabdff1aSopenharmony_ci%endmacro
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci%macro LPC_32 1
38cabdff1aSopenharmony_ciINIT_XMM %1
39cabdff1aSopenharmony_cicglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
40cabdff1aSopenharmony_ci    sub    lend, pred_orderd
41cabdff1aSopenharmony_ci    jle .ret
42cabdff1aSopenharmony_ci    lea    decodedq, [decodedq+pred_orderq*4-8]
43cabdff1aSopenharmony_ci    lea    coeffsq, [coeffsq+pred_orderq*4]
44cabdff1aSopenharmony_ci    neg    pred_orderq
45cabdff1aSopenharmony_ci    movd   m4, qlevelm
46cabdff1aSopenharmony_ciALIGN 16
47cabdff1aSopenharmony_ci.loop_sample:
48cabdff1aSopenharmony_ci    movd   m0, [decodedq+pred_orderq*4+8]
49cabdff1aSopenharmony_ci    add    decodedq, 8
50cabdff1aSopenharmony_ci    movd   m1, [coeffsq+pred_orderq*4]
51cabdff1aSopenharmony_ci    pxor   m2, m2
52cabdff1aSopenharmony_ci    pxor   m3, m3
53cabdff1aSopenharmony_ci    lea    jq, [pred_orderq+1]
54cabdff1aSopenharmony_ci    test   jq, jq
55cabdff1aSopenharmony_ci    jz .end_order
56cabdff1aSopenharmony_ci.loop_order:
57cabdff1aSopenharmony_ci    PMACSDQL m2, m0, m1, m2, m0
58cabdff1aSopenharmony_ci    movd   m0, [decodedq+jq*4]
59cabdff1aSopenharmony_ci    PMACSDQL m3, m1, m0, m3, m1
60cabdff1aSopenharmony_ci    movd   m1, [coeffsq+jq*4]
61cabdff1aSopenharmony_ci    inc    jq
62cabdff1aSopenharmony_ci    jl .loop_order
63cabdff1aSopenharmony_ci.end_order:
64cabdff1aSopenharmony_ci    PMACSDQL m2, m0, m1, m2, m0
65cabdff1aSopenharmony_ci    psrlq  m2, m4
66cabdff1aSopenharmony_ci    movd   m0, [decodedq]
67cabdff1aSopenharmony_ci    paddd  m0, m2
68cabdff1aSopenharmony_ci    movd   [decodedq], m0
69cabdff1aSopenharmony_ci    sub  lend, 2
70cabdff1aSopenharmony_ci    jl .ret
71cabdff1aSopenharmony_ci    PMACSDQL m3, m1, m0, m3, m1
72cabdff1aSopenharmony_ci    psrlq  m3, m4
73cabdff1aSopenharmony_ci    movd   m1, [decodedq+4]
74cabdff1aSopenharmony_ci    paddd  m1, m3
75cabdff1aSopenharmony_ci    movd   [decodedq+4], m1
76cabdff1aSopenharmony_ci    jg .loop_sample
77cabdff1aSopenharmony_ci.ret:
78cabdff1aSopenharmony_ci    REP_RET
79cabdff1aSopenharmony_ci%endmacro
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci%if HAVE_XOP_EXTERNAL
82cabdff1aSopenharmony_ciLPC_32 xop
83cabdff1aSopenharmony_ci%endif
84cabdff1aSopenharmony_ciLPC_32 sse4
85cabdff1aSopenharmony_ci
86cabdff1aSopenharmony_ci;----------------------------------------------------------------------------------
87cabdff1aSopenharmony_ci;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
88cabdff1aSopenharmony_ci;                                                   int len, int shift);
89cabdff1aSopenharmony_ci;----------------------------------------------------------------------------------
90cabdff1aSopenharmony_ci%macro FLAC_DECORRELATE_16 3-4
91cabdff1aSopenharmony_cicglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
92cabdff1aSopenharmony_ci%if ARCH_X86_32
93cabdff1aSopenharmony_ci    mov      lend, lenm
94cabdff1aSopenharmony_ci%endif
95cabdff1aSopenharmony_ci    movd       m3, r4m
96cabdff1aSopenharmony_ci    shl      lend, 2
97cabdff1aSopenharmony_ci    mov      in1q, [in0q + gprsize]
98cabdff1aSopenharmony_ci    mov      in0q, [in0q]
99cabdff1aSopenharmony_ci    mov      outq, [outq]
100cabdff1aSopenharmony_ci    add      in1q, lenq
101cabdff1aSopenharmony_ci    add      in0q, lenq
102cabdff1aSopenharmony_ci    add      outq, lenq
103cabdff1aSopenharmony_ci    neg      lenq
104cabdff1aSopenharmony_ci
105cabdff1aSopenharmony_cialign 16
106cabdff1aSopenharmony_ci.loop:
107cabdff1aSopenharmony_ci    mova       m0, [in0q + lenq]
108cabdff1aSopenharmony_ci    mova       m1, [in1q + lenq]
109cabdff1aSopenharmony_ci%ifidn %1, ms
110cabdff1aSopenharmony_ci    psrad      m2, m1, 1
111cabdff1aSopenharmony_ci    psubd      m0, m2
112cabdff1aSopenharmony_ci%endif
113cabdff1aSopenharmony_ci%ifnidn %1, indep2
114cabdff1aSopenharmony_ci    p%4d       m2, m0, m1
115cabdff1aSopenharmony_ci%endif
116cabdff1aSopenharmony_ci    packssdw  m%2, m%2
117cabdff1aSopenharmony_ci    packssdw  m%3, m%3
118cabdff1aSopenharmony_ci    punpcklwd m%2, m%3
119cabdff1aSopenharmony_ci    psllw     m%2, m3
120cabdff1aSopenharmony_ci    mova [outq + lenq], m%2
121cabdff1aSopenharmony_ci    add      lenq, 16
122cabdff1aSopenharmony_ci    jl .loop
123cabdff1aSopenharmony_ci    REP_RET
124cabdff1aSopenharmony_ci%endmacro
125cabdff1aSopenharmony_ci
126cabdff1aSopenharmony_ciINIT_XMM sse2
127cabdff1aSopenharmony_ciFLAC_DECORRELATE_16 ls, 0, 2, sub
128cabdff1aSopenharmony_ciFLAC_DECORRELATE_16 rs, 2, 1, add
129cabdff1aSopenharmony_ciFLAC_DECORRELATE_16 ms, 2, 0, add
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci;----------------------------------------------------------------------------------
132cabdff1aSopenharmony_ci;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
133cabdff1aSopenharmony_ci;                                        int len, int shift);
134cabdff1aSopenharmony_ci;----------------------------------------------------------------------------------
135cabdff1aSopenharmony_ci%macro FLAC_DECORRELATE_32 5
136cabdff1aSopenharmony_cicglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
137cabdff1aSopenharmony_ci%if ARCH_X86_32
138cabdff1aSopenharmony_ci    mov      lend, lenm
139cabdff1aSopenharmony_ci%endif
140cabdff1aSopenharmony_ci    movd       m3, r4m
141cabdff1aSopenharmony_ci    mov      in1q, [in0q + gprsize]
142cabdff1aSopenharmony_ci    mov      in0q, [in0q]
143cabdff1aSopenharmony_ci    mov      outq, [outq]
144cabdff1aSopenharmony_ci    sub      in1q, in0q
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_cialign 16
147cabdff1aSopenharmony_ci.loop:
148cabdff1aSopenharmony_ci    mova       m0, [in0q]
149cabdff1aSopenharmony_ci    mova       m1, [in0q + in1q]
150cabdff1aSopenharmony_ci%ifidn %1, ms
151cabdff1aSopenharmony_ci    psrad      m2, m1, 1
152cabdff1aSopenharmony_ci    psubd      m0, m2
153cabdff1aSopenharmony_ci%endif
154cabdff1aSopenharmony_ci    p%5d       m2, m0, m1
155cabdff1aSopenharmony_ci    pslld     m%2, m3
156cabdff1aSopenharmony_ci    pslld     m%3, m3
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ci    SBUTTERFLY dq, %2, %3, %4
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_ci    mova  [outq         ], m%2
161cabdff1aSopenharmony_ci    mova  [outq + mmsize], m%3
162cabdff1aSopenharmony_ci
163cabdff1aSopenharmony_ci    add      in0q, mmsize
164cabdff1aSopenharmony_ci    add      outq, mmsize*2
165cabdff1aSopenharmony_ci    sub      lend, mmsize/4
166cabdff1aSopenharmony_ci    jg .loop
167cabdff1aSopenharmony_ci    REP_RET
168cabdff1aSopenharmony_ci%endmacro
169cabdff1aSopenharmony_ci
170cabdff1aSopenharmony_ciINIT_XMM sse2
171cabdff1aSopenharmony_ciFLAC_DECORRELATE_32 ls, 0, 2, 1, sub
172cabdff1aSopenharmony_ciFLAC_DECORRELATE_32 rs, 2, 1, 0, add
173cabdff1aSopenharmony_ciFLAC_DECORRELATE_32 ms, 2, 0, 1, add
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------------------
176cabdff1aSopenharmony_ci;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
177cabdff1aSopenharmony_ci;                                            int len, int shift);
178cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------------------
179cabdff1aSopenharmony_ci;%1 = bps
180cabdff1aSopenharmony_ci;%2 = channels
181cabdff1aSopenharmony_ci;%3 = last xmm reg used
182cabdff1aSopenharmony_ci;%4 = word/dword (shift instruction)
183cabdff1aSopenharmony_ci%macro FLAC_DECORRELATE_INDEP 4
184cabdff1aSopenharmony_ci%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
185cabdff1aSopenharmony_cicglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
186cabdff1aSopenharmony_ci%if ARCH_X86_32
187cabdff1aSopenharmony_ci%if %2 == 6
188cabdff1aSopenharmony_ci    DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
189cabdff1aSopenharmony_ci    %define  lend  dword r3m
190cabdff1aSopenharmony_ci%else
191cabdff1aSopenharmony_ci    mov      lend, lenm
192cabdff1aSopenharmony_ci%endif
193cabdff1aSopenharmony_ci%endif
194cabdff1aSopenharmony_ci    movd      m%3, r4m
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_ci%assign %%i 1
197cabdff1aSopenharmony_ci%rep %2-1
198cabdff1aSopenharmony_ci    mov      in %+ %%i %+ q, [in0q+%%i*gprsize]
199cabdff1aSopenharmony_ci%assign %%i %%i+1
200cabdff1aSopenharmony_ci%endrep
201cabdff1aSopenharmony_ci
202cabdff1aSopenharmony_ci    mov      in0q, [in0q]
203cabdff1aSopenharmony_ci    mov      outq, [outq]
204cabdff1aSopenharmony_ci
205cabdff1aSopenharmony_ci%assign %%i 1
206cabdff1aSopenharmony_ci%rep %2-1
207cabdff1aSopenharmony_ci    sub      in %+ %%i %+ q, in0q
208cabdff1aSopenharmony_ci%assign %%i %%i+1
209cabdff1aSopenharmony_ci%endrep
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_cialign 16
212cabdff1aSopenharmony_ci.loop:
213cabdff1aSopenharmony_ci    mova       m0, [in0q]
214cabdff1aSopenharmony_ci
215cabdff1aSopenharmony_ci%assign %%i 1
216cabdff1aSopenharmony_ci%rep REPCOUNT-1
217cabdff1aSopenharmony_ci    mova     m %+ %%i, [in0q + in %+ %%i %+ q]
218cabdff1aSopenharmony_ci%assign %%i %%i+1
219cabdff1aSopenharmony_ci%endrep
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_ci%if %1 == 32
222cabdff1aSopenharmony_ci
223cabdff1aSopenharmony_ci%if %2 == 8
224cabdff1aSopenharmony_ci    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
225cabdff1aSopenharmony_ci%elif %2 == 6
226cabdff1aSopenharmony_ci    SBUTTERFLY dq, 0, 1, 6
227cabdff1aSopenharmony_ci    SBUTTERFLY dq, 2, 3, 6
228cabdff1aSopenharmony_ci    SBUTTERFLY dq, 4, 5, 6
229cabdff1aSopenharmony_ci
230cabdff1aSopenharmony_ci    punpcklqdq m6, m0, m2
231cabdff1aSopenharmony_ci    punpckhqdq m2, m4
232cabdff1aSopenharmony_ci    shufps     m4, m0, 0xe4
233cabdff1aSopenharmony_ci    punpcklqdq m0, m1, m3
234cabdff1aSopenharmony_ci    punpckhqdq m3, m5
235cabdff1aSopenharmony_ci    shufps     m5, m1, 0xe4
236cabdff1aSopenharmony_ci    SWAP 0,6,1,4,5,3
237cabdff1aSopenharmony_ci%elif %2 == 4
238cabdff1aSopenharmony_ci    TRANSPOSE4x4D 0, 1, 2, 3, 4
239cabdff1aSopenharmony_ci%else ; %2 == 2
240cabdff1aSopenharmony_ci    SBUTTERFLY dq, 0, 1, 2
241cabdff1aSopenharmony_ci%endif
242cabdff1aSopenharmony_ci
243cabdff1aSopenharmony_ci%else ; %1 == 16
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci%if %2 == 8
246cabdff1aSopenharmony_ci    packssdw   m0, [in0q + in4q]
247cabdff1aSopenharmony_ci    packssdw   m1, [in0q + in5q]
248cabdff1aSopenharmony_ci    packssdw   m2, [in0q + in6q]
249cabdff1aSopenharmony_ci    packssdw   m3, [in0q + in7q]
250cabdff1aSopenharmony_ci    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
251cabdff1aSopenharmony_ci%elif %2 == 6
252cabdff1aSopenharmony_ci    packssdw   m0, [in0q + in3q]
253cabdff1aSopenharmony_ci    packssdw   m1, [in0q + in4q]
254cabdff1aSopenharmony_ci    packssdw   m2, [in0q + in5q]
255cabdff1aSopenharmony_ci    pshufd     m3, m0,     q1032
256cabdff1aSopenharmony_ci    punpcklwd  m0, m1
257cabdff1aSopenharmony_ci    punpckhwd  m1, m2
258cabdff1aSopenharmony_ci    punpcklwd  m2, m3
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_ci    shufps     m3, m0, m2, q2020
261cabdff1aSopenharmony_ci    shufps     m0, m1,     q2031
262cabdff1aSopenharmony_ci    shufps     m2, m1,     q3131
263cabdff1aSopenharmony_ci    shufps     m1, m2, m3, q3120
264cabdff1aSopenharmony_ci    shufps     m3, m0,     q0220
265cabdff1aSopenharmony_ci    shufps     m0, m2,     q3113
266cabdff1aSopenharmony_ci    SWAP 2, 0, 3
267cabdff1aSopenharmony_ci%else ; %2 == 4
268cabdff1aSopenharmony_ci    packssdw   m0, [in0q + in2q]
269cabdff1aSopenharmony_ci    packssdw   m1, [in0q + in3q]
270cabdff1aSopenharmony_ci    SBUTTERFLY wd, 0, 1, 2
271cabdff1aSopenharmony_ci    SBUTTERFLY dq, 0, 1, 2
272cabdff1aSopenharmony_ci%endif
273cabdff1aSopenharmony_ci
274cabdff1aSopenharmony_ci%endif
275cabdff1aSopenharmony_ci
276cabdff1aSopenharmony_ci%assign %%i 0
277cabdff1aSopenharmony_ci%rep REPCOUNT
278cabdff1aSopenharmony_ci    psll%4   m %+ %%i, m%3
279cabdff1aSopenharmony_ci%assign %%i %%i+1
280cabdff1aSopenharmony_ci%endrep
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci%assign %%i 0
283cabdff1aSopenharmony_ci%rep REPCOUNT
284cabdff1aSopenharmony_ci    mova [outq + %%i*mmsize], m %+ %%i
285cabdff1aSopenharmony_ci%assign %%i %%i+1
286cabdff1aSopenharmony_ci%endrep
287cabdff1aSopenharmony_ci
288cabdff1aSopenharmony_ci    add      in0q, mmsize
289cabdff1aSopenharmony_ci    add      outq, mmsize*REPCOUNT
290cabdff1aSopenharmony_ci    sub      lend, mmsize/4
291cabdff1aSopenharmony_ci    jg .loop
292cabdff1aSopenharmony_ci    REP_RET
293cabdff1aSopenharmony_ci%endmacro
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ciINIT_XMM sse2
296cabdff1aSopenharmony_ciFLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
297cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 2, 3, d
298cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 16, 4, 3, w
299cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 4, 5, d
300cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 16, 6, 4, w
301cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 6, 7, d
302cabdff1aSopenharmony_ci%if ARCH_X86_64
303cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 16, 8, 5, w
304cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 8, 9, d
305cabdff1aSopenharmony_ci%endif
306cabdff1aSopenharmony_ci
307cabdff1aSopenharmony_ciINIT_XMM avx
308cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 4, 5, d
309cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 6, 7, d
310cabdff1aSopenharmony_ci%if ARCH_X86_64
311cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 16, 8, 5, w
312cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 8, 9, d
313cabdff1aSopenharmony_ci%endif
314