1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* x86-optimized AC-3 DSP functions
3cabdff1aSopenharmony_ci;* Copyright (c) 2011 Justin Ruggles
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci;******************************************************************************
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciSECTION_RODATA
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci; 16777216.0f - used in ff_float_to_fixed24()
27cabdff1aSopenharmony_cipf_1_24: times 4 dd 0x4B800000
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ci; used in ff_ac3_compute_mantissa_size()
30cabdff1aSopenharmony_cicextern ac3_bap_bits
31cabdff1aSopenharmony_cipw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
32cabdff1aSopenharmony_cipw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ci; used in ff_ac3_extract_exponents()
35cabdff1aSopenharmony_cicextern pd_1
36cabdff1aSopenharmony_cipd_151: times 4 dd 151
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ciSECTION .text
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
41cabdff1aSopenharmony_ci; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
42cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_ci%macro AC3_EXPONENT_MIN 0
45cabdff1aSopenharmony_cicglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
46cabdff1aSopenharmony_ci    shl  reuse_blksq, 8
47cabdff1aSopenharmony_ci    jz .end
48cabdff1aSopenharmony_ci    LOOP_ALIGN
49cabdff1aSopenharmony_ci.nextexp:
50cabdff1aSopenharmony_ci    mov      offsetq, reuse_blksq
51cabdff1aSopenharmony_ci    mova          m0, [expq+offsetq]
52cabdff1aSopenharmony_ci    sub      offsetq, 256
53cabdff1aSopenharmony_ci    LOOP_ALIGN
54cabdff1aSopenharmony_ci.nextblk:
55cabdff1aSopenharmony_ci    PMINUB        m0, [expq+offsetq], m1
56cabdff1aSopenharmony_ci    sub      offsetq, 256
57cabdff1aSopenharmony_ci    jae .nextblk
58cabdff1aSopenharmony_ci    mova      [expq], m0
59cabdff1aSopenharmony_ci    add         expq, mmsize
60cabdff1aSopenharmony_ci    sub        expnq, mmsize
61cabdff1aSopenharmony_ci    jg .nextexp
62cabdff1aSopenharmony_ci.end:
63cabdff1aSopenharmony_ci    REP_RET
64cabdff1aSopenharmony_ci%endmacro
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_ci%define LOOP_ALIGN ALIGN 16
67cabdff1aSopenharmony_ci%if HAVE_SSE2_EXTERNAL
68cabdff1aSopenharmony_ciINIT_XMM sse2
69cabdff1aSopenharmony_ciAC3_EXPONENT_MIN
70cabdff1aSopenharmony_ci%endif
71cabdff1aSopenharmony_ci%undef LOOP_ALIGN
72cabdff1aSopenharmony_ci
73cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
74cabdff1aSopenharmony_ci; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
75cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
76cabdff1aSopenharmony_ci
77cabdff1aSopenharmony_ciINIT_XMM sse2
78cabdff1aSopenharmony_cicglobal float_to_fixed24, 3, 3, 9, dst, src, len
79cabdff1aSopenharmony_ci    movaps     m0, [pf_1_24]
80cabdff1aSopenharmony_ci.loop:
81cabdff1aSopenharmony_ci    movaps     m1, [srcq    ]
82cabdff1aSopenharmony_ci    movaps     m2, [srcq+16 ]
83cabdff1aSopenharmony_ci    movaps     m3, [srcq+32 ]
84cabdff1aSopenharmony_ci    movaps     m4, [srcq+48 ]
85cabdff1aSopenharmony_ci%ifdef m8
86cabdff1aSopenharmony_ci    movaps     m5, [srcq+64 ]
87cabdff1aSopenharmony_ci    movaps     m6, [srcq+80 ]
88cabdff1aSopenharmony_ci    movaps     m7, [srcq+96 ]
89cabdff1aSopenharmony_ci    movaps     m8, [srcq+112]
90cabdff1aSopenharmony_ci%endif
91cabdff1aSopenharmony_ci    mulps      m1, m0
92cabdff1aSopenharmony_ci    mulps      m2, m0
93cabdff1aSopenharmony_ci    mulps      m3, m0
94cabdff1aSopenharmony_ci    mulps      m4, m0
95cabdff1aSopenharmony_ci%ifdef m8
96cabdff1aSopenharmony_ci    mulps      m5, m0
97cabdff1aSopenharmony_ci    mulps      m6, m0
98cabdff1aSopenharmony_ci    mulps      m7, m0
99cabdff1aSopenharmony_ci    mulps      m8, m0
100cabdff1aSopenharmony_ci%endif
101cabdff1aSopenharmony_ci    cvtps2dq   m1, m1
102cabdff1aSopenharmony_ci    cvtps2dq   m2, m2
103cabdff1aSopenharmony_ci    cvtps2dq   m3, m3
104cabdff1aSopenharmony_ci    cvtps2dq   m4, m4
105cabdff1aSopenharmony_ci%ifdef m8
106cabdff1aSopenharmony_ci    cvtps2dq   m5, m5
107cabdff1aSopenharmony_ci    cvtps2dq   m6, m6
108cabdff1aSopenharmony_ci    cvtps2dq   m7, m7
109cabdff1aSopenharmony_ci    cvtps2dq   m8, m8
110cabdff1aSopenharmony_ci%endif
111cabdff1aSopenharmony_ci    movdqa  [dstq    ], m1
112cabdff1aSopenharmony_ci    movdqa  [dstq+16 ], m2
113cabdff1aSopenharmony_ci    movdqa  [dstq+32 ], m3
114cabdff1aSopenharmony_ci    movdqa  [dstq+48 ], m4
115cabdff1aSopenharmony_ci%ifdef m8
116cabdff1aSopenharmony_ci    movdqa  [dstq+64 ], m5
117cabdff1aSopenharmony_ci    movdqa  [dstq+80 ], m6
118cabdff1aSopenharmony_ci    movdqa  [dstq+96 ], m7
119cabdff1aSopenharmony_ci    movdqa  [dstq+112], m8
120cabdff1aSopenharmony_ci    add      srcq, 128
121cabdff1aSopenharmony_ci    add      dstq, 128
122cabdff1aSopenharmony_ci    sub      lenq, 32
123cabdff1aSopenharmony_ci%else
124cabdff1aSopenharmony_ci    add      srcq, 64
125cabdff1aSopenharmony_ci    add      dstq, 64
126cabdff1aSopenharmony_ci    sub      lenq, 16
127cabdff1aSopenharmony_ci%endif
128cabdff1aSopenharmony_ci    ja .loop
129cabdff1aSopenharmony_ci    REP_RET
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
132cabdff1aSopenharmony_ci; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
133cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
134cabdff1aSopenharmony_ci
135cabdff1aSopenharmony_ci%macro PHADDD4 2 ; xmm src, xmm tmp
136cabdff1aSopenharmony_ci    movhlps  %2, %1
137cabdff1aSopenharmony_ci    paddd    %1, %2
138cabdff1aSopenharmony_ci    pshufd   %2, %1, 0x1
139cabdff1aSopenharmony_ci    paddd    %1, %2
140cabdff1aSopenharmony_ci%endmacro
141cabdff1aSopenharmony_ci
142cabdff1aSopenharmony_ciINIT_XMM sse2
143cabdff1aSopenharmony_cicglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
144cabdff1aSopenharmony_ci    movdqa      m0, [mant_cntq      ]
145cabdff1aSopenharmony_ci    movdqa      m1, [mant_cntq+ 1*16]
146cabdff1aSopenharmony_ci    paddw       m0, [mant_cntq+ 2*16]
147cabdff1aSopenharmony_ci    paddw       m1, [mant_cntq+ 3*16]
148cabdff1aSopenharmony_ci    paddw       m0, [mant_cntq+ 4*16]
149cabdff1aSopenharmony_ci    paddw       m1, [mant_cntq+ 5*16]
150cabdff1aSopenharmony_ci    paddw       m0, [mant_cntq+ 6*16]
151cabdff1aSopenharmony_ci    paddw       m1, [mant_cntq+ 7*16]
152cabdff1aSopenharmony_ci    paddw       m0, [mant_cntq+ 8*16]
153cabdff1aSopenharmony_ci    paddw       m1, [mant_cntq+ 9*16]
154cabdff1aSopenharmony_ci    paddw       m0, [mant_cntq+10*16]
155cabdff1aSopenharmony_ci    paddw       m1, [mant_cntq+11*16]
156cabdff1aSopenharmony_ci    pmaddwd     m0, [ac3_bap_bits   ]
157cabdff1aSopenharmony_ci    pmaddwd     m1, [ac3_bap_bits+16]
158cabdff1aSopenharmony_ci    paddd       m0, m1
159cabdff1aSopenharmony_ci    PHADDD4     m0, m1
160cabdff1aSopenharmony_ci    movd      sumd, m0
161cabdff1aSopenharmony_ci    movdqa      m3, [pw_bap_mul1]
162cabdff1aSopenharmony_ci    movhpd      m0, [mant_cntq     +2]
163cabdff1aSopenharmony_ci    movlpd      m0, [mant_cntq+1*32+2]
164cabdff1aSopenharmony_ci    movhpd      m1, [mant_cntq+2*32+2]
165cabdff1aSopenharmony_ci    movlpd      m1, [mant_cntq+3*32+2]
166cabdff1aSopenharmony_ci    movhpd      m2, [mant_cntq+4*32+2]
167cabdff1aSopenharmony_ci    movlpd      m2, [mant_cntq+5*32+2]
168cabdff1aSopenharmony_ci    pmulhuw     m0, m3
169cabdff1aSopenharmony_ci    pmulhuw     m1, m3
170cabdff1aSopenharmony_ci    pmulhuw     m2, m3
171cabdff1aSopenharmony_ci    paddusw     m0, m1
172cabdff1aSopenharmony_ci    paddusw     m0, m2
173cabdff1aSopenharmony_ci    pmaddwd     m0, [pw_bap_mul2]
174cabdff1aSopenharmony_ci    PHADDD4     m0, m1
175cabdff1aSopenharmony_ci    movd       eax, m0
176cabdff1aSopenharmony_ci    add        eax, sumd
177cabdff1aSopenharmony_ci    RET
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
180cabdff1aSopenharmony_ci; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
181cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci%macro PABSD 1-2 ; src/dst, unused
184cabdff1aSopenharmony_ci%if cpuflag(ssse3)
185cabdff1aSopenharmony_ci    pabsd    %1, %1
186cabdff1aSopenharmony_ci%else ; src/dst, tmp
187cabdff1aSopenharmony_ci    pxor     %2, %2
188cabdff1aSopenharmony_ci    pcmpgtd  %2, %1
189cabdff1aSopenharmony_ci    pxor     %1, %2
190cabdff1aSopenharmony_ci    psubd    %1, %2
191cabdff1aSopenharmony_ci%endif
192cabdff1aSopenharmony_ci%endmacro
193cabdff1aSopenharmony_ci
194cabdff1aSopenharmony_ci%macro AC3_EXTRACT_EXPONENTS 0
195cabdff1aSopenharmony_cicglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
196cabdff1aSopenharmony_ci    add     expq, lenq
197cabdff1aSopenharmony_ci    lea    coefq, [coefq+4*lenq]
198cabdff1aSopenharmony_ci    neg     lenq
199cabdff1aSopenharmony_ci    mova      m2, [pd_1]
200cabdff1aSopenharmony_ci    mova      m3, [pd_151]
201cabdff1aSopenharmony_ci.loop:
202cabdff1aSopenharmony_ci    ; move 4 32-bit coefs to xmm0
203cabdff1aSopenharmony_ci    mova      m0, [coefq+4*lenq]
204cabdff1aSopenharmony_ci    ; absolute value
205cabdff1aSopenharmony_ci    PABSD     m0, m1
206cabdff1aSopenharmony_ci    ; convert to float and extract exponents
207cabdff1aSopenharmony_ci    pslld     m0, 1
208cabdff1aSopenharmony_ci    por       m0, m2
209cabdff1aSopenharmony_ci    cvtdq2ps  m1, m0
210cabdff1aSopenharmony_ci    psrld     m1, 23
211cabdff1aSopenharmony_ci    mova      m0, m3
212cabdff1aSopenharmony_ci    psubd     m0, m1
213cabdff1aSopenharmony_ci    ; move the lowest byte in each of 4 dwords to the low dword
214cabdff1aSopenharmony_ci    ; NOTE: We cannot just extract the low bytes with pshufb because the dword
215cabdff1aSopenharmony_ci    ;       result for 16777215 is -1 due to float inaccuracy. Using packuswb
216cabdff1aSopenharmony_ci    ;       clips this to 0, which is the correct exponent.
217cabdff1aSopenharmony_ci    packssdw  m0, m0
218cabdff1aSopenharmony_ci    packuswb  m0, m0
219cabdff1aSopenharmony_ci    movd  [expq+lenq], m0
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_ci    add     lenq, 4
222cabdff1aSopenharmony_ci    jl .loop
223cabdff1aSopenharmony_ci    REP_RET
224cabdff1aSopenharmony_ci%endmacro
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_ci%if HAVE_SSE2_EXTERNAL
227cabdff1aSopenharmony_ciINIT_XMM sse2
228cabdff1aSopenharmony_ciAC3_EXTRACT_EXPONENTS
229cabdff1aSopenharmony_ci%endif
230cabdff1aSopenharmony_ci%if HAVE_SSSE3_EXTERNAL
231cabdff1aSopenharmony_ciINIT_XMM ssse3
232cabdff1aSopenharmony_ciAC3_EXTRACT_EXPONENTS
233cabdff1aSopenharmony_ci%endif
234