1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * SIMD-optimized LPC functions
3cabdff1aSopenharmony_ci * Copyright (c) 2007 Loren Merritt
4cabdff1aSopenharmony_ci *
5cabdff1aSopenharmony_ci * This file is part of FFmpeg.
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci * Lesser General Public License for more details.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci */
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci#include "libavutil/attributes.h"
23cabdff1aSopenharmony_ci#include "libavutil/cpu.h"
24cabdff1aSopenharmony_ci#include "libavutil/mem_internal.h"
25cabdff1aSopenharmony_ci#include "libavutil/x86/asm.h"
26cabdff1aSopenharmony_ci#include "libavutil/x86/cpu.h"
27cabdff1aSopenharmony_ci#include "libavcodec/lpc.h"
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ciDECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
30cabdff1aSopenharmony_ciDECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 };
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci#if HAVE_SSE2_INLINE
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_cistatic void lpc_apply_welch_window_sse2(const int32_t *data, int len,
35cabdff1aSopenharmony_ci                                        double *w_data)
36cabdff1aSopenharmony_ci{
37cabdff1aSopenharmony_ci    double c = 2.0 / (len-1.0);
38cabdff1aSopenharmony_ci    int n2 = len>>1;
39cabdff1aSopenharmony_ci    x86_reg i = -n2*sizeof(int32_t);
40cabdff1aSopenharmony_ci    x86_reg j =  n2*sizeof(int32_t);
41cabdff1aSopenharmony_ci    __asm__ volatile(
42cabdff1aSopenharmony_ci        "movsd   %4,     %%xmm7                \n\t"
43cabdff1aSopenharmony_ci        "movapd  "MANGLE(pd_1)", %%xmm6        \n\t"
44cabdff1aSopenharmony_ci        "movapd  "MANGLE(pd_2)", %%xmm5        \n\t"
45cabdff1aSopenharmony_ci        "movlhps %%xmm7, %%xmm7                \n\t"
46cabdff1aSopenharmony_ci        "subpd   %%xmm5, %%xmm7                \n\t"
47cabdff1aSopenharmony_ci        "addsd   %%xmm6, %%xmm7                \n\t"
48cabdff1aSopenharmony_ci        "test    $1,     %5                    \n\t"
49cabdff1aSopenharmony_ci        "jz      2f                            \n\t"
50cabdff1aSopenharmony_ci#define WELCH(MOVPD, offset)\
51cabdff1aSopenharmony_ci        "1:                                    \n\t"\
52cabdff1aSopenharmony_ci        "movapd   %%xmm7,  %%xmm1              \n\t"\
53cabdff1aSopenharmony_ci        "mulpd    %%xmm1,  %%xmm1              \n\t"\
54cabdff1aSopenharmony_ci        "movapd   %%xmm6,  %%xmm0              \n\t"\
55cabdff1aSopenharmony_ci        "subpd    %%xmm1,  %%xmm0              \n\t"\
56cabdff1aSopenharmony_ci        "pshufd   $0x4e,   %%xmm0, %%xmm1      \n\t"\
57cabdff1aSopenharmony_ci        "cvtpi2pd (%3,%0), %%xmm2              \n\t"\
58cabdff1aSopenharmony_ci        "cvtpi2pd "#offset"*4(%3,%1), %%xmm3   \n\t"\
59cabdff1aSopenharmony_ci        "mulpd    %%xmm0,  %%xmm2              \n\t"\
60cabdff1aSopenharmony_ci        "mulpd    %%xmm1,  %%xmm3              \n\t"\
61cabdff1aSopenharmony_ci        "movapd   %%xmm2, (%2,%0,2)            \n\t"\
62cabdff1aSopenharmony_ci        MOVPD"    %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
63cabdff1aSopenharmony_ci        "subpd    %%xmm5,  %%xmm7              \n\t"\
64cabdff1aSopenharmony_ci        "sub      $8,      %1                  \n\t"\
65cabdff1aSopenharmony_ci        "add      $8,      %0                  \n\t"\
66cabdff1aSopenharmony_ci        "jl 1b                                 \n\t"\
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci        WELCH("movupd", -1)
69cabdff1aSopenharmony_ci        "jmp 3f                                \n\t"
70cabdff1aSopenharmony_ci        "2:                                    \n\t"
71cabdff1aSopenharmony_ci        WELCH("movapd", -2)
72cabdff1aSopenharmony_ci        "3:                                    \n\t"
73cabdff1aSopenharmony_ci        :"+&r"(i), "+&r"(j)
74cabdff1aSopenharmony_ci        :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
75cabdff1aSopenharmony_ci         NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
76cabdff1aSopenharmony_ci         XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
77cabdff1aSopenharmony_ci                                    "%xmm5", "%xmm6", "%xmm7")
78cabdff1aSopenharmony_ci    );
79cabdff1aSopenharmony_ci#undef WELCH
80cabdff1aSopenharmony_ci}
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_cistatic void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
83cabdff1aSopenharmony_ci                                      double *autoc)
84cabdff1aSopenharmony_ci{
85cabdff1aSopenharmony_ci    int j;
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ci    if((x86_reg)data & 15)
88cabdff1aSopenharmony_ci        data++;
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci    for(j=0; j<lag; j+=2){
91cabdff1aSopenharmony_ci        x86_reg i = -len*sizeof(double);
92cabdff1aSopenharmony_ci        if(j == lag-2) {
93cabdff1aSopenharmony_ci            __asm__ volatile(
94cabdff1aSopenharmony_ci                "movsd    "MANGLE(pd_1)", %%xmm0    \n\t"
95cabdff1aSopenharmony_ci                "movsd    "MANGLE(pd_1)", %%xmm1    \n\t"
96cabdff1aSopenharmony_ci                "movsd    "MANGLE(pd_1)", %%xmm2    \n\t"
97cabdff1aSopenharmony_ci                "1:                                 \n\t"
98cabdff1aSopenharmony_ci                "movapd   (%2,%0), %%xmm3           \n\t"
99cabdff1aSopenharmony_ci                "movupd -8(%3,%0), %%xmm4           \n\t"
100cabdff1aSopenharmony_ci                "movapd   (%3,%0), %%xmm5           \n\t"
101cabdff1aSopenharmony_ci                "mulpd     %%xmm3, %%xmm4           \n\t"
102cabdff1aSopenharmony_ci                "mulpd     %%xmm3, %%xmm5           \n\t"
103cabdff1aSopenharmony_ci                "mulpd -16(%3,%0), %%xmm3           \n\t"
104cabdff1aSopenharmony_ci                "addpd     %%xmm4, %%xmm1           \n\t"
105cabdff1aSopenharmony_ci                "addpd     %%xmm5, %%xmm0           \n\t"
106cabdff1aSopenharmony_ci                "addpd     %%xmm3, %%xmm2           \n\t"
107cabdff1aSopenharmony_ci                "add       $16,    %0               \n\t"
108cabdff1aSopenharmony_ci                "jl 1b                              \n\t"
109cabdff1aSopenharmony_ci                "movhlps   %%xmm0, %%xmm3           \n\t"
110cabdff1aSopenharmony_ci                "movhlps   %%xmm1, %%xmm4           \n\t"
111cabdff1aSopenharmony_ci                "movhlps   %%xmm2, %%xmm5           \n\t"
112cabdff1aSopenharmony_ci                "addsd     %%xmm3, %%xmm0           \n\t"
113cabdff1aSopenharmony_ci                "addsd     %%xmm4, %%xmm1           \n\t"
114cabdff1aSopenharmony_ci                "addsd     %%xmm5, %%xmm2           \n\t"
115cabdff1aSopenharmony_ci                "movsd     %%xmm0,   (%1)           \n\t"
116cabdff1aSopenharmony_ci                "movsd     %%xmm1,  8(%1)           \n\t"
117cabdff1aSopenharmony_ci                "movsd     %%xmm2, 16(%1)           \n\t"
118cabdff1aSopenharmony_ci                :"+&r"(i)
119cabdff1aSopenharmony_ci                :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
120cabdff1aSopenharmony_ci                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
121cabdff1aSopenharmony_ci                :"memory"
122cabdff1aSopenharmony_ci            );
123cabdff1aSopenharmony_ci        } else {
124cabdff1aSopenharmony_ci            __asm__ volatile(
125cabdff1aSopenharmony_ci                "movsd    "MANGLE(pd_1)", %%xmm0    \n\t"
126cabdff1aSopenharmony_ci                "movsd    "MANGLE(pd_1)", %%xmm1    \n\t"
127cabdff1aSopenharmony_ci                "1:                                 \n\t"
128cabdff1aSopenharmony_ci                "movapd   (%3,%0), %%xmm3           \n\t"
129cabdff1aSopenharmony_ci                "movupd -8(%4,%0), %%xmm4           \n\t"
130cabdff1aSopenharmony_ci                "mulpd     %%xmm3, %%xmm4           \n\t"
131cabdff1aSopenharmony_ci                "mulpd    (%4,%0), %%xmm3           \n\t"
132cabdff1aSopenharmony_ci                "addpd     %%xmm4, %%xmm1           \n\t"
133cabdff1aSopenharmony_ci                "addpd     %%xmm3, %%xmm0           \n\t"
134cabdff1aSopenharmony_ci                "add       $16,    %0               \n\t"
135cabdff1aSopenharmony_ci                "jl 1b                              \n\t"
136cabdff1aSopenharmony_ci                "movhlps   %%xmm0, %%xmm3           \n\t"
137cabdff1aSopenharmony_ci                "movhlps   %%xmm1, %%xmm4           \n\t"
138cabdff1aSopenharmony_ci                "addsd     %%xmm3, %%xmm0           \n\t"
139cabdff1aSopenharmony_ci                "addsd     %%xmm4, %%xmm1           \n\t"
140cabdff1aSopenharmony_ci                "movsd     %%xmm0, %1               \n\t"
141cabdff1aSopenharmony_ci                "movsd     %%xmm1, %2               \n\t"
142cabdff1aSopenharmony_ci                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
143cabdff1aSopenharmony_ci                :"r"(data+len), "r"(data+len-j)
144cabdff1aSopenharmony_ci                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
145cabdff1aSopenharmony_ci            );
146cabdff1aSopenharmony_ci        }
147cabdff1aSopenharmony_ci    }
148cabdff1aSopenharmony_ci}
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci#endif /* HAVE_SSE2_INLINE */
151cabdff1aSopenharmony_ci
152cabdff1aSopenharmony_ciav_cold void ff_lpc_init_x86(LPCContext *c)
153cabdff1aSopenharmony_ci{
154cabdff1aSopenharmony_ci#if HAVE_SSE2_INLINE
155cabdff1aSopenharmony_ci    int cpu_flags = av_get_cpu_flags();
156cabdff1aSopenharmony_ci
157cabdff1aSopenharmony_ci    if (INLINE_SSE2_SLOW(cpu_flags)) {
158cabdff1aSopenharmony_ci        c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
159cabdff1aSopenharmony_ci        c->lpc_compute_autocorr   = lpc_compute_autocorr_sse2;
160cabdff1aSopenharmony_ci    }
161cabdff1aSopenharmony_ci#endif /* HAVE_SSE2_INLINE */
162cabdff1aSopenharmony_ci}
163