1159b3361Sopenharmony_ci/*
2159b3361Sopenharmony_ci * MP3 quantization, intrinsics functions
3159b3361Sopenharmony_ci *
4159b3361Sopenharmony_ci *      Copyright (c) 2005-2006 Gabriel Bouvigne
5159b3361Sopenharmony_ci *
6159b3361Sopenharmony_ci * This library is free software; you can redistribute it and/or
7159b3361Sopenharmony_ci * modify it under the terms of the GNU Library General Public
8159b3361Sopenharmony_ci * License as published by the Free Software Foundation; either
9159b3361Sopenharmony_ci * version 2 of the License, or (at your option) any later version.
10159b3361Sopenharmony_ci *
11159b3361Sopenharmony_ci * This library is distributed in the hope that it will be useful,
12159b3361Sopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13159b3361Sopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.     See the GNU
14159b3361Sopenharmony_ci * Library General Public License for more details.
15159b3361Sopenharmony_ci *
16159b3361Sopenharmony_ci * You should have received a copy of the GNU Library General Public
17159b3361Sopenharmony_ci * License along with this library; if not, write to the
18159b3361Sopenharmony_ci * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19159b3361Sopenharmony_ci * Boston, MA 02111-1307, USA.
20159b3361Sopenharmony_ci */
21159b3361Sopenharmony_ci
22159b3361Sopenharmony_ci
23159b3361Sopenharmony_ci#ifdef HAVE_CONFIG_H
24159b3361Sopenharmony_ci# include <config.h>
25159b3361Sopenharmony_ci#endif
26159b3361Sopenharmony_ci
27159b3361Sopenharmony_ci#include "lame.h"
28159b3361Sopenharmony_ci#include "machine.h"
29159b3361Sopenharmony_ci#include "encoder.h"
30159b3361Sopenharmony_ci#include "util.h"
31159b3361Sopenharmony_ci#include "lame_intrin.h"
32159b3361Sopenharmony_ci
33159b3361Sopenharmony_ci
34159b3361Sopenharmony_ci
35159b3361Sopenharmony_ci#ifdef HAVE_XMMINTRIN_H
36159b3361Sopenharmony_ci
37159b3361Sopenharmony_ci#include <xmmintrin.h>
38159b3361Sopenharmony_ci
39159b3361Sopenharmony_citypedef union {
40159b3361Sopenharmony_ci    int32_t _i_32[4]; /* unions are initialized by its first member */
41159b3361Sopenharmony_ci    float   _float[4];
42159b3361Sopenharmony_ci    __m128  _m128;
43159b3361Sopenharmony_ci} vecfloat_union;
44159b3361Sopenharmony_ci
45159b3361Sopenharmony_ci#define TRI_SIZE (5-1)  /* 1024 =  4**5 */
46159b3361Sopenharmony_cistatic const FLOAT costab[TRI_SIZE * 2] = {
47159b3361Sopenharmony_ci    9.238795325112867e-01, 3.826834323650898e-01,
48159b3361Sopenharmony_ci    9.951847266721969e-01, 9.801714032956060e-02,
49159b3361Sopenharmony_ci    9.996988186962042e-01, 2.454122852291229e-02,
50159b3361Sopenharmony_ci    9.999811752826011e-01, 6.135884649154475e-03
51159b3361Sopenharmony_ci};
52159b3361Sopenharmony_ci
53159b3361Sopenharmony_ci
54159b3361Sopenharmony_ci/* make sure functions with SSE instructions maintain their own properly aligned stack */
55159b3361Sopenharmony_ci#if defined (__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 2)))
56159b3361Sopenharmony_ci#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
57159b3361Sopenharmony_ci#else
58159b3361Sopenharmony_ci#define SSE_FUNCTION
59159b3361Sopenharmony_ci#endif
60159b3361Sopenharmony_ci
61159b3361Sopenharmony_ci
62159b3361Sopenharmony_ciSSE_FUNCTION void
63159b3361Sopenharmony_ciinit_xrpow_core_sse(gr_info * const cod_info, FLOAT xrpow[576], int max_nz, FLOAT * sum)
64159b3361Sopenharmony_ci{
65159b3361Sopenharmony_ci    int     i;
66159b3361Sopenharmony_ci    float   tmp_max = 0;
67159b3361Sopenharmony_ci    float   tmp_sum = 0;
68159b3361Sopenharmony_ci    int     upper = max_nz + 1;
69159b3361Sopenharmony_ci    int     upper4 = (upper / 4) * 4;
70159b3361Sopenharmony_ci    int     rest = upper-upper4;
71159b3361Sopenharmony_ci
72159b3361Sopenharmony_ci    const vecfloat_union fabs_mask = {{ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }};
73159b3361Sopenharmony_ci    const __m128 vec_fabs_mask = _mm_loadu_ps(&fabs_mask._float[0]);
74159b3361Sopenharmony_ci    vecfloat_union vec_xrpow_max;
75159b3361Sopenharmony_ci    vecfloat_union vec_sum;
76159b3361Sopenharmony_ci    vecfloat_union vec_tmp;
77159b3361Sopenharmony_ci
78159b3361Sopenharmony_ci    _mm_prefetch((char const *) cod_info->xr, _MM_HINT_T0);
79159b3361Sopenharmony_ci    _mm_prefetch((char const *) xrpow, _MM_HINT_T0);
80159b3361Sopenharmony_ci
81159b3361Sopenharmony_ci    vec_xrpow_max._m128 = _mm_set_ps1(0);
82159b3361Sopenharmony_ci    vec_sum._m128 = _mm_set_ps1(0);
83159b3361Sopenharmony_ci
84159b3361Sopenharmony_ci    for (i = 0; i < upper4; i += 4) {
85159b3361Sopenharmony_ci        vec_tmp._m128 = _mm_loadu_ps(&(cod_info->xr[i])); /* load */
86159b3361Sopenharmony_ci        vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */
87159b3361Sopenharmony_ci        vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128);
88159b3361Sopenharmony_ci        vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128)));
89159b3361Sopenharmony_ci        vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */
90159b3361Sopenharmony_ci        _mm_storeu_ps(&(xrpow[i]), vec_tmp._m128); /* store into xrpow[] */
91159b3361Sopenharmony_ci    }
92159b3361Sopenharmony_ci    vec_tmp._m128 = _mm_set_ps1(0);
93159b3361Sopenharmony_ci    switch (rest) {
94159b3361Sopenharmony_ci        case 3: vec_tmp._float[2] = cod_info->xr[upper4+2];
95159b3361Sopenharmony_ci        case 2: vec_tmp._float[1] = cod_info->xr[upper4+1];
96159b3361Sopenharmony_ci        case 1: vec_tmp._float[0] = cod_info->xr[upper4+0];
97159b3361Sopenharmony_ci            vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */
98159b3361Sopenharmony_ci            vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128);
99159b3361Sopenharmony_ci            vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128)));
100159b3361Sopenharmony_ci            vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */
101159b3361Sopenharmony_ci            switch (rest) {
102159b3361Sopenharmony_ci                case 3: xrpow[upper4+2] = vec_tmp._float[2];
103159b3361Sopenharmony_ci                case 2: xrpow[upper4+1] = vec_tmp._float[1];
104159b3361Sopenharmony_ci                case 1: xrpow[upper4+0] = vec_tmp._float[0];
105159b3361Sopenharmony_ci                default:
106159b3361Sopenharmony_ci                    break;
107159b3361Sopenharmony_ci            }
108159b3361Sopenharmony_ci        default:
109159b3361Sopenharmony_ci            break;
110159b3361Sopenharmony_ci    }
111159b3361Sopenharmony_ci    tmp_sum = vec_sum._float[0] + vec_sum._float[1] + vec_sum._float[2] + vec_sum._float[3];
112159b3361Sopenharmony_ci    {
113159b3361Sopenharmony_ci        float ma = vec_xrpow_max._float[0] > vec_xrpow_max._float[1]
114159b3361Sopenharmony_ci                ? vec_xrpow_max._float[0] : vec_xrpow_max._float[1];
115159b3361Sopenharmony_ci        float mb = vec_xrpow_max._float[2] > vec_xrpow_max._float[3]
116159b3361Sopenharmony_ci                ? vec_xrpow_max._float[2] : vec_xrpow_max._float[3];
117159b3361Sopenharmony_ci        tmp_max = ma > mb ? ma : mb;
118159b3361Sopenharmony_ci    }
119159b3361Sopenharmony_ci    cod_info->xrpow_max = tmp_max;
120159b3361Sopenharmony_ci    *sum = tmp_sum;
121159b3361Sopenharmony_ci}
122159b3361Sopenharmony_ci
123159b3361Sopenharmony_ci
124159b3361Sopenharmony_ciSSE_FUNCTION static void
125159b3361Sopenharmony_cistore4(__m128 v, float* f0, float* f1, float* f2, float* f3)
126159b3361Sopenharmony_ci{
127159b3361Sopenharmony_ci    vecfloat_union r;
128159b3361Sopenharmony_ci    r._m128 = v;
129159b3361Sopenharmony_ci    *f0 = r._float[0];
130159b3361Sopenharmony_ci    *f1 = r._float[1];
131159b3361Sopenharmony_ci    *f2 = r._float[2];
132159b3361Sopenharmony_ci    *f3 = r._float[3];
133159b3361Sopenharmony_ci}
134159b3361Sopenharmony_ci
135159b3361Sopenharmony_ci
136159b3361Sopenharmony_ciSSE_FUNCTION void
137159b3361Sopenharmony_cifht_SSE2(FLOAT * fz, int n)
138159b3361Sopenharmony_ci{
139159b3361Sopenharmony_ci    const FLOAT *tri = costab;
140159b3361Sopenharmony_ci    int     k4;
141159b3361Sopenharmony_ci    FLOAT  *fi, *gi;
142159b3361Sopenharmony_ci    FLOAT const *fn;
143159b3361Sopenharmony_ci
144159b3361Sopenharmony_ci    n <<= 1;            /* to get BLKSIZE, because of 3DNow! ASM routine */
145159b3361Sopenharmony_ci    fn = fz + n;
146159b3361Sopenharmony_ci    k4 = 4;
147159b3361Sopenharmony_ci    do {
148159b3361Sopenharmony_ci        FLOAT   s1, c1;
149159b3361Sopenharmony_ci        int     i, k1, k2, k3, kx;
150159b3361Sopenharmony_ci        kx = k4 >> 1;
151159b3361Sopenharmony_ci        k1 = k4;
152159b3361Sopenharmony_ci        k2 = k4 << 1;
153159b3361Sopenharmony_ci        k3 = k2 + k1;
154159b3361Sopenharmony_ci        k4 = k2 << 1;
155159b3361Sopenharmony_ci        fi = fz;
156159b3361Sopenharmony_ci        gi = fi + kx;
157159b3361Sopenharmony_ci        do {
158159b3361Sopenharmony_ci            FLOAT   f0, f1, f2, f3;
159159b3361Sopenharmony_ci            f1 = fi[0] - fi[k1];
160159b3361Sopenharmony_ci            f0 = fi[0] + fi[k1];
161159b3361Sopenharmony_ci            f3 = fi[k2] - fi[k3];
162159b3361Sopenharmony_ci            f2 = fi[k2] + fi[k3];
163159b3361Sopenharmony_ci            fi[k2] = f0 - f2;
164159b3361Sopenharmony_ci            fi[0] = f0 + f2;
165159b3361Sopenharmony_ci            fi[k3] = f1 - f3;
166159b3361Sopenharmony_ci            fi[k1] = f1 + f3;
167159b3361Sopenharmony_ci            f1 = gi[0] - gi[k1];
168159b3361Sopenharmony_ci            f0 = gi[0] + gi[k1];
169159b3361Sopenharmony_ci            f3 = SQRT2 * gi[k3];
170159b3361Sopenharmony_ci            f2 = SQRT2 * gi[k2];
171159b3361Sopenharmony_ci            gi[k2] = f0 - f2;
172159b3361Sopenharmony_ci            gi[0] = f0 + f2;
173159b3361Sopenharmony_ci            gi[k3] = f1 - f3;
174159b3361Sopenharmony_ci            gi[k1] = f1 + f3;
175159b3361Sopenharmony_ci            gi += k4;
176159b3361Sopenharmony_ci            fi += k4;
177159b3361Sopenharmony_ci        } while (fi < fn);
178159b3361Sopenharmony_ci        c1 = tri[0];
179159b3361Sopenharmony_ci        s1 = tri[1];
180159b3361Sopenharmony_ci        for (i = 1; i < kx; i++) {
181159b3361Sopenharmony_ci            __m128 v_s2;
182159b3361Sopenharmony_ci            __m128 v_c2;
183159b3361Sopenharmony_ci            __m128 v_c1;
184159b3361Sopenharmony_ci            __m128 v_s1;
185159b3361Sopenharmony_ci            FLOAT   c2, s2, s1_2 = s1+s1;
186159b3361Sopenharmony_ci            c2 = 1 - s1_2 * s1;
187159b3361Sopenharmony_ci            s2 = s1_2 * c1;
188159b3361Sopenharmony_ci            fi = fz + i;
189159b3361Sopenharmony_ci            gi = fz + k1 - i;
190159b3361Sopenharmony_ci            v_c1 = _mm_set_ps1(c1);
191159b3361Sopenharmony_ci            v_s1 = _mm_set_ps1(s1);
192159b3361Sopenharmony_ci            v_c2 = _mm_set_ps1(c2);
193159b3361Sopenharmony_ci            v_s2 = _mm_set_ps1(s2);
194159b3361Sopenharmony_ci            {
195159b3361Sopenharmony_ci                static const vecfloat_union sign_mask = {{0x80000000,0,0,0}};
196159b3361Sopenharmony_ci                v_c1 = _mm_xor_ps(sign_mask._m128, v_c1); /* v_c1 := {-c1, +c1, +c1, +c1} */
197159b3361Sopenharmony_ci            }
198159b3361Sopenharmony_ci            {
199159b3361Sopenharmony_ci                static const vecfloat_union sign_mask = {{0,0x80000000,0,0}};
200159b3361Sopenharmony_ci                v_s1 = _mm_xor_ps(sign_mask._m128, v_s1); /* v_s1 := {+s1, -s1, +s1, +s1} */
201159b3361Sopenharmony_ci            }
202159b3361Sopenharmony_ci            {
203159b3361Sopenharmony_ci                static const vecfloat_union sign_mask = {{0,0,0x80000000,0x80000000}};
204159b3361Sopenharmony_ci                v_c2 = _mm_xor_ps(sign_mask._m128, v_c2); /* v_c2 := {+c2, +c2, -c2, -c2} */
205159b3361Sopenharmony_ci            }
206159b3361Sopenharmony_ci            do {
207159b3361Sopenharmony_ci                __m128 p, q, r;
208159b3361Sopenharmony_ci
209159b3361Sopenharmony_ci                q = _mm_setr_ps(fi[k1], fi[k3], gi[k1], gi[k3]); /* Q := {fi_k1,fi_k3,gi_k1,gi_k3}*/
210159b3361Sopenharmony_ci                p = _mm_mul_ps(v_s2, q);                         /* P := s2 * Q */
211159b3361Sopenharmony_ci                q = _mm_mul_ps(v_c2, q);                         /* Q := c2 * Q */
212159b3361Sopenharmony_ci                q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1,0,3,2));  /* Q := {-c2*gi_k1,-c2*gi_k3,c2*fi_k1,c2*fi_k3} */
213159b3361Sopenharmony_ci                p = _mm_add_ps(p, q);
214159b3361Sopenharmony_ci
215159b3361Sopenharmony_ci                r = _mm_setr_ps(gi[0], gi[k2], fi[0], fi[k2]);   /* R := {gi_0,gi_k2,fi_0,fi_k2} */
216159b3361Sopenharmony_ci                q = _mm_sub_ps(r, p);                            /* Q := {gi_0-p0,gi_k2-p1,fi_0-p2,fi_k2-p3} */
217159b3361Sopenharmony_ci                r = _mm_add_ps(r, p);                            /* R := {gi_0+p0,gi_k2+p1,fi_0+p2,fi_k2+p3} */
218159b3361Sopenharmony_ci                p = _mm_shuffle_ps(q, r, _MM_SHUFFLE(2,0,2,0));  /* P := {q0,q2,r0,r2} */
219159b3361Sopenharmony_ci                p = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3,1,2,0));  /* P := {q0,r0,q2,r2} */
220159b3361Sopenharmony_ci                q = _mm_shuffle_ps(q, r, _MM_SHUFFLE(3,1,3,1));  /* Q := {q1,q3,r1,r3} */
221159b3361Sopenharmony_ci                r = _mm_mul_ps(v_c1, q);
222159b3361Sopenharmony_ci                q = _mm_mul_ps(v_s1, q);
223159b3361Sopenharmony_ci                q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0,1,2,3));  /* Q := {q3,q2,q1,q0} */
224159b3361Sopenharmony_ci                q = _mm_add_ps(q, r);
225159b3361Sopenharmony_ci
226159b3361Sopenharmony_ci                store4(_mm_sub_ps(p, q), &gi[k3], &gi[k2], &fi[k3], &fi[k2]);
227159b3361Sopenharmony_ci                store4(_mm_add_ps(p, q), &gi[k1], &gi[ 0], &fi[k1], &fi[ 0]);
228159b3361Sopenharmony_ci
229159b3361Sopenharmony_ci                gi += k4;
230159b3361Sopenharmony_ci                fi += k4;
231159b3361Sopenharmony_ci            } while (fi < fn);
232159b3361Sopenharmony_ci            c2 = c1;
233159b3361Sopenharmony_ci            c1 = c2 * tri[0] - s1 * tri[1];
234159b3361Sopenharmony_ci            s1 = c2 * tri[1] + s1 * tri[0];
235159b3361Sopenharmony_ci        }
236159b3361Sopenharmony_ci        tri += 2;
237159b3361Sopenharmony_ci    } while (k4 < n);
238159b3361Sopenharmony_ci}
239159b3361Sopenharmony_ci
240159b3361Sopenharmony_ci#endif	/* HAVE_XMMINTRIN_H */
241159b3361Sopenharmony_ci
242