1159b3361Sopenharmony_ci/* 2159b3361Sopenharmony_ci * MP3 quantization, intrinsics functions 3159b3361Sopenharmony_ci * 4159b3361Sopenharmony_ci * Copyright (c) 2005-2006 Gabriel Bouvigne 5159b3361Sopenharmony_ci * 6159b3361Sopenharmony_ci * This library is free software; you can redistribute it and/or 7159b3361Sopenharmony_ci * modify it under the terms of the GNU Library General Public 8159b3361Sopenharmony_ci * License as published by the Free Software Foundation; either 9159b3361Sopenharmony_ci * version 2 of the License, or (at your option) any later version. 10159b3361Sopenharmony_ci * 11159b3361Sopenharmony_ci * This library is distributed in the hope that it will be useful, 12159b3361Sopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13159b3361Sopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14159b3361Sopenharmony_ci * Library General Public License for more details. 15159b3361Sopenharmony_ci * 16159b3361Sopenharmony_ci * You should have received a copy of the GNU Library General Public 17159b3361Sopenharmony_ci * License along with this library; if not, write to the 18159b3361Sopenharmony_ci * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 19159b3361Sopenharmony_ci * Boston, MA 02111-1307, USA. 20159b3361Sopenharmony_ci */ 21159b3361Sopenharmony_ci 22159b3361Sopenharmony_ci 23159b3361Sopenharmony_ci#ifdef HAVE_CONFIG_H 24159b3361Sopenharmony_ci# include <config.h> 25159b3361Sopenharmony_ci#endif 26159b3361Sopenharmony_ci 27159b3361Sopenharmony_ci#include "lame.h" 28159b3361Sopenharmony_ci#include "machine.h" 29159b3361Sopenharmony_ci#include "encoder.h" 30159b3361Sopenharmony_ci#include "util.h" 31159b3361Sopenharmony_ci#include "lame_intrin.h" 32159b3361Sopenharmony_ci 33159b3361Sopenharmony_ci 34159b3361Sopenharmony_ci 35159b3361Sopenharmony_ci#ifdef HAVE_XMMINTRIN_H 36159b3361Sopenharmony_ci 37159b3361Sopenharmony_ci#include <xmmintrin.h> 38159b3361Sopenharmony_ci 39159b3361Sopenharmony_citypedef union { 40159b3361Sopenharmony_ci int32_t _i_32[4]; /* unions are initialized by its first member */ 41159b3361Sopenharmony_ci float _float[4]; 42159b3361Sopenharmony_ci __m128 _m128; 43159b3361Sopenharmony_ci} vecfloat_union; 44159b3361Sopenharmony_ci 45159b3361Sopenharmony_ci#define TRI_SIZE (5-1) /* 1024 = 4**5 */ 46159b3361Sopenharmony_cistatic const FLOAT costab[TRI_SIZE * 2] = { 47159b3361Sopenharmony_ci 9.238795325112867e-01, 3.826834323650898e-01, 48159b3361Sopenharmony_ci 9.951847266721969e-01, 9.801714032956060e-02, 49159b3361Sopenharmony_ci 9.996988186962042e-01, 2.454122852291229e-02, 50159b3361Sopenharmony_ci 9.999811752826011e-01, 6.135884649154475e-03 51159b3361Sopenharmony_ci}; 52159b3361Sopenharmony_ci 53159b3361Sopenharmony_ci 54159b3361Sopenharmony_ci/* make sure functions with SSE instructions maintain their own properly aligned stack */ 55159b3361Sopenharmony_ci#if defined (__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 2))) 56159b3361Sopenharmony_ci#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) 57159b3361Sopenharmony_ci#else 58159b3361Sopenharmony_ci#define SSE_FUNCTION 59159b3361Sopenharmony_ci#endif 60159b3361Sopenharmony_ci 61159b3361Sopenharmony_ci 62159b3361Sopenharmony_ciSSE_FUNCTION void 63159b3361Sopenharmony_ciinit_xrpow_core_sse(gr_info * const cod_info, FLOAT xrpow[576], int max_nz, FLOAT * sum) 64159b3361Sopenharmony_ci{ 65159b3361Sopenharmony_ci int i; 66159b3361Sopenharmony_ci float tmp_max = 0; 67159b3361Sopenharmony_ci float tmp_sum = 0; 68159b3361Sopenharmony_ci int upper = max_nz + 1; 69159b3361Sopenharmony_ci int upper4 = (upper / 4) * 4; 70159b3361Sopenharmony_ci int rest = upper-upper4; 71159b3361Sopenharmony_ci 72159b3361Sopenharmony_ci const vecfloat_union fabs_mask = {{ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }}; 73159b3361Sopenharmony_ci const __m128 vec_fabs_mask = _mm_loadu_ps(&fabs_mask._float[0]); 74159b3361Sopenharmony_ci vecfloat_union vec_xrpow_max; 75159b3361Sopenharmony_ci vecfloat_union vec_sum; 76159b3361Sopenharmony_ci vecfloat_union vec_tmp; 77159b3361Sopenharmony_ci 78159b3361Sopenharmony_ci _mm_prefetch((char const *) cod_info->xr, _MM_HINT_T0); 79159b3361Sopenharmony_ci _mm_prefetch((char const *) xrpow, _MM_HINT_T0); 80159b3361Sopenharmony_ci 81159b3361Sopenharmony_ci vec_xrpow_max._m128 = _mm_set_ps1(0); 82159b3361Sopenharmony_ci vec_sum._m128 = _mm_set_ps1(0); 83159b3361Sopenharmony_ci 84159b3361Sopenharmony_ci for (i = 0; i < upper4; i += 4) { 85159b3361Sopenharmony_ci vec_tmp._m128 = _mm_loadu_ps(&(cod_info->xr[i])); /* load */ 86159b3361Sopenharmony_ci vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */ 87159b3361Sopenharmony_ci vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128); 88159b3361Sopenharmony_ci vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128))); 89159b3361Sopenharmony_ci vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */ 90159b3361Sopenharmony_ci _mm_storeu_ps(&(xrpow[i]), vec_tmp._m128); /* store into xrpow[] */ 91159b3361Sopenharmony_ci } 92159b3361Sopenharmony_ci vec_tmp._m128 = _mm_set_ps1(0); 93159b3361Sopenharmony_ci switch (rest) { 94159b3361Sopenharmony_ci case 3: vec_tmp._float[2] = cod_info->xr[upper4+2]; 95159b3361Sopenharmony_ci case 2: vec_tmp._float[1] = cod_info->xr[upper4+1]; 96159b3361Sopenharmony_ci case 1: vec_tmp._float[0] = cod_info->xr[upper4+0]; 97159b3361Sopenharmony_ci vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */ 98159b3361Sopenharmony_ci vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128); 99159b3361Sopenharmony_ci vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128))); 100159b3361Sopenharmony_ci vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */ 101159b3361Sopenharmony_ci switch (rest) { 102159b3361Sopenharmony_ci case 3: xrpow[upper4+2] = vec_tmp._float[2]; 103159b3361Sopenharmony_ci case 2: xrpow[upper4+1] = vec_tmp._float[1]; 104159b3361Sopenharmony_ci case 1: xrpow[upper4+0] = vec_tmp._float[0]; 105159b3361Sopenharmony_ci default: 106159b3361Sopenharmony_ci break; 107159b3361Sopenharmony_ci } 108159b3361Sopenharmony_ci default: 109159b3361Sopenharmony_ci break; 110159b3361Sopenharmony_ci } 111159b3361Sopenharmony_ci tmp_sum = vec_sum._float[0] + vec_sum._float[1] + vec_sum._float[2] + vec_sum._float[3]; 112159b3361Sopenharmony_ci { 113159b3361Sopenharmony_ci float ma = vec_xrpow_max._float[0] > vec_xrpow_max._float[1] 114159b3361Sopenharmony_ci ? vec_xrpow_max._float[0] : vec_xrpow_max._float[1]; 115159b3361Sopenharmony_ci float mb = vec_xrpow_max._float[2] > vec_xrpow_max._float[3] 116159b3361Sopenharmony_ci ? vec_xrpow_max._float[2] : vec_xrpow_max._float[3]; 117159b3361Sopenharmony_ci tmp_max = ma > mb ? ma : mb; 118159b3361Sopenharmony_ci } 119159b3361Sopenharmony_ci cod_info->xrpow_max = tmp_max; 120159b3361Sopenharmony_ci *sum = tmp_sum; 121159b3361Sopenharmony_ci} 122159b3361Sopenharmony_ci 123159b3361Sopenharmony_ci 124159b3361Sopenharmony_ciSSE_FUNCTION static void 125159b3361Sopenharmony_cistore4(__m128 v, float* f0, float* f1, float* f2, float* f3) 126159b3361Sopenharmony_ci{ 127159b3361Sopenharmony_ci vecfloat_union r; 128159b3361Sopenharmony_ci r._m128 = v; 129159b3361Sopenharmony_ci *f0 = r._float[0]; 130159b3361Sopenharmony_ci *f1 = r._float[1]; 131159b3361Sopenharmony_ci *f2 = r._float[2]; 132159b3361Sopenharmony_ci *f3 = r._float[3]; 133159b3361Sopenharmony_ci} 134159b3361Sopenharmony_ci 135159b3361Sopenharmony_ci 136159b3361Sopenharmony_ciSSE_FUNCTION void 137159b3361Sopenharmony_cifht_SSE2(FLOAT * fz, int n) 138159b3361Sopenharmony_ci{ 139159b3361Sopenharmony_ci const FLOAT *tri = costab; 140159b3361Sopenharmony_ci int k4; 141159b3361Sopenharmony_ci FLOAT *fi, *gi; 142159b3361Sopenharmony_ci FLOAT const *fn; 143159b3361Sopenharmony_ci 144159b3361Sopenharmony_ci n <<= 1; /* to get BLKSIZE, because of 3DNow! ASM routine */ 145159b3361Sopenharmony_ci fn = fz + n; 146159b3361Sopenharmony_ci k4 = 4; 147159b3361Sopenharmony_ci do { 148159b3361Sopenharmony_ci FLOAT s1, c1; 149159b3361Sopenharmony_ci int i, k1, k2, k3, kx; 150159b3361Sopenharmony_ci kx = k4 >> 1; 151159b3361Sopenharmony_ci k1 = k4; 152159b3361Sopenharmony_ci k2 = k4 << 1; 153159b3361Sopenharmony_ci k3 = k2 + k1; 154159b3361Sopenharmony_ci k4 = k2 << 1; 155159b3361Sopenharmony_ci fi = fz; 156159b3361Sopenharmony_ci gi = fi + kx; 157159b3361Sopenharmony_ci do { 158159b3361Sopenharmony_ci FLOAT f0, f1, f2, f3; 159159b3361Sopenharmony_ci f1 = fi[0] - fi[k1]; 160159b3361Sopenharmony_ci f0 = fi[0] + fi[k1]; 161159b3361Sopenharmony_ci f3 = fi[k2] - fi[k3]; 162159b3361Sopenharmony_ci f2 = fi[k2] + fi[k3]; 163159b3361Sopenharmony_ci fi[k2] = f0 - f2; 164159b3361Sopenharmony_ci fi[0] = f0 + f2; 165159b3361Sopenharmony_ci fi[k3] = f1 - f3; 166159b3361Sopenharmony_ci fi[k1] = f1 + f3; 167159b3361Sopenharmony_ci f1 = gi[0] - gi[k1]; 168159b3361Sopenharmony_ci f0 = gi[0] + gi[k1]; 169159b3361Sopenharmony_ci f3 = SQRT2 * gi[k3]; 170159b3361Sopenharmony_ci f2 = SQRT2 * gi[k2]; 171159b3361Sopenharmony_ci gi[k2] = f0 - f2; 172159b3361Sopenharmony_ci gi[0] = f0 + f2; 173159b3361Sopenharmony_ci gi[k3] = f1 - f3; 174159b3361Sopenharmony_ci gi[k1] = f1 + f3; 175159b3361Sopenharmony_ci gi += k4; 176159b3361Sopenharmony_ci fi += k4; 177159b3361Sopenharmony_ci } while (fi < fn); 178159b3361Sopenharmony_ci c1 = tri[0]; 179159b3361Sopenharmony_ci s1 = tri[1]; 180159b3361Sopenharmony_ci for (i = 1; i < kx; i++) { 181159b3361Sopenharmony_ci __m128 v_s2; 182159b3361Sopenharmony_ci __m128 v_c2; 183159b3361Sopenharmony_ci __m128 v_c1; 184159b3361Sopenharmony_ci __m128 v_s1; 185159b3361Sopenharmony_ci FLOAT c2, s2, s1_2 = s1+s1; 186159b3361Sopenharmony_ci c2 = 1 - s1_2 * s1; 187159b3361Sopenharmony_ci s2 = s1_2 * c1; 188159b3361Sopenharmony_ci fi = fz + i; 189159b3361Sopenharmony_ci gi = fz + k1 - i; 190159b3361Sopenharmony_ci v_c1 = _mm_set_ps1(c1); 191159b3361Sopenharmony_ci v_s1 = _mm_set_ps1(s1); 192159b3361Sopenharmony_ci v_c2 = _mm_set_ps1(c2); 193159b3361Sopenharmony_ci v_s2 = _mm_set_ps1(s2); 194159b3361Sopenharmony_ci { 195159b3361Sopenharmony_ci static const vecfloat_union sign_mask = {{0x80000000,0,0,0}}; 196159b3361Sopenharmony_ci v_c1 = _mm_xor_ps(sign_mask._m128, v_c1); /* v_c1 := {-c1, +c1, +c1, +c1} */ 197159b3361Sopenharmony_ci } 198159b3361Sopenharmony_ci { 199159b3361Sopenharmony_ci static const vecfloat_union sign_mask = {{0,0x80000000,0,0}}; 200159b3361Sopenharmony_ci v_s1 = _mm_xor_ps(sign_mask._m128, v_s1); /* v_s1 := {+s1, -s1, +s1, +s1} */ 201159b3361Sopenharmony_ci } 202159b3361Sopenharmony_ci { 203159b3361Sopenharmony_ci static const vecfloat_union sign_mask = {{0,0,0x80000000,0x80000000}}; 204159b3361Sopenharmony_ci v_c2 = _mm_xor_ps(sign_mask._m128, v_c2); /* v_c2 := {+c2, +c2, -c2, -c2} */ 205159b3361Sopenharmony_ci } 206159b3361Sopenharmony_ci do { 207159b3361Sopenharmony_ci __m128 p, q, r; 208159b3361Sopenharmony_ci 209159b3361Sopenharmony_ci q = _mm_setr_ps(fi[k1], fi[k3], gi[k1], gi[k3]); /* Q := {fi_k1,fi_k3,gi_k1,gi_k3}*/ 210159b3361Sopenharmony_ci p = _mm_mul_ps(v_s2, q); /* P := s2 * Q */ 211159b3361Sopenharmony_ci q = _mm_mul_ps(v_c2, q); /* Q := c2 * Q */ 212159b3361Sopenharmony_ci q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1,0,3,2)); /* Q := {-c2*gi_k1,-c2*gi_k3,c2*fi_k1,c2*fi_k3} */ 213159b3361Sopenharmony_ci p = _mm_add_ps(p, q); 214159b3361Sopenharmony_ci 215159b3361Sopenharmony_ci r = _mm_setr_ps(gi[0], gi[k2], fi[0], fi[k2]); /* R := {gi_0,gi_k2,fi_0,fi_k2} */ 216159b3361Sopenharmony_ci q = _mm_sub_ps(r, p); /* Q := {gi_0-p0,gi_k2-p1,fi_0-p2,fi_k2-p3} */ 217159b3361Sopenharmony_ci r = _mm_add_ps(r, p); /* R := {gi_0+p0,gi_k2+p1,fi_0+p2,fi_k2+p3} */ 218159b3361Sopenharmony_ci p = _mm_shuffle_ps(q, r, _MM_SHUFFLE(2,0,2,0)); /* P := {q0,q2,r0,r2} */ 219159b3361Sopenharmony_ci p = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3,1,2,0)); /* P := {q0,r0,q2,r2} */ 220159b3361Sopenharmony_ci q = _mm_shuffle_ps(q, r, _MM_SHUFFLE(3,1,3,1)); /* Q := {q1,q3,r1,r3} */ 221159b3361Sopenharmony_ci r = _mm_mul_ps(v_c1, q); 222159b3361Sopenharmony_ci q = _mm_mul_ps(v_s1, q); 223159b3361Sopenharmony_ci q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0,1,2,3)); /* Q := {q3,q2,q1,q0} */ 224159b3361Sopenharmony_ci q = _mm_add_ps(q, r); 225159b3361Sopenharmony_ci 226159b3361Sopenharmony_ci store4(_mm_sub_ps(p, q), &gi[k3], &gi[k2], &fi[k3], &fi[k2]); 227159b3361Sopenharmony_ci store4(_mm_add_ps(p, q), &gi[k1], &gi[ 0], &fi[k1], &fi[ 0]); 228159b3361Sopenharmony_ci 229159b3361Sopenharmony_ci gi += k4; 230159b3361Sopenharmony_ci fi += k4; 231159b3361Sopenharmony_ci } while (fi < fn); 232159b3361Sopenharmony_ci c2 = c1; 233159b3361Sopenharmony_ci c1 = c2 * tri[0] - s1 * tri[1]; 234159b3361Sopenharmony_ci s1 = c2 * tri[1] + s1 * tri[0]; 235159b3361Sopenharmony_ci } 236159b3361Sopenharmony_ci tri += 2; 237159b3361Sopenharmony_ci } while (k4 < n); 238159b3361Sopenharmony_ci} 239159b3361Sopenharmony_ci 240159b3361Sopenharmony_ci#endif /* HAVE_XMMINTRIN_H */ 241159b3361Sopenharmony_ci 242