153a5a1b3Sopenharmony_ci/*** 253a5a1b3Sopenharmony_ci This file is part of PulseAudio. 353a5a1b3Sopenharmony_ci 453a5a1b3Sopenharmony_ci Copyright 2004-2006 Lennart Poettering 553a5a1b3Sopenharmony_ci Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk> 653a5a1b3Sopenharmony_ci 753a5a1b3Sopenharmony_ci PulseAudio is free software; you can redistribute it and/or modify 853a5a1b3Sopenharmony_ci it under the terms of the GNU Lesser General Public License as published 953a5a1b3Sopenharmony_ci by the Free Software Foundation; either version 2.1 of the License, 1053a5a1b3Sopenharmony_ci or (at your option) any later version. 1153a5a1b3Sopenharmony_ci 1253a5a1b3Sopenharmony_ci PulseAudio is distributed in the hope that it will be useful, but 1353a5a1b3Sopenharmony_ci WITHOUT ANY WARRANTY; without even the implied warranty of 1453a5a1b3Sopenharmony_ci MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1553a5a1b3Sopenharmony_ci General Public License for more details. 1653a5a1b3Sopenharmony_ci 1753a5a1b3Sopenharmony_ci You should have received a copy of the GNU Lesser General Public License 1853a5a1b3Sopenharmony_ci along with PulseAudio; if not, see <http://www.gnu.org/licenses/>. 1953a5a1b3Sopenharmony_ci***/ 2053a5a1b3Sopenharmony_ci 2153a5a1b3Sopenharmony_ci#ifdef HAVE_CONFIG_H 2253a5a1b3Sopenharmony_ci#include <config.h> 2353a5a1b3Sopenharmony_ci#endif 2453a5a1b3Sopenharmony_ci 2553a5a1b3Sopenharmony_ci#include <pulse/rtclock.h> 2653a5a1b3Sopenharmony_ci 2753a5a1b3Sopenharmony_ci#include <pulsecore/random.h> 2853a5a1b3Sopenharmony_ci#include <pulsecore/macro.h> 2953a5a1b3Sopenharmony_ci#include <pulsecore/endianmacros.h> 3053a5a1b3Sopenharmony_ci 3153a5a1b3Sopenharmony_ci#include "cpu-x86.h" 3253a5a1b3Sopenharmony_ci 3353a5a1b3Sopenharmony_ci#include "sample-util.h" 3453a5a1b3Sopenharmony_ci 3553a5a1b3Sopenharmony_ci#if (!defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__FreeBSD_kernel__) && defined (__i386__)) || defined (__amd64__) 3653a5a1b3Sopenharmony_ci/* in s: 2 int16_t samples 3753a5a1b3Sopenharmony_ci * in v: 2 int32_t volumes, fixed point 16:16 3853a5a1b3Sopenharmony_ci * out s: contains scaled and clamped int16_t samples. 3953a5a1b3Sopenharmony_ci * 4053a5a1b3Sopenharmony_ci * We calculate the high 32 bits of a 32x16 multiply which we then 4153a5a1b3Sopenharmony_ci * clamp to 16 bits. The calculation is: 4253a5a1b3Sopenharmony_ci * 4353a5a1b3Sopenharmony_ci * vl = (v & 0xffff) 4453a5a1b3Sopenharmony_ci * vh = (v >> 16) 4553a5a1b3Sopenharmony_ci * s = ((s * vl) >> 16) + (s * vh); 4653a5a1b3Sopenharmony_ci * 4753a5a1b3Sopenharmony_ci * For the first multiply we have to do a sign correction as we need to 4853a5a1b3Sopenharmony_ci * multiply a signed int with an unsigned int. Hacker's delight 8-3 gives a 4953a5a1b3Sopenharmony_ci * simple formula to correct the sign of the high word after the signed 5053a5a1b3Sopenharmony_ci * multiply. 5153a5a1b3Sopenharmony_ci */ 5253a5a1b3Sopenharmony_ci#define VOLUME_32x16(s,v) /* .. | vh | vl | */ \ 5353a5a1b3Sopenharmony_ci " pxor %%mm4, %%mm4 \n\t" /* .. | 0 | 0 | */ \ 5453a5a1b3Sopenharmony_ci " punpcklwd %%mm4, "#s" \n\t" /* .. | 0 | p0 | */ \ 5553a5a1b3Sopenharmony_ci " pcmpgtw "#v", %%mm4 \n\t" /* .. | 0 | s(vl) | */ \ 5653a5a1b3Sopenharmony_ci " pand "#s", %%mm4 \n\t" /* .. | 0 | (p0) | (vl >> 15) & p */ \ 5753a5a1b3Sopenharmony_ci " movq "#s", %%mm5 \n\t" \ 5853a5a1b3Sopenharmony_ci " pmulhw "#v", "#s" \n\t" /* .. | 0 | vl*p0 | */ \ 5953a5a1b3Sopenharmony_ci " paddw %%mm4, "#s" \n\t" /* .. | 0 | vl*p0 | + sign correct */ \ 6053a5a1b3Sopenharmony_ci " pslld $16, "#s" \n\t" /* .. | vl*p0 | 0 | */ \ 6153a5a1b3Sopenharmony_ci " psrld $16, "#v" \n\t" /* .. | 0 | vh | */ \ 6253a5a1b3Sopenharmony_ci " psrad $16, "#s" \n\t" /* .. | vl*p0 | sign extend */ \ 6353a5a1b3Sopenharmony_ci " pmaddwd %%mm5, "#v" \n\t" /* .. | p0 * vh | */ \ 6453a5a1b3Sopenharmony_ci " paddd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \ 6553a5a1b3Sopenharmony_ci " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */ 6653a5a1b3Sopenharmony_ci 6753a5a1b3Sopenharmony_ci/* approximately advances %3 = (%3 + a) % b. This function requires that 6853a5a1b3Sopenharmony_ci * a <= b. */ 6953a5a1b3Sopenharmony_ci#define MOD_ADD(a,b) \ 7053a5a1b3Sopenharmony_ci " add "#a", %3 \n\t" \ 7153a5a1b3Sopenharmony_ci " mov %3, %4 \n\t" \ 7253a5a1b3Sopenharmony_ci " sub "#b", %4 \n\t" \ 7353a5a1b3Sopenharmony_ci " cmovae %4, %3 \n\t" 7453a5a1b3Sopenharmony_ci 7553a5a1b3Sopenharmony_ci/* swap 16 bits */ 7653a5a1b3Sopenharmony_ci#define SWAP_16(s) \ 7753a5a1b3Sopenharmony_ci " movq "#s", %%mm4 \n\t" /* .. | h l | */ \ 7853a5a1b3Sopenharmony_ci " psrlw $8, %%mm4 \n\t" /* .. | 0 h | */ \ 7953a5a1b3Sopenharmony_ci " psllw $8, "#s" \n\t" /* .. | l 0 | */ \ 8053a5a1b3Sopenharmony_ci " por %%mm4, "#s" \n\t" /* .. | l h | */ 8153a5a1b3Sopenharmony_ci 8253a5a1b3Sopenharmony_ci/* swap 2 registers 16 bits for better pairing */ 8353a5a1b3Sopenharmony_ci#define SWAP_16_2(s1,s2) \ 8453a5a1b3Sopenharmony_ci " movq "#s1", %%mm4 \n\t" /* .. | h l | */ \ 8553a5a1b3Sopenharmony_ci " movq "#s2", %%mm5 \n\t" \ 8653a5a1b3Sopenharmony_ci " psrlw $8, %%mm4 \n\t" /* .. | 0 h | */ \ 8753a5a1b3Sopenharmony_ci " psrlw $8, %%mm5 \n\t" \ 8853a5a1b3Sopenharmony_ci " psllw $8, "#s1" \n\t" /* .. | l 0 | */ \ 8953a5a1b3Sopenharmony_ci " psllw $8, "#s2" \n\t" \ 9053a5a1b3Sopenharmony_ci " por %%mm4, "#s1" \n\t" /* .. | l h | */ \ 9153a5a1b3Sopenharmony_ci " por %%mm5, "#s2" \n\t" 9253a5a1b3Sopenharmony_ci 9353a5a1b3Sopenharmony_cistatic void pa_volume_s16ne_mmx(int16_t *samples, const int32_t *volumes, unsigned channels, unsigned length) { 9453a5a1b3Sopenharmony_ci pa_reg_x86 channel, temp; 9553a5a1b3Sopenharmony_ci 9653a5a1b3Sopenharmony_ci /* Channels must be at least 4, and always a multiple of the original number. 9753a5a1b3Sopenharmony_ci * This is also the max amount we overread the volume array, which should 9853a5a1b3Sopenharmony_ci * have enough padding. */ 9953a5a1b3Sopenharmony_ci channels = channels == 3 ? 6 : PA_MAX (4U, channels); 10053a5a1b3Sopenharmony_ci 10153a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 10253a5a1b3Sopenharmony_ci " xor %3, %3 \n\t" 10353a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ 10453a5a1b3Sopenharmony_ci 10553a5a1b3Sopenharmony_ci " test $1, %2 \n\t" /* check for odd samples */ 10653a5a1b3Sopenharmony_ci " je 2f \n\t" 10753a5a1b3Sopenharmony_ci 10853a5a1b3Sopenharmony_ci " movd (%q1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */ 10953a5a1b3Sopenharmony_ci " movw (%0), %w4 \n\t" /* .. | p0 | */ 11053a5a1b3Sopenharmony_ci " movd %4, %%mm1 \n\t" 11153a5a1b3Sopenharmony_ci VOLUME_32x16 (%%mm1, %%mm0) 11253a5a1b3Sopenharmony_ci " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */ 11353a5a1b3Sopenharmony_ci " movw %w4, (%0) \n\t" 11453a5a1b3Sopenharmony_ci " add $2, %0 \n\t" 11553a5a1b3Sopenharmony_ci MOD_ADD ($1, %5) 11653a5a1b3Sopenharmony_ci 11753a5a1b3Sopenharmony_ci "2: \n\t" 11853a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ 11953a5a1b3Sopenharmony_ci " test $1, %2 \n\t" /* check for odd samples */ 12053a5a1b3Sopenharmony_ci " je 4f \n\t" 12153a5a1b3Sopenharmony_ci 12253a5a1b3Sopenharmony_ci "3: \n\t" /* do samples in groups of 2 */ 12353a5a1b3Sopenharmony_ci " movq (%q1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ 12453a5a1b3Sopenharmony_ci " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ 12553a5a1b3Sopenharmony_ci VOLUME_32x16 (%%mm1, %%mm0) 12653a5a1b3Sopenharmony_ci " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ 12753a5a1b3Sopenharmony_ci " add $4, %0 \n\t" 12853a5a1b3Sopenharmony_ci MOD_ADD ($2, %5) 12953a5a1b3Sopenharmony_ci 13053a5a1b3Sopenharmony_ci "4: \n\t" 13153a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ 13253a5a1b3Sopenharmony_ci " cmp $0, %2 \n\t" 13353a5a1b3Sopenharmony_ci " je 6f \n\t" 13453a5a1b3Sopenharmony_ci 13553a5a1b3Sopenharmony_ci "5: \n\t" /* do samples in groups of 4 */ 13653a5a1b3Sopenharmony_ci " movq (%q1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ 13753a5a1b3Sopenharmony_ci " movq 8(%q1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */ 13853a5a1b3Sopenharmony_ci " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ 13953a5a1b3Sopenharmony_ci " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */ 14053a5a1b3Sopenharmony_ci VOLUME_32x16 (%%mm1, %%mm0) 14153a5a1b3Sopenharmony_ci VOLUME_32x16 (%%mm3, %%mm2) 14253a5a1b3Sopenharmony_ci " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ 14353a5a1b3Sopenharmony_ci " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */ 14453a5a1b3Sopenharmony_ci " add $8, %0 \n\t" 14553a5a1b3Sopenharmony_ci MOD_ADD ($4, %5) 14653a5a1b3Sopenharmony_ci " dec %2 \n\t" 14753a5a1b3Sopenharmony_ci " jne 5b \n\t" 14853a5a1b3Sopenharmony_ci 14953a5a1b3Sopenharmony_ci "6: \n\t" 15053a5a1b3Sopenharmony_ci " emms \n\t" 15153a5a1b3Sopenharmony_ci 15253a5a1b3Sopenharmony_ci : "+r" (samples), "+r" (volumes), "+r" (length), "=&D" (channel), "=&r" (temp) 15353a5a1b3Sopenharmony_ci#if defined (__i386__) 15453a5a1b3Sopenharmony_ci : "m" (channels) 15553a5a1b3Sopenharmony_ci#else 15653a5a1b3Sopenharmony_ci : "r" ((pa_reg_x86)channels) 15753a5a1b3Sopenharmony_ci#endif 15853a5a1b3Sopenharmony_ci : "cc" 15953a5a1b3Sopenharmony_ci ); 16053a5a1b3Sopenharmony_ci} 16153a5a1b3Sopenharmony_ci 16253a5a1b3Sopenharmony_cistatic void pa_volume_s16re_mmx(int16_t *samples, const int32_t *volumes, unsigned channels, unsigned length) { 16353a5a1b3Sopenharmony_ci pa_reg_x86 channel, temp; 16453a5a1b3Sopenharmony_ci 16553a5a1b3Sopenharmony_ci /* Channels must be at least 4, and always a multiple of the original number. 16653a5a1b3Sopenharmony_ci * This is also the max amount we overread the volume array, which should 16753a5a1b3Sopenharmony_ci * have enough padding. */ 16853a5a1b3Sopenharmony_ci channels = channels == 3 ? 6 : PA_MAX (4U, channels); 16953a5a1b3Sopenharmony_ci 17053a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 17153a5a1b3Sopenharmony_ci " xor %3, %3 \n\t" 17253a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ 17353a5a1b3Sopenharmony_ci " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */ 17453a5a1b3Sopenharmony_ci " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */ 17553a5a1b3Sopenharmony_ci " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */ 17653a5a1b3Sopenharmony_ci " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */ 17753a5a1b3Sopenharmony_ci 17853a5a1b3Sopenharmony_ci " test $1, %2 \n\t" /* check for odd samples */ 17953a5a1b3Sopenharmony_ci " je 2f \n\t" 18053a5a1b3Sopenharmony_ci 18153a5a1b3Sopenharmony_ci " movd (%q1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */ 18253a5a1b3Sopenharmony_ci " movw (%0), %w4 \n\t" /* .. | p0 | */ 18353a5a1b3Sopenharmony_ci " rorw $8, %w4 \n\t" 18453a5a1b3Sopenharmony_ci " movd %4, %%mm1 \n\t" 18553a5a1b3Sopenharmony_ci VOLUME_32x16 (%%mm1, %%mm0) 18653a5a1b3Sopenharmony_ci " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */ 18753a5a1b3Sopenharmony_ci " rorw $8, %w4 \n\t" 18853a5a1b3Sopenharmony_ci " movw %w4, (%0) \n\t" 18953a5a1b3Sopenharmony_ci " add $2, %0 \n\t" 19053a5a1b3Sopenharmony_ci MOD_ADD ($1, %5) 19153a5a1b3Sopenharmony_ci 19253a5a1b3Sopenharmony_ci "2: \n\t" 19353a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ 19453a5a1b3Sopenharmony_ci " test $1, %2 \n\t" /* check for odd samples */ 19553a5a1b3Sopenharmony_ci " je 4f \n\t" 19653a5a1b3Sopenharmony_ci 19753a5a1b3Sopenharmony_ci "3: \n\t" /* do samples in groups of 2 */ 19853a5a1b3Sopenharmony_ci " movq (%q1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ 19953a5a1b3Sopenharmony_ci " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ 20053a5a1b3Sopenharmony_ci SWAP_16 (%%mm1) 20153a5a1b3Sopenharmony_ci VOLUME_32x16 (%%mm1, %%mm0) 20253a5a1b3Sopenharmony_ci SWAP_16 (%%mm0) 20353a5a1b3Sopenharmony_ci " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ 20453a5a1b3Sopenharmony_ci " add $4, %0 \n\t" 20553a5a1b3Sopenharmony_ci MOD_ADD ($2, %5) 20653a5a1b3Sopenharmony_ci 20753a5a1b3Sopenharmony_ci "4: \n\t" 20853a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ 20953a5a1b3Sopenharmony_ci " cmp $0, %2 \n\t" 21053a5a1b3Sopenharmony_ci " je 6f \n\t" 21153a5a1b3Sopenharmony_ci 21253a5a1b3Sopenharmony_ci "5: \n\t" /* do samples in groups of 4 */ 21353a5a1b3Sopenharmony_ci " movq (%q1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ 21453a5a1b3Sopenharmony_ci " movq 8(%q1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */ 21553a5a1b3Sopenharmony_ci " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ 21653a5a1b3Sopenharmony_ci " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */ 21753a5a1b3Sopenharmony_ci SWAP_16_2 (%%mm1, %%mm3) 21853a5a1b3Sopenharmony_ci VOLUME_32x16 (%%mm1, %%mm0) 21953a5a1b3Sopenharmony_ci VOLUME_32x16 (%%mm3, %%mm2) 22053a5a1b3Sopenharmony_ci SWAP_16_2 (%%mm0, %%mm2) 22153a5a1b3Sopenharmony_ci " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ 22253a5a1b3Sopenharmony_ci " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */ 22353a5a1b3Sopenharmony_ci " add $8, %0 \n\t" 22453a5a1b3Sopenharmony_ci MOD_ADD ($4, %5) 22553a5a1b3Sopenharmony_ci " dec %2 \n\t" 22653a5a1b3Sopenharmony_ci " jne 5b \n\t" 22753a5a1b3Sopenharmony_ci 22853a5a1b3Sopenharmony_ci "6: \n\t" 22953a5a1b3Sopenharmony_ci " emms \n\t" 23053a5a1b3Sopenharmony_ci 23153a5a1b3Sopenharmony_ci : "+r" (samples), "+r" (volumes), "+r" (length), "=&D" (channel), "=&r" (temp) 23253a5a1b3Sopenharmony_ci#if defined (__i386__) 23353a5a1b3Sopenharmony_ci : "m" (channels) 23453a5a1b3Sopenharmony_ci#else 23553a5a1b3Sopenharmony_ci : "r" ((pa_reg_x86)channels) 23653a5a1b3Sopenharmony_ci#endif 23753a5a1b3Sopenharmony_ci : "cc" 23853a5a1b3Sopenharmony_ci ); 23953a5a1b3Sopenharmony_ci} 24053a5a1b3Sopenharmony_ci 24153a5a1b3Sopenharmony_ci#endif /* (!defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__FreeBSD_kernel__) && defined (__i386__)) || defined (__amd64__) */ 24253a5a1b3Sopenharmony_ci 24353a5a1b3Sopenharmony_civoid pa_volume_func_init_mmx(pa_cpu_x86_flag_t flags) { 24453a5a1b3Sopenharmony_ci#if (!defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__FreeBSD_kernel__) && defined (__i386__)) || defined (__amd64__) 24553a5a1b3Sopenharmony_ci if ((flags & PA_CPU_X86_MMX) && (flags & PA_CPU_X86_CMOV)) { 24653a5a1b3Sopenharmony_ci pa_log_info("Initialising MMX optimized volume functions."); 24753a5a1b3Sopenharmony_ci 24853a5a1b3Sopenharmony_ci pa_set_volume_func(PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx); 24953a5a1b3Sopenharmony_ci pa_set_volume_func(PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx); 25053a5a1b3Sopenharmony_ci } 25153a5a1b3Sopenharmony_ci#endif /* (!defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__FreeBSD_kernel__) && defined (__i386__)) || defined (__amd64__) */ 25253a5a1b3Sopenharmony_ci} 253