153a5a1b3Sopenharmony_ci/*** 253a5a1b3Sopenharmony_ci This file is part of PulseAudio. 353a5a1b3Sopenharmony_ci 453a5a1b3Sopenharmony_ci Copyright 2004-2006 Lennart Poettering 553a5a1b3Sopenharmony_ci Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk> 653a5a1b3Sopenharmony_ci 753a5a1b3Sopenharmony_ci PulseAudio is free software; you can redistribute it and/or modify 853a5a1b3Sopenharmony_ci it under the terms of the GNU Lesser General Public License as published 953a5a1b3Sopenharmony_ci by the Free Software Foundation; either version 2.1 of the License, 1053a5a1b3Sopenharmony_ci or (at your option) any later version. 1153a5a1b3Sopenharmony_ci 1253a5a1b3Sopenharmony_ci PulseAudio is distributed in the hope that it will be useful, but 1353a5a1b3Sopenharmony_ci WITHOUT ANY WARRANTY; without even the implied warranty of 1453a5a1b3Sopenharmony_ci MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1553a5a1b3Sopenharmony_ci General Public License for more details. 1653a5a1b3Sopenharmony_ci 1753a5a1b3Sopenharmony_ci You should have received a copy of the GNU Lesser General Public License 1853a5a1b3Sopenharmony_ci along with PulseAudio; if not, see <http://www.gnu.org/licenses/>. 1953a5a1b3Sopenharmony_ci***/ 2053a5a1b3Sopenharmony_ci 2153a5a1b3Sopenharmony_ci#ifdef HAVE_CONFIG_H 2253a5a1b3Sopenharmony_ci#include <config.h> 2353a5a1b3Sopenharmony_ci#endif 2453a5a1b3Sopenharmony_ci 2553a5a1b3Sopenharmony_ci#include <pulse/rtclock.h> 2653a5a1b3Sopenharmony_ci 2753a5a1b3Sopenharmony_ci#include <pulsecore/random.h> 2853a5a1b3Sopenharmony_ci#include <pulsecore/macro.h> 2953a5a1b3Sopenharmony_ci#include <pulsecore/endianmacros.h> 3053a5a1b3Sopenharmony_ci 3153a5a1b3Sopenharmony_ci#include "cpu-x86.h" 3253a5a1b3Sopenharmony_ci 3353a5a1b3Sopenharmony_ci#include "sample-util.h" 3453a5a1b3Sopenharmony_ci 3553a5a1b3Sopenharmony_ci#if (!defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__FreeBSD_kernel__) && defined (__i386__)) || defined (__amd64__) 3653a5a1b3Sopenharmony_ci 3753a5a1b3Sopenharmony_ci#define VOLUME_32x16(s,v) /* .. | vh | vl | */ \ 3853a5a1b3Sopenharmony_ci " pxor %%xmm4, %%xmm4 \n\t" /* .. | 0 | 0 | */ \ 3953a5a1b3Sopenharmony_ci " punpcklwd %%xmm4, "#s" \n\t" /* .. | 0 | p0 | */ \ 4053a5a1b3Sopenharmony_ci " pcmpgtw "#s", %%xmm4 \n\t" /* .. | 0 | s(p0) | */ \ 4153a5a1b3Sopenharmony_ci " pand "#v", %%xmm4 \n\t" /* .. | 0 | (vl) | */ \ 4253a5a1b3Sopenharmony_ci " movdqa "#s", %%xmm5 \n\t" \ 4353a5a1b3Sopenharmony_ci " pmulhuw "#v", "#s" \n\t" /* .. | 0 | vl*p0 | */ \ 4453a5a1b3Sopenharmony_ci " psubd %%xmm4, "#s" \n\t" /* .. | 0 | vl*p0 | + sign correct */ \ 4553a5a1b3Sopenharmony_ci " psrld $16, "#v" \n\t" /* .. | 0 | vh | */ \ 4653a5a1b3Sopenharmony_ci " pmaddwd %%xmm5, "#v" \n\t" /* .. | p0 * vh | */ \ 4753a5a1b3Sopenharmony_ci " paddd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \ 4853a5a1b3Sopenharmony_ci " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */ 4953a5a1b3Sopenharmony_ci 5053a5a1b3Sopenharmony_ci#define MOD_ADD(a,b) \ 5153a5a1b3Sopenharmony_ci " add "#a", %3 \n\t" /* channel += inc */ \ 5253a5a1b3Sopenharmony_ci " mov %3, %4 \n\t" \ 5353a5a1b3Sopenharmony_ci " sub "#b", %4 \n\t" /* tmp = channel - channels */ \ 5453a5a1b3Sopenharmony_ci " cmovae %4, %3 \n\t" /* if (tmp >= 0) channel = tmp */ 5553a5a1b3Sopenharmony_ci 5653a5a1b3Sopenharmony_ci/* swap 16 bits */ 5753a5a1b3Sopenharmony_ci#define SWAP_16(s) \ 5853a5a1b3Sopenharmony_ci " movdqa "#s", %%xmm4 \n\t" /* .. | h l | */ \ 5953a5a1b3Sopenharmony_ci " psrlw $8, %%xmm4 \n\t" /* .. | 0 h | */ \ 6053a5a1b3Sopenharmony_ci " psllw $8, "#s" \n\t" /* .. | l 0 | */ \ 6153a5a1b3Sopenharmony_ci " por %%xmm4, "#s" \n\t" /* .. | l h | */ 6253a5a1b3Sopenharmony_ci 6353a5a1b3Sopenharmony_ci/* swap 2 registers 16 bits for better pairing */ 6453a5a1b3Sopenharmony_ci#define SWAP_16_2(s1,s2) \ 6553a5a1b3Sopenharmony_ci " movdqa "#s1", %%xmm4 \n\t" /* .. | h l | */ \ 6653a5a1b3Sopenharmony_ci " movdqa "#s2", %%xmm5 \n\t" \ 6753a5a1b3Sopenharmony_ci " psrlw $8, %%xmm4 \n\t" /* .. | 0 h | */ \ 6853a5a1b3Sopenharmony_ci " psrlw $8, %%xmm5 \n\t" \ 6953a5a1b3Sopenharmony_ci " psllw $8, "#s1" \n\t" /* .. | l 0 | */ \ 7053a5a1b3Sopenharmony_ci " psllw $8, "#s2" \n\t" \ 7153a5a1b3Sopenharmony_ci " por %%xmm4, "#s1" \n\t" /* .. | l h | */ \ 7253a5a1b3Sopenharmony_ci " por %%xmm5, "#s2" \n\t" 7353a5a1b3Sopenharmony_ci 7453a5a1b3Sopenharmony_cistatic int channel_overread_table[8] = {8,8,8,12,8,10,12,14}; 7553a5a1b3Sopenharmony_ci 7653a5a1b3Sopenharmony_cistatic void pa_volume_s16ne_sse2(int16_t *samples, const int32_t *volumes, unsigned channels, unsigned length) { 7753a5a1b3Sopenharmony_ci pa_reg_x86 channel, temp; 7853a5a1b3Sopenharmony_ci 7953a5a1b3Sopenharmony_ci /* Channels must be at least 8 and always a multiple of the original number. 8053a5a1b3Sopenharmony_ci * This is also the max amount we overread the volume array, which should 8153a5a1b3Sopenharmony_ci * have enough padding. */ 8253a5a1b3Sopenharmony_ci if (channels < 8) 8353a5a1b3Sopenharmony_ci channels = channel_overread_table[channels]; 8453a5a1b3Sopenharmony_ci 8553a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 8653a5a1b3Sopenharmony_ci " xor %3, %3 \n\t" 8753a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ 8853a5a1b3Sopenharmony_ci 8953a5a1b3Sopenharmony_ci " test $1, %2 \n\t" /* check for odd samples */ 9053a5a1b3Sopenharmony_ci " je 2f \n\t" 9153a5a1b3Sopenharmony_ci 9253a5a1b3Sopenharmony_ci " movd (%q1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */ 9353a5a1b3Sopenharmony_ci " movw (%0), %w4 \n\t" /* .. | p0 | */ 9453a5a1b3Sopenharmony_ci " movd %4, %%xmm1 \n\t" 9553a5a1b3Sopenharmony_ci VOLUME_32x16 (%%xmm1, %%xmm0) 9653a5a1b3Sopenharmony_ci " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */ 9753a5a1b3Sopenharmony_ci " movw %w4, (%0) \n\t" 9853a5a1b3Sopenharmony_ci " add $2, %0 \n\t" 9953a5a1b3Sopenharmony_ci MOD_ADD ($1, %5) 10053a5a1b3Sopenharmony_ci 10153a5a1b3Sopenharmony_ci "2: \n\t" 10253a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ 10353a5a1b3Sopenharmony_ci " test $1, %2 \n\t" 10453a5a1b3Sopenharmony_ci " je 4f \n\t" 10553a5a1b3Sopenharmony_ci 10653a5a1b3Sopenharmony_ci "3: \n\t" /* do samples in groups of 2 */ 10753a5a1b3Sopenharmony_ci " movq (%q1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */ 10853a5a1b3Sopenharmony_ci " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */ 10953a5a1b3Sopenharmony_ci VOLUME_32x16 (%%xmm1, %%xmm0) 11053a5a1b3Sopenharmony_ci " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ 11153a5a1b3Sopenharmony_ci " add $4, %0 \n\t" 11253a5a1b3Sopenharmony_ci MOD_ADD ($2, %5) 11353a5a1b3Sopenharmony_ci 11453a5a1b3Sopenharmony_ci "4: \n\t" 11553a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ 11653a5a1b3Sopenharmony_ci " test $1, %2 \n\t" 11753a5a1b3Sopenharmony_ci " je 6f \n\t" 11853a5a1b3Sopenharmony_ci 11953a5a1b3Sopenharmony_ci /* FIXME, we can do aligned access of the volume values if we can guarantee 12053a5a1b3Sopenharmony_ci * that the array is 16 bytes aligned, we probably have to do the odd values 12153a5a1b3Sopenharmony_ci * after this then. */ 12253a5a1b3Sopenharmony_ci "5: \n\t" /* do samples in groups of 4 */ 12353a5a1b3Sopenharmony_ci " movdqu (%q1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ 12453a5a1b3Sopenharmony_ci " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ 12553a5a1b3Sopenharmony_ci VOLUME_32x16 (%%xmm1, %%xmm0) 12653a5a1b3Sopenharmony_ci " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ 12753a5a1b3Sopenharmony_ci " add $8, %0 \n\t" 12853a5a1b3Sopenharmony_ci MOD_ADD ($4, %5) 12953a5a1b3Sopenharmony_ci 13053a5a1b3Sopenharmony_ci "6: \n\t" 13153a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */ 13253a5a1b3Sopenharmony_ci " cmp $0, %2 \n\t" 13353a5a1b3Sopenharmony_ci " je 8f \n\t" 13453a5a1b3Sopenharmony_ci 13553a5a1b3Sopenharmony_ci "7: \n\t" /* do samples in groups of 8 */ 13653a5a1b3Sopenharmony_ci " movdqu (%q1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ 13753a5a1b3Sopenharmony_ci " movdqu 16(%q1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */ 13853a5a1b3Sopenharmony_ci " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ 13953a5a1b3Sopenharmony_ci " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */ 14053a5a1b3Sopenharmony_ci VOLUME_32x16 (%%xmm1, %%xmm0) 14153a5a1b3Sopenharmony_ci VOLUME_32x16 (%%xmm3, %%xmm2) 14253a5a1b3Sopenharmony_ci " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ 14353a5a1b3Sopenharmony_ci " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */ 14453a5a1b3Sopenharmony_ci " add $16, %0 \n\t" 14553a5a1b3Sopenharmony_ci MOD_ADD ($8, %5) 14653a5a1b3Sopenharmony_ci " dec %2 \n\t" 14753a5a1b3Sopenharmony_ci " jne 7b \n\t" 14853a5a1b3Sopenharmony_ci "8: \n\t" 14953a5a1b3Sopenharmony_ci 15053a5a1b3Sopenharmony_ci : "+r" (samples), "+r" (volumes), "+r" (length), "=&D" (channel), "=&r" (temp) 15153a5a1b3Sopenharmony_ci#if defined (__i386__) 15253a5a1b3Sopenharmony_ci : "m" (channels) 15353a5a1b3Sopenharmony_ci#else 15453a5a1b3Sopenharmony_ci : "r" ((pa_reg_x86)channels) 15553a5a1b3Sopenharmony_ci#endif 15653a5a1b3Sopenharmony_ci : "cc" 15753a5a1b3Sopenharmony_ci ); 15853a5a1b3Sopenharmony_ci} 15953a5a1b3Sopenharmony_ci 16053a5a1b3Sopenharmony_cistatic void pa_volume_s16re_sse2(int16_t *samples, const int32_t *volumes, unsigned channels, unsigned length) { 16153a5a1b3Sopenharmony_ci pa_reg_x86 channel, temp; 16253a5a1b3Sopenharmony_ci 16353a5a1b3Sopenharmony_ci /* Channels must be at least 8 and always a multiple of the original number. 16453a5a1b3Sopenharmony_ci * This is also the max amount we overread the volume array, which should 16553a5a1b3Sopenharmony_ci * have enough padding. */ 16653a5a1b3Sopenharmony_ci if (channels < 8) 16753a5a1b3Sopenharmony_ci channels = channel_overread_table[channels]; 16853a5a1b3Sopenharmony_ci 16953a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 17053a5a1b3Sopenharmony_ci " xor %3, %3 \n\t" 17153a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ 17253a5a1b3Sopenharmony_ci 17353a5a1b3Sopenharmony_ci " test $1, %2 \n\t" /* check for odd samples */ 17453a5a1b3Sopenharmony_ci " je 2f \n\t" 17553a5a1b3Sopenharmony_ci 17653a5a1b3Sopenharmony_ci " movd (%q1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */ 17753a5a1b3Sopenharmony_ci " movw (%0), %w4 \n\t" /* .. | p0 | */ 17853a5a1b3Sopenharmony_ci " rorw $8, %w4 \n\t" 17953a5a1b3Sopenharmony_ci " movd %4, %%xmm1 \n\t" 18053a5a1b3Sopenharmony_ci VOLUME_32x16 (%%xmm1, %%xmm0) 18153a5a1b3Sopenharmony_ci " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */ 18253a5a1b3Sopenharmony_ci " rorw $8, %w4 \n\t" 18353a5a1b3Sopenharmony_ci " movw %w4, (%0) \n\t" 18453a5a1b3Sopenharmony_ci " add $2, %0 \n\t" 18553a5a1b3Sopenharmony_ci MOD_ADD ($1, %5) 18653a5a1b3Sopenharmony_ci 18753a5a1b3Sopenharmony_ci "2: \n\t" 18853a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ 18953a5a1b3Sopenharmony_ci " test $1, %2 \n\t" 19053a5a1b3Sopenharmony_ci " je 4f \n\t" 19153a5a1b3Sopenharmony_ci 19253a5a1b3Sopenharmony_ci "3: \n\t" /* do samples in groups of 2 */ 19353a5a1b3Sopenharmony_ci " movq (%q1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */ 19453a5a1b3Sopenharmony_ci " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */ 19553a5a1b3Sopenharmony_ci SWAP_16 (%%xmm1) 19653a5a1b3Sopenharmony_ci VOLUME_32x16 (%%xmm1, %%xmm0) 19753a5a1b3Sopenharmony_ci SWAP_16 (%%xmm0) 19853a5a1b3Sopenharmony_ci " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ 19953a5a1b3Sopenharmony_ci " add $4, %0 \n\t" 20053a5a1b3Sopenharmony_ci MOD_ADD ($2, %5) 20153a5a1b3Sopenharmony_ci 20253a5a1b3Sopenharmony_ci "4: \n\t" 20353a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ 20453a5a1b3Sopenharmony_ci " test $1, %2 \n\t" 20553a5a1b3Sopenharmony_ci " je 6f \n\t" 20653a5a1b3Sopenharmony_ci 20753a5a1b3Sopenharmony_ci /* FIXME, we can do aligned access of the volume values if we can guarantee 20853a5a1b3Sopenharmony_ci * that the array is 16 bytes aligned, we probably have to do the odd values 20953a5a1b3Sopenharmony_ci * after this then. */ 21053a5a1b3Sopenharmony_ci "5: \n\t" /* do samples in groups of 4 */ 21153a5a1b3Sopenharmony_ci " movdqu (%q1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ 21253a5a1b3Sopenharmony_ci " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ 21353a5a1b3Sopenharmony_ci SWAP_16 (%%xmm1) 21453a5a1b3Sopenharmony_ci VOLUME_32x16 (%%xmm1, %%xmm0) 21553a5a1b3Sopenharmony_ci SWAP_16 (%%xmm0) 21653a5a1b3Sopenharmony_ci " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ 21753a5a1b3Sopenharmony_ci " add $8, %0 \n\t" 21853a5a1b3Sopenharmony_ci MOD_ADD ($4, %5) 21953a5a1b3Sopenharmony_ci 22053a5a1b3Sopenharmony_ci "6: \n\t" 22153a5a1b3Sopenharmony_ci " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */ 22253a5a1b3Sopenharmony_ci " cmp $0, %2 \n\t" 22353a5a1b3Sopenharmony_ci " je 8f \n\t" 22453a5a1b3Sopenharmony_ci 22553a5a1b3Sopenharmony_ci "7: \n\t" /* do samples in groups of 8 */ 22653a5a1b3Sopenharmony_ci " movdqu (%q1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ 22753a5a1b3Sopenharmony_ci " movdqu 16(%q1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */ 22853a5a1b3Sopenharmony_ci " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ 22953a5a1b3Sopenharmony_ci " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */ 23053a5a1b3Sopenharmony_ci SWAP_16_2 (%%xmm1, %%xmm3) 23153a5a1b3Sopenharmony_ci VOLUME_32x16 (%%xmm1, %%xmm0) 23253a5a1b3Sopenharmony_ci VOLUME_32x16 (%%xmm3, %%xmm2) 23353a5a1b3Sopenharmony_ci SWAP_16_2 (%%xmm0, %%xmm2) 23453a5a1b3Sopenharmony_ci " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ 23553a5a1b3Sopenharmony_ci " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */ 23653a5a1b3Sopenharmony_ci " add $16, %0 \n\t" 23753a5a1b3Sopenharmony_ci MOD_ADD ($8, %5) 23853a5a1b3Sopenharmony_ci " dec %2 \n\t" 23953a5a1b3Sopenharmony_ci " jne 7b \n\t" 24053a5a1b3Sopenharmony_ci "8: \n\t" 24153a5a1b3Sopenharmony_ci 24253a5a1b3Sopenharmony_ci : "+r" (samples), "+r" (volumes), "+r" (length), "=&D" (channel), "=&r" (temp) 24353a5a1b3Sopenharmony_ci#if defined (__i386__) 24453a5a1b3Sopenharmony_ci : "m" (channels) 24553a5a1b3Sopenharmony_ci#else 24653a5a1b3Sopenharmony_ci : "r" ((pa_reg_x86)channels) 24753a5a1b3Sopenharmony_ci#endif 24853a5a1b3Sopenharmony_ci : "cc" 24953a5a1b3Sopenharmony_ci ); 25053a5a1b3Sopenharmony_ci} 25153a5a1b3Sopenharmony_ci 25253a5a1b3Sopenharmony_ci#endif /* (!defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__FreeBSD_kernel__) && defined (__i386__)) || defined (__amd64__) */ 25353a5a1b3Sopenharmony_ci 25453a5a1b3Sopenharmony_civoid pa_volume_func_init_sse(pa_cpu_x86_flag_t flags) { 25553a5a1b3Sopenharmony_ci#if (!defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__FreeBSD_kernel__) && defined (__i386__)) || defined (__amd64__) 25653a5a1b3Sopenharmony_ci if (flags & PA_CPU_X86_SSE2) { 25753a5a1b3Sopenharmony_ci pa_log_info("Initialising SSE2 optimized volume functions."); 25853a5a1b3Sopenharmony_ci 25953a5a1b3Sopenharmony_ci pa_set_volume_func(PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse2); 26053a5a1b3Sopenharmony_ci pa_set_volume_func(PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse2); 26153a5a1b3Sopenharmony_ci } 26253a5a1b3Sopenharmony_ci#endif /* (!defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__FreeBSD_kernel__) && defined (__i386__)) || defined (__amd64__) */ 26353a5a1b3Sopenharmony_ci} 264