153a5a1b3Sopenharmony_ci/*** 253a5a1b3Sopenharmony_ci This file is part of PulseAudio. 353a5a1b3Sopenharmony_ci 453a5a1b3Sopenharmony_ci Copyright 2013 Peter Meerwald <pmeerw@pmeerw.net> 553a5a1b3Sopenharmony_ci 653a5a1b3Sopenharmony_ci PulseAudio is free software; you can redistribute it and/or modify 753a5a1b3Sopenharmony_ci it under the terms of the GNU Lesser General Public License as published 853a5a1b3Sopenharmony_ci by the Free Software Foundation; either version 2.1 of the License, 953a5a1b3Sopenharmony_ci or (at your option) any later version. 1053a5a1b3Sopenharmony_ci 1153a5a1b3Sopenharmony_ci PulseAudio is distributed in the hope that it will be useful, but 1253a5a1b3Sopenharmony_ci WITHOUT ANY WARRANTY; without even the implied warranty of 1353a5a1b3Sopenharmony_ci MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1453a5a1b3Sopenharmony_ci General Public License for more details. 1553a5a1b3Sopenharmony_ci***/ 1653a5a1b3Sopenharmony_ci 1753a5a1b3Sopenharmony_ci#ifdef HAVE_CONFIG_H 1853a5a1b3Sopenharmony_ci#include <config.h> 1953a5a1b3Sopenharmony_ci#endif 2053a5a1b3Sopenharmony_ci 2153a5a1b3Sopenharmony_ci#include <pulsecore/macro.h> 2253a5a1b3Sopenharmony_ci#include <pulsecore/endianmacros.h> 2353a5a1b3Sopenharmony_ci#include <pulsecore/sample-util.h> 2453a5a1b3Sopenharmony_ci 2553a5a1b3Sopenharmony_ci#include "cpu-arm.h" 2653a5a1b3Sopenharmony_ci#include "mix.h" 2753a5a1b3Sopenharmony_ci 2853a5a1b3Sopenharmony_ci#include <arm_neon.h> 2953a5a1b3Sopenharmony_ci 3053a5a1b3Sopenharmony_cistatic pa_do_mix_func_t fallback; 3153a5a1b3Sopenharmony_ci 3253a5a1b3Sopenharmony_ci/* special case: mix s16ne streams, 2 channels each */ 3353a5a1b3Sopenharmony_cistatic void pa_mix_ch2_s16ne_neon(pa_mix_info streams[], unsigned nstreams, uint8_t *data, unsigned length) { 3453a5a1b3Sopenharmony_ci const unsigned mask = sizeof(int16_t) * 8 - 1; 3553a5a1b3Sopenharmony_ci const uint8_t *end = data + (length & ~mask); 3653a5a1b3Sopenharmony_ci 3753a5a1b3Sopenharmony_ci while (data < end) { 3853a5a1b3Sopenharmony_ci int32x4_t sum0, sum1; 3953a5a1b3Sopenharmony_ci unsigned i; 4053a5a1b3Sopenharmony_ci 4153a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 4253a5a1b3Sopenharmony_ci "veor.s32 %q[sum0], %q[sum0] \n\t" 4353a5a1b3Sopenharmony_ci "veor.s32 %q[sum1], %q[sum1] \n\t" 4453a5a1b3Sopenharmony_ci : [sum0] "=w" (sum0), [sum1] "=w" (sum1) 4553a5a1b3Sopenharmony_ci : 4653a5a1b3Sopenharmony_ci : "cc" /* clobber list */ 4753a5a1b3Sopenharmony_ci ); 4853a5a1b3Sopenharmony_ci 4953a5a1b3Sopenharmony_ci for (i = 0; i < nstreams; i++) { 5053a5a1b3Sopenharmony_ci pa_mix_info *m = streams + i; 5153a5a1b3Sopenharmony_ci int32_t cv0 = m->linear[0].i; 5253a5a1b3Sopenharmony_ci int32_t cv1 = m->linear[1].i; 5353a5a1b3Sopenharmony_ci 5453a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 5553a5a1b3Sopenharmony_ci "vld2.s16 {d0,d2}, [%[ptr]]! \n\t" 5653a5a1b3Sopenharmony_ci "vmov.s32 d4[0], %[cv0] \n\t" 5753a5a1b3Sopenharmony_ci "vmov.s32 d4[1], %[cv1] \n\t" 5853a5a1b3Sopenharmony_ci "vshll.s16 q0, d0, #15 \n\t" 5953a5a1b3Sopenharmony_ci "vshll.s16 q1, d2, #15 \n\t" 6053a5a1b3Sopenharmony_ci "vqdmulh.s32 q0, q0, d4[0] \n\t" 6153a5a1b3Sopenharmony_ci "vqdmulh.s32 q1, q1, d4[1] \n\t" 6253a5a1b3Sopenharmony_ci "vqadd.s32 %q[sum0], %q[sum0], q0 \n\t" 6353a5a1b3Sopenharmony_ci "vqadd.s32 %q[sum1], %q[sum1], q1 \n\t" 6453a5a1b3Sopenharmony_ci : [ptr] "+r" (m->ptr), [sum0] "+w" (sum0), [sum1] "+w" (sum1) 6553a5a1b3Sopenharmony_ci : [cv0] "r" (cv0), [cv1] "r" (cv1) 6653a5a1b3Sopenharmony_ci : "memory", "cc", "q0", "q1", "d4" /* clobber list */ 6753a5a1b3Sopenharmony_ci ); 6853a5a1b3Sopenharmony_ci } 6953a5a1b3Sopenharmony_ci 7053a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 7153a5a1b3Sopenharmony_ci "vqmovn.s32 d0, %q[sum0] \n\t" 7253a5a1b3Sopenharmony_ci "vqmovn.s32 d1, %q[sum1] \n\t" 7353a5a1b3Sopenharmony_ci "vst2.s16 {d0,d1}, [%[data]]! \n\t" 7453a5a1b3Sopenharmony_ci : [data] "+r" (data) 7553a5a1b3Sopenharmony_ci : [sum0] "w" (sum0), [sum1] "w" (sum1) 7653a5a1b3Sopenharmony_ci : "memory", "cc", "q0" /* clobber list */ 7753a5a1b3Sopenharmony_ci ); 7853a5a1b3Sopenharmony_ci } 7953a5a1b3Sopenharmony_ci 8053a5a1b3Sopenharmony_ci fallback(streams, nstreams, 2, data, length & mask); 8153a5a1b3Sopenharmony_ci} 8253a5a1b3Sopenharmony_ci 8353a5a1b3Sopenharmony_ci/* special case: mix 2 s16ne streams, 1 channel each */ 8453a5a1b3Sopenharmony_cistatic void pa_mix2_ch1_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) { 8553a5a1b3Sopenharmony_ci const int16_t *ptr0 = streams[0].ptr; 8653a5a1b3Sopenharmony_ci const int16_t *ptr1 = streams[1].ptr; 8753a5a1b3Sopenharmony_ci 8853a5a1b3Sopenharmony_ci int32x4_t sv0, sv1; 8953a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 9053a5a1b3Sopenharmony_ci "vdup.s32 %q[sv0], %[lin0] \n\t" 9153a5a1b3Sopenharmony_ci "vdup.s32 %q[sv1], %[lin1] \n\t" 9253a5a1b3Sopenharmony_ci : [sv0] "=w" (sv0), [sv1] "=w" (sv1) 9353a5a1b3Sopenharmony_ci : [lin0] "r" (streams[0].linear[0]), [lin1] "r" (streams[1].linear[0]) 9453a5a1b3Sopenharmony_ci : /* clobber list */ 9553a5a1b3Sopenharmony_ci ); 9653a5a1b3Sopenharmony_ci 9753a5a1b3Sopenharmony_ci length /= sizeof(int16_t); 9853a5a1b3Sopenharmony_ci for (; length >= 4; length -= 4) { 9953a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 10053a5a1b3Sopenharmony_ci "vld1.s16 d0, [%[ptr0]]! \n\t" 10153a5a1b3Sopenharmony_ci "vld1.s16 d2, [%[ptr1]]! \n\t" 10253a5a1b3Sopenharmony_ci "vshll.s16 q0, d0, #15 \n\t" 10353a5a1b3Sopenharmony_ci "vshll.s16 q1, d2, #15 \n\t" 10453a5a1b3Sopenharmony_ci "vqdmulh.s32 q0, q0, %q[sv0] \n\t" 10553a5a1b3Sopenharmony_ci "vqdmulh.s32 q1, q1, %q[sv1] \n\t" 10653a5a1b3Sopenharmony_ci "vqadd.s32 q0, q0, q1 \n\t" 10753a5a1b3Sopenharmony_ci "vqmovn.s32 d0, q0 \n\t" 10853a5a1b3Sopenharmony_ci "vst1.s16 d0, [%[data]]! \n\t" 10953a5a1b3Sopenharmony_ci : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data) 11053a5a1b3Sopenharmony_ci : [sv0] "w" (sv0), [sv1] "w" (sv1) 11153a5a1b3Sopenharmony_ci : "memory", "cc", "q0", "q1" /* clobber list */ 11253a5a1b3Sopenharmony_ci ); 11353a5a1b3Sopenharmony_ci } 11453a5a1b3Sopenharmony_ci 11553a5a1b3Sopenharmony_ci for (; length > 0; length--) { 11653a5a1b3Sopenharmony_ci int32_t sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[0].i); 11753a5a1b3Sopenharmony_ci sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[0].i); 11853a5a1b3Sopenharmony_ci *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF); 11953a5a1b3Sopenharmony_ci } 12053a5a1b3Sopenharmony_ci} 12153a5a1b3Sopenharmony_ci 12253a5a1b3Sopenharmony_ci/* special case: mix 2 s16ne streams, 2 channel each */ 12353a5a1b3Sopenharmony_cistatic void pa_mix2_ch2_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) { 12453a5a1b3Sopenharmony_ci const int16_t *ptr0 = streams[0].ptr; 12553a5a1b3Sopenharmony_ci const int16_t *ptr1 = streams[1].ptr; 12653a5a1b3Sopenharmony_ci 12753a5a1b3Sopenharmony_ci int32x4_t sv0, sv1; 12853a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 12953a5a1b3Sopenharmony_ci "vld1.s32 d0, [%[lin0]] \n\t" 13053a5a1b3Sopenharmony_ci "vmov.s32 d1, d0 \n\t" 13153a5a1b3Sopenharmony_ci "vmov.s32 %q[sv0], q0 \n\t" 13253a5a1b3Sopenharmony_ci "vld1.s32 d0, [%[lin1]] \n\t" 13353a5a1b3Sopenharmony_ci "vmov.s32 d1, d0 \n\t" 13453a5a1b3Sopenharmony_ci "vmov.s32 %q[sv1], q0 \n\t" 13553a5a1b3Sopenharmony_ci : [sv0] "=w" (sv0), [sv1] "=w" (sv1) 13653a5a1b3Sopenharmony_ci : [lin0] "r" (streams[0].linear), [lin1] "r" (streams[1].linear) 13753a5a1b3Sopenharmony_ci : "q0" /* clobber list */ 13853a5a1b3Sopenharmony_ci ); 13953a5a1b3Sopenharmony_ci 14053a5a1b3Sopenharmony_ci length /= sizeof(int16_t); 14153a5a1b3Sopenharmony_ci for (; length >= 4; length -= 4) { 14253a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 14353a5a1b3Sopenharmony_ci "vld1.s16 d0, [%[ptr0]]! \n\t" 14453a5a1b3Sopenharmony_ci "vld1.s16 d2, [%[ptr1]]! \n\t" 14553a5a1b3Sopenharmony_ci "vshll.s16 q0, d0, #15 \n\t" 14653a5a1b3Sopenharmony_ci "vshll.s16 q1, d2, #15 \n\t" 14753a5a1b3Sopenharmony_ci "vqdmulh.s32 q0, q0, %q[sv0] \n\t" 14853a5a1b3Sopenharmony_ci "vqdmulh.s32 q1, q1, %q[sv1] \n\t" 14953a5a1b3Sopenharmony_ci "vqadd.s32 q0, q0, q1 \n\t" 15053a5a1b3Sopenharmony_ci "vqmovn.s32 d0, q0 \n\t" 15153a5a1b3Sopenharmony_ci "vst1.s16 d0, [%[data]]! \n\t" 15253a5a1b3Sopenharmony_ci : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data) 15353a5a1b3Sopenharmony_ci : [sv0] "w" (sv0), [sv1] "w" (sv1) 15453a5a1b3Sopenharmony_ci : "memory", "cc", "q0", "q1" /* clobber list */ 15553a5a1b3Sopenharmony_ci ); 15653a5a1b3Sopenharmony_ci } 15753a5a1b3Sopenharmony_ci 15853a5a1b3Sopenharmony_ci if (length > 0) { 15953a5a1b3Sopenharmony_ci int32_t sum; 16053a5a1b3Sopenharmony_ci 16153a5a1b3Sopenharmony_ci sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[0].i); 16253a5a1b3Sopenharmony_ci sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[0].i); 16353a5a1b3Sopenharmony_ci *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF); 16453a5a1b3Sopenharmony_ci 16553a5a1b3Sopenharmony_ci sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[1].i); 16653a5a1b3Sopenharmony_ci sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[1].i); 16753a5a1b3Sopenharmony_ci *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF); 16853a5a1b3Sopenharmony_ci } 16953a5a1b3Sopenharmony_ci} 17053a5a1b3Sopenharmony_ci 17153a5a1b3Sopenharmony_ci/* special case: mix 2 s16ne streams, 4 channels each */ 17253a5a1b3Sopenharmony_cistatic void pa_mix2_ch4_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) { 17353a5a1b3Sopenharmony_ci const int16_t *ptr0 = streams[0].ptr; 17453a5a1b3Sopenharmony_ci const int16_t *ptr1 = streams[1].ptr; 17553a5a1b3Sopenharmony_ci 17653a5a1b3Sopenharmony_ci int32x4_t sv0, sv1; 17753a5a1b3Sopenharmony_ci 17853a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 17953a5a1b3Sopenharmony_ci "vld1.s32 %h[sv0], [%[lin0]] \n\t" 18053a5a1b3Sopenharmony_ci "vld1.s32 %h[sv1], [%[lin1]] \n\t" 18153a5a1b3Sopenharmony_ci : [sv0] "=w" (sv0), [sv1] "=w" (sv1) 18253a5a1b3Sopenharmony_ci : [lin0] "r" (streams[0].linear), [lin1] "r" (streams[1].linear) 18353a5a1b3Sopenharmony_ci : /* clobber list */ 18453a5a1b3Sopenharmony_ci ); 18553a5a1b3Sopenharmony_ci 18653a5a1b3Sopenharmony_ci length /= sizeof(int16_t); 18753a5a1b3Sopenharmony_ci for (; length >= 4; length -= 4) { 18853a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 18953a5a1b3Sopenharmony_ci "vld1.s16 d0, [%[ptr0]]! \n\t" 19053a5a1b3Sopenharmony_ci "vld1.s16 d2, [%[ptr1]]! \n\t" 19153a5a1b3Sopenharmony_ci "vshll.s16 q0, d0, #15 \n\t" 19253a5a1b3Sopenharmony_ci "vshll.s16 q1, d2, #15 \n\t" 19353a5a1b3Sopenharmony_ci "vqdmulh.s32 q0, q0, %q[sv0] \n\t" 19453a5a1b3Sopenharmony_ci "vqdmulh.s32 q1, q1, %q[sv1] \n\t" 19553a5a1b3Sopenharmony_ci "vqadd.s32 q0, q0, q1 \n\t" 19653a5a1b3Sopenharmony_ci "vqmovn.s32 d0, q0 \n\t" 19753a5a1b3Sopenharmony_ci "vst1.s16 d0, [%[data]]! \n\t" 19853a5a1b3Sopenharmony_ci : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data) 19953a5a1b3Sopenharmony_ci : [sv0] "w" (sv0), [sv1] "w" (sv1) 20053a5a1b3Sopenharmony_ci : "memory", "cc", "q0", "q1" /* clobber list */ 20153a5a1b3Sopenharmony_ci ); 20253a5a1b3Sopenharmony_ci } 20353a5a1b3Sopenharmony_ci} 20453a5a1b3Sopenharmony_ci 20553a5a1b3Sopenharmony_cistatic void pa_mix_s16ne_neon(pa_mix_info streams[], unsigned nstreams, unsigned nchannels, void *data, unsigned length) { 20653a5a1b3Sopenharmony_ci if (nstreams == 2 && nchannels == 2) 20753a5a1b3Sopenharmony_ci pa_mix2_ch2_s16ne_neon(streams, data, length); 20853a5a1b3Sopenharmony_ci else if (nstreams == 2 && nchannels == 4) 20953a5a1b3Sopenharmony_ci pa_mix2_ch4_s16ne_neon(streams, data, length); 21053a5a1b3Sopenharmony_ci else if (nstreams == 2 && nchannels == 1) 21153a5a1b3Sopenharmony_ci pa_mix2_ch1_s16ne_neon(streams, data, length); 21253a5a1b3Sopenharmony_ci else if (nchannels == 2) 21353a5a1b3Sopenharmony_ci pa_mix_ch2_s16ne_neon(streams, nstreams, data, length); 21453a5a1b3Sopenharmony_ci else 21553a5a1b3Sopenharmony_ci fallback(streams, nstreams, nchannels, data, length); 21653a5a1b3Sopenharmony_ci} 21753a5a1b3Sopenharmony_ci 21853a5a1b3Sopenharmony_civoid pa_mix_func_init_neon(pa_cpu_arm_flag_t flags) { 21953a5a1b3Sopenharmony_ci pa_log_info("Initialising ARM NEON optimized mixing functions."); 22053a5a1b3Sopenharmony_ci 22153a5a1b3Sopenharmony_ci fallback = pa_get_mix_func(PA_SAMPLE_S16NE); 22253a5a1b3Sopenharmony_ci pa_set_mix_func(PA_SAMPLE_S16NE, (pa_do_mix_func_t) pa_mix_s16ne_neon); 22353a5a1b3Sopenharmony_ci} 224