153a5a1b3Sopenharmony_ci/*** 253a5a1b3Sopenharmony_ci This file is part of PulseAudio. 353a5a1b3Sopenharmony_ci 453a5a1b3Sopenharmony_ci Copyright 2013 Peter Meerwald <p.meerwald@bct-electronic.com> 553a5a1b3Sopenharmony_ci 653a5a1b3Sopenharmony_ci PulseAudio is free software; you can redistribute it and/or modify 753a5a1b3Sopenharmony_ci it under the terms of the GNU Lesser General Public License as published 853a5a1b3Sopenharmony_ci by the Free Software Foundation; either version 2.1 of the License, 953a5a1b3Sopenharmony_ci or (at your option) any later version. 1053a5a1b3Sopenharmony_ci 1153a5a1b3Sopenharmony_ci PulseAudio is distributed in the hope that it will be useful, but 1253a5a1b3Sopenharmony_ci WITHOUT ANY WARRANTY; without even the implied warranty of 1353a5a1b3Sopenharmony_ci MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1453a5a1b3Sopenharmony_ci General Public License for more details. 1553a5a1b3Sopenharmony_ci***/ 1653a5a1b3Sopenharmony_ci 1753a5a1b3Sopenharmony_ci#ifdef HAVE_CONFIG_H 1853a5a1b3Sopenharmony_ci#include <config.h> 1953a5a1b3Sopenharmony_ci#endif 2053a5a1b3Sopenharmony_ci 2153a5a1b3Sopenharmony_ci#include <pulse/sample.h> 2253a5a1b3Sopenharmony_ci#include <pulse/xmalloc.h> 2353a5a1b3Sopenharmony_ci#include <pulsecore/log.h> 2453a5a1b3Sopenharmony_ci#include <pulsecore/macro.h> 2553a5a1b3Sopenharmony_ci 2653a5a1b3Sopenharmony_ci#include "cpu-arm.h" 2753a5a1b3Sopenharmony_ci#include "remap.h" 2853a5a1b3Sopenharmony_ci 2953a5a1b3Sopenharmony_ci#include <arm_neon.h> 3053a5a1b3Sopenharmony_ci 3153a5a1b3Sopenharmony_cistatic void remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t *m, float *dst, const float *src, unsigned n) { 3253a5a1b3Sopenharmony_ci for (; n >= 4; n -= 4) { 3353a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 3453a5a1b3Sopenharmony_ci "vld1.32 {q0}, [%[src]]! \n\t" 3553a5a1b3Sopenharmony_ci "vmov q1, q0 \n\t" 3653a5a1b3Sopenharmony_ci "vst2.32 {q0,q1}, [%[dst]]! \n\t" 3753a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 3853a5a1b3Sopenharmony_ci : /* input operands */ 3953a5a1b3Sopenharmony_ci : "memory", "q0", "q1" /* clobber list */ 4053a5a1b3Sopenharmony_ci ); 4153a5a1b3Sopenharmony_ci } 4253a5a1b3Sopenharmony_ci 4353a5a1b3Sopenharmony_ci for (; n > 0; n--) { 4453a5a1b3Sopenharmony_ci dst[0] = dst[1] = src[0]; 4553a5a1b3Sopenharmony_ci src++; 4653a5a1b3Sopenharmony_ci dst += 2; 4753a5a1b3Sopenharmony_ci } 4853a5a1b3Sopenharmony_ci} 4953a5a1b3Sopenharmony_ci 5053a5a1b3Sopenharmony_cistatic void remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t *m, float *dst, const float *src, unsigned n) { 5153a5a1b3Sopenharmony_ci for (; n >= 2; n -= 2) { 5253a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 5353a5a1b3Sopenharmony_ci "ldm %[src]!, {r4,r6} \n\t" 5453a5a1b3Sopenharmony_ci "mov r5, r4 \n\t" 5553a5a1b3Sopenharmony_ci 5653a5a1b3Sopenharmony_ci /* We use r12 instead of r7 here, because r7 is reserved for the 5753a5a1b3Sopenharmony_ci * frame pointer when using Thumb. */ 5853a5a1b3Sopenharmony_ci "mov r12, r6 \n\t" 5953a5a1b3Sopenharmony_ci 6053a5a1b3Sopenharmony_ci "stm %[dst]!, {r4-r6,r12} \n\t" 6153a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 6253a5a1b3Sopenharmony_ci : /* input operands */ 6353a5a1b3Sopenharmony_ci : "memory", "r4", "r5", "r6", "r12" /* clobber list */ 6453a5a1b3Sopenharmony_ci ); 6553a5a1b3Sopenharmony_ci } 6653a5a1b3Sopenharmony_ci 6753a5a1b3Sopenharmony_ci if (n > 0) 6853a5a1b3Sopenharmony_ci dst[0] = dst[1] = src[0]; 6953a5a1b3Sopenharmony_ci} 7053a5a1b3Sopenharmony_ci 7153a5a1b3Sopenharmony_cistatic void remap_mono_to_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 7253a5a1b3Sopenharmony_ci for (; n >= 8; n -= 8) { 7353a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 7453a5a1b3Sopenharmony_ci "vld1.16 {q0}, [%[src]]! \n\t" 7553a5a1b3Sopenharmony_ci "vmov q1, q0 \n\t" 7653a5a1b3Sopenharmony_ci "vst2.16 {q0,q1}, [%[dst]]! \n\t" 7753a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 7853a5a1b3Sopenharmony_ci : /* input operands */ 7953a5a1b3Sopenharmony_ci : "memory", "q0", "q1" /* clobber list */ 8053a5a1b3Sopenharmony_ci ); 8153a5a1b3Sopenharmony_ci } 8253a5a1b3Sopenharmony_ci 8353a5a1b3Sopenharmony_ci for (; n > 0; n--) { 8453a5a1b3Sopenharmony_ci dst[0] = dst[1] = src[0]; 8553a5a1b3Sopenharmony_ci src++; 8653a5a1b3Sopenharmony_ci dst += 2; 8753a5a1b3Sopenharmony_ci } 8853a5a1b3Sopenharmony_ci} 8953a5a1b3Sopenharmony_ci 9053a5a1b3Sopenharmony_cistatic void remap_mono_to_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 9153a5a1b3Sopenharmony_ci for (; n >= 2; n -= 2) { 9253a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 9353a5a1b3Sopenharmony_ci "vld1.32 {d0}, [%[src]]! \n\t" 9453a5a1b3Sopenharmony_ci "vdup.f32 q1, d0[0] \n\t" 9553a5a1b3Sopenharmony_ci "vdup.f32 q2, d0[1] \n\t" 9653a5a1b3Sopenharmony_ci "vst1.32 {q1,q2}, [%[dst]]! \n\t" 9753a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 9853a5a1b3Sopenharmony_ci : /* input operands */ 9953a5a1b3Sopenharmony_ci : "memory", "q0", "q1", "q2" /* clobber list */ 10053a5a1b3Sopenharmony_ci ); 10153a5a1b3Sopenharmony_ci } 10253a5a1b3Sopenharmony_ci 10353a5a1b3Sopenharmony_ci if (n--) 10453a5a1b3Sopenharmony_ci dst[0] = dst[1] = dst[2] = dst[3] = src[0]; 10553a5a1b3Sopenharmony_ci} 10653a5a1b3Sopenharmony_ci 10753a5a1b3Sopenharmony_cistatic void remap_mono_to_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 10853a5a1b3Sopenharmony_ci for (; n >= 4; n -= 4) { 10953a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 11053a5a1b3Sopenharmony_ci "vld1.16 {d0}, [%[src]]! \n\t" 11153a5a1b3Sopenharmony_ci "vdup.s16 d1, d0[1] \n\t" 11253a5a1b3Sopenharmony_ci "vdup.s16 d2, d0[2] \n\t" 11353a5a1b3Sopenharmony_ci "vdup.s16 d3, d0[3] \n\t" 11453a5a1b3Sopenharmony_ci "vdup.s16 d0, d0[0] \n\t" 11553a5a1b3Sopenharmony_ci "vst1.16 {d0,d1,d2,d3}, [%[dst]]!\n\t" 11653a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 11753a5a1b3Sopenharmony_ci : /* input operands */ 11853a5a1b3Sopenharmony_ci : "memory", "d0", "d1", "d2", "d3" /* clobber list */ 11953a5a1b3Sopenharmony_ci ); 12053a5a1b3Sopenharmony_ci } 12153a5a1b3Sopenharmony_ci 12253a5a1b3Sopenharmony_ci for (; n > 0; n--) { 12353a5a1b3Sopenharmony_ci dst[0] = dst[1] = dst[2] = dst[3] = src[0]; 12453a5a1b3Sopenharmony_ci src++; 12553a5a1b3Sopenharmony_ci dst += 4; 12653a5a1b3Sopenharmony_ci } 12753a5a1b3Sopenharmony_ci} 12853a5a1b3Sopenharmony_ci 12953a5a1b3Sopenharmony_cistatic void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 13053a5a1b3Sopenharmony_ci const float32x4_t halve = vdupq_n_f32(0.5f); 13153a5a1b3Sopenharmony_ci for (; n >= 4; n -= 4) { 13253a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 13353a5a1b3Sopenharmony_ci "vld2.32 {q0,q1}, [%[src]]! \n\t" 13453a5a1b3Sopenharmony_ci "vadd.f32 q0, q0, q1 \n\t" 13553a5a1b3Sopenharmony_ci "vmul.f32 q0, q0, %q[halve] \n\t" 13653a5a1b3Sopenharmony_ci "vst1.32 {q0}, [%[dst]]! \n\t" 13753a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 13853a5a1b3Sopenharmony_ci : [halve] "w" (halve) /* input operands */ 13953a5a1b3Sopenharmony_ci : "memory", "q0", "q1" /* clobber list */ 14053a5a1b3Sopenharmony_ci ); 14153a5a1b3Sopenharmony_ci } 14253a5a1b3Sopenharmony_ci 14353a5a1b3Sopenharmony_ci for (; n > 0; n--) { 14453a5a1b3Sopenharmony_ci dst[0] = (src[0] + src[1])*0.5f; 14553a5a1b3Sopenharmony_ci src += 2; 14653a5a1b3Sopenharmony_ci dst++; 14753a5a1b3Sopenharmony_ci } 14853a5a1b3Sopenharmony_ci} 14953a5a1b3Sopenharmony_ci 15053a5a1b3Sopenharmony_cistatic void remap_stereo_to_mono_s32ne_neon(pa_remap_t *m, int32_t *dst, const int32_t *src, unsigned n) { 15153a5a1b3Sopenharmony_ci for (; n >= 4; n -= 4) { 15253a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 15353a5a1b3Sopenharmony_ci "vld2.32 {q0,q1}, [%[src]]! \n\t" 15453a5a1b3Sopenharmony_ci "vrhadd.s32 q0, q0, q1 \n\t" 15553a5a1b3Sopenharmony_ci "vst1.32 {q0}, [%[dst]]! \n\t" 15653a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 15753a5a1b3Sopenharmony_ci : /* input operands */ 15853a5a1b3Sopenharmony_ci : "memory", "q0", "q1" /* clobber list */ 15953a5a1b3Sopenharmony_ci ); 16053a5a1b3Sopenharmony_ci } 16153a5a1b3Sopenharmony_ci 16253a5a1b3Sopenharmony_ci for (; n > 0; n--) { 16353a5a1b3Sopenharmony_ci dst[0] = src[0]/2 + src[1]/2; 16453a5a1b3Sopenharmony_ci src += 2; 16553a5a1b3Sopenharmony_ci dst++; 16653a5a1b3Sopenharmony_ci } 16753a5a1b3Sopenharmony_ci} 16853a5a1b3Sopenharmony_ci 16953a5a1b3Sopenharmony_cistatic void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 17053a5a1b3Sopenharmony_ci for (; n >= 8; n -= 8) { 17153a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 17253a5a1b3Sopenharmony_ci "vld2.16 {q0,q1}, [%[src]]! \n\t" 17353a5a1b3Sopenharmony_ci "vrhadd.s16 q0, q0, q1 \n\t" 17453a5a1b3Sopenharmony_ci "vst1.16 {q0}, [%[dst]]! \n\t" 17553a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 17653a5a1b3Sopenharmony_ci : /* input operands */ 17753a5a1b3Sopenharmony_ci : "memory", "q0", "q1" /* clobber list */ 17853a5a1b3Sopenharmony_ci ); 17953a5a1b3Sopenharmony_ci } 18053a5a1b3Sopenharmony_ci 18153a5a1b3Sopenharmony_ci for (; n > 0; n--) { 18253a5a1b3Sopenharmony_ci dst[0] = (src[0] + src[1])/2; 18353a5a1b3Sopenharmony_ci src += 2; 18453a5a1b3Sopenharmony_ci dst++; 18553a5a1b3Sopenharmony_ci } 18653a5a1b3Sopenharmony_ci} 18753a5a1b3Sopenharmony_ci 18853a5a1b3Sopenharmony_cistatic void remap_ch4_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 18953a5a1b3Sopenharmony_ci const float32x2_t quart = vdup_n_f32(0.25f); 19053a5a1b3Sopenharmony_ci for (; n >= 2; n -= 2) { 19153a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 19253a5a1b3Sopenharmony_ci "vld4.32 {d0,d1,d2,d3}, [%[src]]!\n\t" 19353a5a1b3Sopenharmony_ci "vadd.f32 d0, d0, d1 \n\t" 19453a5a1b3Sopenharmony_ci "vadd.f32 d2, d2, d3 \n\t" 19553a5a1b3Sopenharmony_ci "vadd.f32 d0, d0, d2 \n\t" 19653a5a1b3Sopenharmony_ci "vmul.f32 d0, d0, %P[quart] \n\t" 19753a5a1b3Sopenharmony_ci "vst1.32 {d0}, [%[dst]]! \n\t" 19853a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 19953a5a1b3Sopenharmony_ci : [quart] "w" (quart) /* input operands */ 20053a5a1b3Sopenharmony_ci : "memory", "d0", "d1", "d2", "d3" /* clobber list */ 20153a5a1b3Sopenharmony_ci ); 20253a5a1b3Sopenharmony_ci } 20353a5a1b3Sopenharmony_ci 20453a5a1b3Sopenharmony_ci if (n > 0) 20553a5a1b3Sopenharmony_ci dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f; 20653a5a1b3Sopenharmony_ci} 20753a5a1b3Sopenharmony_ci 20853a5a1b3Sopenharmony_cistatic void remap_ch4_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 20953a5a1b3Sopenharmony_ci for (; n >= 4; n -= 4) { 21053a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 21153a5a1b3Sopenharmony_ci "vld4.16 {d0,d1,d2,d3}, [%[src]]!\n\t" 21253a5a1b3Sopenharmony_ci "vrhadd.s16 d0, d0, d1 \n\t" 21353a5a1b3Sopenharmony_ci "vrhadd.s16 d2, d2, d3 \n\t" 21453a5a1b3Sopenharmony_ci "vrhadd.s16 d0, d0, d2 \n\t" 21553a5a1b3Sopenharmony_ci "vst1.16 {d0}, [%[dst]]! \n\t" 21653a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 21753a5a1b3Sopenharmony_ci : /* input operands */ 21853a5a1b3Sopenharmony_ci : "memory", "d0", "d1", "d2", "d3" /* clobber list */ 21953a5a1b3Sopenharmony_ci ); 22053a5a1b3Sopenharmony_ci } 22153a5a1b3Sopenharmony_ci 22253a5a1b3Sopenharmony_ci for (; n > 0; n--) { 22353a5a1b3Sopenharmony_ci dst[0] = (src[0] + src[1] + src[2] + src[3])/4; 22453a5a1b3Sopenharmony_ci src += 4; 22553a5a1b3Sopenharmony_ci dst++; 22653a5a1b3Sopenharmony_ci } 22753a5a1b3Sopenharmony_ci} 22853a5a1b3Sopenharmony_ci 22953a5a1b3Sopenharmony_cistatic void remap_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 23053a5a1b3Sopenharmony_ci int32x4_t *f = m->state; 23153a5a1b3Sopenharmony_ci const int32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3]; 23253a5a1b3Sopenharmony_ci 23353a5a1b3Sopenharmony_ci for (; n > 0; n--) { 23453a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 23553a5a1b3Sopenharmony_ci "vld1.16 {d0}, [%[src]]! \n\t" 23653a5a1b3Sopenharmony_ci "vmovl.s16 q0, d0 \n\t" 23753a5a1b3Sopenharmony_ci "vdup.s32 q1, d0[0] \n\t" 23853a5a1b3Sopenharmony_ci "vmul.s32 q1, q1, %q[f0] \n\t" 23953a5a1b3Sopenharmony_ci "vdup.s32 q2, d0[1] \n\t" 24053a5a1b3Sopenharmony_ci "vmla.s32 q1, q2, %q[f1] \n\t" 24153a5a1b3Sopenharmony_ci "vdup.s32 q2, d1[0] \n\t" 24253a5a1b3Sopenharmony_ci "vmla.s32 q1, q2, %q[f2] \n\t" 24353a5a1b3Sopenharmony_ci "vdup.s32 q2, d1[1] \n\t" 24453a5a1b3Sopenharmony_ci "vmla.s32 q1, q2, %q[f3] \n\t" 24553a5a1b3Sopenharmony_ci "vqshrn.s32 d2, q1, #16 \n\t" 24653a5a1b3Sopenharmony_ci "vst1.32 {d2}, [%[dst]]! \n\t" 24753a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) 24853a5a1b3Sopenharmony_ci : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3) 24953a5a1b3Sopenharmony_ci : "memory", "q0", "q1", "q2" 25053a5a1b3Sopenharmony_ci ); 25153a5a1b3Sopenharmony_ci } 25253a5a1b3Sopenharmony_ci} 25353a5a1b3Sopenharmony_ci 25453a5a1b3Sopenharmony_cistatic void remap_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 25553a5a1b3Sopenharmony_ci float32x4_t *f = m->state; 25653a5a1b3Sopenharmony_ci const float32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3]; 25753a5a1b3Sopenharmony_ci 25853a5a1b3Sopenharmony_ci for (; n > 0; n--) { 25953a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 26053a5a1b3Sopenharmony_ci "vld1.32 {d0,d1}, [%[src]]! \n\t" 26153a5a1b3Sopenharmony_ci "vdup.f32 q1, d0[0] \n\t" 26253a5a1b3Sopenharmony_ci "vmul.f32 q1, q1, %q[f0] \n\t" 26353a5a1b3Sopenharmony_ci "vdup.f32 q2, d0[1] \n\t" 26453a5a1b3Sopenharmony_ci "vmla.f32 q1, q2, %q[f1] \n\t" 26553a5a1b3Sopenharmony_ci "vdup.f32 q2, d1[0] \n\t" 26653a5a1b3Sopenharmony_ci "vmla.f32 q1, q2, %q[f2] \n\t" 26753a5a1b3Sopenharmony_ci "vdup.f32 q2, d1[1] \n\t" 26853a5a1b3Sopenharmony_ci "vmla.f32 q1, q2, %q[f3] \n\t" 26953a5a1b3Sopenharmony_ci "vst1.32 {d2,d3}, [%[dst]]! \n\t" 27053a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) 27153a5a1b3Sopenharmony_ci : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3) 27253a5a1b3Sopenharmony_ci : "memory", "q0", "q1", "q2" 27353a5a1b3Sopenharmony_ci ); 27453a5a1b3Sopenharmony_ci } 27553a5a1b3Sopenharmony_ci} 27653a5a1b3Sopenharmony_ci 27753a5a1b3Sopenharmony_cistatic void remap_arrange_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 27853a5a1b3Sopenharmony_ci const uint8x8_t t = ((uint8x8_t *) m->state)[0]; 27953a5a1b3Sopenharmony_ci 28053a5a1b3Sopenharmony_ci for (; n >= 2; n -= 2) { 28153a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 28253a5a1b3Sopenharmony_ci "vld1.s16 d0, [%[src]]! \n\t" 28353a5a1b3Sopenharmony_ci "vtbl.8 d0, {d0}, %P[t] \n\t" 28453a5a1b3Sopenharmony_ci "vst1.s16 d0, [%[dst]]! \n\t" 28553a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 28653a5a1b3Sopenharmony_ci : [t] "w" (t) /* input operands */ 28753a5a1b3Sopenharmony_ci : "memory", "d0" /* clobber list */ 28853a5a1b3Sopenharmony_ci ); 28953a5a1b3Sopenharmony_ci } 29053a5a1b3Sopenharmony_ci 29153a5a1b3Sopenharmony_ci if (n > 0) { 29253a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 29353a5a1b3Sopenharmony_ci "vld1.32 d0[0], [%[src]]! \n\t" 29453a5a1b3Sopenharmony_ci "vtbl.8 d0, {d0}, %P[t] \n\t" 29553a5a1b3Sopenharmony_ci "vst1.32 d0[0], [%[dst]]! \n\t" 29653a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 29753a5a1b3Sopenharmony_ci : [t] "w" (t) /* input operands */ 29853a5a1b3Sopenharmony_ci : "memory", "d0" /* clobber list */ 29953a5a1b3Sopenharmony_ci ); 30053a5a1b3Sopenharmony_ci } 30153a5a1b3Sopenharmony_ci} 30253a5a1b3Sopenharmony_ci 30353a5a1b3Sopenharmony_cistatic void remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 30453a5a1b3Sopenharmony_ci const uint8x8_t t = ((uint8x8_t *) m->state)[0]; 30553a5a1b3Sopenharmony_ci 30653a5a1b3Sopenharmony_ci for (; n > 0; n--) { 30753a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 30853a5a1b3Sopenharmony_ci "vld1.32 d0[0], [%[src]]! \n\t" 30953a5a1b3Sopenharmony_ci "vtbl.8 d0, {d0}, %P[t] \n\t" 31053a5a1b3Sopenharmony_ci "vst1.s16 d0, [%[dst]]! \n\t" 31153a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 31253a5a1b3Sopenharmony_ci : [t] "w" (t) /* input operands */ 31353a5a1b3Sopenharmony_ci : "memory", "d0" /* clobber list */ 31453a5a1b3Sopenharmony_ci ); 31553a5a1b3Sopenharmony_ci } 31653a5a1b3Sopenharmony_ci} 31753a5a1b3Sopenharmony_ci 31853a5a1b3Sopenharmony_cistatic void remap_arrange_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 31953a5a1b3Sopenharmony_ci const uint8x8_t t = ((uint8x8_t *) m->state)[0]; 32053a5a1b3Sopenharmony_ci 32153a5a1b3Sopenharmony_ci for (; n > 0; n--) { 32253a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 32353a5a1b3Sopenharmony_ci "vld1.s16 d0, [%[src]]! \n\t" 32453a5a1b3Sopenharmony_ci "vtbl.8 d0, {d0}, %P[t] \n\t" 32553a5a1b3Sopenharmony_ci "vst1.s16 d0, [%[dst]]! \n\t" 32653a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 32753a5a1b3Sopenharmony_ci : [t] "w" (t) /* input operands */ 32853a5a1b3Sopenharmony_ci : "memory", "d0" /* clobber list */ 32953a5a1b3Sopenharmony_ci ); 33053a5a1b3Sopenharmony_ci } 33153a5a1b3Sopenharmony_ci} 33253a5a1b3Sopenharmony_ci 33353a5a1b3Sopenharmony_cistatic void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 33453a5a1b3Sopenharmony_ci const uint8x8_t t = ((uint8x8_t *)m->state)[0]; 33553a5a1b3Sopenharmony_ci 33653a5a1b3Sopenharmony_ci for (; n > 0; n--) { 33753a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 33853a5a1b3Sopenharmony_ci "vld1.f32 d0, [%[src]]! \n\t" 33953a5a1b3Sopenharmony_ci "vtbl.8 d0, {d0}, %P[t] \n\t" 34053a5a1b3Sopenharmony_ci "vst1.s16 {d0}, [%[dst]]! \n\t" 34153a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 34253a5a1b3Sopenharmony_ci : [t] "w" (t) /* input operands */ 34353a5a1b3Sopenharmony_ci : "memory", "d0" /* clobber list */ 34453a5a1b3Sopenharmony_ci ); 34553a5a1b3Sopenharmony_ci } 34653a5a1b3Sopenharmony_ci} 34753a5a1b3Sopenharmony_ci 34853a5a1b3Sopenharmony_ci/* Works for both S32NE and FLOAT32NE */ 34953a5a1b3Sopenharmony_cistatic void remap_arrange_ch2_ch4_any32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 35053a5a1b3Sopenharmony_ci const uint8x8_t t0 = ((uint8x8_t *)m->state)[0]; 35153a5a1b3Sopenharmony_ci const uint8x8_t t1 = ((uint8x8_t *)m->state)[1]; 35253a5a1b3Sopenharmony_ci 35353a5a1b3Sopenharmony_ci for (; n > 0; n--) { 35453a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 35553a5a1b3Sopenharmony_ci "vld1.f32 d0, [%[src]]! \n\t" 35653a5a1b3Sopenharmony_ci "vtbl.8 d1, {d0}, %P[t0] \n\t" 35753a5a1b3Sopenharmony_ci "vtbl.8 d2, {d0}, %P[t1] \n\t" 35853a5a1b3Sopenharmony_ci "vst1.s16 {d1,d2}, [%[dst]]! \n\t" 35953a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 36053a5a1b3Sopenharmony_ci : [t0] "w" (t0), [t1] "w" (t1) /* input operands */ 36153a5a1b3Sopenharmony_ci : "memory", "d0", "d1", "d2" /* clobber list */ 36253a5a1b3Sopenharmony_ci ); 36353a5a1b3Sopenharmony_ci } 36453a5a1b3Sopenharmony_ci} 36553a5a1b3Sopenharmony_ci 36653a5a1b3Sopenharmony_cistatic void remap_arrange_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 36753a5a1b3Sopenharmony_ci const uint8x8_t t0 = ((uint8x8_t *)m->state)[0]; 36853a5a1b3Sopenharmony_ci const uint8x8_t t1 = ((uint8x8_t *)m->state)[1]; 36953a5a1b3Sopenharmony_ci 37053a5a1b3Sopenharmony_ci for (; n > 0; n--) { 37153a5a1b3Sopenharmony_ci __asm__ __volatile__ ( 37253a5a1b3Sopenharmony_ci "vld1.f32 {d0,d1}, [%[src]]! \n\t" 37353a5a1b3Sopenharmony_ci "vtbl.8 d2, {d0,d1}, %P[t0] \n\t" 37453a5a1b3Sopenharmony_ci "vtbl.8 d3, {d0,d1}, %P[t1] \n\t" 37553a5a1b3Sopenharmony_ci "vst1.s16 {d2,d3}, [%[dst]]! \n\t" 37653a5a1b3Sopenharmony_ci : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 37753a5a1b3Sopenharmony_ci : [t0] "w" (t0), [t1] "w" (t1) /* input operands */ 37853a5a1b3Sopenharmony_ci : "memory", "d0", "d1", "d2", "d3" /* clobber list */ 37953a5a1b3Sopenharmony_ci ); 38053a5a1b3Sopenharmony_ci } 38153a5a1b3Sopenharmony_ci} 38253a5a1b3Sopenharmony_ci 38353a5a1b3Sopenharmony_cistatic pa_cpu_arm_flag_t arm_flags; 38453a5a1b3Sopenharmony_ci 38553a5a1b3Sopenharmony_cistatic void init_remap_neon(pa_remap_t *m) { 38653a5a1b3Sopenharmony_ci unsigned n_oc, n_ic; 38753a5a1b3Sopenharmony_ci int8_t arrange[PA_CHANNELS_MAX]; 38853a5a1b3Sopenharmony_ci 38953a5a1b3Sopenharmony_ci n_oc = m->o_ss.channels; 39053a5a1b3Sopenharmony_ci n_ic = m->i_ss.channels; 39153a5a1b3Sopenharmony_ci 39253a5a1b3Sopenharmony_ci /* We short-circuit remap function selection for S32NE in most 39353a5a1b3Sopenharmony_ci * cases as the corresponding generic C code is performing 39453a5a1b3Sopenharmony_ci * similarly or even better. However there are a few cases where 39553a5a1b3Sopenharmony_ci * there actually is a significant improvement from using 39653a5a1b3Sopenharmony_ci * hand-crafted NEON assembly so we cannot just bail out for S32NE 39753a5a1b3Sopenharmony_ci * here. */ 39853a5a1b3Sopenharmony_ci if (n_ic == 1 && n_oc == 2 && 39953a5a1b3Sopenharmony_ci m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) { 40053a5a1b3Sopenharmony_ci if (m->format == PA_SAMPLE_S32NE) 40153a5a1b3Sopenharmony_ci return; 40253a5a1b3Sopenharmony_ci if (arm_flags & PA_CPU_ARM_CORTEX_A8) { 40353a5a1b3Sopenharmony_ci 40453a5a1b3Sopenharmony_ci pa_log_info("Using ARM NEON/A8 mono to stereo remapping"); 40553a5a1b3Sopenharmony_ci pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon, 40653a5a1b3Sopenharmony_ci NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8); 40753a5a1b3Sopenharmony_ci } 40853a5a1b3Sopenharmony_ci else { 40953a5a1b3Sopenharmony_ci pa_log_info("Using ARM NEON mono to stereo remapping"); 41053a5a1b3Sopenharmony_ci pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon, 41153a5a1b3Sopenharmony_ci NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm); 41253a5a1b3Sopenharmony_ci } 41353a5a1b3Sopenharmony_ci } else if (n_ic == 1 && n_oc == 4 && 41453a5a1b3Sopenharmony_ci m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 && 41553a5a1b3Sopenharmony_ci m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) { 41653a5a1b3Sopenharmony_ci 41753a5a1b3Sopenharmony_ci if (m->format == PA_SAMPLE_S32NE) 41853a5a1b3Sopenharmony_ci return; 41953a5a1b3Sopenharmony_ci pa_log_info("Using ARM NEON mono to 4-channel remapping"); 42053a5a1b3Sopenharmony_ci pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon, 42153a5a1b3Sopenharmony_ci NULL, (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon); 42253a5a1b3Sopenharmony_ci } else if (n_ic == 2 && n_oc == 1 && 42353a5a1b3Sopenharmony_ci m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) { 42453a5a1b3Sopenharmony_ci 42553a5a1b3Sopenharmony_ci pa_log_info("Using ARM NEON stereo to mono remapping"); 42653a5a1b3Sopenharmony_ci pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon, 42753a5a1b3Sopenharmony_ci (pa_do_remap_func_t) remap_stereo_to_mono_s32ne_neon, 42853a5a1b3Sopenharmony_ci (pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon); 42953a5a1b3Sopenharmony_ci } else if (n_ic == 4 && n_oc == 1 && 43053a5a1b3Sopenharmony_ci m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 && 43153a5a1b3Sopenharmony_ci m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) { 43253a5a1b3Sopenharmony_ci 43353a5a1b3Sopenharmony_ci if (m->format == PA_SAMPLE_S32NE) 43453a5a1b3Sopenharmony_ci return; 43553a5a1b3Sopenharmony_ci pa_log_info("Using ARM NEON 4-channel to mono remapping"); 43653a5a1b3Sopenharmony_ci pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon, 43753a5a1b3Sopenharmony_ci NULL, (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon); 43853a5a1b3Sopenharmony_ci } else if (pa_setup_remap_arrange(m, arrange) && 43953a5a1b3Sopenharmony_ci ((n_ic == 2 && n_oc == 2) || 44053a5a1b3Sopenharmony_ci (n_ic == 2 && n_oc == 4) || 44153a5a1b3Sopenharmony_ci (n_ic == 4 && n_oc == 4))) { 44253a5a1b3Sopenharmony_ci unsigned o; 44353a5a1b3Sopenharmony_ci 44453a5a1b3Sopenharmony_ci if (n_ic == 2 && n_oc == 2) { 44553a5a1b3Sopenharmony_ci if (m->format == PA_SAMPLE_S32NE) 44653a5a1b3Sopenharmony_ci return; 44753a5a1b3Sopenharmony_ci pa_log_info("Using NEON stereo arrange remapping"); 44853a5a1b3Sopenharmony_ci pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon, 44953a5a1b3Sopenharmony_ci NULL, (pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon); 45053a5a1b3Sopenharmony_ci } else if (n_ic == 2 && n_oc == 4) { 45153a5a1b3Sopenharmony_ci pa_log_info("Using NEON 2-channel to 4-channel arrange remapping"); 45253a5a1b3Sopenharmony_ci pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon, 45353a5a1b3Sopenharmony_ci (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon, 45453a5a1b3Sopenharmony_ci (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon); 45553a5a1b3Sopenharmony_ci } else if (n_ic == 4 && n_oc == 4) { 45653a5a1b3Sopenharmony_ci if (m->format == PA_SAMPLE_S32NE) 45753a5a1b3Sopenharmony_ci return; 45853a5a1b3Sopenharmony_ci pa_log_info("Using NEON 4-channel arrange remapping"); 45953a5a1b3Sopenharmony_ci pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon, 46053a5a1b3Sopenharmony_ci NULL, (pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon); 46153a5a1b3Sopenharmony_ci } 46253a5a1b3Sopenharmony_ci 46353a5a1b3Sopenharmony_ci /* setup state */ 46453a5a1b3Sopenharmony_ci switch (m->format) { 46553a5a1b3Sopenharmony_ci case PA_SAMPLE_S16NE: { 46653a5a1b3Sopenharmony_ci uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 1); 46753a5a1b3Sopenharmony_ci for (o = 0; o < 4; o++) { 46853a5a1b3Sopenharmony_ci if (arrange[o % n_oc] >= 0) { 46953a5a1b3Sopenharmony_ci /* convert channel index to vtbl indices */ 47053a5a1b3Sopenharmony_ci unsigned frame = o / n_oc; 47153a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 2 + 0] = (frame * n_oc + arrange[o % n_oc]) * 2 + 0; 47253a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 2 + 1] = (frame * n_oc + arrange[o % n_oc]) * 2 + 1; 47353a5a1b3Sopenharmony_ci } else { 47453a5a1b3Sopenharmony_ci /* use invalid table indices to map to 0 */ 47553a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 2 + 0] = 0xff; 47653a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 2 + 1] = 0xff; 47753a5a1b3Sopenharmony_ci } 47853a5a1b3Sopenharmony_ci } 47953a5a1b3Sopenharmony_ci break; 48053a5a1b3Sopenharmony_ci } 48153a5a1b3Sopenharmony_ci case PA_SAMPLE_S32NE: 48253a5a1b3Sopenharmony_ci /* fall-through */ 48353a5a1b3Sopenharmony_ci case PA_SAMPLE_FLOAT32NE: { 48453a5a1b3Sopenharmony_ci uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2); 48553a5a1b3Sopenharmony_ci for (o = 0; o < n_oc; o++) { 48653a5a1b3Sopenharmony_ci if (arrange[o] >= 0) { 48753a5a1b3Sopenharmony_ci /* convert channel index to vtbl indices */ 48853a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 4 + 0] = arrange[o] * 4 + 0; 48953a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 4 + 1] = arrange[o] * 4 + 1; 49053a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 4 + 2] = arrange[o] * 4 + 2; 49153a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 4 + 3] = arrange[o] * 4 + 3; 49253a5a1b3Sopenharmony_ci } else { 49353a5a1b3Sopenharmony_ci /* use invalid table indices to map to 0 */ 49453a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 4 + 0] = 0xff; 49553a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 4 + 1] = 0xff; 49653a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 4 + 2] = 0xff; 49753a5a1b3Sopenharmony_ci ((uint8_t *) t)[o * 4 + 3] = 0xff; 49853a5a1b3Sopenharmony_ci } 49953a5a1b3Sopenharmony_ci } 50053a5a1b3Sopenharmony_ci break; 50153a5a1b3Sopenharmony_ci } 50253a5a1b3Sopenharmony_ci default: 50353a5a1b3Sopenharmony_ci pa_assert_not_reached(); 50453a5a1b3Sopenharmony_ci } 50553a5a1b3Sopenharmony_ci } else if (n_ic == 4 && n_oc == 4) { 50653a5a1b3Sopenharmony_ci unsigned i, o; 50753a5a1b3Sopenharmony_ci 50853a5a1b3Sopenharmony_ci if (m->format == PA_SAMPLE_S32NE) 50953a5a1b3Sopenharmony_ci return; 51053a5a1b3Sopenharmony_ci pa_log_info("Using ARM NEON 4-channel remapping"); 51153a5a1b3Sopenharmony_ci pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon, 51253a5a1b3Sopenharmony_ci (pa_do_remap_func_t) NULL, 51353a5a1b3Sopenharmony_ci (pa_do_remap_func_t) remap_ch4_float32ne_neon); 51453a5a1b3Sopenharmony_ci 51553a5a1b3Sopenharmony_ci /* setup state */ 51653a5a1b3Sopenharmony_ci switch (m->format) { 51753a5a1b3Sopenharmony_ci case PA_SAMPLE_S16NE: { 51853a5a1b3Sopenharmony_ci int32x4_t *f = m->state = pa_xnew0(int32x4_t, 4); 51953a5a1b3Sopenharmony_ci for (o = 0; o < 4; o++) { 52053a5a1b3Sopenharmony_ci for (i = 0; i < 4; i++) { 52153a5a1b3Sopenharmony_ci ((int *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_i[o][i], 0, 0x10000); 52253a5a1b3Sopenharmony_ci } 52353a5a1b3Sopenharmony_ci } 52453a5a1b3Sopenharmony_ci break; 52553a5a1b3Sopenharmony_ci } 52653a5a1b3Sopenharmony_ci case PA_SAMPLE_FLOAT32NE: { 52753a5a1b3Sopenharmony_ci float32x4_t *f = m->state = pa_xnew0(float32x4_t, 4); 52853a5a1b3Sopenharmony_ci for (o = 0; o < 4; o++) { 52953a5a1b3Sopenharmony_ci for (i = 0; i < 4; i++) { 53053a5a1b3Sopenharmony_ci ((float *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_f[o][i], 0.0f, 1.0f); 53153a5a1b3Sopenharmony_ci } 53253a5a1b3Sopenharmony_ci } 53353a5a1b3Sopenharmony_ci break; 53453a5a1b3Sopenharmony_ci } 53553a5a1b3Sopenharmony_ci default: 53653a5a1b3Sopenharmony_ci pa_assert_not_reached(); 53753a5a1b3Sopenharmony_ci } 53853a5a1b3Sopenharmony_ci } 53953a5a1b3Sopenharmony_ci} 54053a5a1b3Sopenharmony_ci 54153a5a1b3Sopenharmony_civoid pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) { 54253a5a1b3Sopenharmony_ci pa_log_info("Initialising ARM NEON optimized remappers."); 54353a5a1b3Sopenharmony_ci arm_flags = flags; 54453a5a1b3Sopenharmony_ci pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon); 54553a5a1b3Sopenharmony_ci} 546