1/*** 2 This file is part of PulseAudio. 3 4 Copyright 2013 Peter Meerwald <pmeerw@pmeerw.net> 5 6 PulseAudio is free software; you can redistribute it and/or modify 7 it under the terms of the GNU Lesser General Public License as published 8 by the Free Software Foundation; either version 2.1 of the License, 9 or (at your option) any later version. 10 11 PulseAudio is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15***/ 16 17#ifdef HAVE_CONFIG_H 18#include <config.h> 19#endif 20 21#include <pulsecore/macro.h> 22#include <pulsecore/endianmacros.h> 23#include <pulsecore/sample-util.h> 24 25#include "cpu-arm.h" 26#include "mix.h" 27 28#include <arm_neon.h> 29 30static pa_do_mix_func_t fallback; 31 32/* special case: mix s16ne streams, 2 channels each */ 33static void pa_mix_ch2_s16ne_neon(pa_mix_info streams[], unsigned nstreams, uint8_t *data, unsigned length) { 34 const unsigned mask = sizeof(int16_t) * 8 - 1; 35 const uint8_t *end = data + (length & ~mask); 36 37 while (data < end) { 38 int32x4_t sum0, sum1; 39 unsigned i; 40 41 __asm__ __volatile__ ( 42 "veor.s32 %q[sum0], %q[sum0] \n\t" 43 "veor.s32 %q[sum1], %q[sum1] \n\t" 44 : [sum0] "=w" (sum0), [sum1] "=w" (sum1) 45 : 46 : "cc" /* clobber list */ 47 ); 48 49 for (i = 0; i < nstreams; i++) { 50 pa_mix_info *m = streams + i; 51 int32_t cv0 = m->linear[0].i; 52 int32_t cv1 = m->linear[1].i; 53 54 __asm__ __volatile__ ( 55 "vld2.s16 {d0,d2}, [%[ptr]]! \n\t" 56 "vmov.s32 d4[0], %[cv0] \n\t" 57 "vmov.s32 d4[1], %[cv1] \n\t" 58 "vshll.s16 q0, d0, #15 \n\t" 59 "vshll.s16 q1, d2, #15 \n\t" 60 "vqdmulh.s32 q0, q0, d4[0] \n\t" 61 "vqdmulh.s32 q1, q1, d4[1] \n\t" 62 "vqadd.s32 %q[sum0], %q[sum0], q0 \n\t" 63 "vqadd.s32 %q[sum1], %q[sum1], q1 \n\t" 64 : [ptr] "+r" (m->ptr), [sum0] "+w" (sum0), [sum1] "+w" (sum1) 65 : [cv0] "r" (cv0), [cv1] "r" (cv1) 66 : "memory", "cc", "q0", "q1", "d4" /* clobber list */ 67 ); 68 } 69 70 __asm__ __volatile__ ( 71 "vqmovn.s32 d0, %q[sum0] \n\t" 72 "vqmovn.s32 d1, %q[sum1] \n\t" 73 "vst2.s16 {d0,d1}, [%[data]]! \n\t" 74 : [data] "+r" (data) 75 : [sum0] "w" (sum0), [sum1] "w" (sum1) 76 : "memory", "cc", "q0" /* clobber list */ 77 ); 78 } 79 80 fallback(streams, nstreams, 2, data, length & mask); 81} 82 83/* special case: mix 2 s16ne streams, 1 channel each */ 84static void pa_mix2_ch1_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) { 85 const int16_t *ptr0 = streams[0].ptr; 86 const int16_t *ptr1 = streams[1].ptr; 87 88 int32x4_t sv0, sv1; 89 __asm__ __volatile__ ( 90 "vdup.s32 %q[sv0], %[lin0] \n\t" 91 "vdup.s32 %q[sv1], %[lin1] \n\t" 92 : [sv0] "=w" (sv0), [sv1] "=w" (sv1) 93 : [lin0] "r" (streams[0].linear[0]), [lin1] "r" (streams[1].linear[0]) 94 : /* clobber list */ 95 ); 96 97 length /= sizeof(int16_t); 98 for (; length >= 4; length -= 4) { 99 __asm__ __volatile__ ( 100 "vld1.s16 d0, [%[ptr0]]! \n\t" 101 "vld1.s16 d2, [%[ptr1]]! \n\t" 102 "vshll.s16 q0, d0, #15 \n\t" 103 "vshll.s16 q1, d2, #15 \n\t" 104 "vqdmulh.s32 q0, q0, %q[sv0] \n\t" 105 "vqdmulh.s32 q1, q1, %q[sv1] \n\t" 106 "vqadd.s32 q0, q0, q1 \n\t" 107 "vqmovn.s32 d0, q0 \n\t" 108 "vst1.s16 d0, [%[data]]! \n\t" 109 : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data) 110 : [sv0] "w" (sv0), [sv1] "w" (sv1) 111 : "memory", "cc", "q0", "q1" /* clobber list */ 112 ); 113 } 114 115 for (; length > 0; length--) { 116 int32_t sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[0].i); 117 sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[0].i); 118 *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF); 119 } 120} 121 122/* special case: mix 2 s16ne streams, 2 channel each */ 123static void pa_mix2_ch2_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) { 124 const int16_t *ptr0 = streams[0].ptr; 125 const int16_t *ptr1 = streams[1].ptr; 126 127 int32x4_t sv0, sv1; 128 __asm__ __volatile__ ( 129 "vld1.s32 d0, [%[lin0]] \n\t" 130 "vmov.s32 d1, d0 \n\t" 131 "vmov.s32 %q[sv0], q0 \n\t" 132 "vld1.s32 d0, [%[lin1]] \n\t" 133 "vmov.s32 d1, d0 \n\t" 134 "vmov.s32 %q[sv1], q0 \n\t" 135 : [sv0] "=w" (sv0), [sv1] "=w" (sv1) 136 : [lin0] "r" (streams[0].linear), [lin1] "r" (streams[1].linear) 137 : "q0" /* clobber list */ 138 ); 139 140 length /= sizeof(int16_t); 141 for (; length >= 4; length -= 4) { 142 __asm__ __volatile__ ( 143 "vld1.s16 d0, [%[ptr0]]! \n\t" 144 "vld1.s16 d2, [%[ptr1]]! \n\t" 145 "vshll.s16 q0, d0, #15 \n\t" 146 "vshll.s16 q1, d2, #15 \n\t" 147 "vqdmulh.s32 q0, q0, %q[sv0] \n\t" 148 "vqdmulh.s32 q1, q1, %q[sv1] \n\t" 149 "vqadd.s32 q0, q0, q1 \n\t" 150 "vqmovn.s32 d0, q0 \n\t" 151 "vst1.s16 d0, [%[data]]! \n\t" 152 : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data) 153 : [sv0] "w" (sv0), [sv1] "w" (sv1) 154 : "memory", "cc", "q0", "q1" /* clobber list */ 155 ); 156 } 157 158 if (length > 0) { 159 int32_t sum; 160 161 sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[0].i); 162 sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[0].i); 163 *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF); 164 165 sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[1].i); 166 sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[1].i); 167 *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF); 168 } 169} 170 171/* special case: mix 2 s16ne streams, 4 channels each */ 172static void pa_mix2_ch4_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) { 173 const int16_t *ptr0 = streams[0].ptr; 174 const int16_t *ptr1 = streams[1].ptr; 175 176 int32x4_t sv0, sv1; 177 178 __asm__ __volatile__ ( 179 "vld1.s32 %h[sv0], [%[lin0]] \n\t" 180 "vld1.s32 %h[sv1], [%[lin1]] \n\t" 181 : [sv0] "=w" (sv0), [sv1] "=w" (sv1) 182 : [lin0] "r" (streams[0].linear), [lin1] "r" (streams[1].linear) 183 : /* clobber list */ 184 ); 185 186 length /= sizeof(int16_t); 187 for (; length >= 4; length -= 4) { 188 __asm__ __volatile__ ( 189 "vld1.s16 d0, [%[ptr0]]! \n\t" 190 "vld1.s16 d2, [%[ptr1]]! \n\t" 191 "vshll.s16 q0, d0, #15 \n\t" 192 "vshll.s16 q1, d2, #15 \n\t" 193 "vqdmulh.s32 q0, q0, %q[sv0] \n\t" 194 "vqdmulh.s32 q1, q1, %q[sv1] \n\t" 195 "vqadd.s32 q0, q0, q1 \n\t" 196 "vqmovn.s32 d0, q0 \n\t" 197 "vst1.s16 d0, [%[data]]! \n\t" 198 : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data) 199 : [sv0] "w" (sv0), [sv1] "w" (sv1) 200 : "memory", "cc", "q0", "q1" /* clobber list */ 201 ); 202 } 203} 204 205static void pa_mix_s16ne_neon(pa_mix_info streams[], unsigned nstreams, unsigned nchannels, void *data, unsigned length) { 206 if (nstreams == 2 && nchannels == 2) 207 pa_mix2_ch2_s16ne_neon(streams, data, length); 208 else if (nstreams == 2 && nchannels == 4) 209 pa_mix2_ch4_s16ne_neon(streams, data, length); 210 else if (nstreams == 2 && nchannels == 1) 211 pa_mix2_ch1_s16ne_neon(streams, data, length); 212 else if (nchannels == 2) 213 pa_mix_ch2_s16ne_neon(streams, nstreams, data, length); 214 else 215 fallback(streams, nstreams, nchannels, data, length); 216} 217 218void pa_mix_func_init_neon(pa_cpu_arm_flag_t flags) { 219 pa_log_info("Initialising ARM NEON optimized mixing functions."); 220 221 fallback = pa_get_mix_func(PA_SAMPLE_S16NE); 222 pa_set_mix_func(PA_SAMPLE_S16NE, (pa_do_mix_func_t) pa_mix_s16ne_neon); 223} 224