1/*** 2 This file is part of PulseAudio. 3 4 Copyright 2013 Peter Meerwald <p.meerwald@bct-electronic.com> 5 6 PulseAudio is free software; you can redistribute it and/or modify 7 it under the terms of the GNU Lesser General Public License as published 8 by the Free Software Foundation; either version 2.1 of the License, 9 or (at your option) any later version. 10 11 PulseAudio is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15***/ 16 17#ifdef HAVE_CONFIG_H 18#include <config.h> 19#endif 20 21#include <pulse/sample.h> 22#include <pulse/xmalloc.h> 23#include <pulsecore/log.h> 24#include <pulsecore/macro.h> 25 26#include "cpu-arm.h" 27#include "remap.h" 28 29#include <arm_neon.h> 30 31static void remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t *m, float *dst, const float *src, unsigned n) { 32 for (; n >= 4; n -= 4) { 33 __asm__ __volatile__ ( 34 "vld1.32 {q0}, [%[src]]! \n\t" 35 "vmov q1, q0 \n\t" 36 "vst2.32 {q0,q1}, [%[dst]]! \n\t" 37 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 38 : /* input operands */ 39 : "memory", "q0", "q1" /* clobber list */ 40 ); 41 } 42 43 for (; n > 0; n--) { 44 dst[0] = dst[1] = src[0]; 45 src++; 46 dst += 2; 47 } 48} 49 50static void remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t *m, float *dst, const float *src, unsigned n) { 51 for (; n >= 2; n -= 2) { 52 __asm__ __volatile__ ( 53 "ldm %[src]!, {r4,r6} \n\t" 54 "mov r5, r4 \n\t" 55 56 /* We use r12 instead of r7 here, because r7 is reserved for the 57 * frame pointer when using Thumb. */ 58 "mov r12, r6 \n\t" 59 60 "stm %[dst]!, {r4-r6,r12} \n\t" 61 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 62 : /* input operands */ 63 : "memory", "r4", "r5", "r6", "r12" /* clobber list */ 64 ); 65 } 66 67 if (n > 0) 68 dst[0] = dst[1] = src[0]; 69} 70 71static void remap_mono_to_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 72 for (; n >= 8; n -= 8) { 73 __asm__ __volatile__ ( 74 "vld1.16 {q0}, [%[src]]! \n\t" 75 "vmov q1, q0 \n\t" 76 "vst2.16 {q0,q1}, [%[dst]]! \n\t" 77 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 78 : /* input operands */ 79 : "memory", "q0", "q1" /* clobber list */ 80 ); 81 } 82 83 for (; n > 0; n--) { 84 dst[0] = dst[1] = src[0]; 85 src++; 86 dst += 2; 87 } 88} 89 90static void remap_mono_to_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 91 for (; n >= 2; n -= 2) { 92 __asm__ __volatile__ ( 93 "vld1.32 {d0}, [%[src]]! \n\t" 94 "vdup.f32 q1, d0[0] \n\t" 95 "vdup.f32 q2, d0[1] \n\t" 96 "vst1.32 {q1,q2}, [%[dst]]! \n\t" 97 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 98 : /* input operands */ 99 : "memory", "q0", "q1", "q2" /* clobber list */ 100 ); 101 } 102 103 if (n--) 104 dst[0] = dst[1] = dst[2] = dst[3] = src[0]; 105} 106 107static void remap_mono_to_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 108 for (; n >= 4; n -= 4) { 109 __asm__ __volatile__ ( 110 "vld1.16 {d0}, [%[src]]! \n\t" 111 "vdup.s16 d1, d0[1] \n\t" 112 "vdup.s16 d2, d0[2] \n\t" 113 "vdup.s16 d3, d0[3] \n\t" 114 "vdup.s16 d0, d0[0] \n\t" 115 "vst1.16 {d0,d1,d2,d3}, [%[dst]]!\n\t" 116 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 117 : /* input operands */ 118 : "memory", "d0", "d1", "d2", "d3" /* clobber list */ 119 ); 120 } 121 122 for (; n > 0; n--) { 123 dst[0] = dst[1] = dst[2] = dst[3] = src[0]; 124 src++; 125 dst += 4; 126 } 127} 128 129static void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 130 const float32x4_t halve = vdupq_n_f32(0.5f); 131 for (; n >= 4; n -= 4) { 132 __asm__ __volatile__ ( 133 "vld2.32 {q0,q1}, [%[src]]! \n\t" 134 "vadd.f32 q0, q0, q1 \n\t" 135 "vmul.f32 q0, q0, %q[halve] \n\t" 136 "vst1.32 {q0}, [%[dst]]! \n\t" 137 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 138 : [halve] "w" (halve) /* input operands */ 139 : "memory", "q0", "q1" /* clobber list */ 140 ); 141 } 142 143 for (; n > 0; n--) { 144 dst[0] = (src[0] + src[1])*0.5f; 145 src += 2; 146 dst++; 147 } 148} 149 150static void remap_stereo_to_mono_s32ne_neon(pa_remap_t *m, int32_t *dst, const int32_t *src, unsigned n) { 151 for (; n >= 4; n -= 4) { 152 __asm__ __volatile__ ( 153 "vld2.32 {q0,q1}, [%[src]]! \n\t" 154 "vrhadd.s32 q0, q0, q1 \n\t" 155 "vst1.32 {q0}, [%[dst]]! \n\t" 156 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 157 : /* input operands */ 158 : "memory", "q0", "q1" /* clobber list */ 159 ); 160 } 161 162 for (; n > 0; n--) { 163 dst[0] = src[0]/2 + src[1]/2; 164 src += 2; 165 dst++; 166 } 167} 168 169static void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 170 for (; n >= 8; n -= 8) { 171 __asm__ __volatile__ ( 172 "vld2.16 {q0,q1}, [%[src]]! \n\t" 173 "vrhadd.s16 q0, q0, q1 \n\t" 174 "vst1.16 {q0}, [%[dst]]! \n\t" 175 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 176 : /* input operands */ 177 : "memory", "q0", "q1" /* clobber list */ 178 ); 179 } 180 181 for (; n > 0; n--) { 182 dst[0] = (src[0] + src[1])/2; 183 src += 2; 184 dst++; 185 } 186} 187 188static void remap_ch4_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 189 const float32x2_t quart = vdup_n_f32(0.25f); 190 for (; n >= 2; n -= 2) { 191 __asm__ __volatile__ ( 192 "vld4.32 {d0,d1,d2,d3}, [%[src]]!\n\t" 193 "vadd.f32 d0, d0, d1 \n\t" 194 "vadd.f32 d2, d2, d3 \n\t" 195 "vadd.f32 d0, d0, d2 \n\t" 196 "vmul.f32 d0, d0, %P[quart] \n\t" 197 "vst1.32 {d0}, [%[dst]]! \n\t" 198 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 199 : [quart] "w" (quart) /* input operands */ 200 : "memory", "d0", "d1", "d2", "d3" /* clobber list */ 201 ); 202 } 203 204 if (n > 0) 205 dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f; 206} 207 208static void remap_ch4_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 209 for (; n >= 4; n -= 4) { 210 __asm__ __volatile__ ( 211 "vld4.16 {d0,d1,d2,d3}, [%[src]]!\n\t" 212 "vrhadd.s16 d0, d0, d1 \n\t" 213 "vrhadd.s16 d2, d2, d3 \n\t" 214 "vrhadd.s16 d0, d0, d2 \n\t" 215 "vst1.16 {d0}, [%[dst]]! \n\t" 216 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 217 : /* input operands */ 218 : "memory", "d0", "d1", "d2", "d3" /* clobber list */ 219 ); 220 } 221 222 for (; n > 0; n--) { 223 dst[0] = (src[0] + src[1] + src[2] + src[3])/4; 224 src += 4; 225 dst++; 226 } 227} 228 229static void remap_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 230 int32x4_t *f = m->state; 231 const int32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3]; 232 233 for (; n > 0; n--) { 234 __asm__ __volatile__ ( 235 "vld1.16 {d0}, [%[src]]! \n\t" 236 "vmovl.s16 q0, d0 \n\t" 237 "vdup.s32 q1, d0[0] \n\t" 238 "vmul.s32 q1, q1, %q[f0] \n\t" 239 "vdup.s32 q2, d0[1] \n\t" 240 "vmla.s32 q1, q2, %q[f1] \n\t" 241 "vdup.s32 q2, d1[0] \n\t" 242 "vmla.s32 q1, q2, %q[f2] \n\t" 243 "vdup.s32 q2, d1[1] \n\t" 244 "vmla.s32 q1, q2, %q[f3] \n\t" 245 "vqshrn.s32 d2, q1, #16 \n\t" 246 "vst1.32 {d2}, [%[dst]]! \n\t" 247 : [dst] "+r" (dst), [src] "+r" (src) 248 : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3) 249 : "memory", "q0", "q1", "q2" 250 ); 251 } 252} 253 254static void remap_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 255 float32x4_t *f = m->state; 256 const float32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3]; 257 258 for (; n > 0; n--) { 259 __asm__ __volatile__ ( 260 "vld1.32 {d0,d1}, [%[src]]! \n\t" 261 "vdup.f32 q1, d0[0] \n\t" 262 "vmul.f32 q1, q1, %q[f0] \n\t" 263 "vdup.f32 q2, d0[1] \n\t" 264 "vmla.f32 q1, q2, %q[f1] \n\t" 265 "vdup.f32 q2, d1[0] \n\t" 266 "vmla.f32 q1, q2, %q[f2] \n\t" 267 "vdup.f32 q2, d1[1] \n\t" 268 "vmla.f32 q1, q2, %q[f3] \n\t" 269 "vst1.32 {d2,d3}, [%[dst]]! \n\t" 270 : [dst] "+r" (dst), [src] "+r" (src) 271 : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3) 272 : "memory", "q0", "q1", "q2" 273 ); 274 } 275} 276 277static void remap_arrange_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 278 const uint8x8_t t = ((uint8x8_t *) m->state)[0]; 279 280 for (; n >= 2; n -= 2) { 281 __asm__ __volatile__ ( 282 "vld1.s16 d0, [%[src]]! \n\t" 283 "vtbl.8 d0, {d0}, %P[t] \n\t" 284 "vst1.s16 d0, [%[dst]]! \n\t" 285 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 286 : [t] "w" (t) /* input operands */ 287 : "memory", "d0" /* clobber list */ 288 ); 289 } 290 291 if (n > 0) { 292 __asm__ __volatile__ ( 293 "vld1.32 d0[0], [%[src]]! \n\t" 294 "vtbl.8 d0, {d0}, %P[t] \n\t" 295 "vst1.32 d0[0], [%[dst]]! \n\t" 296 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 297 : [t] "w" (t) /* input operands */ 298 : "memory", "d0" /* clobber list */ 299 ); 300 } 301} 302 303static void remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 304 const uint8x8_t t = ((uint8x8_t *) m->state)[0]; 305 306 for (; n > 0; n--) { 307 __asm__ __volatile__ ( 308 "vld1.32 d0[0], [%[src]]! \n\t" 309 "vtbl.8 d0, {d0}, %P[t] \n\t" 310 "vst1.s16 d0, [%[dst]]! \n\t" 311 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 312 : [t] "w" (t) /* input operands */ 313 : "memory", "d0" /* clobber list */ 314 ); 315 } 316} 317 318static void remap_arrange_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) { 319 const uint8x8_t t = ((uint8x8_t *) m->state)[0]; 320 321 for (; n > 0; n--) { 322 __asm__ __volatile__ ( 323 "vld1.s16 d0, [%[src]]! \n\t" 324 "vtbl.8 d0, {d0}, %P[t] \n\t" 325 "vst1.s16 d0, [%[dst]]! \n\t" 326 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 327 : [t] "w" (t) /* input operands */ 328 : "memory", "d0" /* clobber list */ 329 ); 330 } 331} 332 333static void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 334 const uint8x8_t t = ((uint8x8_t *)m->state)[0]; 335 336 for (; n > 0; n--) { 337 __asm__ __volatile__ ( 338 "vld1.f32 d0, [%[src]]! \n\t" 339 "vtbl.8 d0, {d0}, %P[t] \n\t" 340 "vst1.s16 {d0}, [%[dst]]! \n\t" 341 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 342 : [t] "w" (t) /* input operands */ 343 : "memory", "d0" /* clobber list */ 344 ); 345 } 346} 347 348/* Works for both S32NE and FLOAT32NE */ 349static void remap_arrange_ch2_ch4_any32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 350 const uint8x8_t t0 = ((uint8x8_t *)m->state)[0]; 351 const uint8x8_t t1 = ((uint8x8_t *)m->state)[1]; 352 353 for (; n > 0; n--) { 354 __asm__ __volatile__ ( 355 "vld1.f32 d0, [%[src]]! \n\t" 356 "vtbl.8 d1, {d0}, %P[t0] \n\t" 357 "vtbl.8 d2, {d0}, %P[t1] \n\t" 358 "vst1.s16 {d1,d2}, [%[dst]]! \n\t" 359 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 360 : [t0] "w" (t0), [t1] "w" (t1) /* input operands */ 361 : "memory", "d0", "d1", "d2" /* clobber list */ 362 ); 363 } 364} 365 366static void remap_arrange_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) { 367 const uint8x8_t t0 = ((uint8x8_t *)m->state)[0]; 368 const uint8x8_t t1 = ((uint8x8_t *)m->state)[1]; 369 370 for (; n > 0; n--) { 371 __asm__ __volatile__ ( 372 "vld1.f32 {d0,d1}, [%[src]]! \n\t" 373 "vtbl.8 d2, {d0,d1}, %P[t0] \n\t" 374 "vtbl.8 d3, {d0,d1}, %P[t1] \n\t" 375 "vst1.s16 {d2,d3}, [%[dst]]! \n\t" 376 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */ 377 : [t0] "w" (t0), [t1] "w" (t1) /* input operands */ 378 : "memory", "d0", "d1", "d2", "d3" /* clobber list */ 379 ); 380 } 381} 382 383static pa_cpu_arm_flag_t arm_flags; 384 385static void init_remap_neon(pa_remap_t *m) { 386 unsigned n_oc, n_ic; 387 int8_t arrange[PA_CHANNELS_MAX]; 388 389 n_oc = m->o_ss.channels; 390 n_ic = m->i_ss.channels; 391 392 /* We short-circuit remap function selection for S32NE in most 393 * cases as the corresponding generic C code is performing 394 * similarly or even better. However there are a few cases where 395 * there actually is a significant improvement from using 396 * hand-crafted NEON assembly so we cannot just bail out for S32NE 397 * here. */ 398 if (n_ic == 1 && n_oc == 2 && 399 m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) { 400 if (m->format == PA_SAMPLE_S32NE) 401 return; 402 if (arm_flags & PA_CPU_ARM_CORTEX_A8) { 403 404 pa_log_info("Using ARM NEON/A8 mono to stereo remapping"); 405 pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon, 406 NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8); 407 } 408 else { 409 pa_log_info("Using ARM NEON mono to stereo remapping"); 410 pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon, 411 NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm); 412 } 413 } else if (n_ic == 1 && n_oc == 4 && 414 m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 && 415 m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) { 416 417 if (m->format == PA_SAMPLE_S32NE) 418 return; 419 pa_log_info("Using ARM NEON mono to 4-channel remapping"); 420 pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon, 421 NULL, (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon); 422 } else if (n_ic == 2 && n_oc == 1 && 423 m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) { 424 425 pa_log_info("Using ARM NEON stereo to mono remapping"); 426 pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon, 427 (pa_do_remap_func_t) remap_stereo_to_mono_s32ne_neon, 428 (pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon); 429 } else if (n_ic == 4 && n_oc == 1 && 430 m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 && 431 m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) { 432 433 if (m->format == PA_SAMPLE_S32NE) 434 return; 435 pa_log_info("Using ARM NEON 4-channel to mono remapping"); 436 pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon, 437 NULL, (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon); 438 } else if (pa_setup_remap_arrange(m, arrange) && 439 ((n_ic == 2 && n_oc == 2) || 440 (n_ic == 2 && n_oc == 4) || 441 (n_ic == 4 && n_oc == 4))) { 442 unsigned o; 443 444 if (n_ic == 2 && n_oc == 2) { 445 if (m->format == PA_SAMPLE_S32NE) 446 return; 447 pa_log_info("Using NEON stereo arrange remapping"); 448 pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon, 449 NULL, (pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon); 450 } else if (n_ic == 2 && n_oc == 4) { 451 pa_log_info("Using NEON 2-channel to 4-channel arrange remapping"); 452 pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon, 453 (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon, 454 (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon); 455 } else if (n_ic == 4 && n_oc == 4) { 456 if (m->format == PA_SAMPLE_S32NE) 457 return; 458 pa_log_info("Using NEON 4-channel arrange remapping"); 459 pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon, 460 NULL, (pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon); 461 } 462 463 /* setup state */ 464 switch (m->format) { 465 case PA_SAMPLE_S16NE: { 466 uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 1); 467 for (o = 0; o < 4; o++) { 468 if (arrange[o % n_oc] >= 0) { 469 /* convert channel index to vtbl indices */ 470 unsigned frame = o / n_oc; 471 ((uint8_t *) t)[o * 2 + 0] = (frame * n_oc + arrange[o % n_oc]) * 2 + 0; 472 ((uint8_t *) t)[o * 2 + 1] = (frame * n_oc + arrange[o % n_oc]) * 2 + 1; 473 } else { 474 /* use invalid table indices to map to 0 */ 475 ((uint8_t *) t)[o * 2 + 0] = 0xff; 476 ((uint8_t *) t)[o * 2 + 1] = 0xff; 477 } 478 } 479 break; 480 } 481 case PA_SAMPLE_S32NE: 482 /* fall-through */ 483 case PA_SAMPLE_FLOAT32NE: { 484 uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2); 485 for (o = 0; o < n_oc; o++) { 486 if (arrange[o] >= 0) { 487 /* convert channel index to vtbl indices */ 488 ((uint8_t *) t)[o * 4 + 0] = arrange[o] * 4 + 0; 489 ((uint8_t *) t)[o * 4 + 1] = arrange[o] * 4 + 1; 490 ((uint8_t *) t)[o * 4 + 2] = arrange[o] * 4 + 2; 491 ((uint8_t *) t)[o * 4 + 3] = arrange[o] * 4 + 3; 492 } else { 493 /* use invalid table indices to map to 0 */ 494 ((uint8_t *) t)[o * 4 + 0] = 0xff; 495 ((uint8_t *) t)[o * 4 + 1] = 0xff; 496 ((uint8_t *) t)[o * 4 + 2] = 0xff; 497 ((uint8_t *) t)[o * 4 + 3] = 0xff; 498 } 499 } 500 break; 501 } 502 default: 503 pa_assert_not_reached(); 504 } 505 } else if (n_ic == 4 && n_oc == 4) { 506 unsigned i, o; 507 508 if (m->format == PA_SAMPLE_S32NE) 509 return; 510 pa_log_info("Using ARM NEON 4-channel remapping"); 511 pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon, 512 (pa_do_remap_func_t) NULL, 513 (pa_do_remap_func_t) remap_ch4_float32ne_neon); 514 515 /* setup state */ 516 switch (m->format) { 517 case PA_SAMPLE_S16NE: { 518 int32x4_t *f = m->state = pa_xnew0(int32x4_t, 4); 519 for (o = 0; o < 4; o++) { 520 for (i = 0; i < 4; i++) { 521 ((int *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_i[o][i], 0, 0x10000); 522 } 523 } 524 break; 525 } 526 case PA_SAMPLE_FLOAT32NE: { 527 float32x4_t *f = m->state = pa_xnew0(float32x4_t, 4); 528 for (o = 0; o < 4; o++) { 529 for (i = 0; i < 4; i++) { 530 ((float *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_f[o][i], 0.0f, 1.0f); 531 } 532 } 533 break; 534 } 535 default: 536 pa_assert_not_reached(); 537 } 538 } 539} 540 541void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) { 542 pa_log_info("Initialising ARM NEON optimized remappers."); 543 arm_flags = flags; 544 pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon); 545} 546