1/***
2  This file is part of PulseAudio.
3
4  Copyright 2013 Peter Meerwald <p.meerwald@bct-electronic.com>
5
6  PulseAudio is free software; you can redistribute it and/or modify
7  it under the terms of the GNU Lesser General Public License as published
8  by the Free Software Foundation; either version 2.1 of the License,
9  or (at your option) any later version.
10
11  PulseAudio is distributed in the hope that it will be useful, but
12  WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  General Public License for more details.
15***/
16
17#ifdef HAVE_CONFIG_H
18#include <config.h>
19#endif
20
21#include <pulse/sample.h>
22#include <pulse/xmalloc.h>
23#include <pulsecore/log.h>
24#include <pulsecore/macro.h>
25
26#include "cpu-arm.h"
27#include "remap.h"
28
29#include <arm_neon.h>
30
31static void remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t *m, float *dst, const float *src, unsigned n) {
32    for (; n >= 4; n -= 4) {
33        __asm__ __volatile__ (
34            "vld1.32    {q0}, [%[src]]!         \n\t"
35            "vmov       q1, q0                  \n\t"
36            "vst2.32    {q0,q1}, [%[dst]]!      \n\t"
37            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
38            : /* input operands */
39            : "memory", "q0", "q1" /* clobber list */
40        );
41    }
42
43    for (; n > 0; n--) {
44        dst[0] = dst[1] = src[0];
45        src++;
46        dst += 2;
47    }
48}
49
50static void remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t *m, float *dst, const float *src, unsigned n) {
51    for (; n >= 2; n -= 2) {
52        __asm__ __volatile__ (
53            "ldm        %[src]!, {r4,r6}        \n\t"
54            "mov        r5, r4                  \n\t"
55
56            /* We use r12 instead of r7 here, because r7 is reserved for the
57             * frame pointer when using Thumb. */
58            "mov        r12, r6                 \n\t"
59
60            "stm        %[dst]!, {r4-r6,r12}    \n\t"
61            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
62            : /* input operands */
63            : "memory", "r4", "r5", "r6", "r12" /* clobber list */
64        );
65    }
66
67    if (n > 0)
68        dst[0] = dst[1] = src[0];
69}
70
71static void remap_mono_to_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
72    for (; n >= 8; n -= 8) {
73        __asm__ __volatile__ (
74            "vld1.16    {q0}, [%[src]]!         \n\t"
75            "vmov       q1, q0                  \n\t"
76            "vst2.16    {q0,q1}, [%[dst]]!      \n\t"
77            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
78            : /* input operands */
79            : "memory", "q0", "q1" /* clobber list */
80        );
81    }
82
83    for (; n > 0; n--) {
84        dst[0] = dst[1] = src[0];
85        src++;
86        dst += 2;
87    }
88}
89
90static void remap_mono_to_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
91    for (; n >= 2; n -= 2) {
92        __asm__ __volatile__ (
93            "vld1.32    {d0}, [%[src]]!         \n\t"
94            "vdup.f32   q1, d0[0]               \n\t"
95            "vdup.f32   q2, d0[1]               \n\t"
96            "vst1.32    {q1,q2}, [%[dst]]!      \n\t"
97            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
98            : /* input operands */
99            : "memory", "q0", "q1", "q2" /* clobber list */
100        );
101    }
102
103    if (n--)
104        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
105}
106
107static void remap_mono_to_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
108    for (; n >= 4; n -= 4) {
109        __asm__ __volatile__ (
110            "vld1.16    {d0}, [%[src]]!         \n\t"
111            "vdup.s16   d1, d0[1]               \n\t"
112            "vdup.s16   d2, d0[2]               \n\t"
113            "vdup.s16   d3, d0[3]               \n\t"
114            "vdup.s16   d0, d0[0]               \n\t"
115            "vst1.16    {d0,d1,d2,d3}, [%[dst]]!\n\t"
116            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
117            : /* input operands */
118            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
119        );
120    }
121
122    for (; n > 0; n--) {
123        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
124        src++;
125        dst += 4;
126    }
127}
128
129static void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
130    const float32x4_t halve = vdupq_n_f32(0.5f);
131    for (; n >= 4; n -= 4) {
132        __asm__ __volatile__ (
133            "vld2.32    {q0,q1}, [%[src]]!      \n\t"
134            "vadd.f32   q0, q0, q1              \n\t"
135            "vmul.f32   q0, q0, %q[halve]       \n\t"
136            "vst1.32    {q0}, [%[dst]]!         \n\t"
137            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
138            : [halve] "w" (halve) /* input operands */
139            : "memory", "q0", "q1" /* clobber list */
140        );
141    }
142
143    for (; n > 0; n--) {
144        dst[0] = (src[0] + src[1])*0.5f;
145        src += 2;
146        dst++;
147    }
148}
149
150static void remap_stereo_to_mono_s32ne_neon(pa_remap_t *m, int32_t *dst, const int32_t *src, unsigned n) {
151    for (; n >= 4; n -= 4) {
152        __asm__ __volatile__ (
153            "vld2.32    {q0,q1}, [%[src]]!      \n\t"
154            "vrhadd.s32 q0, q0, q1              \n\t"
155            "vst1.32    {q0}, [%[dst]]!         \n\t"
156            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
157            : /* input operands */
158            : "memory", "q0", "q1" /* clobber list */
159        );
160    }
161
162    for (; n > 0; n--) {
163        dst[0] = src[0]/2 + src[1]/2;
164        src += 2;
165        dst++;
166    }
167}
168
169static void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
170    for (; n >= 8; n -= 8) {
171        __asm__ __volatile__ (
172            "vld2.16    {q0,q1}, [%[src]]!      \n\t"
173            "vrhadd.s16 q0, q0, q1              \n\t"
174            "vst1.16    {q0}, [%[dst]]!         \n\t"
175            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
176            : /* input operands */
177            : "memory", "q0", "q1" /* clobber list */
178        );
179    }
180
181    for (; n > 0; n--) {
182        dst[0] = (src[0] + src[1])/2;
183        src += 2;
184        dst++;
185    }
186}
187
188static void remap_ch4_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
189    const float32x2_t quart = vdup_n_f32(0.25f);
190    for (; n >= 2; n -= 2) {
191        __asm__ __volatile__ (
192            "vld4.32    {d0,d1,d2,d3}, [%[src]]!\n\t"
193            "vadd.f32   d0, d0, d1              \n\t"
194            "vadd.f32   d2, d2, d3              \n\t"
195            "vadd.f32   d0, d0, d2              \n\t"
196            "vmul.f32   d0, d0, %P[quart]       \n\t"
197            "vst1.32    {d0}, [%[dst]]!         \n\t"
198            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
199            : [quart] "w" (quart) /* input operands */
200            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
201        );
202    }
203
204    if (n > 0)
205        dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f;
206}
207
208static void remap_ch4_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
209    for (; n >= 4; n -= 4) {
210        __asm__ __volatile__ (
211            "vld4.16    {d0,d1,d2,d3}, [%[src]]!\n\t"
212            "vrhadd.s16 d0, d0, d1              \n\t"
213            "vrhadd.s16 d2, d2, d3              \n\t"
214            "vrhadd.s16 d0, d0, d2              \n\t"
215            "vst1.16    {d0}, [%[dst]]!         \n\t"
216            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
217            : /* input operands */
218            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
219        );
220    }
221
222    for (; n > 0; n--) {
223        dst[0] = (src[0] + src[1] + src[2] + src[3])/4;
224        src += 4;
225        dst++;
226    }
227}
228
229static void remap_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
230    int32x4_t *f = m->state;
231    const int32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
232
233    for (; n > 0; n--) {
234        __asm__ __volatile__ (
235            "vld1.16    {d0}, [%[src]]!         \n\t"
236            "vmovl.s16  q0, d0                  \n\t"
237            "vdup.s32   q1, d0[0]               \n\t"
238            "vmul.s32   q1, q1, %q[f0]          \n\t"
239            "vdup.s32   q2, d0[1]               \n\t"
240            "vmla.s32   q1, q2, %q[f1]          \n\t"
241            "vdup.s32   q2, d1[0]               \n\t"
242            "vmla.s32   q1, q2, %q[f2]          \n\t"
243            "vdup.s32   q2, d1[1]               \n\t"
244            "vmla.s32   q1, q2, %q[f3]          \n\t"
245            "vqshrn.s32  d2, q1, #16            \n\t"
246            "vst1.32    {d2}, [%[dst]]!         \n\t"
247            : [dst] "+r" (dst), [src] "+r" (src)
248            : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
249            : "memory", "q0", "q1", "q2"
250        );
251    }
252}
253
254static void remap_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
255    float32x4_t *f = m->state;
256    const float32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
257
258    for (; n > 0; n--) {
259        __asm__ __volatile__ (
260            "vld1.32    {d0,d1}, [%[src]]!      \n\t"
261            "vdup.f32   q1, d0[0]               \n\t"
262            "vmul.f32   q1, q1, %q[f0]          \n\t"
263            "vdup.f32   q2, d0[1]               \n\t"
264            "vmla.f32   q1, q2, %q[f1]          \n\t"
265            "vdup.f32   q2, d1[0]               \n\t"
266            "vmla.f32   q1, q2, %q[f2]          \n\t"
267            "vdup.f32   q2, d1[1]               \n\t"
268            "vmla.f32   q1, q2, %q[f3]          \n\t"
269            "vst1.32    {d2,d3}, [%[dst]]!      \n\t"
270            : [dst] "+r" (dst), [src] "+r" (src)
271            : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
272            : "memory", "q0", "q1", "q2"
273        );
274    }
275}
276
277static void remap_arrange_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
278    const uint8x8_t t = ((uint8x8_t *) m->state)[0];
279
280    for (; n >= 2; n -= 2) {
281        __asm__ __volatile__ (
282            "vld1.s16   d0, [%[src]]!           \n\t"
283            "vtbl.8     d0, {d0}, %P[t]         \n\t"
284            "vst1.s16   d0, [%[dst]]!           \n\t"
285            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
286            : [t] "w" (t) /* input operands */
287            : "memory", "d0" /* clobber list */
288        );
289    }
290
291    if (n > 0) {
292        __asm__ __volatile__ (
293            "vld1.32   d0[0], [%[src]]!         \n\t"
294            "vtbl.8    d0, {d0}, %P[t]          \n\t"
295            "vst1.32   d0[0], [%[dst]]!         \n\t"
296            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
297            : [t] "w" (t) /* input operands */
298            : "memory", "d0" /* clobber list */
299        );
300    }
301}
302
303static void remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
304    const uint8x8_t t = ((uint8x8_t *) m->state)[0];
305
306    for (; n > 0; n--) {
307        __asm__ __volatile__ (
308            "vld1.32    d0[0], [%[src]]!           \n\t"
309            "vtbl.8     d0, {d0}, %P[t]            \n\t"
310            "vst1.s16   d0, [%[dst]]!              \n\t"
311            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
312            : [t] "w" (t) /* input operands */
313            : "memory", "d0" /* clobber list */
314        );
315    }
316}
317
318static void remap_arrange_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
319    const uint8x8_t t = ((uint8x8_t *) m->state)[0];
320
321    for (; n > 0; n--) {
322        __asm__ __volatile__ (
323            "vld1.s16   d0, [%[src]]!           \n\t"
324            "vtbl.8     d0, {d0}, %P[t]         \n\t"
325            "vst1.s16   d0, [%[dst]]!           \n\t"
326            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
327            : [t] "w" (t) /* input operands */
328            : "memory", "d0" /* clobber list */
329        );
330    }
331}
332
333static void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
334    const uint8x8_t t = ((uint8x8_t *)m->state)[0];
335
336    for (; n > 0; n--) {
337        __asm__ __volatile__ (
338            "vld1.f32   d0, [%[src]]!           \n\t"
339            "vtbl.8     d0, {d0}, %P[t]         \n\t"
340            "vst1.s16   {d0}, [%[dst]]!         \n\t"
341            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
342            : [t] "w" (t) /* input operands */
343            : "memory", "d0" /* clobber list */
344        );
345    }
346}
347
348/* Works for both S32NE and FLOAT32NE */
349static void remap_arrange_ch2_ch4_any32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
350    const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
351    const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
352
353    for (; n > 0; n--) {
354        __asm__ __volatile__ (
355            "vld1.f32   d0, [%[src]]!           \n\t"
356            "vtbl.8     d1, {d0}, %P[t0]        \n\t"
357            "vtbl.8     d2, {d0}, %P[t1]        \n\t"
358            "vst1.s16   {d1,d2}, [%[dst]]!      \n\t"
359            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
360            : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
361            : "memory", "d0", "d1", "d2" /* clobber list */
362        );
363    }
364}
365
366static void remap_arrange_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
367    const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
368    const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
369
370    for (; n > 0; n--) {
371        __asm__ __volatile__ (
372            "vld1.f32   {d0,d1}, [%[src]]!      \n\t"
373            "vtbl.8     d2, {d0,d1}, %P[t0]     \n\t"
374            "vtbl.8     d3, {d0,d1}, %P[t1]     \n\t"
375            "vst1.s16   {d2,d3}, [%[dst]]!      \n\t"
376            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
377            : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
378            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
379        );
380    }
381}
382
383static pa_cpu_arm_flag_t arm_flags;
384
385static void init_remap_neon(pa_remap_t *m) {
386    unsigned n_oc, n_ic;
387    int8_t arrange[PA_CHANNELS_MAX];
388
389    n_oc = m->o_ss.channels;
390    n_ic = m->i_ss.channels;
391
392    /* We short-circuit remap function selection for S32NE in most
393     * cases as the corresponding generic C code is performing
394     * similarly or even better. However there are a few cases where
395     * there actually is a significant improvement from using
396     * hand-crafted NEON assembly so we cannot just bail out for S32NE
397     * here. */
398    if (n_ic == 1 && n_oc == 2 &&
399            m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
400        if (m->format == PA_SAMPLE_S32NE)
401            return;
402        if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
403
404            pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
405            pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
406                NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
407        }
408        else {
409            pa_log_info("Using ARM NEON mono to stereo remapping");
410            pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
411                NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
412        }
413    } else if (n_ic == 1 && n_oc == 4 &&
414            m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 &&
415            m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) {
416
417        if (m->format == PA_SAMPLE_S32NE)
418            return;
419        pa_log_info("Using ARM NEON mono to 4-channel remapping");
420        pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon,
421            NULL, (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
422    } else if (n_ic == 2 && n_oc == 1 &&
423            m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) {
424
425        pa_log_info("Using ARM NEON stereo to mono remapping");
426        pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon,
427            (pa_do_remap_func_t) remap_stereo_to_mono_s32ne_neon,
428            (pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon);
429    } else if (n_ic == 4 && n_oc == 1 &&
430            m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 &&
431            m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) {
432
433        if (m->format == PA_SAMPLE_S32NE)
434            return;
435        pa_log_info("Using ARM NEON 4-channel to mono remapping");
436        pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon,
437            NULL, (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
438    } else if (pa_setup_remap_arrange(m, arrange) &&
439        ((n_ic == 2 && n_oc == 2) ||
440         (n_ic == 2 && n_oc == 4) ||
441         (n_ic == 4 && n_oc == 4))) {
442        unsigned o;
443
444        if (n_ic == 2 && n_oc == 2) {
445            if (m->format == PA_SAMPLE_S32NE)
446                return;
447            pa_log_info("Using NEON stereo arrange remapping");
448            pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon,
449                NULL, (pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
450        } else if (n_ic == 2 && n_oc == 4) {
451            pa_log_info("Using NEON 2-channel to 4-channel arrange remapping");
452            pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon,
453                (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon,
454                (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon);
455        } else if (n_ic == 4 && n_oc == 4) {
456            if (m->format == PA_SAMPLE_S32NE)
457                return;
458            pa_log_info("Using NEON 4-channel arrange remapping");
459            pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon,
460                NULL, (pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
461        }
462
463        /* setup state */
464        switch (m->format) {
465        case PA_SAMPLE_S16NE: {
466            uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 1);
467            for (o = 0; o < 4; o++) {
468                if (arrange[o % n_oc] >= 0) {
469                    /* convert channel index to vtbl indices */
470                    unsigned frame = o / n_oc;
471                    ((uint8_t *) t)[o * 2 + 0] = (frame * n_oc + arrange[o % n_oc]) * 2 + 0;
472                    ((uint8_t *) t)[o * 2 + 1] = (frame * n_oc + arrange[o % n_oc]) * 2 + 1;
473                } else {
474                    /* use invalid table indices to map to 0 */
475                    ((uint8_t *) t)[o * 2 + 0] = 0xff;
476                    ((uint8_t *) t)[o * 2 + 1] = 0xff;
477                }
478            }
479            break;
480        }
481        case PA_SAMPLE_S32NE:
482                /* fall-through */
483        case PA_SAMPLE_FLOAT32NE: {
484            uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2);
485            for (o = 0; o < n_oc; o++) {
486                if (arrange[o] >= 0) {
487                    /* convert channel index to vtbl indices */
488                    ((uint8_t *) t)[o * 4 + 0] = arrange[o] * 4 + 0;
489                    ((uint8_t *) t)[o * 4 + 1] = arrange[o] * 4 + 1;
490                    ((uint8_t *) t)[o * 4 + 2] = arrange[o] * 4 + 2;
491                    ((uint8_t *) t)[o * 4 + 3] = arrange[o] * 4 + 3;
492                } else {
493                    /* use invalid table indices to map to 0 */
494                    ((uint8_t *) t)[o * 4 + 0] = 0xff;
495                    ((uint8_t *) t)[o * 4 + 1] = 0xff;
496                    ((uint8_t *) t)[o * 4 + 2] = 0xff;
497                    ((uint8_t *) t)[o * 4 + 3] = 0xff;
498                }
499            }
500            break;
501        }
502        default:
503            pa_assert_not_reached();
504        }
505    } else if (n_ic == 4 && n_oc == 4) {
506        unsigned i, o;
507
508        if (m->format == PA_SAMPLE_S32NE)
509            return;
510        pa_log_info("Using ARM NEON 4-channel remapping");
511        pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon,
512            (pa_do_remap_func_t) NULL,
513            (pa_do_remap_func_t) remap_ch4_float32ne_neon);
514
515        /* setup state */
516        switch (m->format) {
517        case PA_SAMPLE_S16NE: {
518            int32x4_t *f = m->state = pa_xnew0(int32x4_t, 4);
519            for (o = 0; o < 4; o++) {
520                for (i = 0; i < 4; i++) {
521                    ((int *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_i[o][i], 0, 0x10000);
522                }
523            }
524            break;
525        }
526        case PA_SAMPLE_FLOAT32NE: {
527            float32x4_t *f = m->state = pa_xnew0(float32x4_t, 4);
528            for (o = 0; o < 4; o++) {
529                for (i = 0; i < 4; i++) {
530                    ((float *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_f[o][i], 0.0f, 1.0f);
531                }
532            }
533            break;
534        }
535        default:
536            pa_assert_not_reached();
537        }
538    }
539}
540
541void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) {
542    pa_log_info("Initialising ARM NEON optimized remappers.");
543    arm_flags = flags;
544    pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon);
545}
546