153a5a1b3Sopenharmony_ci/***
253a5a1b3Sopenharmony_ci  This file is part of PulseAudio.
353a5a1b3Sopenharmony_ci
453a5a1b3Sopenharmony_ci  Copyright 2013 Peter Meerwald <p.meerwald@bct-electronic.com>
553a5a1b3Sopenharmony_ci
653a5a1b3Sopenharmony_ci  PulseAudio is free software; you can redistribute it and/or modify
753a5a1b3Sopenharmony_ci  it under the terms of the GNU Lesser General Public License as published
853a5a1b3Sopenharmony_ci  by the Free Software Foundation; either version 2.1 of the License,
953a5a1b3Sopenharmony_ci  or (at your option) any later version.
1053a5a1b3Sopenharmony_ci
1153a5a1b3Sopenharmony_ci  PulseAudio is distributed in the hope that it will be useful, but
1253a5a1b3Sopenharmony_ci  WITHOUT ANY WARRANTY; without even the implied warranty of
1353a5a1b3Sopenharmony_ci  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1453a5a1b3Sopenharmony_ci  General Public License for more details.
1553a5a1b3Sopenharmony_ci***/
1653a5a1b3Sopenharmony_ci
1753a5a1b3Sopenharmony_ci#ifdef HAVE_CONFIG_H
1853a5a1b3Sopenharmony_ci#include <config.h>
1953a5a1b3Sopenharmony_ci#endif
2053a5a1b3Sopenharmony_ci
2153a5a1b3Sopenharmony_ci#include <pulse/sample.h>
2253a5a1b3Sopenharmony_ci#include <pulse/xmalloc.h>
2353a5a1b3Sopenharmony_ci#include <pulsecore/log.h>
2453a5a1b3Sopenharmony_ci#include <pulsecore/macro.h>
2553a5a1b3Sopenharmony_ci
2653a5a1b3Sopenharmony_ci#include "cpu-arm.h"
2753a5a1b3Sopenharmony_ci#include "remap.h"
2853a5a1b3Sopenharmony_ci
2953a5a1b3Sopenharmony_ci#include <arm_neon.h>
3053a5a1b3Sopenharmony_ci
3153a5a1b3Sopenharmony_cistatic void remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t *m, float *dst, const float *src, unsigned n) {
3253a5a1b3Sopenharmony_ci    for (; n >= 4; n -= 4) {
3353a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
3453a5a1b3Sopenharmony_ci            "vld1.32    {q0}, [%[src]]!         \n\t"
3553a5a1b3Sopenharmony_ci            "vmov       q1, q0                  \n\t"
3653a5a1b3Sopenharmony_ci            "vst2.32    {q0,q1}, [%[dst]]!      \n\t"
3753a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
3853a5a1b3Sopenharmony_ci            : /* input operands */
3953a5a1b3Sopenharmony_ci            : "memory", "q0", "q1" /* clobber list */
4053a5a1b3Sopenharmony_ci        );
4153a5a1b3Sopenharmony_ci    }
4253a5a1b3Sopenharmony_ci
4353a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
4453a5a1b3Sopenharmony_ci        dst[0] = dst[1] = src[0];
4553a5a1b3Sopenharmony_ci        src++;
4653a5a1b3Sopenharmony_ci        dst += 2;
4753a5a1b3Sopenharmony_ci    }
4853a5a1b3Sopenharmony_ci}
4953a5a1b3Sopenharmony_ci
5053a5a1b3Sopenharmony_cistatic void remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t *m, float *dst, const float *src, unsigned n) {
5153a5a1b3Sopenharmony_ci    for (; n >= 2; n -= 2) {
5253a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
5353a5a1b3Sopenharmony_ci            "ldm        %[src]!, {r4,r6}        \n\t"
5453a5a1b3Sopenharmony_ci            "mov        r5, r4                  \n\t"
5553a5a1b3Sopenharmony_ci
5653a5a1b3Sopenharmony_ci            /* We use r12 instead of r7 here, because r7 is reserved for the
5753a5a1b3Sopenharmony_ci             * frame pointer when using Thumb. */
5853a5a1b3Sopenharmony_ci            "mov        r12, r6                 \n\t"
5953a5a1b3Sopenharmony_ci
6053a5a1b3Sopenharmony_ci            "stm        %[dst]!, {r4-r6,r12}    \n\t"
6153a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
6253a5a1b3Sopenharmony_ci            : /* input operands */
6353a5a1b3Sopenharmony_ci            : "memory", "r4", "r5", "r6", "r12" /* clobber list */
6453a5a1b3Sopenharmony_ci        );
6553a5a1b3Sopenharmony_ci    }
6653a5a1b3Sopenharmony_ci
6753a5a1b3Sopenharmony_ci    if (n > 0)
6853a5a1b3Sopenharmony_ci        dst[0] = dst[1] = src[0];
6953a5a1b3Sopenharmony_ci}
7053a5a1b3Sopenharmony_ci
7153a5a1b3Sopenharmony_cistatic void remap_mono_to_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
7253a5a1b3Sopenharmony_ci    for (; n >= 8; n -= 8) {
7353a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
7453a5a1b3Sopenharmony_ci            "vld1.16    {q0}, [%[src]]!         \n\t"
7553a5a1b3Sopenharmony_ci            "vmov       q1, q0                  \n\t"
7653a5a1b3Sopenharmony_ci            "vst2.16    {q0,q1}, [%[dst]]!      \n\t"
7753a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
7853a5a1b3Sopenharmony_ci            : /* input operands */
7953a5a1b3Sopenharmony_ci            : "memory", "q0", "q1" /* clobber list */
8053a5a1b3Sopenharmony_ci        );
8153a5a1b3Sopenharmony_ci    }
8253a5a1b3Sopenharmony_ci
8353a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
8453a5a1b3Sopenharmony_ci        dst[0] = dst[1] = src[0];
8553a5a1b3Sopenharmony_ci        src++;
8653a5a1b3Sopenharmony_ci        dst += 2;
8753a5a1b3Sopenharmony_ci    }
8853a5a1b3Sopenharmony_ci}
8953a5a1b3Sopenharmony_ci
9053a5a1b3Sopenharmony_cistatic void remap_mono_to_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
9153a5a1b3Sopenharmony_ci    for (; n >= 2; n -= 2) {
9253a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
9353a5a1b3Sopenharmony_ci            "vld1.32    {d0}, [%[src]]!         \n\t"
9453a5a1b3Sopenharmony_ci            "vdup.f32   q1, d0[0]               \n\t"
9553a5a1b3Sopenharmony_ci            "vdup.f32   q2, d0[1]               \n\t"
9653a5a1b3Sopenharmony_ci            "vst1.32    {q1,q2}, [%[dst]]!      \n\t"
9753a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
9853a5a1b3Sopenharmony_ci            : /* input operands */
9953a5a1b3Sopenharmony_ci            : "memory", "q0", "q1", "q2" /* clobber list */
10053a5a1b3Sopenharmony_ci        );
10153a5a1b3Sopenharmony_ci    }
10253a5a1b3Sopenharmony_ci
10353a5a1b3Sopenharmony_ci    if (n--)
10453a5a1b3Sopenharmony_ci        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
10553a5a1b3Sopenharmony_ci}
10653a5a1b3Sopenharmony_ci
10753a5a1b3Sopenharmony_cistatic void remap_mono_to_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
10853a5a1b3Sopenharmony_ci    for (; n >= 4; n -= 4) {
10953a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
11053a5a1b3Sopenharmony_ci            "vld1.16    {d0}, [%[src]]!         \n\t"
11153a5a1b3Sopenharmony_ci            "vdup.s16   d1, d0[1]               \n\t"
11253a5a1b3Sopenharmony_ci            "vdup.s16   d2, d0[2]               \n\t"
11353a5a1b3Sopenharmony_ci            "vdup.s16   d3, d0[3]               \n\t"
11453a5a1b3Sopenharmony_ci            "vdup.s16   d0, d0[0]               \n\t"
11553a5a1b3Sopenharmony_ci            "vst1.16    {d0,d1,d2,d3}, [%[dst]]!\n\t"
11653a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
11753a5a1b3Sopenharmony_ci            : /* input operands */
11853a5a1b3Sopenharmony_ci            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
11953a5a1b3Sopenharmony_ci        );
12053a5a1b3Sopenharmony_ci    }
12153a5a1b3Sopenharmony_ci
12253a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
12353a5a1b3Sopenharmony_ci        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
12453a5a1b3Sopenharmony_ci        src++;
12553a5a1b3Sopenharmony_ci        dst += 4;
12653a5a1b3Sopenharmony_ci    }
12753a5a1b3Sopenharmony_ci}
12853a5a1b3Sopenharmony_ci
12953a5a1b3Sopenharmony_cistatic void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
13053a5a1b3Sopenharmony_ci    const float32x4_t halve = vdupq_n_f32(0.5f);
13153a5a1b3Sopenharmony_ci    for (; n >= 4; n -= 4) {
13253a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
13353a5a1b3Sopenharmony_ci            "vld2.32    {q0,q1}, [%[src]]!      \n\t"
13453a5a1b3Sopenharmony_ci            "vadd.f32   q0, q0, q1              \n\t"
13553a5a1b3Sopenharmony_ci            "vmul.f32   q0, q0, %q[halve]       \n\t"
13653a5a1b3Sopenharmony_ci            "vst1.32    {q0}, [%[dst]]!         \n\t"
13753a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
13853a5a1b3Sopenharmony_ci            : [halve] "w" (halve) /* input operands */
13953a5a1b3Sopenharmony_ci            : "memory", "q0", "q1" /* clobber list */
14053a5a1b3Sopenharmony_ci        );
14153a5a1b3Sopenharmony_ci    }
14253a5a1b3Sopenharmony_ci
14353a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
14453a5a1b3Sopenharmony_ci        dst[0] = (src[0] + src[1])*0.5f;
14553a5a1b3Sopenharmony_ci        src += 2;
14653a5a1b3Sopenharmony_ci        dst++;
14753a5a1b3Sopenharmony_ci    }
14853a5a1b3Sopenharmony_ci}
14953a5a1b3Sopenharmony_ci
15053a5a1b3Sopenharmony_cistatic void remap_stereo_to_mono_s32ne_neon(pa_remap_t *m, int32_t *dst, const int32_t *src, unsigned n) {
15153a5a1b3Sopenharmony_ci    for (; n >= 4; n -= 4) {
15253a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
15353a5a1b3Sopenharmony_ci            "vld2.32    {q0,q1}, [%[src]]!      \n\t"
15453a5a1b3Sopenharmony_ci            "vrhadd.s32 q0, q0, q1              \n\t"
15553a5a1b3Sopenharmony_ci            "vst1.32    {q0}, [%[dst]]!         \n\t"
15653a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
15753a5a1b3Sopenharmony_ci            : /* input operands */
15853a5a1b3Sopenharmony_ci            : "memory", "q0", "q1" /* clobber list */
15953a5a1b3Sopenharmony_ci        );
16053a5a1b3Sopenharmony_ci    }
16153a5a1b3Sopenharmony_ci
16253a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
16353a5a1b3Sopenharmony_ci        dst[0] = src[0]/2 + src[1]/2;
16453a5a1b3Sopenharmony_ci        src += 2;
16553a5a1b3Sopenharmony_ci        dst++;
16653a5a1b3Sopenharmony_ci    }
16753a5a1b3Sopenharmony_ci}
16853a5a1b3Sopenharmony_ci
16953a5a1b3Sopenharmony_cistatic void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
17053a5a1b3Sopenharmony_ci    for (; n >= 8; n -= 8) {
17153a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
17253a5a1b3Sopenharmony_ci            "vld2.16    {q0,q1}, [%[src]]!      \n\t"
17353a5a1b3Sopenharmony_ci            "vrhadd.s16 q0, q0, q1              \n\t"
17453a5a1b3Sopenharmony_ci            "vst1.16    {q0}, [%[dst]]!         \n\t"
17553a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
17653a5a1b3Sopenharmony_ci            : /* input operands */
17753a5a1b3Sopenharmony_ci            : "memory", "q0", "q1" /* clobber list */
17853a5a1b3Sopenharmony_ci        );
17953a5a1b3Sopenharmony_ci    }
18053a5a1b3Sopenharmony_ci
18153a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
18253a5a1b3Sopenharmony_ci        dst[0] = (src[0] + src[1])/2;
18353a5a1b3Sopenharmony_ci        src += 2;
18453a5a1b3Sopenharmony_ci        dst++;
18553a5a1b3Sopenharmony_ci    }
18653a5a1b3Sopenharmony_ci}
18753a5a1b3Sopenharmony_ci
18853a5a1b3Sopenharmony_cistatic void remap_ch4_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
18953a5a1b3Sopenharmony_ci    const float32x2_t quart = vdup_n_f32(0.25f);
19053a5a1b3Sopenharmony_ci    for (; n >= 2; n -= 2) {
19153a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
19253a5a1b3Sopenharmony_ci            "vld4.32    {d0,d1,d2,d3}, [%[src]]!\n\t"
19353a5a1b3Sopenharmony_ci            "vadd.f32   d0, d0, d1              \n\t"
19453a5a1b3Sopenharmony_ci            "vadd.f32   d2, d2, d3              \n\t"
19553a5a1b3Sopenharmony_ci            "vadd.f32   d0, d0, d2              \n\t"
19653a5a1b3Sopenharmony_ci            "vmul.f32   d0, d0, %P[quart]       \n\t"
19753a5a1b3Sopenharmony_ci            "vst1.32    {d0}, [%[dst]]!         \n\t"
19853a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
19953a5a1b3Sopenharmony_ci            : [quart] "w" (quart) /* input operands */
20053a5a1b3Sopenharmony_ci            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
20153a5a1b3Sopenharmony_ci        );
20253a5a1b3Sopenharmony_ci    }
20353a5a1b3Sopenharmony_ci
20453a5a1b3Sopenharmony_ci    if (n > 0)
20553a5a1b3Sopenharmony_ci        dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f;
20653a5a1b3Sopenharmony_ci}
20753a5a1b3Sopenharmony_ci
20853a5a1b3Sopenharmony_cistatic void remap_ch4_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
20953a5a1b3Sopenharmony_ci    for (; n >= 4; n -= 4) {
21053a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
21153a5a1b3Sopenharmony_ci            "vld4.16    {d0,d1,d2,d3}, [%[src]]!\n\t"
21253a5a1b3Sopenharmony_ci            "vrhadd.s16 d0, d0, d1              \n\t"
21353a5a1b3Sopenharmony_ci            "vrhadd.s16 d2, d2, d3              \n\t"
21453a5a1b3Sopenharmony_ci            "vrhadd.s16 d0, d0, d2              \n\t"
21553a5a1b3Sopenharmony_ci            "vst1.16    {d0}, [%[dst]]!         \n\t"
21653a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
21753a5a1b3Sopenharmony_ci            : /* input operands */
21853a5a1b3Sopenharmony_ci            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
21953a5a1b3Sopenharmony_ci        );
22053a5a1b3Sopenharmony_ci    }
22153a5a1b3Sopenharmony_ci
22253a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
22353a5a1b3Sopenharmony_ci        dst[0] = (src[0] + src[1] + src[2] + src[3])/4;
22453a5a1b3Sopenharmony_ci        src += 4;
22553a5a1b3Sopenharmony_ci        dst++;
22653a5a1b3Sopenharmony_ci    }
22753a5a1b3Sopenharmony_ci}
22853a5a1b3Sopenharmony_ci
22953a5a1b3Sopenharmony_cistatic void remap_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
23053a5a1b3Sopenharmony_ci    int32x4_t *f = m->state;
23153a5a1b3Sopenharmony_ci    const int32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
23253a5a1b3Sopenharmony_ci
23353a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
23453a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
23553a5a1b3Sopenharmony_ci            "vld1.16    {d0}, [%[src]]!         \n\t"
23653a5a1b3Sopenharmony_ci            "vmovl.s16  q0, d0                  \n\t"
23753a5a1b3Sopenharmony_ci            "vdup.s32   q1, d0[0]               \n\t"
23853a5a1b3Sopenharmony_ci            "vmul.s32   q1, q1, %q[f0]          \n\t"
23953a5a1b3Sopenharmony_ci            "vdup.s32   q2, d0[1]               \n\t"
24053a5a1b3Sopenharmony_ci            "vmla.s32   q1, q2, %q[f1]          \n\t"
24153a5a1b3Sopenharmony_ci            "vdup.s32   q2, d1[0]               \n\t"
24253a5a1b3Sopenharmony_ci            "vmla.s32   q1, q2, %q[f2]          \n\t"
24353a5a1b3Sopenharmony_ci            "vdup.s32   q2, d1[1]               \n\t"
24453a5a1b3Sopenharmony_ci            "vmla.s32   q1, q2, %q[f3]          \n\t"
24553a5a1b3Sopenharmony_ci            "vqshrn.s32  d2, q1, #16            \n\t"
24653a5a1b3Sopenharmony_ci            "vst1.32    {d2}, [%[dst]]!         \n\t"
24753a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src)
24853a5a1b3Sopenharmony_ci            : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
24953a5a1b3Sopenharmony_ci            : "memory", "q0", "q1", "q2"
25053a5a1b3Sopenharmony_ci        );
25153a5a1b3Sopenharmony_ci    }
25253a5a1b3Sopenharmony_ci}
25353a5a1b3Sopenharmony_ci
25453a5a1b3Sopenharmony_cistatic void remap_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
25553a5a1b3Sopenharmony_ci    float32x4_t *f = m->state;
25653a5a1b3Sopenharmony_ci    const float32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
25753a5a1b3Sopenharmony_ci
25853a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
25953a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
26053a5a1b3Sopenharmony_ci            "vld1.32    {d0,d1}, [%[src]]!      \n\t"
26153a5a1b3Sopenharmony_ci            "vdup.f32   q1, d0[0]               \n\t"
26253a5a1b3Sopenharmony_ci            "vmul.f32   q1, q1, %q[f0]          \n\t"
26353a5a1b3Sopenharmony_ci            "vdup.f32   q2, d0[1]               \n\t"
26453a5a1b3Sopenharmony_ci            "vmla.f32   q1, q2, %q[f1]          \n\t"
26553a5a1b3Sopenharmony_ci            "vdup.f32   q2, d1[0]               \n\t"
26653a5a1b3Sopenharmony_ci            "vmla.f32   q1, q2, %q[f2]          \n\t"
26753a5a1b3Sopenharmony_ci            "vdup.f32   q2, d1[1]               \n\t"
26853a5a1b3Sopenharmony_ci            "vmla.f32   q1, q2, %q[f3]          \n\t"
26953a5a1b3Sopenharmony_ci            "vst1.32    {d2,d3}, [%[dst]]!      \n\t"
27053a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src)
27153a5a1b3Sopenharmony_ci            : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
27253a5a1b3Sopenharmony_ci            : "memory", "q0", "q1", "q2"
27353a5a1b3Sopenharmony_ci        );
27453a5a1b3Sopenharmony_ci    }
27553a5a1b3Sopenharmony_ci}
27653a5a1b3Sopenharmony_ci
27753a5a1b3Sopenharmony_cistatic void remap_arrange_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
27853a5a1b3Sopenharmony_ci    const uint8x8_t t = ((uint8x8_t *) m->state)[0];
27953a5a1b3Sopenharmony_ci
28053a5a1b3Sopenharmony_ci    for (; n >= 2; n -= 2) {
28153a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
28253a5a1b3Sopenharmony_ci            "vld1.s16   d0, [%[src]]!           \n\t"
28353a5a1b3Sopenharmony_ci            "vtbl.8     d0, {d0}, %P[t]         \n\t"
28453a5a1b3Sopenharmony_ci            "vst1.s16   d0, [%[dst]]!           \n\t"
28553a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
28653a5a1b3Sopenharmony_ci            : [t] "w" (t) /* input operands */
28753a5a1b3Sopenharmony_ci            : "memory", "d0" /* clobber list */
28853a5a1b3Sopenharmony_ci        );
28953a5a1b3Sopenharmony_ci    }
29053a5a1b3Sopenharmony_ci
29153a5a1b3Sopenharmony_ci    if (n > 0) {
29253a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
29353a5a1b3Sopenharmony_ci            "vld1.32   d0[0], [%[src]]!         \n\t"
29453a5a1b3Sopenharmony_ci            "vtbl.8    d0, {d0}, %P[t]          \n\t"
29553a5a1b3Sopenharmony_ci            "vst1.32   d0[0], [%[dst]]!         \n\t"
29653a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
29753a5a1b3Sopenharmony_ci            : [t] "w" (t) /* input operands */
29853a5a1b3Sopenharmony_ci            : "memory", "d0" /* clobber list */
29953a5a1b3Sopenharmony_ci        );
30053a5a1b3Sopenharmony_ci    }
30153a5a1b3Sopenharmony_ci}
30253a5a1b3Sopenharmony_ci
30353a5a1b3Sopenharmony_cistatic void remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
30453a5a1b3Sopenharmony_ci    const uint8x8_t t = ((uint8x8_t *) m->state)[0];
30553a5a1b3Sopenharmony_ci
30653a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
30753a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
30853a5a1b3Sopenharmony_ci            "vld1.32    d0[0], [%[src]]!           \n\t"
30953a5a1b3Sopenharmony_ci            "vtbl.8     d0, {d0}, %P[t]            \n\t"
31053a5a1b3Sopenharmony_ci            "vst1.s16   d0, [%[dst]]!              \n\t"
31153a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
31253a5a1b3Sopenharmony_ci            : [t] "w" (t) /* input operands */
31353a5a1b3Sopenharmony_ci            : "memory", "d0" /* clobber list */
31453a5a1b3Sopenharmony_ci        );
31553a5a1b3Sopenharmony_ci    }
31653a5a1b3Sopenharmony_ci}
31753a5a1b3Sopenharmony_ci
31853a5a1b3Sopenharmony_cistatic void remap_arrange_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
31953a5a1b3Sopenharmony_ci    const uint8x8_t t = ((uint8x8_t *) m->state)[0];
32053a5a1b3Sopenharmony_ci
32153a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
32253a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
32353a5a1b3Sopenharmony_ci            "vld1.s16   d0, [%[src]]!           \n\t"
32453a5a1b3Sopenharmony_ci            "vtbl.8     d0, {d0}, %P[t]         \n\t"
32553a5a1b3Sopenharmony_ci            "vst1.s16   d0, [%[dst]]!           \n\t"
32653a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
32753a5a1b3Sopenharmony_ci            : [t] "w" (t) /* input operands */
32853a5a1b3Sopenharmony_ci            : "memory", "d0" /* clobber list */
32953a5a1b3Sopenharmony_ci        );
33053a5a1b3Sopenharmony_ci    }
33153a5a1b3Sopenharmony_ci}
33253a5a1b3Sopenharmony_ci
33353a5a1b3Sopenharmony_cistatic void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
33453a5a1b3Sopenharmony_ci    const uint8x8_t t = ((uint8x8_t *)m->state)[0];
33553a5a1b3Sopenharmony_ci
33653a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
33753a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
33853a5a1b3Sopenharmony_ci            "vld1.f32   d0, [%[src]]!           \n\t"
33953a5a1b3Sopenharmony_ci            "vtbl.8     d0, {d0}, %P[t]         \n\t"
34053a5a1b3Sopenharmony_ci            "vst1.s16   {d0}, [%[dst]]!         \n\t"
34153a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
34253a5a1b3Sopenharmony_ci            : [t] "w" (t) /* input operands */
34353a5a1b3Sopenharmony_ci            : "memory", "d0" /* clobber list */
34453a5a1b3Sopenharmony_ci        );
34553a5a1b3Sopenharmony_ci    }
34653a5a1b3Sopenharmony_ci}
34753a5a1b3Sopenharmony_ci
34853a5a1b3Sopenharmony_ci/* Works for both S32NE and FLOAT32NE */
34953a5a1b3Sopenharmony_cistatic void remap_arrange_ch2_ch4_any32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
35053a5a1b3Sopenharmony_ci    const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
35153a5a1b3Sopenharmony_ci    const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
35253a5a1b3Sopenharmony_ci
35353a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
35453a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
35553a5a1b3Sopenharmony_ci            "vld1.f32   d0, [%[src]]!           \n\t"
35653a5a1b3Sopenharmony_ci            "vtbl.8     d1, {d0}, %P[t0]        \n\t"
35753a5a1b3Sopenharmony_ci            "vtbl.8     d2, {d0}, %P[t1]        \n\t"
35853a5a1b3Sopenharmony_ci            "vst1.s16   {d1,d2}, [%[dst]]!      \n\t"
35953a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
36053a5a1b3Sopenharmony_ci            : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
36153a5a1b3Sopenharmony_ci            : "memory", "d0", "d1", "d2" /* clobber list */
36253a5a1b3Sopenharmony_ci        );
36353a5a1b3Sopenharmony_ci    }
36453a5a1b3Sopenharmony_ci}
36553a5a1b3Sopenharmony_ci
36653a5a1b3Sopenharmony_cistatic void remap_arrange_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
36753a5a1b3Sopenharmony_ci    const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
36853a5a1b3Sopenharmony_ci    const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
36953a5a1b3Sopenharmony_ci
37053a5a1b3Sopenharmony_ci    for (; n > 0; n--) {
37153a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
37253a5a1b3Sopenharmony_ci            "vld1.f32   {d0,d1}, [%[src]]!      \n\t"
37353a5a1b3Sopenharmony_ci            "vtbl.8     d2, {d0,d1}, %P[t0]     \n\t"
37453a5a1b3Sopenharmony_ci            "vtbl.8     d3, {d0,d1}, %P[t1]     \n\t"
37553a5a1b3Sopenharmony_ci            "vst1.s16   {d2,d3}, [%[dst]]!      \n\t"
37653a5a1b3Sopenharmony_ci            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
37753a5a1b3Sopenharmony_ci            : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
37853a5a1b3Sopenharmony_ci            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
37953a5a1b3Sopenharmony_ci        );
38053a5a1b3Sopenharmony_ci    }
38153a5a1b3Sopenharmony_ci}
38253a5a1b3Sopenharmony_ci
38353a5a1b3Sopenharmony_cistatic pa_cpu_arm_flag_t arm_flags;
38453a5a1b3Sopenharmony_ci
38553a5a1b3Sopenharmony_cistatic void init_remap_neon(pa_remap_t *m) {
38653a5a1b3Sopenharmony_ci    unsigned n_oc, n_ic;
38753a5a1b3Sopenharmony_ci    int8_t arrange[PA_CHANNELS_MAX];
38853a5a1b3Sopenharmony_ci
38953a5a1b3Sopenharmony_ci    n_oc = m->o_ss.channels;
39053a5a1b3Sopenharmony_ci    n_ic = m->i_ss.channels;
39153a5a1b3Sopenharmony_ci
39253a5a1b3Sopenharmony_ci    /* We short-circuit remap function selection for S32NE in most
39353a5a1b3Sopenharmony_ci     * cases as the corresponding generic C code is performing
39453a5a1b3Sopenharmony_ci     * similarly or even better. However there are a few cases where
39553a5a1b3Sopenharmony_ci     * there actually is a significant improvement from using
39653a5a1b3Sopenharmony_ci     * hand-crafted NEON assembly so we cannot just bail out for S32NE
39753a5a1b3Sopenharmony_ci     * here. */
39853a5a1b3Sopenharmony_ci    if (n_ic == 1 && n_oc == 2 &&
39953a5a1b3Sopenharmony_ci            m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
40053a5a1b3Sopenharmony_ci        if (m->format == PA_SAMPLE_S32NE)
40153a5a1b3Sopenharmony_ci            return;
40253a5a1b3Sopenharmony_ci        if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
40353a5a1b3Sopenharmony_ci
40453a5a1b3Sopenharmony_ci            pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
40553a5a1b3Sopenharmony_ci            pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
40653a5a1b3Sopenharmony_ci                NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
40753a5a1b3Sopenharmony_ci        }
40853a5a1b3Sopenharmony_ci        else {
40953a5a1b3Sopenharmony_ci            pa_log_info("Using ARM NEON mono to stereo remapping");
41053a5a1b3Sopenharmony_ci            pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
41153a5a1b3Sopenharmony_ci                NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
41253a5a1b3Sopenharmony_ci        }
41353a5a1b3Sopenharmony_ci    } else if (n_ic == 1 && n_oc == 4 &&
41453a5a1b3Sopenharmony_ci            m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 &&
41553a5a1b3Sopenharmony_ci            m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) {
41653a5a1b3Sopenharmony_ci
41753a5a1b3Sopenharmony_ci        if (m->format == PA_SAMPLE_S32NE)
41853a5a1b3Sopenharmony_ci            return;
41953a5a1b3Sopenharmony_ci        pa_log_info("Using ARM NEON mono to 4-channel remapping");
42053a5a1b3Sopenharmony_ci        pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon,
42153a5a1b3Sopenharmony_ci            NULL, (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
42253a5a1b3Sopenharmony_ci    } else if (n_ic == 2 && n_oc == 1 &&
42353a5a1b3Sopenharmony_ci            m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) {
42453a5a1b3Sopenharmony_ci
42553a5a1b3Sopenharmony_ci        pa_log_info("Using ARM NEON stereo to mono remapping");
42653a5a1b3Sopenharmony_ci        pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon,
42753a5a1b3Sopenharmony_ci            (pa_do_remap_func_t) remap_stereo_to_mono_s32ne_neon,
42853a5a1b3Sopenharmony_ci            (pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon);
42953a5a1b3Sopenharmony_ci    } else if (n_ic == 4 && n_oc == 1 &&
43053a5a1b3Sopenharmony_ci            m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 &&
43153a5a1b3Sopenharmony_ci            m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) {
43253a5a1b3Sopenharmony_ci
43353a5a1b3Sopenharmony_ci        if (m->format == PA_SAMPLE_S32NE)
43453a5a1b3Sopenharmony_ci            return;
43553a5a1b3Sopenharmony_ci        pa_log_info("Using ARM NEON 4-channel to mono remapping");
43653a5a1b3Sopenharmony_ci        pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon,
43753a5a1b3Sopenharmony_ci            NULL, (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
43853a5a1b3Sopenharmony_ci    } else if (pa_setup_remap_arrange(m, arrange) &&
43953a5a1b3Sopenharmony_ci        ((n_ic == 2 && n_oc == 2) ||
44053a5a1b3Sopenharmony_ci         (n_ic == 2 && n_oc == 4) ||
44153a5a1b3Sopenharmony_ci         (n_ic == 4 && n_oc == 4))) {
44253a5a1b3Sopenharmony_ci        unsigned o;
44353a5a1b3Sopenharmony_ci
44453a5a1b3Sopenharmony_ci        if (n_ic == 2 && n_oc == 2) {
44553a5a1b3Sopenharmony_ci            if (m->format == PA_SAMPLE_S32NE)
44653a5a1b3Sopenharmony_ci                return;
44753a5a1b3Sopenharmony_ci            pa_log_info("Using NEON stereo arrange remapping");
44853a5a1b3Sopenharmony_ci            pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon,
44953a5a1b3Sopenharmony_ci                NULL, (pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
45053a5a1b3Sopenharmony_ci        } else if (n_ic == 2 && n_oc == 4) {
45153a5a1b3Sopenharmony_ci            pa_log_info("Using NEON 2-channel to 4-channel arrange remapping");
45253a5a1b3Sopenharmony_ci            pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon,
45353a5a1b3Sopenharmony_ci                (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon,
45453a5a1b3Sopenharmony_ci                (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon);
45553a5a1b3Sopenharmony_ci        } else if (n_ic == 4 && n_oc == 4) {
45653a5a1b3Sopenharmony_ci            if (m->format == PA_SAMPLE_S32NE)
45753a5a1b3Sopenharmony_ci                return;
45853a5a1b3Sopenharmony_ci            pa_log_info("Using NEON 4-channel arrange remapping");
45953a5a1b3Sopenharmony_ci            pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon,
46053a5a1b3Sopenharmony_ci                NULL, (pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
46153a5a1b3Sopenharmony_ci        }
46253a5a1b3Sopenharmony_ci
46353a5a1b3Sopenharmony_ci        /* setup state */
46453a5a1b3Sopenharmony_ci        switch (m->format) {
46553a5a1b3Sopenharmony_ci        case PA_SAMPLE_S16NE: {
46653a5a1b3Sopenharmony_ci            uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 1);
46753a5a1b3Sopenharmony_ci            for (o = 0; o < 4; o++) {
46853a5a1b3Sopenharmony_ci                if (arrange[o % n_oc] >= 0) {
46953a5a1b3Sopenharmony_ci                    /* convert channel index to vtbl indices */
47053a5a1b3Sopenharmony_ci                    unsigned frame = o / n_oc;
47153a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 2 + 0] = (frame * n_oc + arrange[o % n_oc]) * 2 + 0;
47253a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 2 + 1] = (frame * n_oc + arrange[o % n_oc]) * 2 + 1;
47353a5a1b3Sopenharmony_ci                } else {
47453a5a1b3Sopenharmony_ci                    /* use invalid table indices to map to 0 */
47553a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 2 + 0] = 0xff;
47653a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 2 + 1] = 0xff;
47753a5a1b3Sopenharmony_ci                }
47853a5a1b3Sopenharmony_ci            }
47953a5a1b3Sopenharmony_ci            break;
48053a5a1b3Sopenharmony_ci        }
48153a5a1b3Sopenharmony_ci        case PA_SAMPLE_S32NE:
48253a5a1b3Sopenharmony_ci                /* fall-through */
48353a5a1b3Sopenharmony_ci        case PA_SAMPLE_FLOAT32NE: {
48453a5a1b3Sopenharmony_ci            uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2);
48553a5a1b3Sopenharmony_ci            for (o = 0; o < n_oc; o++) {
48653a5a1b3Sopenharmony_ci                if (arrange[o] >= 0) {
48753a5a1b3Sopenharmony_ci                    /* convert channel index to vtbl indices */
48853a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 4 + 0] = arrange[o] * 4 + 0;
48953a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 4 + 1] = arrange[o] * 4 + 1;
49053a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 4 + 2] = arrange[o] * 4 + 2;
49153a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 4 + 3] = arrange[o] * 4 + 3;
49253a5a1b3Sopenharmony_ci                } else {
49353a5a1b3Sopenharmony_ci                    /* use invalid table indices to map to 0 */
49453a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 4 + 0] = 0xff;
49553a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 4 + 1] = 0xff;
49653a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 4 + 2] = 0xff;
49753a5a1b3Sopenharmony_ci                    ((uint8_t *) t)[o * 4 + 3] = 0xff;
49853a5a1b3Sopenharmony_ci                }
49953a5a1b3Sopenharmony_ci            }
50053a5a1b3Sopenharmony_ci            break;
50153a5a1b3Sopenharmony_ci        }
50253a5a1b3Sopenharmony_ci        default:
50353a5a1b3Sopenharmony_ci            pa_assert_not_reached();
50453a5a1b3Sopenharmony_ci        }
50553a5a1b3Sopenharmony_ci    } else if (n_ic == 4 && n_oc == 4) {
50653a5a1b3Sopenharmony_ci        unsigned i, o;
50753a5a1b3Sopenharmony_ci
50853a5a1b3Sopenharmony_ci        if (m->format == PA_SAMPLE_S32NE)
50953a5a1b3Sopenharmony_ci            return;
51053a5a1b3Sopenharmony_ci        pa_log_info("Using ARM NEON 4-channel remapping");
51153a5a1b3Sopenharmony_ci        pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon,
51253a5a1b3Sopenharmony_ci            (pa_do_remap_func_t) NULL,
51353a5a1b3Sopenharmony_ci            (pa_do_remap_func_t) remap_ch4_float32ne_neon);
51453a5a1b3Sopenharmony_ci
51553a5a1b3Sopenharmony_ci        /* setup state */
51653a5a1b3Sopenharmony_ci        switch (m->format) {
51753a5a1b3Sopenharmony_ci        case PA_SAMPLE_S16NE: {
51853a5a1b3Sopenharmony_ci            int32x4_t *f = m->state = pa_xnew0(int32x4_t, 4);
51953a5a1b3Sopenharmony_ci            for (o = 0; o < 4; o++) {
52053a5a1b3Sopenharmony_ci                for (i = 0; i < 4; i++) {
52153a5a1b3Sopenharmony_ci                    ((int *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_i[o][i], 0, 0x10000);
52253a5a1b3Sopenharmony_ci                }
52353a5a1b3Sopenharmony_ci            }
52453a5a1b3Sopenharmony_ci            break;
52553a5a1b3Sopenharmony_ci        }
52653a5a1b3Sopenharmony_ci        case PA_SAMPLE_FLOAT32NE: {
52753a5a1b3Sopenharmony_ci            float32x4_t *f = m->state = pa_xnew0(float32x4_t, 4);
52853a5a1b3Sopenharmony_ci            for (o = 0; o < 4; o++) {
52953a5a1b3Sopenharmony_ci                for (i = 0; i < 4; i++) {
53053a5a1b3Sopenharmony_ci                    ((float *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_f[o][i], 0.0f, 1.0f);
53153a5a1b3Sopenharmony_ci                }
53253a5a1b3Sopenharmony_ci            }
53353a5a1b3Sopenharmony_ci            break;
53453a5a1b3Sopenharmony_ci        }
53553a5a1b3Sopenharmony_ci        default:
53653a5a1b3Sopenharmony_ci            pa_assert_not_reached();
53753a5a1b3Sopenharmony_ci        }
53853a5a1b3Sopenharmony_ci    }
53953a5a1b3Sopenharmony_ci}
54053a5a1b3Sopenharmony_ci
54153a5a1b3Sopenharmony_civoid pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) {
54253a5a1b3Sopenharmony_ci    pa_log_info("Initialising ARM NEON optimized remappers.");
54353a5a1b3Sopenharmony_ci    arm_flags = flags;
54453a5a1b3Sopenharmony_ci    pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon);
54553a5a1b3Sopenharmony_ci}
546