153a5a1b3Sopenharmony_ci/***
253a5a1b3Sopenharmony_ci  This file is part of PulseAudio.
353a5a1b3Sopenharmony_ci
453a5a1b3Sopenharmony_ci  Copyright 2013 Peter Meerwald <pmeerw@pmeerw.net>
553a5a1b3Sopenharmony_ci
653a5a1b3Sopenharmony_ci  PulseAudio is free software; you can redistribute it and/or modify
753a5a1b3Sopenharmony_ci  it under the terms of the GNU Lesser General Public License as published
853a5a1b3Sopenharmony_ci  by the Free Software Foundation; either version 2.1 of the License,
953a5a1b3Sopenharmony_ci  or (at your option) any later version.
1053a5a1b3Sopenharmony_ci
1153a5a1b3Sopenharmony_ci  PulseAudio is distributed in the hope that it will be useful, but
1253a5a1b3Sopenharmony_ci  WITHOUT ANY WARRANTY; without even the implied warranty of
1353a5a1b3Sopenharmony_ci  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1453a5a1b3Sopenharmony_ci  General Public License for more details.
1553a5a1b3Sopenharmony_ci***/
1653a5a1b3Sopenharmony_ci
1753a5a1b3Sopenharmony_ci#ifdef HAVE_CONFIG_H
1853a5a1b3Sopenharmony_ci#include <config.h>
1953a5a1b3Sopenharmony_ci#endif
2053a5a1b3Sopenharmony_ci
2153a5a1b3Sopenharmony_ci#include <pulsecore/macro.h>
2253a5a1b3Sopenharmony_ci#include <pulsecore/endianmacros.h>
2353a5a1b3Sopenharmony_ci#include <pulsecore/sample-util.h>
2453a5a1b3Sopenharmony_ci
2553a5a1b3Sopenharmony_ci#include "cpu-arm.h"
2653a5a1b3Sopenharmony_ci#include "mix.h"
2753a5a1b3Sopenharmony_ci
2853a5a1b3Sopenharmony_ci#include <arm_neon.h>
2953a5a1b3Sopenharmony_ci
3053a5a1b3Sopenharmony_cistatic pa_do_mix_func_t fallback;
3153a5a1b3Sopenharmony_ci
3253a5a1b3Sopenharmony_ci/* special case: mix s16ne streams, 2 channels each */
3353a5a1b3Sopenharmony_cistatic void pa_mix_ch2_s16ne_neon(pa_mix_info streams[], unsigned nstreams, uint8_t *data, unsigned length) {
3453a5a1b3Sopenharmony_ci    const unsigned mask = sizeof(int16_t) * 8 - 1;
3553a5a1b3Sopenharmony_ci    const uint8_t *end = data + (length & ~mask);
3653a5a1b3Sopenharmony_ci
3753a5a1b3Sopenharmony_ci    while (data < end) {
3853a5a1b3Sopenharmony_ci        int32x4_t sum0, sum1;
3953a5a1b3Sopenharmony_ci        unsigned i;
4053a5a1b3Sopenharmony_ci
4153a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
4253a5a1b3Sopenharmony_ci            "veor.s32 %q[sum0], %q[sum0]     \n\t"
4353a5a1b3Sopenharmony_ci            "veor.s32 %q[sum1], %q[sum1]     \n\t"
4453a5a1b3Sopenharmony_ci            : [sum0] "=w" (sum0), [sum1] "=w" (sum1)
4553a5a1b3Sopenharmony_ci            :
4653a5a1b3Sopenharmony_ci            : "cc" /* clobber list */
4753a5a1b3Sopenharmony_ci        );
4853a5a1b3Sopenharmony_ci
4953a5a1b3Sopenharmony_ci        for (i = 0; i < nstreams; i++) {
5053a5a1b3Sopenharmony_ci            pa_mix_info *m = streams + i;
5153a5a1b3Sopenharmony_ci            int32_t cv0 = m->linear[0].i;
5253a5a1b3Sopenharmony_ci            int32_t cv1 = m->linear[1].i;
5353a5a1b3Sopenharmony_ci
5453a5a1b3Sopenharmony_ci            __asm__ __volatile__ (
5553a5a1b3Sopenharmony_ci                "vld2.s16    {d0,d2}, [%[ptr]]!      \n\t"
5653a5a1b3Sopenharmony_ci                "vmov.s32    d4[0], %[cv0]           \n\t"
5753a5a1b3Sopenharmony_ci                "vmov.s32    d4[1], %[cv1]           \n\t"
5853a5a1b3Sopenharmony_ci                "vshll.s16   q0, d0, #15             \n\t"
5953a5a1b3Sopenharmony_ci                "vshll.s16   q1, d2, #15             \n\t"
6053a5a1b3Sopenharmony_ci                "vqdmulh.s32 q0, q0, d4[0]           \n\t"
6153a5a1b3Sopenharmony_ci                "vqdmulh.s32 q1, q1, d4[1]           \n\t"
6253a5a1b3Sopenharmony_ci                "vqadd.s32   %q[sum0], %q[sum0], q0  \n\t"
6353a5a1b3Sopenharmony_ci                "vqadd.s32   %q[sum1], %q[sum1], q1  \n\t"
6453a5a1b3Sopenharmony_ci                : [ptr] "+r" (m->ptr), [sum0] "+w" (sum0), [sum1] "+w" (sum1)
6553a5a1b3Sopenharmony_ci                : [cv0] "r" (cv0), [cv1] "r" (cv1)
6653a5a1b3Sopenharmony_ci                : "memory", "cc", "q0", "q1", "d4" /* clobber list */
6753a5a1b3Sopenharmony_ci            );
6853a5a1b3Sopenharmony_ci        }
6953a5a1b3Sopenharmony_ci
7053a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
7153a5a1b3Sopenharmony_ci            "vqmovn.s32 d0, %q[sum0]         \n\t"
7253a5a1b3Sopenharmony_ci            "vqmovn.s32 d1, %q[sum1]         \n\t"
7353a5a1b3Sopenharmony_ci            "vst2.s16   {d0,d1}, [%[data]]!  \n\t"
7453a5a1b3Sopenharmony_ci            : [data] "+r" (data)
7553a5a1b3Sopenharmony_ci            : [sum0] "w" (sum0), [sum1] "w" (sum1)
7653a5a1b3Sopenharmony_ci            : "memory", "cc", "q0" /* clobber list */
7753a5a1b3Sopenharmony_ci        );
7853a5a1b3Sopenharmony_ci    }
7953a5a1b3Sopenharmony_ci
8053a5a1b3Sopenharmony_ci    fallback(streams, nstreams, 2, data, length & mask);
8153a5a1b3Sopenharmony_ci}
8253a5a1b3Sopenharmony_ci
8353a5a1b3Sopenharmony_ci/* special case: mix 2 s16ne streams, 1 channel each */
8453a5a1b3Sopenharmony_cistatic void pa_mix2_ch1_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) {
8553a5a1b3Sopenharmony_ci    const int16_t *ptr0 = streams[0].ptr;
8653a5a1b3Sopenharmony_ci    const int16_t *ptr1 = streams[1].ptr;
8753a5a1b3Sopenharmony_ci
8853a5a1b3Sopenharmony_ci    int32x4_t sv0, sv1;
8953a5a1b3Sopenharmony_ci    __asm__ __volatile__ (
9053a5a1b3Sopenharmony_ci        "vdup.s32    %q[sv0], %[lin0]        \n\t"
9153a5a1b3Sopenharmony_ci        "vdup.s32    %q[sv1], %[lin1]        \n\t"
9253a5a1b3Sopenharmony_ci        : [sv0] "=w" (sv0), [sv1] "=w" (sv1)
9353a5a1b3Sopenharmony_ci        : [lin0] "r" (streams[0].linear[0]), [lin1] "r" (streams[1].linear[0])
9453a5a1b3Sopenharmony_ci        : /* clobber list */
9553a5a1b3Sopenharmony_ci    );
9653a5a1b3Sopenharmony_ci
9753a5a1b3Sopenharmony_ci    length /= sizeof(int16_t);
9853a5a1b3Sopenharmony_ci    for (; length >= 4; length -= 4) {
9953a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
10053a5a1b3Sopenharmony_ci            "vld1.s16    d0, [%[ptr0]]!      \n\t"
10153a5a1b3Sopenharmony_ci            "vld1.s16    d2, [%[ptr1]]!      \n\t"
10253a5a1b3Sopenharmony_ci            "vshll.s16   q0, d0, #15         \n\t"
10353a5a1b3Sopenharmony_ci            "vshll.s16   q1, d2, #15         \n\t"
10453a5a1b3Sopenharmony_ci            "vqdmulh.s32 q0, q0, %q[sv0]     \n\t"
10553a5a1b3Sopenharmony_ci            "vqdmulh.s32 q1, q1, %q[sv1]     \n\t"
10653a5a1b3Sopenharmony_ci            "vqadd.s32   q0, q0, q1          \n\t"
10753a5a1b3Sopenharmony_ci            "vqmovn.s32  d0, q0              \n\t"
10853a5a1b3Sopenharmony_ci            "vst1.s16    d0, [%[data]]!      \n\t"
10953a5a1b3Sopenharmony_ci            : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data)
11053a5a1b3Sopenharmony_ci            : [sv0] "w" (sv0), [sv1] "w" (sv1)
11153a5a1b3Sopenharmony_ci            : "memory", "cc", "q0", "q1" /* clobber list */
11253a5a1b3Sopenharmony_ci        );
11353a5a1b3Sopenharmony_ci    }
11453a5a1b3Sopenharmony_ci
11553a5a1b3Sopenharmony_ci    for (; length > 0; length--) {
11653a5a1b3Sopenharmony_ci        int32_t sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[0].i);
11753a5a1b3Sopenharmony_ci        sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[0].i);
11853a5a1b3Sopenharmony_ci        *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF);
11953a5a1b3Sopenharmony_ci    }
12053a5a1b3Sopenharmony_ci}
12153a5a1b3Sopenharmony_ci
12253a5a1b3Sopenharmony_ci/* special case: mix 2 s16ne streams, 2 channel each */
12353a5a1b3Sopenharmony_cistatic void pa_mix2_ch2_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) {
12453a5a1b3Sopenharmony_ci    const int16_t *ptr0 = streams[0].ptr;
12553a5a1b3Sopenharmony_ci    const int16_t *ptr1 = streams[1].ptr;
12653a5a1b3Sopenharmony_ci
12753a5a1b3Sopenharmony_ci    int32x4_t sv0, sv1;
12853a5a1b3Sopenharmony_ci    __asm__ __volatile__ (
12953a5a1b3Sopenharmony_ci        "vld1.s32 d0, [%[lin0]]              \n\t"
13053a5a1b3Sopenharmony_ci        "vmov.s32 d1, d0                     \n\t"
13153a5a1b3Sopenharmony_ci        "vmov.s32 %q[sv0], q0                \n\t"
13253a5a1b3Sopenharmony_ci        "vld1.s32 d0, [%[lin1]]              \n\t"
13353a5a1b3Sopenharmony_ci        "vmov.s32 d1, d0                     \n\t"
13453a5a1b3Sopenharmony_ci        "vmov.s32 %q[sv1], q0                \n\t"
13553a5a1b3Sopenharmony_ci        : [sv0] "=w" (sv0), [sv1] "=w" (sv1)
13653a5a1b3Sopenharmony_ci        : [lin0] "r" (streams[0].linear), [lin1] "r" (streams[1].linear)
13753a5a1b3Sopenharmony_ci        : "q0" /* clobber list */
13853a5a1b3Sopenharmony_ci    );
13953a5a1b3Sopenharmony_ci
14053a5a1b3Sopenharmony_ci    length /= sizeof(int16_t);
14153a5a1b3Sopenharmony_ci    for (; length >= 4; length -= 4) {
14253a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
14353a5a1b3Sopenharmony_ci            "vld1.s16    d0, [%[ptr0]]!      \n\t"
14453a5a1b3Sopenharmony_ci            "vld1.s16    d2, [%[ptr1]]!      \n\t"
14553a5a1b3Sopenharmony_ci            "vshll.s16   q0, d0, #15         \n\t"
14653a5a1b3Sopenharmony_ci            "vshll.s16   q1, d2, #15         \n\t"
14753a5a1b3Sopenharmony_ci            "vqdmulh.s32 q0, q0, %q[sv0]     \n\t"
14853a5a1b3Sopenharmony_ci            "vqdmulh.s32 q1, q1, %q[sv1]     \n\t"
14953a5a1b3Sopenharmony_ci            "vqadd.s32   q0, q0, q1          \n\t"
15053a5a1b3Sopenharmony_ci            "vqmovn.s32  d0, q0              \n\t"
15153a5a1b3Sopenharmony_ci            "vst1.s16    d0, [%[data]]!      \n\t"
15253a5a1b3Sopenharmony_ci            : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data)
15353a5a1b3Sopenharmony_ci            : [sv0] "w" (sv0), [sv1] "w" (sv1)
15453a5a1b3Sopenharmony_ci            : "memory", "cc", "q0", "q1" /* clobber list */
15553a5a1b3Sopenharmony_ci        );
15653a5a1b3Sopenharmony_ci    }
15753a5a1b3Sopenharmony_ci
15853a5a1b3Sopenharmony_ci    if (length > 0) {
15953a5a1b3Sopenharmony_ci        int32_t sum;
16053a5a1b3Sopenharmony_ci
16153a5a1b3Sopenharmony_ci        sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[0].i);
16253a5a1b3Sopenharmony_ci        sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[0].i);
16353a5a1b3Sopenharmony_ci        *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF);
16453a5a1b3Sopenharmony_ci
16553a5a1b3Sopenharmony_ci        sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[1].i);
16653a5a1b3Sopenharmony_ci        sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[1].i);
16753a5a1b3Sopenharmony_ci        *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF);
16853a5a1b3Sopenharmony_ci    }
16953a5a1b3Sopenharmony_ci}
17053a5a1b3Sopenharmony_ci
17153a5a1b3Sopenharmony_ci/* special case: mix 2 s16ne streams, 4 channels each */
17253a5a1b3Sopenharmony_cistatic void pa_mix2_ch4_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) {
17353a5a1b3Sopenharmony_ci    const int16_t *ptr0 = streams[0].ptr;
17453a5a1b3Sopenharmony_ci    const int16_t *ptr1 = streams[1].ptr;
17553a5a1b3Sopenharmony_ci
17653a5a1b3Sopenharmony_ci    int32x4_t sv0, sv1;
17753a5a1b3Sopenharmony_ci
17853a5a1b3Sopenharmony_ci    __asm__ __volatile__ (
17953a5a1b3Sopenharmony_ci        "vld1.s32 %h[sv0], [%[lin0]]         \n\t"
18053a5a1b3Sopenharmony_ci        "vld1.s32 %h[sv1], [%[lin1]]         \n\t"
18153a5a1b3Sopenharmony_ci        : [sv0] "=w" (sv0), [sv1] "=w" (sv1)
18253a5a1b3Sopenharmony_ci        : [lin0] "r" (streams[0].linear), [lin1] "r" (streams[1].linear)
18353a5a1b3Sopenharmony_ci        : /* clobber list */
18453a5a1b3Sopenharmony_ci    );
18553a5a1b3Sopenharmony_ci
18653a5a1b3Sopenharmony_ci    length /= sizeof(int16_t);
18753a5a1b3Sopenharmony_ci    for (; length >= 4; length -= 4) {
18853a5a1b3Sopenharmony_ci        __asm__ __volatile__ (
18953a5a1b3Sopenharmony_ci            "vld1.s16    d0, [%[ptr0]]!      \n\t"
19053a5a1b3Sopenharmony_ci            "vld1.s16    d2, [%[ptr1]]!      \n\t"
19153a5a1b3Sopenharmony_ci            "vshll.s16   q0, d0, #15         \n\t"
19253a5a1b3Sopenharmony_ci            "vshll.s16   q1, d2, #15         \n\t"
19353a5a1b3Sopenharmony_ci            "vqdmulh.s32 q0, q0, %q[sv0]     \n\t"
19453a5a1b3Sopenharmony_ci            "vqdmulh.s32 q1, q1, %q[sv1]     \n\t"
19553a5a1b3Sopenharmony_ci            "vqadd.s32   q0, q0, q1          \n\t"
19653a5a1b3Sopenharmony_ci            "vqmovn.s32  d0, q0              \n\t"
19753a5a1b3Sopenharmony_ci            "vst1.s16    d0, [%[data]]!      \n\t"
19853a5a1b3Sopenharmony_ci            : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data)
19953a5a1b3Sopenharmony_ci            : [sv0] "w" (sv0), [sv1] "w" (sv1)
20053a5a1b3Sopenharmony_ci            : "memory", "cc", "q0", "q1" /* clobber list */
20153a5a1b3Sopenharmony_ci        );
20253a5a1b3Sopenharmony_ci    }
20353a5a1b3Sopenharmony_ci}
20453a5a1b3Sopenharmony_ci
20553a5a1b3Sopenharmony_cistatic void pa_mix_s16ne_neon(pa_mix_info streams[], unsigned nstreams, unsigned nchannels, void *data, unsigned length) {
20653a5a1b3Sopenharmony_ci    if (nstreams == 2 && nchannels == 2)
20753a5a1b3Sopenharmony_ci        pa_mix2_ch2_s16ne_neon(streams, data, length);
20853a5a1b3Sopenharmony_ci    else if (nstreams == 2 && nchannels == 4)
20953a5a1b3Sopenharmony_ci        pa_mix2_ch4_s16ne_neon(streams, data, length);
21053a5a1b3Sopenharmony_ci    else if (nstreams == 2 && nchannels == 1)
21153a5a1b3Sopenharmony_ci        pa_mix2_ch1_s16ne_neon(streams, data, length);
21253a5a1b3Sopenharmony_ci    else if (nchannels == 2)
21353a5a1b3Sopenharmony_ci        pa_mix_ch2_s16ne_neon(streams, nstreams, data, length);
21453a5a1b3Sopenharmony_ci    else
21553a5a1b3Sopenharmony_ci        fallback(streams, nstreams, nchannels, data, length);
21653a5a1b3Sopenharmony_ci}
21753a5a1b3Sopenharmony_ci
21853a5a1b3Sopenharmony_civoid pa_mix_func_init_neon(pa_cpu_arm_flag_t flags) {
21953a5a1b3Sopenharmony_ci    pa_log_info("Initialising ARM NEON optimized mixing functions.");
22053a5a1b3Sopenharmony_ci
22153a5a1b3Sopenharmony_ci    fallback = pa_get_mix_func(PA_SAMPLE_S16NE);
22253a5a1b3Sopenharmony_ci    pa_set_mix_func(PA_SAMPLE_S16NE, (pa_do_mix_func_t) pa_mix_s16ne_neon);
22353a5a1b3Sopenharmony_ci}
224