1/***
2  This file is part of PulseAudio.
3
4  Copyright 2013 Peter Meerwald <pmeerw@pmeerw.net>
5
6  PulseAudio is free software; you can redistribute it and/or modify
7  it under the terms of the GNU Lesser General Public License as published
8  by the Free Software Foundation; either version 2.1 of the License,
9  or (at your option) any later version.
10
11  PulseAudio is distributed in the hope that it will be useful, but
12  WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  General Public License for more details.
15***/
16
17#ifdef HAVE_CONFIG_H
18#include <config.h>
19#endif
20
21#include <pulsecore/macro.h>
22#include <pulsecore/endianmacros.h>
23#include <pulsecore/sample-util.h>
24
25#include "cpu-arm.h"
26#include "mix.h"
27
28#include <arm_neon.h>
29
30static pa_do_mix_func_t fallback;
31
32/* special case: mix s16ne streams, 2 channels each */
33static void pa_mix_ch2_s16ne_neon(pa_mix_info streams[], unsigned nstreams, uint8_t *data, unsigned length) {
34    const unsigned mask = sizeof(int16_t) * 8 - 1;
35    const uint8_t *end = data + (length & ~mask);
36
37    while (data < end) {
38        int32x4_t sum0, sum1;
39        unsigned i;
40
41        __asm__ __volatile__ (
42            "veor.s32 %q[sum0], %q[sum0]     \n\t"
43            "veor.s32 %q[sum1], %q[sum1]     \n\t"
44            : [sum0] "=w" (sum0), [sum1] "=w" (sum1)
45            :
46            : "cc" /* clobber list */
47        );
48
49        for (i = 0; i < nstreams; i++) {
50            pa_mix_info *m = streams + i;
51            int32_t cv0 = m->linear[0].i;
52            int32_t cv1 = m->linear[1].i;
53
54            __asm__ __volatile__ (
55                "vld2.s16    {d0,d2}, [%[ptr]]!      \n\t"
56                "vmov.s32    d4[0], %[cv0]           \n\t"
57                "vmov.s32    d4[1], %[cv1]           \n\t"
58                "vshll.s16   q0, d0, #15             \n\t"
59                "vshll.s16   q1, d2, #15             \n\t"
60                "vqdmulh.s32 q0, q0, d4[0]           \n\t"
61                "vqdmulh.s32 q1, q1, d4[1]           \n\t"
62                "vqadd.s32   %q[sum0], %q[sum0], q0  \n\t"
63                "vqadd.s32   %q[sum1], %q[sum1], q1  \n\t"
64                : [ptr] "+r" (m->ptr), [sum0] "+w" (sum0), [sum1] "+w" (sum1)
65                : [cv0] "r" (cv0), [cv1] "r" (cv1)
66                : "memory", "cc", "q0", "q1", "d4" /* clobber list */
67            );
68        }
69
70        __asm__ __volatile__ (
71            "vqmovn.s32 d0, %q[sum0]         \n\t"
72            "vqmovn.s32 d1, %q[sum1]         \n\t"
73            "vst2.s16   {d0,d1}, [%[data]]!  \n\t"
74            : [data] "+r" (data)
75            : [sum0] "w" (sum0), [sum1] "w" (sum1)
76            : "memory", "cc", "q0" /* clobber list */
77        );
78    }
79
80    fallback(streams, nstreams, 2, data, length & mask);
81}
82
83/* special case: mix 2 s16ne streams, 1 channel each */
84static void pa_mix2_ch1_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) {
85    const int16_t *ptr0 = streams[0].ptr;
86    const int16_t *ptr1 = streams[1].ptr;
87
88    int32x4_t sv0, sv1;
89    __asm__ __volatile__ (
90        "vdup.s32    %q[sv0], %[lin0]        \n\t"
91        "vdup.s32    %q[sv1], %[lin1]        \n\t"
92        : [sv0] "=w" (sv0), [sv1] "=w" (sv1)
93        : [lin0] "r" (streams[0].linear[0]), [lin1] "r" (streams[1].linear[0])
94        : /* clobber list */
95    );
96
97    length /= sizeof(int16_t);
98    for (; length >= 4; length -= 4) {
99        __asm__ __volatile__ (
100            "vld1.s16    d0, [%[ptr0]]!      \n\t"
101            "vld1.s16    d2, [%[ptr1]]!      \n\t"
102            "vshll.s16   q0, d0, #15         \n\t"
103            "vshll.s16   q1, d2, #15         \n\t"
104            "vqdmulh.s32 q0, q0, %q[sv0]     \n\t"
105            "vqdmulh.s32 q1, q1, %q[sv1]     \n\t"
106            "vqadd.s32   q0, q0, q1          \n\t"
107            "vqmovn.s32  d0, q0              \n\t"
108            "vst1.s16    d0, [%[data]]!      \n\t"
109            : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data)
110            : [sv0] "w" (sv0), [sv1] "w" (sv1)
111            : "memory", "cc", "q0", "q1" /* clobber list */
112        );
113    }
114
115    for (; length > 0; length--) {
116        int32_t sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[0].i);
117        sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[0].i);
118        *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF);
119    }
120}
121
122/* special case: mix 2 s16ne streams, 2 channel each */
123static void pa_mix2_ch2_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) {
124    const int16_t *ptr0 = streams[0].ptr;
125    const int16_t *ptr1 = streams[1].ptr;
126
127    int32x4_t sv0, sv1;
128    __asm__ __volatile__ (
129        "vld1.s32 d0, [%[lin0]]              \n\t"
130        "vmov.s32 d1, d0                     \n\t"
131        "vmov.s32 %q[sv0], q0                \n\t"
132        "vld1.s32 d0, [%[lin1]]              \n\t"
133        "vmov.s32 d1, d0                     \n\t"
134        "vmov.s32 %q[sv1], q0                \n\t"
135        : [sv0] "=w" (sv0), [sv1] "=w" (sv1)
136        : [lin0] "r" (streams[0].linear), [lin1] "r" (streams[1].linear)
137        : "q0" /* clobber list */
138    );
139
140    length /= sizeof(int16_t);
141    for (; length >= 4; length -= 4) {
142        __asm__ __volatile__ (
143            "vld1.s16    d0, [%[ptr0]]!      \n\t"
144            "vld1.s16    d2, [%[ptr1]]!      \n\t"
145            "vshll.s16   q0, d0, #15         \n\t"
146            "vshll.s16   q1, d2, #15         \n\t"
147            "vqdmulh.s32 q0, q0, %q[sv0]     \n\t"
148            "vqdmulh.s32 q1, q1, %q[sv1]     \n\t"
149            "vqadd.s32   q0, q0, q1          \n\t"
150            "vqmovn.s32  d0, q0              \n\t"
151            "vst1.s16    d0, [%[data]]!      \n\t"
152            : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data)
153            : [sv0] "w" (sv0), [sv1] "w" (sv1)
154            : "memory", "cc", "q0", "q1" /* clobber list */
155        );
156    }
157
158    if (length > 0) {
159        int32_t sum;
160
161        sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[0].i);
162        sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[0].i);
163        *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF);
164
165        sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[1].i);
166        sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[1].i);
167        *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF);
168    }
169}
170
171/* special case: mix 2 s16ne streams, 4 channels each */
172static void pa_mix2_ch4_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) {
173    const int16_t *ptr0 = streams[0].ptr;
174    const int16_t *ptr1 = streams[1].ptr;
175
176    int32x4_t sv0, sv1;
177
178    __asm__ __volatile__ (
179        "vld1.s32 %h[sv0], [%[lin0]]         \n\t"
180        "vld1.s32 %h[sv1], [%[lin1]]         \n\t"
181        : [sv0] "=w" (sv0), [sv1] "=w" (sv1)
182        : [lin0] "r" (streams[0].linear), [lin1] "r" (streams[1].linear)
183        : /* clobber list */
184    );
185
186    length /= sizeof(int16_t);
187    for (; length >= 4; length -= 4) {
188        __asm__ __volatile__ (
189            "vld1.s16    d0, [%[ptr0]]!      \n\t"
190            "vld1.s16    d2, [%[ptr1]]!      \n\t"
191            "vshll.s16   q0, d0, #15         \n\t"
192            "vshll.s16   q1, d2, #15         \n\t"
193            "vqdmulh.s32 q0, q0, %q[sv0]     \n\t"
194            "vqdmulh.s32 q1, q1, %q[sv1]     \n\t"
195            "vqadd.s32   q0, q0, q1          \n\t"
196            "vqmovn.s32  d0, q0              \n\t"
197            "vst1.s16    d0, [%[data]]!      \n\t"
198            : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data)
199            : [sv0] "w" (sv0), [sv1] "w" (sv1)
200            : "memory", "cc", "q0", "q1" /* clobber list */
201        );
202    }
203}
204
205static void pa_mix_s16ne_neon(pa_mix_info streams[], unsigned nstreams, unsigned nchannels, void *data, unsigned length) {
206    if (nstreams == 2 && nchannels == 2)
207        pa_mix2_ch2_s16ne_neon(streams, data, length);
208    else if (nstreams == 2 && nchannels == 4)
209        pa_mix2_ch4_s16ne_neon(streams, data, length);
210    else if (nstreams == 2 && nchannels == 1)
211        pa_mix2_ch1_s16ne_neon(streams, data, length);
212    else if (nchannels == 2)
213        pa_mix_ch2_s16ne_neon(streams, nstreams, data, length);
214    else
215        fallback(streams, nstreams, nchannels, data, length);
216}
217
218void pa_mix_func_init_neon(pa_cpu_arm_flag_t flags) {
219    pa_log_info("Initialising ARM NEON optimized mixing functions.");
220
221    fallback = pa_get_mix_func(PA_SAMPLE_S16NE);
222    pa_set_mix_func(PA_SAMPLE_S16NE, (pa_do_mix_func_t) pa_mix_s16ne_neon);
223}
224