1/*
2 * Copyright (c) 2012
3 *      MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 *    contributors may be used to endorse or promote products derived from
15 *    this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * Authors:  Darko Laus      (darko@mips.com)
30 *           Djordje Pesut   (djordje@mips.com)
31 *           Mirjana Vulin   (mvulin@mips.com)
32 *
33 * AAC Spectral Band Replication decoding functions optimized for MIPS
34 *
35 * This file is part of FFmpeg.
36 *
37 * FFmpeg is free software; you can redistribute it and/or
38 * modify it under the terms of the GNU Lesser General Public
39 * License as published by the Free Software Foundation; either
40 * version 2.1 of the License, or (at your option) any later version.
41 *
42 * FFmpeg is distributed in the hope that it will be useful,
43 * but WITHOUT ANY WARRANTY; without even the implied warranty of
44 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
45 * Lesser General Public License for more details.
46 *
47 * You should have received a copy of the GNU Lesser General Public
48 * License along with FFmpeg; if not, write to the Free Software
49 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
50 */
51
52/**
53 * @file
54 * Reference: libavcodec/sbrdsp.c
55 */
56
57#include "config.h"
58#include "libavcodec/sbrdsp.h"
59#include "libavutil/mips/asmdefs.h"
60
61#if HAVE_INLINE_ASM
62#if HAVE_MIPSFPU
63static void sbr_qmf_pre_shuffle_mips(float *z)
64{
65    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6;
66    float *z1 = &z[66];
67    float *z2 = &z[59];
68    float *z3 = &z[2];
69    float *z4 = z1 + 60;
70
71    /* loop unrolled 5 times */
72    __asm__ volatile (
73        "lui    %[Temp6],   0x8000                  \n\t"
74    "1:                                             \n\t"
75        "lw     %[Temp1],   0(%[z2])                \n\t"
76        "lw     %[Temp2],   4(%[z2])                \n\t"
77        "lw     %[Temp3],   8(%[z2])                \n\t"
78        "lw     %[Temp4],   12(%[z2])               \n\t"
79        "lw     %[Temp5],   16(%[z2])               \n\t"
80        "xor    %[Temp1],   %[Temp1],   %[Temp6]    \n\t"
81        "xor    %[Temp2],   %[Temp2],   %[Temp6]    \n\t"
82        "xor    %[Temp3],   %[Temp3],   %[Temp6]    \n\t"
83        "xor    %[Temp4],   %[Temp4],   %[Temp6]    \n\t"
84        "xor    %[Temp5],   %[Temp5],   %[Temp6]    \n\t"
85        PTR_ADDIU "%[z2],   %[z2],      -20         \n\t"
86        "sw     %[Temp1],   32(%[z1])               \n\t"
87        "sw     %[Temp2],   24(%[z1])               \n\t"
88        "sw     %[Temp3],   16(%[z1])               \n\t"
89        "sw     %[Temp4],   8(%[z1])                \n\t"
90        "sw     %[Temp5],   0(%[z1])                \n\t"
91        "lw     %[Temp1],   0(%[z3])                \n\t"
92        "lw     %[Temp2],   4(%[z3])                \n\t"
93        "lw     %[Temp3],   8(%[z3])                \n\t"
94        "lw     %[Temp4],   12(%[z3])               \n\t"
95        "lw     %[Temp5],   16(%[z3])               \n\t"
96        "sw     %[Temp1],   4(%[z1])                \n\t"
97        "sw     %[Temp2],   12(%[z1])               \n\t"
98        "sw     %[Temp3],   20(%[z1])               \n\t"
99        "sw     %[Temp4],   28(%[z1])               \n\t"
100        "sw     %[Temp5],   36(%[z1])               \n\t"
101        PTR_ADDIU "%[z3],   %[z3],      20          \n\t"
102        PTR_ADDIU "%[z1],   %[z1],      40          \n\t"
103        "bne    %[z1],      %[z4],      1b          \n\t"
104        "lw     %[Temp1],   132(%[z])               \n\t"
105        "lw     %[Temp2],   128(%[z])               \n\t"
106        "lw     %[Temp3],   0(%[z])                 \n\t"
107        "lw     %[Temp4],   4(%[z])                 \n\t"
108        "xor    %[Temp1],   %[Temp1],   %[Temp6]    \n\t"
109        "sw     %[Temp1],   504(%[z])               \n\t"
110        "sw     %[Temp2],   508(%[z])               \n\t"
111        "sw     %[Temp3],   256(%[z])               \n\t"
112        "sw     %[Temp4],   260(%[z])               \n\t"
113
114        : [Temp1]"=&r"(Temp1), [Temp2]"=&r"(Temp2),
115          [Temp3]"=&r"(Temp3), [Temp4]"=&r"(Temp4),
116          [Temp5]"=&r"(Temp5), [Temp6]"=&r"(Temp6),
117          [z1]"+r"(z1), [z2]"+r"(z2), [z3]"+r"(z3)
118        : [z4]"r"(z4), [z]"r"(z)
119        : "memory"
120    );
121}
122
123static void sbr_qmf_post_shuffle_mips(float W[32][2], const float *z)
124{
125    int Temp1, Temp2, Temp3, Temp4, Temp5;
126    float *W_ptr = (float *)W;
127    float *z1    = (float *)z;
128    float *z2    = (float *)&z[60];
129    float *z_end = z1 + 32;
130
131     /* loop unrolled 4 times */
132    __asm__ volatile (
133        "lui    %[Temp5],   0x8000                  \n\t"
134    "1:                                             \n\t"
135        "lw     %[Temp1],   0(%[z2])                \n\t"
136        "lw     %[Temp2],   4(%[z2])                \n\t"
137        "lw     %[Temp3],   8(%[z2])                \n\t"
138        "lw     %[Temp4],   12(%[z2])               \n\t"
139        "xor    %[Temp1],   %[Temp1],   %[Temp5]    \n\t"
140        "xor    %[Temp2],   %[Temp2],   %[Temp5]    \n\t"
141        "xor    %[Temp3],   %[Temp3],   %[Temp5]    \n\t"
142        "xor    %[Temp4],   %[Temp4],   %[Temp5]    \n\t"
143        PTR_ADDIU "%[z2],   %[z2],      -16         \n\t"
144        "sw     %[Temp1],   24(%[W_ptr])            \n\t"
145        "sw     %[Temp2],   16(%[W_ptr])            \n\t"
146        "sw     %[Temp3],   8(%[W_ptr])             \n\t"
147        "sw     %[Temp4],   0(%[W_ptr])             \n\t"
148        "lw     %[Temp1],   0(%[z1])                \n\t"
149        "lw     %[Temp2],   4(%[z1])                \n\t"
150        "lw     %[Temp3],   8(%[z1])                \n\t"
151        "lw     %[Temp4],   12(%[z1])               \n\t"
152        "sw     %[Temp1],   4(%[W_ptr])             \n\t"
153        "sw     %[Temp2],   12(%[W_ptr])            \n\t"
154        "sw     %[Temp3],   20(%[W_ptr])            \n\t"
155        "sw     %[Temp4],   28(%[W_ptr])            \n\t"
156        PTR_ADDIU "%[z1],   %[z1],      16          \n\t"
157        PTR_ADDIU "%[W_ptr],%[W_ptr],   32          \n\t"
158        "bne    %[z1],      %[z_end],   1b          \n\t"
159
160        : [Temp1]"=&r"(Temp1), [Temp2]"=&r"(Temp2),
161          [Temp3]"=&r"(Temp3), [Temp4]"=&r"(Temp4),
162          [Temp5]"=&r"(Temp5), [z1]"+r"(z1),
163          [z2]"+r"(z2), [W_ptr]"+r"(W_ptr)
164        : [z_end]"r"(z_end)
165        : "memory"
166    );
167}
168
169#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
170static void sbr_sum64x5_mips(float *z)
171{
172    int k;
173    float *z1;
174    float f1, f2, f3, f4, f5, f6, f7, f8;
175    for (k = 0; k < 64; k += 8) {
176
177        z1 = &z[k];
178
179         /* loop unrolled 8 times */
180        __asm__ volatile (
181            "lwc1   $f0,    0(%[z1])        \n\t"
182            "lwc1   $f1,    256(%[z1])      \n\t"
183            "lwc1   $f2,    4(%[z1])        \n\t"
184            "lwc1   $f3,    260(%[z1])      \n\t"
185            "lwc1   $f4,    8(%[z1])        \n\t"
186            "add.s  %[f1],  $f0,    $f1     \n\t"
187            "lwc1   $f5,    264(%[z1])      \n\t"
188            "add.s  %[f2],  $f2,    $f3     \n\t"
189            "lwc1   $f6,    12(%[z1])       \n\t"
190            "lwc1   $f7,    268(%[z1])      \n\t"
191            "add.s  %[f3],  $f4,    $f5     \n\t"
192            "lwc1   $f8,    16(%[z1])       \n\t"
193            "lwc1   $f9,    272(%[z1])      \n\t"
194            "add.s  %[f4],  $f6,    $f7     \n\t"
195            "lwc1   $f10,   20(%[z1])       \n\t"
196            "lwc1   $f11,   276(%[z1])      \n\t"
197            "add.s  %[f5],  $f8,    $f9     \n\t"
198            "lwc1   $f12,   24(%[z1])       \n\t"
199            "lwc1   $f13,   280(%[z1])      \n\t"
200            "add.s  %[f6],  $f10,   $f11    \n\t"
201            "lwc1   $f14,   28(%[z1])       \n\t"
202            "lwc1   $f15,   284(%[z1])      \n\t"
203            "add.s  %[f7],  $f12,   $f13    \n\t"
204            "lwc1   $f0,    512(%[z1])      \n\t"
205            "lwc1   $f1,    516(%[z1])      \n\t"
206            "add.s  %[f8],  $f14,   $f15    \n\t"
207            "lwc1   $f2,    520(%[z1])      \n\t"
208            "add.s  %[f1],  %[f1],  $f0     \n\t"
209            "add.s  %[f2],  %[f2],  $f1     \n\t"
210            "lwc1   $f3,    524(%[z1])      \n\t"
211            "add.s  %[f3],  %[f3],  $f2     \n\t"
212            "lwc1   $f4,    528(%[z1])      \n\t"
213            "lwc1   $f5,    532(%[z1])      \n\t"
214            "add.s  %[f4],  %[f4],  $f3     \n\t"
215            "lwc1   $f6,    536(%[z1])      \n\t"
216            "add.s  %[f5],  %[f5],  $f4     \n\t"
217            "add.s  %[f6],  %[f6],  $f5     \n\t"
218            "lwc1   $f7,    540(%[z1])      \n\t"
219            "add.s  %[f7],  %[f7],  $f6     \n\t"
220            "lwc1   $f0,    768(%[z1])      \n\t"
221            "lwc1   $f1,    772(%[z1])      \n\t"
222            "add.s  %[f8],  %[f8],  $f7     \n\t"
223            "lwc1   $f2,    776(%[z1])      \n\t"
224            "add.s  %[f1],  %[f1],  $f0     \n\t"
225            "add.s  %[f2],  %[f2],  $f1     \n\t"
226            "lwc1   $f3,    780(%[z1])      \n\t"
227            "add.s  %[f3],  %[f3],  $f2     \n\t"
228            "lwc1   $f4,    784(%[z1])      \n\t"
229            "lwc1   $f5,    788(%[z1])      \n\t"
230            "add.s  %[f4],  %[f4],  $f3     \n\t"
231            "lwc1   $f6,    792(%[z1])      \n\t"
232            "add.s  %[f5],  %[f5],  $f4     \n\t"
233            "add.s  %[f6],  %[f6],  $f5     \n\t"
234            "lwc1   $f7,    796(%[z1])      \n\t"
235            "add.s  %[f7],  %[f7],  $f6     \n\t"
236            "lwc1   $f0,    1024(%[z1])     \n\t"
237            "lwc1   $f1,    1028(%[z1])     \n\t"
238            "add.s  %[f8],  %[f8],  $f7     \n\t"
239            "lwc1   $f2,    1032(%[z1])     \n\t"
240            "add.s  %[f1],  %[f1],  $f0     \n\t"
241            "add.s  %[f2],  %[f2],  $f1     \n\t"
242            "lwc1   $f3,    1036(%[z1])     \n\t"
243            "add.s  %[f3],  %[f3],  $f2     \n\t"
244            "lwc1   $f4,    1040(%[z1])     \n\t"
245            "lwc1   $f5,    1044(%[z1])     \n\t"
246            "add.s  %[f4],  %[f4],  $f3     \n\t"
247            "lwc1   $f6,    1048(%[z1])     \n\t"
248            "add.s  %[f5],  %[f5],  $f4     \n\t"
249            "add.s  %[f6],  %[f6],  $f5     \n\t"
250            "lwc1   $f7,    1052(%[z1])     \n\t"
251            "add.s  %[f7],  %[f7],  $f6     \n\t"
252            "swc1   %[f1],  0(%[z1])        \n\t"
253            "swc1   %[f2],  4(%[z1])        \n\t"
254            "add.s  %[f8],  %[f8],  $f7     \n\t"
255            "swc1   %[f3],  8(%[z1])        \n\t"
256            "swc1   %[f4],  12(%[z1])       \n\t"
257            "swc1   %[f5],  16(%[z1])       \n\t"
258            "swc1   %[f6],  20(%[z1])       \n\t"
259            "swc1   %[f7],  24(%[z1])       \n\t"
260            "swc1   %[f8],  28(%[z1])       \n\t"
261
262            : [f1]"=&f"(f1), [f2]"=&f"(f2), [f3]"=&f"(f3),
263              [f4]"=&f"(f4), [f5]"=&f"(f5), [f6]"=&f"(f6),
264              [f7]"=&f"(f7), [f8]"=&f"(f8)
265            : [z1]"r"(z1)
266            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5",
267              "$f6", "$f7", "$f8", "$f9", "$f10", "$f11",
268              "$f12", "$f13", "$f14", "$f15",
269              "memory"
270        );
271    }
272}
273
274static float sbr_sum_square_mips(float (*x)[2], int n)
275{
276    float sum0 = 0.0f, sum1 = 0.0f;
277    float *p_x;
278    float temp0, temp1, temp2, temp3;
279    float *loop_end;
280    p_x = &x[0][0];
281    loop_end = p_x + (n >> 1)*4 - 4;
282
283    __asm__ volatile (
284        ".set      push                                             \n\t"
285        ".set      noreorder                                        \n\t"
286        "lwc1      %[temp0],   0(%[p_x])                            \n\t"
287        "lwc1      %[temp1],   4(%[p_x])                            \n\t"
288        "lwc1      %[temp2],   8(%[p_x])                            \n\t"
289        "lwc1      %[temp3],   12(%[p_x])                           \n\t"
290    "1:                                                             \n\t"
291        PTR_ADDIU "%[p_x],     %[p_x],       16                     \n\t"
292        "madd.s    %[sum0],    %[sum0],      %[temp0],   %[temp0]   \n\t"
293        "lwc1      %[temp0],   0(%[p_x])                            \n\t"
294        "madd.s    %[sum1],    %[sum1],      %[temp1],   %[temp1]   \n\t"
295        "lwc1      %[temp1],   4(%[p_x])                            \n\t"
296        "madd.s    %[sum0],    %[sum0],      %[temp2],   %[temp2]   \n\t"
297        "lwc1      %[temp2],   8(%[p_x])                            \n\t"
298        "madd.s    %[sum1],    %[sum1],      %[temp3],   %[temp3]   \n\t"
299        "bne       %[p_x],     %[loop_end],  1b                     \n\t"
300        " lwc1     %[temp3],   12(%[p_x])                           \n\t"
301        "madd.s    %[sum0],    %[sum0],      %[temp0],   %[temp0]   \n\t"
302        "madd.s    %[sum1],    %[sum1],      %[temp1],   %[temp1]   \n\t"
303        "madd.s    %[sum0],    %[sum0],      %[temp2],   %[temp2]   \n\t"
304        "madd.s    %[sum1],    %[sum1],      %[temp3],   %[temp3]   \n\t"
305        ".set      pop                                              \n\t"
306
307        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
308          [temp3]"=&f"(temp3), [sum0]"+f"(sum0), [sum1]"+f"(sum1),
309          [p_x]"+r"(p_x)
310        : [loop_end]"r"(loop_end)
311        : "memory"
312    );
313    return sum0 + sum1;
314}
315
316static void sbr_qmf_deint_bfly_mips(float *v, const float *src0, const float *src1)
317{
318    int i;
319    float temp0, temp1, temp2, temp3, temp4, temp5;
320    float temp6, temp7, temp8, temp9, temp10, temp11;
321    float *v0 = v;
322    float *v1 = &v[127];
323    float *psrc0 = (float*)src0;
324    float *psrc1 = (float*)&src1[63];
325
326    for (i = 0; i < 4; i++) {
327
328         /* loop unrolled 16 times */
329        __asm__ volatile(
330            "lwc1       %[temp0],   0(%[src0])             \n\t"
331            "lwc1       %[temp1],   0(%[src1])             \n\t"
332            "lwc1       %[temp3],   4(%[src0])             \n\t"
333            "lwc1       %[temp4],   -4(%[src1])            \n\t"
334            "lwc1       %[temp6],   8(%[src0])             \n\t"
335            "lwc1       %[temp7],   -8(%[src1])            \n\t"
336            "lwc1       %[temp9],   12(%[src0])            \n\t"
337            "lwc1       %[temp10],  -12(%[src1])           \n\t"
338            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
339            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
340            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
341            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
342            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
343            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
344            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
345            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
346            "swc1       %[temp2],   0(%[v1])               \n\t"
347            "swc1       %[temp0],   0(%[v0])               \n\t"
348            "swc1       %[temp5],   -4(%[v1])              \n\t"
349            "swc1       %[temp3],   4(%[v0])               \n\t"
350            "swc1       %[temp8],   -8(%[v1])              \n\t"
351            "swc1       %[temp6],   8(%[v0])               \n\t"
352            "swc1       %[temp11],  -12(%[v1])             \n\t"
353            "swc1       %[temp9],   12(%[v0])              \n\t"
354            "lwc1       %[temp0],   16(%[src0])            \n\t"
355            "lwc1       %[temp1],   -16(%[src1])           \n\t"
356            "lwc1       %[temp3],   20(%[src0])            \n\t"
357            "lwc1       %[temp4],   -20(%[src1])           \n\t"
358            "lwc1       %[temp6],   24(%[src0])            \n\t"
359            "lwc1       %[temp7],   -24(%[src1])           \n\t"
360            "lwc1       %[temp9],   28(%[src0])            \n\t"
361            "lwc1       %[temp10],  -28(%[src1])           \n\t"
362            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
363            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
364            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
365            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
366            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
367            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
368            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
369            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
370            "swc1       %[temp2],   -16(%[v1])             \n\t"
371            "swc1       %[temp0],   16(%[v0])              \n\t"
372            "swc1       %[temp5],   -20(%[v1])             \n\t"
373            "swc1       %[temp3],   20(%[v0])              \n\t"
374            "swc1       %[temp8],   -24(%[v1])             \n\t"
375            "swc1       %[temp6],   24(%[v0])              \n\t"
376            "swc1       %[temp11],  -28(%[v1])             \n\t"
377            "swc1       %[temp9],   28(%[v0])              \n\t"
378            "lwc1       %[temp0],   32(%[src0])            \n\t"
379            "lwc1       %[temp1],   -32(%[src1])           \n\t"
380            "lwc1       %[temp3],   36(%[src0])            \n\t"
381            "lwc1       %[temp4],   -36(%[src1])           \n\t"
382            "lwc1       %[temp6],   40(%[src0])            \n\t"
383            "lwc1       %[temp7],   -40(%[src1])           \n\t"
384            "lwc1       %[temp9],   44(%[src0])            \n\t"
385            "lwc1       %[temp10],  -44(%[src1])           \n\t"
386            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
387            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
388            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
389            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
390            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
391            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
392            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
393            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
394            "swc1       %[temp2],   -32(%[v1])             \n\t"
395            "swc1       %[temp0],   32(%[v0])              \n\t"
396            "swc1       %[temp5],   -36(%[v1])             \n\t"
397            "swc1       %[temp3],   36(%[v0])              \n\t"
398            "swc1       %[temp8],   -40(%[v1])             \n\t"
399            "swc1       %[temp6],   40(%[v0])              \n\t"
400            "swc1       %[temp11],  -44(%[v1])             \n\t"
401            "swc1       %[temp9],   44(%[v0])              \n\t"
402            "lwc1       %[temp0],   48(%[src0])            \n\t"
403            "lwc1       %[temp1],   -48(%[src1])           \n\t"
404            "lwc1       %[temp3],   52(%[src0])            \n\t"
405            "lwc1       %[temp4],   -52(%[src1])           \n\t"
406            "lwc1       %[temp6],   56(%[src0])            \n\t"
407            "lwc1       %[temp7],   -56(%[src1])           \n\t"
408            "lwc1       %[temp9],   60(%[src0])            \n\t"
409            "lwc1       %[temp10],  -60(%[src1])           \n\t"
410            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
411            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
412            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
413            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
414            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
415            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
416            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
417            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
418            "swc1       %[temp2],   -48(%[v1])             \n\t"
419            "swc1       %[temp0],   48(%[v0])              \n\t"
420            "swc1       %[temp5],   -52(%[v1])             \n\t"
421            "swc1       %[temp3],   52(%[v0])              \n\t"
422            "swc1       %[temp8],   -56(%[v1])             \n\t"
423            "swc1       %[temp6],   56(%[v0])              \n\t"
424            "swc1       %[temp11],  -60(%[v1])             \n\t"
425            "swc1       %[temp9],   60(%[v0])              \n\t"
426            PTR_ADDIU " %[src0],    %[src0],    64         \n\t"
427            PTR_ADDIU " %[src1],    %[src1],    -64        \n\t"
428            PTR_ADDIU " %[v0],      %[v0],      64         \n\t"
429            PTR_ADDIU " %[v1],      %[v1],      -64        \n\t"
430
431            : [v0]"+r"(v0), [v1]"+r"(v1), [src0]"+r"(psrc0), [src1]"+r"(psrc1),
432              [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
433              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
434              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
435              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11)
436            :
437            :"memory"
438        );
439    }
440}
441
442static void sbr_autocorrelate_mips(const float x[40][2], float phi[3][2][2])
443{
444    int i;
445    float real_sum_0 = 0.0f;
446    float real_sum_1 = 0.0f;
447    float real_sum_2 = 0.0f;
448    float imag_sum_1 = 0.0f;
449    float imag_sum_2 = 0.0f;
450    float *p_x, *p_phi;
451    float temp0, temp1, temp2, temp3, temp4, temp5, temp6;
452    float temp7, temp_r, temp_r1, temp_r2, temp_r3, temp_r4;
453    p_x = (float*)&x[0][0];
454    p_phi = &phi[0][0][0];
455
456    __asm__ volatile (
457        "lwc1    %[temp0],      8(%[p_x])                           \n\t"
458        "lwc1    %[temp1],      12(%[p_x])                          \n\t"
459        "lwc1    %[temp2],      16(%[p_x])                          \n\t"
460        "lwc1    %[temp3],      20(%[p_x])                          \n\t"
461        "lwc1    %[temp4],      24(%[p_x])                          \n\t"
462        "lwc1    %[temp5],      28(%[p_x])                          \n\t"
463        "mul.s   %[temp_r],     %[temp1],      %[temp1]             \n\t"
464        "mul.s   %[temp_r1],    %[temp1],      %[temp3]             \n\t"
465        "mul.s   %[temp_r2],    %[temp1],      %[temp2]             \n\t"
466        "mul.s   %[temp_r3],    %[temp1],      %[temp5]             \n\t"
467        "mul.s   %[temp_r4],    %[temp1],      %[temp4]             \n\t"
468        "madd.s  %[temp_r],     %[temp_r],     %[temp0],  %[temp0]  \n\t"
469        "madd.s  %[temp_r1],    %[temp_r1],    %[temp0],  %[temp2]  \n\t"
470        "msub.s  %[temp_r2],    %[temp_r2],    %[temp0],  %[temp3]  \n\t"
471        "madd.s  %[temp_r3],    %[temp_r3],    %[temp0],  %[temp4]  \n\t"
472        "msub.s  %[temp_r4],    %[temp_r4],    %[temp0],  %[temp5]  \n\t"
473        "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
474        "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
475        "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
476        "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
477        "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
478        PTR_ADDIU "%[p_x],      %[p_x],        8                    \n\t"
479
480        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
481          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
482          [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
483          [imag_sum_1]"+f"(imag_sum_1), [real_sum_2]"+f"(real_sum_2),
484          [temp_r]"=&f"(temp_r), [temp_r1]"=&f"(temp_r1), [temp_r2]"=&f"(temp_r2),
485          [temp_r3]"=&f"(temp_r3), [temp_r4]"=&f"(temp_r4),
486          [p_x]"+r"(p_x), [imag_sum_2]"+f"(imag_sum_2)
487        :
488        : "memory"
489    );
490
491    for (i = 0; i < 12; i++) {
492        __asm__ volatile (
493            "lwc1    %[temp0],      8(%[p_x])                           \n\t"
494            "lwc1    %[temp1],      12(%[p_x])                          \n\t"
495            "lwc1    %[temp2],      16(%[p_x])                          \n\t"
496            "lwc1    %[temp3],      20(%[p_x])                          \n\t"
497            "lwc1    %[temp4],      24(%[p_x])                          \n\t"
498            "lwc1    %[temp5],      28(%[p_x])                          \n\t"
499            "mul.s   %[temp_r],     %[temp1],      %[temp1]             \n\t"
500            "mul.s   %[temp_r1],    %[temp1],      %[temp3]             \n\t"
501            "mul.s   %[temp_r2],    %[temp1],      %[temp2]             \n\t"
502            "mul.s   %[temp_r3],    %[temp1],      %[temp5]             \n\t"
503            "mul.s   %[temp_r4],    %[temp1],      %[temp4]             \n\t"
504            "madd.s  %[temp_r],     %[temp_r],     %[temp0],  %[temp0]  \n\t"
505            "madd.s  %[temp_r1],    %[temp_r1],    %[temp0],  %[temp2]  \n\t"
506            "msub.s  %[temp_r2],    %[temp_r2],    %[temp0],  %[temp3]  \n\t"
507            "madd.s  %[temp_r3],    %[temp_r3],    %[temp0],  %[temp4]  \n\t"
508            "msub.s  %[temp_r4],    %[temp_r4],    %[temp0],  %[temp5]  \n\t"
509            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
510            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
511            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
512            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
513            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
514            "lwc1    %[temp0],      32(%[p_x])                          \n\t"
515            "lwc1    %[temp1],      36(%[p_x])                          \n\t"
516            "mul.s   %[temp_r],     %[temp3],      %[temp3]             \n\t"
517            "mul.s   %[temp_r1],    %[temp3],      %[temp5]             \n\t"
518            "mul.s   %[temp_r2],    %[temp3],      %[temp4]             \n\t"
519            "mul.s   %[temp_r3],    %[temp3],      %[temp1]             \n\t"
520            "mul.s   %[temp_r4],    %[temp3],      %[temp0]             \n\t"
521            "madd.s  %[temp_r],     %[temp_r],     %[temp2],  %[temp2]  \n\t"
522            "madd.s  %[temp_r1],    %[temp_r1],    %[temp2],  %[temp4]  \n\t"
523            "msub.s  %[temp_r2],    %[temp_r2],    %[temp2],  %[temp5]  \n\t"
524            "madd.s  %[temp_r3],    %[temp_r3],    %[temp2],  %[temp0]  \n\t"
525            "msub.s  %[temp_r4],    %[temp_r4],    %[temp2],  %[temp1]  \n\t"
526            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
527            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
528            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
529            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
530            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
531            "lwc1    %[temp2],      40(%[p_x])                          \n\t"
532            "lwc1    %[temp3],      44(%[p_x])                          \n\t"
533            "mul.s   %[temp_r],     %[temp5],      %[temp5]             \n\t"
534            "mul.s   %[temp_r1],    %[temp5],      %[temp1]             \n\t"
535            "mul.s   %[temp_r2],    %[temp5],      %[temp0]             \n\t"
536            "mul.s   %[temp_r3],    %[temp5],      %[temp3]             \n\t"
537            "mul.s   %[temp_r4],    %[temp5],      %[temp2]             \n\t"
538            "madd.s  %[temp_r],     %[temp_r],     %[temp4],  %[temp4]  \n\t"
539            "madd.s  %[temp_r1],    %[temp_r1],    %[temp4],  %[temp0]  \n\t"
540            "msub.s  %[temp_r2],    %[temp_r2],    %[temp4],  %[temp1]  \n\t"
541            "madd.s  %[temp_r3],    %[temp_r3],    %[temp4],  %[temp2]  \n\t"
542            "msub.s  %[temp_r4],    %[temp_r4],    %[temp4],  %[temp3]  \n\t"
543            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
544            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
545            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
546            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
547            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
548            PTR_ADDIU "%[p_x],      %[p_x],        24                   \n\t"
549
550            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
551              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
552              [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
553              [imag_sum_1]"+f"(imag_sum_1), [real_sum_2]"+f"(real_sum_2),
554              [temp_r]"=&f"(temp_r), [temp_r1]"=&f"(temp_r1),
555              [temp_r2]"=&f"(temp_r2), [temp_r3]"=&f"(temp_r3),
556              [temp_r4]"=&f"(temp_r4), [p_x]"+r"(p_x),
557              [imag_sum_2]"+f"(imag_sum_2)
558            :
559            : "memory"
560        );
561    }
562    __asm__ volatile (
563        "lwc1    %[temp0],    -296(%[p_x])                        \n\t"
564        "lwc1    %[temp1],    -292(%[p_x])                        \n\t"
565        "lwc1    %[temp2],    8(%[p_x])                           \n\t"
566        "lwc1    %[temp3],    12(%[p_x])                          \n\t"
567        "lwc1    %[temp4],    -288(%[p_x])                        \n\t"
568        "lwc1    %[temp5],    -284(%[p_x])                        \n\t"
569        "lwc1    %[temp6],    -280(%[p_x])                        \n\t"
570        "lwc1    %[temp7],    -276(%[p_x])                        \n\t"
571        "madd.s  %[temp_r],   %[real_sum_0], %[temp0],  %[temp0]  \n\t"
572        "madd.s  %[temp_r1],  %[real_sum_0], %[temp2],  %[temp2]  \n\t"
573        "madd.s  %[temp_r2],  %[real_sum_1], %[temp0],  %[temp4]  \n\t"
574        "madd.s  %[temp_r3],  %[imag_sum_1], %[temp0],  %[temp5]  \n\t"
575        "madd.s  %[temp_r],   %[temp_r],     %[temp1],  %[temp1]  \n\t"
576        "madd.s  %[temp_r1],  %[temp_r1],    %[temp3],  %[temp3]  \n\t"
577        "madd.s  %[temp_r2],  %[temp_r2],    %[temp1],  %[temp5]  \n\t"
578        "nmsub.s  %[temp_r3], %[temp_r3],    %[temp1],  %[temp4]  \n\t"
579        "lwc1    %[temp4],    16(%[p_x])                          \n\t"
580        "lwc1    %[temp5],    20(%[p_x])                          \n\t"
581        "swc1    %[temp_r],   40(%[p_phi])                        \n\t"
582        "swc1    %[temp_r1],  16(%[p_phi])                        \n\t"
583        "swc1    %[temp_r2],  24(%[p_phi])                        \n\t"
584        "swc1    %[temp_r3],  28(%[p_phi])                        \n\t"
585        "madd.s  %[temp_r],   %[real_sum_1], %[temp2],  %[temp4]  \n\t"
586        "madd.s  %[temp_r1],  %[imag_sum_1], %[temp2],  %[temp5]  \n\t"
587        "madd.s  %[temp_r2],  %[real_sum_2], %[temp0],  %[temp6]  \n\t"
588        "madd.s  %[temp_r3],  %[imag_sum_2], %[temp0],  %[temp7]  \n\t"
589        "madd.s  %[temp_r],   %[temp_r],     %[temp3],  %[temp5]  \n\t"
590        "nmsub.s %[temp_r1],  %[temp_r1],    %[temp3],  %[temp4]  \n\t"
591        "madd.s  %[temp_r2],  %[temp_r2],    %[temp1],  %[temp7]  \n\t"
592        "nmsub.s %[temp_r3],  %[temp_r3],    %[temp1],  %[temp6]  \n\t"
593        "swc1    %[temp_r],   0(%[p_phi])                         \n\t"
594        "swc1    %[temp_r1],  4(%[p_phi])                         \n\t"
595        "swc1    %[temp_r2],  8(%[p_phi])                         \n\t"
596        "swc1    %[temp_r3],  12(%[p_phi])                        \n\t"
597
598        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
599          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
600          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp_r]"=&f"(temp_r),
601          [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
602          [real_sum_2]"+f"(real_sum_2), [imag_sum_1]"+f"(imag_sum_1),
603          [temp_r2]"=&f"(temp_r2), [temp_r3]"=&f"(temp_r3),
604          [temp_r1]"=&f"(temp_r1), [p_phi]"+r"(p_phi),
605          [imag_sum_2]"+f"(imag_sum_2)
606        : [p_x]"r"(p_x)
607        : "memory"
608    );
609}
610
611static void sbr_hf_gen_mips(float (*X_high)[2], const float (*X_low)[2],
612                         const float alpha0[2], const float alpha1[2],
613                         float bw, int start, int end)
614{
615    float alpha[4];
616    int i;
617    float *p_x_low = (float*)&X_low[0][0] + 2*start;
618    float *p_x_high = &X_high[0][0] + 2*start;
619    float temp0, temp1, temp2, temp3, temp4, temp5, temp6;
620    float temp7, temp8, temp9, temp10, temp11, temp12;
621
622    alpha[0] = alpha1[0] * bw * bw;
623    alpha[1] = alpha1[1] * bw * bw;
624    alpha[2] = alpha0[0] * bw;
625    alpha[3] = alpha0[1] * bw;
626
627    for (i = start; i < end; i++) {
628        __asm__ volatile (
629            "lwc1    %[temp0],    -16(%[p_x_low])                        \n\t"
630            "lwc1    %[temp1],    -12(%[p_x_low])                        \n\t"
631            "lwc1    %[temp2],    -8(%[p_x_low])                         \n\t"
632            "lwc1    %[temp3],    -4(%[p_x_low])                         \n\t"
633            "lwc1    %[temp5],    0(%[p_x_low])                          \n\t"
634            "lwc1    %[temp6],    4(%[p_x_low])                          \n\t"
635            "lwc1    %[temp7],    0(%[alpha])                            \n\t"
636            "lwc1    %[temp8],    4(%[alpha])                            \n\t"
637            "lwc1    %[temp9],    8(%[alpha])                            \n\t"
638            "lwc1    %[temp10],   12(%[alpha])                           \n\t"
639            PTR_ADDIU "%[p_x_high], %[p_x_high],   8                     \n\t"
640            PTR_ADDIU "%[p_x_low],  %[p_x_low],    8                     \n\t"
641            "mul.s   %[temp11],   %[temp1],        %[temp8]              \n\t"
642            "msub.s  %[temp11],   %[temp11],       %[temp0],  %[temp7]   \n\t"
643            "madd.s  %[temp11],   %[temp11],       %[temp2],  %[temp9]   \n\t"
644            "nmsub.s %[temp11],   %[temp11],       %[temp3],  %[temp10]  \n\t"
645            "add.s   %[temp11],   %[temp11],       %[temp5]              \n\t"
646            "swc1    %[temp11],   -8(%[p_x_high])                        \n\t"
647            "mul.s   %[temp12],   %[temp1],        %[temp7]              \n\t"
648            "madd.s  %[temp12],   %[temp12],       %[temp0],  %[temp8]   \n\t"
649            "madd.s  %[temp12],   %[temp12],       %[temp3],  %[temp9]   \n\t"
650            "madd.s  %[temp12],   %[temp12],       %[temp2],  %[temp10]  \n\t"
651            "add.s   %[temp12],   %[temp12],       %[temp6]              \n\t"
652            "swc1    %[temp12],   -4(%[p_x_high])                        \n\t"
653
654            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
655              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
656              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
657              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
658              [temp12]"=&f"(temp12), [p_x_high]"+r"(p_x_high),
659              [p_x_low]"+r"(p_x_low)
660            : [alpha]"r"(alpha)
661            : "memory"
662        );
663    }
664}
665
666static void sbr_hf_g_filt_mips(float (*Y)[2], const float (*X_high)[40][2],
667                            const float *g_filt, int m_max, intptr_t ixh)
668{
669    const float *p_x, *p_g, *loop_end;
670    float *p_y;
671    float temp0, temp1, temp2;
672
673    p_g = &g_filt[0];
674    p_y = &Y[0][0];
675    p_x = &X_high[0][ixh][0];
676    loop_end = p_g + m_max;
677
678    __asm__ volatile(
679        ".set    push                                \n\t"
680        ".set    noreorder                           \n\t"
681    "1:                                              \n\t"
682        "lwc1    %[temp0],   0(%[p_g])               \n\t"
683        "lwc1    %[temp1],   0(%[p_x])               \n\t"
684        "lwc1    %[temp2],   4(%[p_x])               \n\t"
685        "mul.s   %[temp1],   %[temp1],     %[temp0]  \n\t"
686        "mul.s   %[temp2],   %[temp2],     %[temp0]  \n\t"
687        PTR_ADDIU "%[p_g],   %[p_g],       4         \n\t"
688        PTR_ADDIU "%[p_x],   %[p_x],       320       \n\t"
689        "swc1    %[temp1],   0(%[p_y])               \n\t"
690        "swc1    %[temp2],   4(%[p_y])               \n\t"
691        "bne     %[p_g],     %[loop_end],  1b        \n\t"
692        PTR_ADDIU "%[p_y],   %[p_y],       8         \n\t"
693        ".set    pop                                 \n\t"
694
695        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
696          [temp2]"=&f"(temp2), [p_x]"+r"(p_x),
697          [p_y]"+r"(p_y), [p_g]"+r"(p_g)
698        : [loop_end]"r"(loop_end)
699        : "memory"
700    );
701}
702
703static void sbr_hf_apply_noise_0_mips(float (*Y)[2], const float *s_m,
704                                 const float *q_filt, int noise,
705                                 int kx, int m_max)
706{
707    int m;
708
709    for (m = 0; m < m_max; m++){
710
711        float *Y1=&Y[m][0];
712        float *ff_table;
713        float y0,y1, temp1, temp2, temp4, temp5;
714        int temp0, temp3;
715        const float *s_m1=&s_m[m];
716        const float *q_filt1= &q_filt[m];
717
718        __asm__ volatile(
719            "lwc1    %[y0],       0(%[Y1])                                    \n\t"
720            "lwc1    %[temp1],    0(%[s_m1])                                  \n\t"
721            "addiu   %[noise],    %[noise],              1                    \n\t"
722            "andi    %[noise],    %[noise],              0x1ff                \n\t"
723            "sll     %[temp0],    %[noise], 3                                 \n\t"
724            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table], %[temp0]             \n\t"
725            "add.s   %[y0],       %[y0],                 %[temp1]             \n\t"
726            "mfc1    %[temp3],    %[temp1]                                    \n\t"
727            "bne     %[temp3],    $0,                    1f                   \n\t"
728            "lwc1    %[y1],       4(%[Y1])                                    \n\t"
729            "lwc1    %[temp2],    0(%[q_filt1])                               \n\t"
730            "lwc1    %[temp4],    0(%[ff_table])                              \n\t"
731            "lwc1    %[temp5],    4(%[ff_table])                              \n\t"
732            "madd.s  %[y0],       %[y0],                 %[temp2],  %[temp4]  \n\t"
733            "madd.s  %[y1],       %[y1],                 %[temp2],  %[temp5]  \n\t"
734            "swc1    %[y1],       4(%[Y1])                                    \n\t"
735        "1:                                                                   \n\t"
736            "swc1    %[y0],       0(%[Y1])                                    \n\t"
737
738            : [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), [y1]"=&f"(y1),
739              [temp0]"=&r"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
740              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
741            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
742              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1)
743            : "memory"
744        );
745    }
746}
747
748static void sbr_hf_apply_noise_1_mips(float (*Y)[2], const float *s_m,
749                                 const float *q_filt, int noise,
750                                 int kx, int m_max)
751{
752    float y0,y1,temp1, temp2, temp4, temp5;
753    int temp0, temp3, m;
754    float phi_sign = 1 - 2 * (kx & 1);
755
756    for (m = 0; m < m_max; m++) {
757
758        float *ff_table;
759        float *Y1=&Y[m][0];
760        const float *s_m1=&s_m[m];
761        const float *q_filt1= &q_filt[m];
762
763        __asm__ volatile(
764            "lwc1   %[y1],       4(%[Y1])                                     \n\t"
765            "lwc1   %[temp1],    0(%[s_m1])                                   \n\t"
766            "lw     %[temp3],    0(%[s_m1])                                   \n\t"
767            "addiu  %[noise],    %[noise],               1                    \n\t"
768            "andi   %[noise],    %[noise],               0x1ff                \n\t"
769            "sll    %[temp0],    %[noise],               3                    \n\t"
770            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table],%[temp0]              \n\t"
771            "madd.s %[y1],       %[y1],                 %[temp1], %[phi_sign] \n\t"
772            "bne    %[temp3],    $0,                    1f                    \n\t"
773            "lwc1   %[y0],       0(%[Y1])                                     \n\t"
774            "lwc1   %[temp2],    0(%[q_filt1])                                \n\t"
775            "lwc1   %[temp4],    0(%[ff_table])                               \n\t"
776            "lwc1   %[temp5],    4(%[ff_table])                               \n\t"
777            "madd.s %[y0],       %[y0],                 %[temp2], %[temp4]    \n\t"
778            "madd.s %[y1],       %[y1],                 %[temp2], %[temp5]    \n\t"
779            "swc1   %[y0],       0(%[Y1])                                     \n\t"
780        "1:                                                                   \n\t"
781            "swc1   %[y1],       4(%[Y1])                                     \n\t"
782
783            : [ff_table] "=&r" (ff_table), [y0] "=&f" (y0), [y1] "=&f" (y1),
784              [temp0] "=&r" (temp0), [temp1] "=&f" (temp1), [temp2] "=&f" (temp2),
785              [temp3] "=&r" (temp3), [temp4] "=&f" (temp4), [temp5] "=&f" (temp5)
786            : [ff_sbr_noise_table] "r" (ff_sbr_noise_table), [noise] "r" (noise),
787              [Y1] "r" (Y1), [s_m1] "r" (s_m1), [q_filt1] "r" (q_filt1),
788              [phi_sign] "f" (phi_sign)
789            : "memory"
790        );
791        phi_sign = -phi_sign;
792    }
793}
794
795static void sbr_hf_apply_noise_2_mips(float (*Y)[2], const float *s_m,
796                                 const float *q_filt, int noise,
797                                 int kx, int m_max)
798{
799    int m, temp0, temp1;
800    float *ff_table;
801    float y0, y1, temp2, temp3, temp4, temp5;
802
803    for (m = 0; m < m_max; m++) {
804
805        float *Y1=&Y[m][0];
806        const float *s_m1=&s_m[m];
807        const float *q_filt1= &q_filt[m];
808
809        __asm__ volatile(
810            "lwc1   %[y0],       0(%[Y1])                                  \n\t"
811            "lwc1   %[temp3],    0(%[s_m1])                                \n\t"
812            "addiu  %[noise],    %[noise],              1                  \n\t"
813            "andi   %[noise],    %[noise],              0x1ff              \n\t"
814            "sll    %[temp0],    %[noise],              3                  \n\t"
815            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table],%[temp0]           \n\t"
816            "sub.s  %[y0],       %[y0],                 %[temp3]           \n\t"
817            "mfc1   %[temp1],    %[temp3]                                  \n\t"
818            "bne    %[temp1],    $0,                    1f                 \n\t"
819            "lwc1   %[y1],       4(%[Y1])                                  \n\t"
820            "lwc1   %[temp2],    0(%[q_filt1])                             \n\t"
821            "lwc1   %[temp4],    0(%[ff_table])                            \n\t"
822            "lwc1   %[temp5],    4(%[ff_table])                            \n\t"
823            "madd.s %[y0],       %[y0],                 %[temp2], %[temp4] \n\t"
824            "madd.s %[y1],       %[y1],                 %[temp2], %[temp5] \n\t"
825            "swc1   %[y1],       4(%[Y1])                                  \n\t"
826        "1:                                                                \n\t"
827            "swc1   %[y0],       0(%[Y1])                                  \n\t"
828
829            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [y0]"=&f"(y0),
830              [y1]"=&f"(y1), [ff_table]"=&r"(ff_table),
831              [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
832              [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
833            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
834              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1)
835            : "memory"
836        );
837    }
838}
839
840static void sbr_hf_apply_noise_3_mips(float (*Y)[2], const float *s_m,
841                                 const float *q_filt, int noise,
842                                 int kx, int m_max)
843{
844    float phi_sign = 1 - 2 * (kx & 1);
845    int m;
846
847    for (m = 0; m < m_max; m++) {
848
849        float *Y1=&Y[m][0];
850        float *ff_table;
851        float y0,y1, temp1, temp2, temp4, temp5;
852        int temp0, temp3;
853        const float *s_m1=&s_m[m];
854        const float *q_filt1= &q_filt[m];
855
856        __asm__ volatile(
857            "lwc1    %[y1],       4(%[Y1])                                     \n\t"
858            "lwc1    %[temp1],    0(%[s_m1])                                   \n\t"
859            "addiu   %[noise],    %[noise],              1                     \n\t"
860            "andi    %[noise],    %[noise],              0x1ff                 \n\t"
861            "sll     %[temp0],    %[noise],              3                     \n\t"
862            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table], %[temp0]              \n\t"
863            "nmsub.s %[y1],       %[y1],                 %[temp1], %[phi_sign] \n\t"
864            "mfc1    %[temp3],    %[temp1]                                     \n\t"
865            "bne     %[temp3],    $0,                    1f                    \n\t"
866            "lwc1    %[y0],       0(%[Y1])                                     \n\t"
867            "lwc1    %[temp2],    0(%[q_filt1])                                \n\t"
868            "lwc1    %[temp4],    0(%[ff_table])                               \n\t"
869            "lwc1    %[temp5],    4(%[ff_table])                               \n\t"
870            "madd.s  %[y0],       %[y0],                 %[temp2], %[temp4]    \n\t"
871            "madd.s  %[y1],       %[y1],                 %[temp2], %[temp5]    \n\t"
872            "swc1    %[y0],       0(%[Y1])                                     \n\t"
873            "1:                                                                \n\t"
874            "swc1    %[y1],       4(%[Y1])                                     \n\t"
875
876            : [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), [y1]"=&f"(y1),
877              [temp0]"=&r"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
878              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
879            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
880              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1),
881              [phi_sign]"f"(phi_sign)
882            : "memory"
883        );
884       phi_sign = -phi_sign;
885    }
886}
887#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
888#endif /* HAVE_MIPSFPU */
889#endif /* HAVE_INLINE_ASM */
890
891void ff_sbrdsp_init_mips(SBRDSPContext *s)
892{
893#if HAVE_INLINE_ASM
894#if HAVE_MIPSFPU
895    s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_mips;
896    s->qmf_post_shuffle = sbr_qmf_post_shuffle_mips;
897#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
898    s->sum64x5 = sbr_sum64x5_mips;
899    s->sum_square = sbr_sum_square_mips;
900    s->qmf_deint_bfly = sbr_qmf_deint_bfly_mips;
901    s->autocorrelate = sbr_autocorrelate_mips;
902    s->hf_gen = sbr_hf_gen_mips;
903    s->hf_g_filt = sbr_hf_g_filt_mips;
904
905    s->hf_apply_noise[0] = sbr_hf_apply_noise_0_mips;
906    s->hf_apply_noise[1] = sbr_hf_apply_noise_1_mips;
907    s->hf_apply_noise[2] = sbr_hf_apply_noise_2_mips;
908    s->hf_apply_noise[3] = sbr_hf_apply_noise_3_mips;
909#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
910#endif /* HAVE_MIPSFPU */
911#endif /* HAVE_INLINE_ASM */
912}
913