1/*
2 * Copyright (c) 2012
3 *      MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 *    contributors may be used to endorse or promote products derived from
15 *    this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * Author:  Stanislav Ocovaj (socovaj@mips.com)
30 *          Szabolcs Pal     (sabolc@mips.com)
31 *
32 * AAC coefficients encoder optimized for MIPS floating-point architecture
33 *
34 * This file is part of FFmpeg.
35 *
36 * FFmpeg is free software; you can redistribute it and/or
37 * modify it under the terms of the GNU Lesser General Public
38 * License as published by the Free Software Foundation; either
39 * version 2.1 of the License, or (at your option) any later version.
40 *
41 * FFmpeg is distributed in the hope that it will be useful,
42 * but WITHOUT ANY WARRANTY; without even the implied warranty of
43 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
44 * Lesser General Public License for more details.
45 *
46 * You should have received a copy of the GNU Lesser General Public
47 * License along with FFmpeg; if not, write to the Free Software
48 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49 */
50
51/**
52 * @file
53 * Reference: libavcodec/aaccoder.c
54 */
55
56#include "libavutil/libm.h"
57
58#include <float.h>
59#include "libavutil/mathematics.h"
60#include "libavcodec/avcodec.h"
61#include "libavcodec/put_bits.h"
62#include "libavcodec/aac.h"
63#include "libavcodec/aacenc.h"
64#include "libavcodec/aactab.h"
65#include "libavcodec/aacenctab.h"
66#include "libavcodec/aacenc_utils.h"
67
68#if HAVE_INLINE_ASM
69#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
70typedef struct BandCodingPath {
71    int prev_idx;
72    float cost;
73    int run;
74} BandCodingPath;
75
76static const uint8_t uquad_sign_bits[81] = {
77    0, 1, 1, 1, 2, 2, 1, 2, 2,
78    1, 2, 2, 2, 3, 3, 2, 3, 3,
79    1, 2, 2, 2, 3, 3, 2, 3, 3,
80    1, 2, 2, 2, 3, 3, 2, 3, 3,
81    2, 3, 3, 3, 4, 4, 3, 4, 4,
82    2, 3, 3, 3, 4, 4, 3, 4, 4,
83    1, 2, 2, 2, 3, 3, 2, 3, 3,
84    2, 3, 3, 3, 4, 4, 3, 4, 4,
85    2, 3, 3, 3, 4, 4, 3, 4, 4
86};
87
88static const uint8_t upair7_sign_bits[64] = {
89    0, 1, 1, 1, 1, 1, 1, 1,
90    1, 2, 2, 2, 2, 2, 2, 2,
91    1, 2, 2, 2, 2, 2, 2, 2,
92    1, 2, 2, 2, 2, 2, 2, 2,
93    1, 2, 2, 2, 2, 2, 2, 2,
94    1, 2, 2, 2, 2, 2, 2, 2,
95    1, 2, 2, 2, 2, 2, 2, 2,
96    1, 2, 2, 2, 2, 2, 2, 2,
97};
98
99static const uint8_t upair12_sign_bits[169] = {
100    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
101    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
102    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
103    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
104    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
105    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
108    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
109    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
110    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
111    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
112    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
113};
114
115static const uint8_t esc_sign_bits[289] = {
116    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
117    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
118    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
120    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
121    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
122    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
124    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
125    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
126    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
127    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
128    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
129    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
131    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
132    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
133};
134
135/**
136 * Functions developed from template function and optimized for quantizing and encoding band
137 */
138static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
139                                                     PutBitContext *pb, const float *in, float *out,
140                                                     const float *scaled, int size, int scale_idx,
141                                                     int cb, const float lambda, const float uplim,
142                                                     int *bits, float *energy, const float ROUNDING)
143{
144    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
145    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
146    int i;
147    int qc1, qc2, qc3, qc4;
148    float qenergy = 0.0f;
149
150    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
151    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
152    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
153
154    abs_pow34_v(s->scoefs, in, size);
155    scaled = s->scoefs;
156    for (i = 0; i < size; i += 4) {
157        int curidx;
158        int *in_int = (int *)&in[i];
159        int t0, t1, t2, t3, t4, t5, t6, t7;
160        const float *vec;
161
162        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
163        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
164        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
165        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
166
167        __asm__ volatile (
168            ".set push                      \n\t"
169            ".set noreorder                 \n\t"
170
171            "slt    %[qc1], $zero,  %[qc1]  \n\t"
172            "slt    %[qc2], $zero,  %[qc2]  \n\t"
173            "slt    %[qc3], $zero,  %[qc3]  \n\t"
174            "slt    %[qc4], $zero,  %[qc4]  \n\t"
175            "lw     %[t0],  0(%[in_int])    \n\t"
176            "lw     %[t1],  4(%[in_int])    \n\t"
177            "lw     %[t2],  8(%[in_int])    \n\t"
178            "lw     %[t3],  12(%[in_int])   \n\t"
179            "srl    %[t0],  %[t0],  31      \n\t"
180            "srl    %[t1],  %[t1],  31      \n\t"
181            "srl    %[t2],  %[t2],  31      \n\t"
182            "srl    %[t3],  %[t3],  31      \n\t"
183            "subu   %[t4],  $zero,  %[qc1]  \n\t"
184            "subu   %[t5],  $zero,  %[qc2]  \n\t"
185            "subu   %[t6],  $zero,  %[qc3]  \n\t"
186            "subu   %[t7],  $zero,  %[qc4]  \n\t"
187            "movn   %[qc1], %[t4],  %[t0]   \n\t"
188            "movn   %[qc2], %[t5],  %[t1]   \n\t"
189            "movn   %[qc3], %[t6],  %[t2]   \n\t"
190            "movn   %[qc4], %[t7],  %[t3]   \n\t"
191
192            ".set pop                       \n\t"
193
194            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
195              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
196              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
197              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
198            : [in_int]"r"(in_int)
199            : "memory"
200        );
201
202        curidx = qc1;
203        curidx *= 3;
204        curidx += qc2;
205        curidx *= 3;
206        curidx += qc3;
207        curidx *= 3;
208        curidx += qc4;
209        curidx += 40;
210
211        put_bits(pb, p_bits[curidx], p_codes[curidx]);
212
213        if (out || energy) {
214            float e1,e2,e3,e4;
215            vec = &p_vec[curidx*4];
216            e1 = vec[0] * IQ;
217            e2 = vec[1] * IQ;
218            e3 = vec[2] * IQ;
219            e4 = vec[3] * IQ;
220            if (out) {
221                out[i+0] = e1;
222                out[i+1] = e2;
223                out[i+2] = e3;
224                out[i+3] = e4;
225            }
226            if (energy)
227                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
228        }
229    }
230    if (energy)
231        *energy = qenergy;
232}
233
234static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
235                                                     PutBitContext *pb, const float *in, float *out,
236                                                     const float *scaled, int size, int scale_idx,
237                                                     int cb, const float lambda, const float uplim,
238                                                     int *bits, float *energy, const float ROUNDING)
239{
240    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
241    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
242    int i;
243    int qc1, qc2, qc3, qc4;
244    float qenergy = 0.0f;
245
246    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
247    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
248    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
249
250    abs_pow34_v(s->scoefs, in, size);
251    scaled = s->scoefs;
252    for (i = 0; i < size; i += 4) {
253        int curidx, sign, count;
254        int *in_int = (int *)&in[i];
255        uint8_t v_bits;
256        unsigned int v_codes;
257        int t0, t1, t2, t3, t4;
258        const float *vec;
259
260        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
261        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
262        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
263        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
264
265        __asm__ volatile (
266            ".set push                              \n\t"
267            ".set noreorder                         \n\t"
268
269            "ori    %[t4],      $zero,      2       \n\t"
270            "ori    %[sign],    $zero,      0       \n\t"
271            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
272            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
273            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
274            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
275            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
276            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
277            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
278            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
279            "lw     %[t0],      0(%[in_int])        \n\t"
280            "lw     %[t1],      4(%[in_int])        \n\t"
281            "lw     %[t2],      8(%[in_int])        \n\t"
282            "lw     %[t3],      12(%[in_int])       \n\t"
283            "slt    %[t0],      %[t0],      $zero   \n\t"
284            "movn   %[sign],    %[t0],      %[qc1]  \n\t"
285            "slt    %[t1],      %[t1],      $zero   \n\t"
286            "slt    %[t2],      %[t2],      $zero   \n\t"
287            "slt    %[t3],      %[t3],      $zero   \n\t"
288            "sll    %[t0],      %[sign],    1       \n\t"
289            "or     %[t0],      %[t0],      %[t1]   \n\t"
290            "movn   %[sign],    %[t0],      %[qc2]  \n\t"
291            "slt    %[t4],      $zero,      %[qc1]  \n\t"
292            "slt    %[t1],      $zero,      %[qc2]  \n\t"
293            "slt    %[count],   $zero,      %[qc3]  \n\t"
294            "sll    %[t0],      %[sign],    1       \n\t"
295            "or     %[t0],      %[t0],      %[t2]   \n\t"
296            "movn   %[sign],    %[t0],      %[qc3]  \n\t"
297            "slt    %[t2],      $zero,      %[qc4]  \n\t"
298            "addu   %[count],   %[count],   %[t4]   \n\t"
299            "addu   %[count],   %[count],   %[t1]   \n\t"
300            "sll    %[t0],      %[sign],    1       \n\t"
301            "or     %[t0],      %[t0],      %[t3]   \n\t"
302            "movn   %[sign],    %[t0],      %[qc4]  \n\t"
303            "addu   %[count],   %[count],   %[t2]   \n\t"
304
305            ".set pop                               \n\t"
306
307            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
308              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
309              [sign]"=&r"(sign), [count]"=&r"(count),
310              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
311              [t4]"=&r"(t4)
312            : [in_int]"r"(in_int)
313            : "memory"
314        );
315
316        curidx = qc1;
317        curidx *= 3;
318        curidx += qc2;
319        curidx *= 3;
320        curidx += qc3;
321        curidx *= 3;
322        curidx += qc4;
323
324        v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
325        v_bits  = p_bits[curidx] + count;
326        put_bits(pb, v_bits, v_codes);
327
328        if (out || energy) {
329            float e1,e2,e3,e4;
330            vec = &p_vec[curidx*4];
331            e1 = copysignf(vec[0] * IQ, in[i+0]);
332            e2 = copysignf(vec[1] * IQ, in[i+1]);
333            e3 = copysignf(vec[2] * IQ, in[i+2]);
334            e4 = copysignf(vec[3] * IQ, in[i+3]);
335            if (out) {
336                out[i+0] = e1;
337                out[i+1] = e2;
338                out[i+2] = e3;
339                out[i+3] = e4;
340            }
341            if (energy)
342                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
343        }
344    }
345    if (energy)
346        *energy = qenergy;
347}
348
349static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
350                                                     PutBitContext *pb, const float *in, float *out,
351                                                     const float *scaled, int size, int scale_idx,
352                                                     int cb, const float lambda, const float uplim,
353                                                     int *bits, float *energy, const float ROUNDING)
354{
355    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
356    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
357    int i;
358    int qc1, qc2, qc3, qc4;
359    float qenergy = 0.0f;
360
361    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
362    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
363    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
364
365    abs_pow34_v(s->scoefs, in, size);
366    scaled = s->scoefs;
367    for (i = 0; i < size; i += 4) {
368        int curidx, curidx2;
369        int *in_int = (int *)&in[i];
370        uint8_t v_bits;
371        unsigned int v_codes;
372        int t0, t1, t2, t3, t4, t5, t6, t7;
373        const float *vec1, *vec2;
374
375        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
376        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
377        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
378        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
379
380        __asm__ volatile (
381            ".set push                      \n\t"
382            ".set noreorder                 \n\t"
383
384            "ori    %[t4],  $zero,  4       \n\t"
385            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
386            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
387            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
388            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
389            "movn   %[qc1], %[t4],  %[t0]   \n\t"
390            "movn   %[qc2], %[t4],  %[t1]   \n\t"
391            "movn   %[qc3], %[t4],  %[t2]   \n\t"
392            "movn   %[qc4], %[t4],  %[t3]   \n\t"
393            "lw     %[t0],  0(%[in_int])    \n\t"
394            "lw     %[t1],  4(%[in_int])    \n\t"
395            "lw     %[t2],  8(%[in_int])    \n\t"
396            "lw     %[t3],  12(%[in_int])   \n\t"
397            "srl    %[t0],  %[t0],  31      \n\t"
398            "srl    %[t1],  %[t1],  31      \n\t"
399            "srl    %[t2],  %[t2],  31      \n\t"
400            "srl    %[t3],  %[t3],  31      \n\t"
401            "subu   %[t4],  $zero,  %[qc1]  \n\t"
402            "subu   %[t5],  $zero,  %[qc2]  \n\t"
403            "subu   %[t6],  $zero,  %[qc3]  \n\t"
404            "subu   %[t7],  $zero,  %[qc4]  \n\t"
405            "movn   %[qc1], %[t4],  %[t0]   \n\t"
406            "movn   %[qc2], %[t5],  %[t1]   \n\t"
407            "movn   %[qc3], %[t6],  %[t2]   \n\t"
408            "movn   %[qc4], %[t7],  %[t3]   \n\t"
409
410            ".set pop                       \n\t"
411
412            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
413              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
414              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
415              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
416            : [in_int]"r"(in_int)
417            : "memory"
418        );
419
420        curidx = 9 * qc1;
421        curidx += qc2 + 40;
422
423        curidx2 = 9 * qc3;
424        curidx2 += qc4 + 40;
425
426        v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
427        v_bits  = p_bits[curidx] + p_bits[curidx2];
428        put_bits(pb, v_bits, v_codes);
429
430        if (out || energy) {
431            float e1,e2,e3,e4;
432            vec1 = &p_vec[curidx*2 ];
433            vec2 = &p_vec[curidx2*2];
434            e1 = vec1[0] * IQ;
435            e2 = vec1[1] * IQ;
436            e3 = vec2[0] * IQ;
437            e4 = vec2[1] * IQ;
438            if (out) {
439                out[i+0] = e1;
440                out[i+1] = e2;
441                out[i+2] = e3;
442                out[i+3] = e4;
443            }
444            if (energy)
445                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
446        }
447    }
448    if (energy)
449        *energy = qenergy;
450}
451
452static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
453                                                      PutBitContext *pb, const float *in, float *out,
454                                                      const float *scaled, int size, int scale_idx,
455                                                      int cb, const float lambda, const float uplim,
456                                                      int *bits, float *energy, const float ROUNDING)
457{
458    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
459    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
460    int i;
461    int qc1, qc2, qc3, qc4;
462    float qenergy = 0.0f;
463
464    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
465    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
466    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
467
468    abs_pow34_v(s->scoefs, in, size);
469    scaled = s->scoefs;
470    for (i = 0; i < size; i += 4) {
471        int curidx1, curidx2, sign1, count1, sign2, count2;
472        int *in_int = (int *)&in[i];
473        uint8_t v_bits;
474        unsigned int v_codes;
475        int t0, t1, t2, t3, t4;
476        const float *vec1, *vec2;
477
478        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
479        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
480        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
481        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
482
483        __asm__ volatile (
484            ".set push                              \n\t"
485            ".set noreorder                         \n\t"
486
487            "ori    %[t4],      $zero,      7       \n\t"
488            "ori    %[sign1],   $zero,      0       \n\t"
489            "ori    %[sign2],   $zero,      0       \n\t"
490            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
491            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
492            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
493            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
494            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
495            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
496            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
497            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
498            "lw     %[t0],      0(%[in_int])        \n\t"
499            "lw     %[t1],      4(%[in_int])        \n\t"
500            "lw     %[t2],      8(%[in_int])        \n\t"
501            "lw     %[t3],      12(%[in_int])       \n\t"
502            "slt    %[t0],      %[t0],      $zero   \n\t"
503            "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
504            "slt    %[t2],      %[t2],      $zero   \n\t"
505            "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
506            "slt    %[t1],      %[t1],      $zero   \n\t"
507            "sll    %[t0],      %[sign1],   1       \n\t"
508            "or     %[t0],      %[t0],      %[t1]   \n\t"
509            "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
510            "slt    %[t3],      %[t3],      $zero   \n\t"
511            "sll    %[t0],      %[sign2],   1       \n\t"
512            "or     %[t0],      %[t0],      %[t3]   \n\t"
513            "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
514            "slt    %[count1],  $zero,      %[qc1]  \n\t"
515            "slt    %[t1],      $zero,      %[qc2]  \n\t"
516            "slt    %[count2],  $zero,      %[qc3]  \n\t"
517            "slt    %[t2],      $zero,      %[qc4]  \n\t"
518            "addu   %[count1],  %[count1],  %[t1]   \n\t"
519            "addu   %[count2],  %[count2],  %[t2]   \n\t"
520
521            ".set pop                               \n\t"
522
523            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
524              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
525              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
526              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
527              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
528              [t4]"=&r"(t4)
529            : [in_int]"r"(in_int)
530            : "t0", "t1", "t2", "t3", "t4",
531              "memory"
532        );
533
534        curidx1  = 8 * qc1;
535        curidx1 += qc2;
536
537        v_codes = (p_codes[curidx1] << count1) | sign1;
538        v_bits  = p_bits[curidx1] + count1;
539        put_bits(pb, v_bits, v_codes);
540
541        curidx2  = 8 * qc3;
542        curidx2 += qc4;
543
544        v_codes = (p_codes[curidx2] << count2) | sign2;
545        v_bits  = p_bits[curidx2] + count2;
546        put_bits(pb, v_bits, v_codes);
547
548        if (out || energy) {
549            float e1,e2,e3,e4;
550            vec1 = &p_vec[curidx1*2];
551            vec2 = &p_vec[curidx2*2];
552            e1 = copysignf(vec1[0] * IQ, in[i+0]);
553            e2 = copysignf(vec1[1] * IQ, in[i+1]);
554            e3 = copysignf(vec2[0] * IQ, in[i+2]);
555            e4 = copysignf(vec2[1] * IQ, in[i+3]);
556            if (out) {
557                out[i+0] = e1;
558                out[i+1] = e2;
559                out[i+2] = e3;
560                out[i+3] = e4;
561            }
562            if (energy)
563                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
564        }
565    }
566    if (energy)
567        *energy = qenergy;
568}
569
570static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
571                                                       PutBitContext *pb, const float *in, float *out,
572                                                       const float *scaled, int size, int scale_idx,
573                                                       int cb, const float lambda, const float uplim,
574                                                       int *bits, float *energy, const float ROUNDING)
575{
576    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
577    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
578    int i;
579    int qc1, qc2, qc3, qc4;
580    float qenergy = 0.0f;
581
582    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
583    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
584    float    *p_vec   = (float   *)ff_aac_codebook_vectors[cb-1];
585
586    abs_pow34_v(s->scoefs, in, size);
587    scaled = s->scoefs;
588    for (i = 0; i < size; i += 4) {
589        int curidx1, curidx2, sign1, count1, sign2, count2;
590        int *in_int = (int *)&in[i];
591        uint8_t v_bits;
592        unsigned int v_codes;
593        int t0, t1, t2, t3, t4;
594        const float *vec1, *vec2;
595
596        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
597        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
598        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
599        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
600
601        __asm__ volatile (
602            ".set push                              \n\t"
603            ".set noreorder                         \n\t"
604
605            "ori    %[t4],      $zero,      12      \n\t"
606            "ori    %[sign1],   $zero,      0       \n\t"
607            "ori    %[sign2],   $zero,      0       \n\t"
608            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
609            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
610            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
611            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
612            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
613            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
614            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
615            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
616            "lw     %[t0],      0(%[in_int])        \n\t"
617            "lw     %[t1],      4(%[in_int])        \n\t"
618            "lw     %[t2],      8(%[in_int])        \n\t"
619            "lw     %[t3],      12(%[in_int])       \n\t"
620            "slt    %[t0],      %[t0],      $zero   \n\t"
621            "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
622            "slt    %[t2],      %[t2],      $zero   \n\t"
623            "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
624            "slt    %[t1],      %[t1],      $zero   \n\t"
625            "sll    %[t0],      %[sign1],   1       \n\t"
626            "or     %[t0],      %[t0],      %[t1]   \n\t"
627            "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
628            "slt    %[t3],      %[t3],      $zero   \n\t"
629            "sll    %[t0],      %[sign2],   1       \n\t"
630            "or     %[t0],      %[t0],      %[t3]   \n\t"
631            "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
632            "slt    %[count1],  $zero,      %[qc1]  \n\t"
633            "slt    %[t1],      $zero,      %[qc2]  \n\t"
634            "slt    %[count2],  $zero,      %[qc3]  \n\t"
635            "slt    %[t2],      $zero,      %[qc4]  \n\t"
636            "addu   %[count1],  %[count1],  %[t1]   \n\t"
637            "addu   %[count2],  %[count2],  %[t2]   \n\t"
638
639            ".set pop                               \n\t"
640
641            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
642              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
643              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
644              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
645              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
646              [t4]"=&r"(t4)
647            : [in_int]"r"(in_int)
648            : "memory"
649        );
650
651        curidx1  = 13 * qc1;
652        curidx1 += qc2;
653
654        v_codes = (p_codes[curidx1] << count1) | sign1;
655        v_bits  = p_bits[curidx1] + count1;
656        put_bits(pb, v_bits, v_codes);
657
658        curidx2  = 13 * qc3;
659        curidx2 += qc4;
660
661        v_codes = (p_codes[curidx2] << count2) | sign2;
662        v_bits  = p_bits[curidx2] + count2;
663        put_bits(pb, v_bits, v_codes);
664
665        if (out || energy) {
666            float e1,e2,e3,e4;
667            vec1 = &p_vec[curidx1*2];
668            vec2 = &p_vec[curidx2*2];
669            e1 = copysignf(vec1[0] * IQ, in[i+0]);
670            e2 = copysignf(vec1[1] * IQ, in[i+1]);
671            e3 = copysignf(vec2[0] * IQ, in[i+2]);
672            e4 = copysignf(vec2[1] * IQ, in[i+3]);
673            if (out) {
674                out[i+0] = e1;
675                out[i+1] = e2;
676                out[i+2] = e3;
677                out[i+3] = e4;
678            }
679            if (energy)
680                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
681        }
682    }
683    if (energy)
684        *energy = qenergy;
685}
686
687static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
688                                                   PutBitContext *pb, const float *in, float *out,
689                                                   const float *scaled, int size, int scale_idx,
690                                                   int cb, const float lambda, const float uplim,
691                                                   int *bits, float *energy, const float ROUNDING)
692{
693    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
694    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
695    int i;
696    int qc1, qc2, qc3, qc4;
697    float qenergy = 0.0f;
698
699    uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
700    uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
701    float    *p_vectors = (float*   )ff_aac_codebook_vectors[cb-1];
702
703    abs_pow34_v(s->scoefs, in, size);
704    scaled = s->scoefs;
705
706    if (cb < 11) {
707        for (i = 0; i < size; i += 4) {
708            int curidx, curidx2, sign1, count1, sign2, count2;
709            int *in_int = (int *)&in[i];
710            uint8_t v_bits;
711            unsigned int v_codes;
712            int t0, t1, t2, t3, t4;
713            const float *vec1, *vec2;
714
715            qc1 = scaled[i  ] * Q34 + ROUNDING;
716            qc2 = scaled[i+1] * Q34 + ROUNDING;
717            qc3 = scaled[i+2] * Q34 + ROUNDING;
718            qc4 = scaled[i+3] * Q34 + ROUNDING;
719
720            __asm__ volatile (
721                ".set push                                  \n\t"
722                ".set noreorder                             \n\t"
723
724                "ori        %[t4],      $zero,      16      \n\t"
725                "ori        %[sign1],   $zero,      0       \n\t"
726                "ori        %[sign2],   $zero,      0       \n\t"
727                "slt        %[t0],      %[t4],      %[qc1]  \n\t"
728                "slt        %[t1],      %[t4],      %[qc2]  \n\t"
729                "slt        %[t2],      %[t4],      %[qc3]  \n\t"
730                "slt        %[t3],      %[t4],      %[qc4]  \n\t"
731                "movn       %[qc1],     %[t4],      %[t0]   \n\t"
732                "movn       %[qc2],     %[t4],      %[t1]   \n\t"
733                "movn       %[qc3],     %[t4],      %[t2]   \n\t"
734                "movn       %[qc4],     %[t4],      %[t3]   \n\t"
735                "lw         %[t0],      0(%[in_int])        \n\t"
736                "lw         %[t1],      4(%[in_int])        \n\t"
737                "lw         %[t2],      8(%[in_int])        \n\t"
738                "lw         %[t3],      12(%[in_int])       \n\t"
739                "slt        %[t0],      %[t0],      $zero   \n\t"
740                "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
741                "slt        %[t2],      %[t2],      $zero   \n\t"
742                "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
743                "slt        %[t1],      %[t1],      $zero   \n\t"
744                "sll        %[t0],      %[sign1],   1       \n\t"
745                "or         %[t0],      %[t0],      %[t1]   \n\t"
746                "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
747                "slt        %[t3],      %[t3],      $zero   \n\t"
748                "sll        %[t0],      %[sign2],   1       \n\t"
749                "or         %[t0],      %[t0],      %[t3]   \n\t"
750                "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
751                "slt        %[count1],  $zero,      %[qc1]  \n\t"
752                "slt        %[t1],      $zero,      %[qc2]  \n\t"
753                "slt        %[count2],  $zero,      %[qc3]  \n\t"
754                "slt        %[t2],      $zero,      %[qc4]  \n\t"
755                "addu       %[count1],  %[count1],  %[t1]   \n\t"
756                "addu       %[count2],  %[count2],  %[t2]   \n\t"
757
758                ".set pop                                   \n\t"
759
760                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
761                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
762                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
763                  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
764                  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
765                  [t4]"=&r"(t4)
766                : [in_int]"r"(in_int)
767                : "memory"
768            );
769
770            curidx = 17 * qc1;
771            curidx += qc2;
772            curidx2 = 17 * qc3;
773            curidx2 += qc4;
774
775            v_codes = (p_codes[curidx] << count1) | sign1;
776            v_bits  = p_bits[curidx] + count1;
777            put_bits(pb, v_bits, v_codes);
778
779            v_codes = (p_codes[curidx2] << count2) | sign2;
780            v_bits  = p_bits[curidx2] + count2;
781            put_bits(pb, v_bits, v_codes);
782
783            if (out || energy) {
784                float e1,e2,e3,e4;
785                vec1 = &p_vectors[curidx*2 ];
786                vec2 = &p_vectors[curidx2*2];
787                e1 = copysignf(vec1[0] * IQ, in[i+0]);
788                e2 = copysignf(vec1[1] * IQ, in[i+1]);
789                e3 = copysignf(vec2[0] * IQ, in[i+2]);
790                e4 = copysignf(vec2[1] * IQ, in[i+3]);
791                if (out) {
792                    out[i+0] = e1;
793                    out[i+1] = e2;
794                    out[i+2] = e3;
795                    out[i+3] = e4;
796                }
797                if (energy)
798                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
799            }
800        }
801    } else {
802        for (i = 0; i < size; i += 4) {
803            int curidx, curidx2, sign1, count1, sign2, count2;
804            int *in_int = (int *)&in[i];
805            uint8_t v_bits;
806            unsigned int v_codes;
807            int c1, c2, c3, c4;
808            int t0, t1, t2, t3, t4;
809
810            qc1 = scaled[i  ] * Q34 + ROUNDING;
811            qc2 = scaled[i+1] * Q34 + ROUNDING;
812            qc3 = scaled[i+2] * Q34 + ROUNDING;
813            qc4 = scaled[i+3] * Q34 + ROUNDING;
814
815            __asm__ volatile (
816                ".set push                                  \n\t"
817                ".set noreorder                             \n\t"
818
819                "ori        %[t4],      $zero,      16      \n\t"
820                "ori        %[sign1],   $zero,      0       \n\t"
821                "ori        %[sign2],   $zero,      0       \n\t"
822                "shll_s.w   %[c1],      %[qc1],     18      \n\t"
823                "shll_s.w   %[c2],      %[qc2],     18      \n\t"
824                "shll_s.w   %[c3],      %[qc3],     18      \n\t"
825                "shll_s.w   %[c4],      %[qc4],     18      \n\t"
826                "srl        %[c1],      %[c1],      18      \n\t"
827                "srl        %[c2],      %[c2],      18      \n\t"
828                "srl        %[c3],      %[c3],      18      \n\t"
829                "srl        %[c4],      %[c4],      18      \n\t"
830                "slt        %[t0],      %[t4],      %[qc1]  \n\t"
831                "slt        %[t1],      %[t4],      %[qc2]  \n\t"
832                "slt        %[t2],      %[t4],      %[qc3]  \n\t"
833                "slt        %[t3],      %[t4],      %[qc4]  \n\t"
834                "movn       %[qc1],     %[t4],      %[t0]   \n\t"
835                "movn       %[qc2],     %[t4],      %[t1]   \n\t"
836                "movn       %[qc3],     %[t4],      %[t2]   \n\t"
837                "movn       %[qc4],     %[t4],      %[t3]   \n\t"
838                "lw         %[t0],      0(%[in_int])        \n\t"
839                "lw         %[t1],      4(%[in_int])        \n\t"
840                "lw         %[t2],      8(%[in_int])        \n\t"
841                "lw         %[t3],      12(%[in_int])       \n\t"
842                "slt        %[t0],      %[t0],      $zero   \n\t"
843                "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
844                "slt        %[t2],      %[t2],      $zero   \n\t"
845                "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
846                "slt        %[t1],      %[t1],      $zero   \n\t"
847                "sll        %[t0],      %[sign1],   1       \n\t"
848                "or         %[t0],      %[t0],      %[t1]   \n\t"
849                "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
850                "slt        %[t3],      %[t3],      $zero   \n\t"
851                "sll        %[t0],      %[sign2],   1       \n\t"
852                "or         %[t0],      %[t0],      %[t3]   \n\t"
853                "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
854                "slt        %[count1],  $zero,      %[qc1]  \n\t"
855                "slt        %[t1],      $zero,      %[qc2]  \n\t"
856                "slt        %[count2],  $zero,      %[qc3]  \n\t"
857                "slt        %[t2],      $zero,      %[qc4]  \n\t"
858                "addu       %[count1],  %[count1],  %[t1]   \n\t"
859                "addu       %[count2],  %[count2],  %[t2]   \n\t"
860
861                ".set pop                                   \n\t"
862
863                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
864                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
865                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
866                  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
867                  [c1]"=&r"(c1), [c2]"=&r"(c2),
868                  [c3]"=&r"(c3), [c4]"=&r"(c4),
869                  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
870                  [t4]"=&r"(t4)
871                : [in_int]"r"(in_int)
872                : "memory"
873            );
874
875            curidx = 17 * qc1;
876            curidx += qc2;
877
878            curidx2 = 17 * qc3;
879            curidx2 += qc4;
880
881            v_codes = (p_codes[curidx] << count1) | sign1;
882            v_bits  = p_bits[curidx] + count1;
883            put_bits(pb, v_bits, v_codes);
884
885            if (p_vectors[curidx*2  ] == 64.0f) {
886                int len = av_log2(c1);
887                v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
888                put_bits(pb, len * 2 - 3, v_codes);
889            }
890            if (p_vectors[curidx*2+1] == 64.0f) {
891                int len = av_log2(c2);
892                v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
893                put_bits(pb, len*2-3, v_codes);
894            }
895
896            v_codes = (p_codes[curidx2] << count2) | sign2;
897            v_bits  = p_bits[curidx2] + count2;
898            put_bits(pb, v_bits, v_codes);
899
900            if (p_vectors[curidx2*2  ] == 64.0f) {
901                int len = av_log2(c3);
902                v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
903                put_bits(pb, len* 2 - 3, v_codes);
904            }
905            if (p_vectors[curidx2*2+1] == 64.0f) {
906                int len = av_log2(c4);
907                v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
908                put_bits(pb, len * 2 - 3, v_codes);
909            }
910
911            if (out || energy) {
912                float e1, e2, e3, e4;
913                e1 = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
914                e2 = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
915                e3 = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
916                e4 = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
917                if (out) {
918                    out[i+0] = e1;
919                    out[i+1] = e2;
920                    out[i+2] = e3;
921                    out[i+3] = e4;
922                }
923                if (energy)
924                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
925            }
926        }
927    }
928    if (energy)
929        *energy = qenergy;
930}
931
932static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
933                                                         PutBitContext *pb, const float *in, float *out,
934                                                         const float *scaled, int size, int scale_idx,
935                                                         int cb, const float lambda, const float uplim,
936                                                         int *bits, float *energy, const float ROUNDING) {
937    av_assert0(0);
938}
939
940static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
941                                                         PutBitContext *pb, const float *in, float *out,
942                                                         const float *scaled, int size, int scale_idx,
943                                                         int cb, const float lambda, const float uplim,
944                                                         int *bits, float *energy, const float ROUNDING) {
945    int i;
946    if (bits)
947        *bits = 0;
948    if (out) {
949        for (i = 0; i < size; i += 4) {
950           out[i  ] = 0.0f;
951           out[i+1] = 0.0f;
952           out[i+2] = 0.0f;
953           out[i+3] = 0.0f;
954        }
955    }
956    if (energy)
957        *energy = 0.0f;
958}
959
960static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
961                                                         PutBitContext *pb, const float *in, float *out,
962                                                         const float *scaled, int size, int scale_idx,
963                                                         int cb, const float lambda, const float uplim,
964                                                         int *bits, float *energy, const float ROUNDING) = {
965    quantize_and_encode_band_cost_ZERO_mips,
966    quantize_and_encode_band_cost_SQUAD_mips,
967    quantize_and_encode_band_cost_SQUAD_mips,
968    quantize_and_encode_band_cost_UQUAD_mips,
969    quantize_and_encode_band_cost_UQUAD_mips,
970    quantize_and_encode_band_cost_SPAIR_mips,
971    quantize_and_encode_band_cost_SPAIR_mips,
972    quantize_and_encode_band_cost_UPAIR7_mips,
973    quantize_and_encode_band_cost_UPAIR7_mips,
974    quantize_and_encode_band_cost_UPAIR12_mips,
975    quantize_and_encode_band_cost_UPAIR12_mips,
976    quantize_and_encode_band_cost_ESC_mips,
977    quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */
978    quantize_and_encode_band_cost_ZERO_mips,
979    quantize_and_encode_band_cost_ZERO_mips,
980    quantize_and_encode_band_cost_ZERO_mips,
981};
982
983#define quantize_and_encode_band_cost(                                       \
984                                s, pb, in, out, scaled, size, scale_idx, cb, \
985                                lambda, uplim, bits, energy, ROUNDING)       \
986    quantize_and_encode_band_cost_arr[cb](                                   \
987                                s, pb, in, out, scaled, size, scale_idx, cb, \
988                                lambda, uplim, bits, energy, ROUNDING)
989
990static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
991                                          const float *in, float *out, int size, int scale_idx,
992                                          int cb, const float lambda, int rtz)
993{
994    quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
995                                  INFINITY, NULL, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
996}
997
998/**
999 * Functions developed from template function and optimized for getting the number of bits
1000 */
1001static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
1002                                        PutBitContext *pb, const float *in,
1003                                        const float *scaled, int size, int scale_idx,
1004                                        int cb, const float lambda, const float uplim,
1005                                        int *bits)
1006{
1007    return 0;
1008}
1009
1010static float get_band_numbits_NONE_mips(struct AACEncContext *s,
1011                                        PutBitContext *pb, const float *in,
1012                                        const float *scaled, int size, int scale_idx,
1013                                        int cb, const float lambda, const float uplim,
1014                                        int *bits)
1015{
1016    av_assert0(0);
1017    return 0;
1018}
1019
1020static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
1021                                         PutBitContext *pb, const float *in,
1022                                         const float *scaled, int size, int scale_idx,
1023                                         int cb, const float lambda, const float uplim,
1024                                         int *bits)
1025{
1026    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1027    int i;
1028    int qc1, qc2, qc3, qc4;
1029    int curbits = 0;
1030
1031    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1032
1033    for (i = 0; i < size; i += 4) {
1034        int curidx;
1035        int *in_int = (int *)&in[i];
1036        int t0, t1, t2, t3, t4, t5, t6, t7;
1037
1038        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1039        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1040        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1041        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1042
1043        __asm__ volatile (
1044            ".set push                      \n\t"
1045            ".set noreorder                 \n\t"
1046
1047            "slt    %[qc1], $zero,  %[qc1]  \n\t"
1048            "slt    %[qc2], $zero,  %[qc2]  \n\t"
1049            "slt    %[qc3], $zero,  %[qc3]  \n\t"
1050            "slt    %[qc4], $zero,  %[qc4]  \n\t"
1051            "lw     %[t0],  0(%[in_int])    \n\t"
1052            "lw     %[t1],  4(%[in_int])    \n\t"
1053            "lw     %[t2],  8(%[in_int])    \n\t"
1054            "lw     %[t3],  12(%[in_int])   \n\t"
1055            "srl    %[t0],  %[t0],  31      \n\t"
1056            "srl    %[t1],  %[t1],  31      \n\t"
1057            "srl    %[t2],  %[t2],  31      \n\t"
1058            "srl    %[t3],  %[t3],  31      \n\t"
1059            "subu   %[t4],  $zero,  %[qc1]  \n\t"
1060            "subu   %[t5],  $zero,  %[qc2]  \n\t"
1061            "subu   %[t6],  $zero,  %[qc3]  \n\t"
1062            "subu   %[t7],  $zero,  %[qc4]  \n\t"
1063            "movn   %[qc1], %[t4],  %[t0]   \n\t"
1064            "movn   %[qc2], %[t5],  %[t1]   \n\t"
1065            "movn   %[qc3], %[t6],  %[t2]   \n\t"
1066            "movn   %[qc4], %[t7],  %[t3]   \n\t"
1067
1068            ".set pop                       \n\t"
1069
1070            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1071              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1072              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1073              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1074            : [in_int]"r"(in_int)
1075            : "memory"
1076        );
1077
1078        curidx = qc1;
1079        curidx *= 3;
1080        curidx += qc2;
1081        curidx *= 3;
1082        curidx += qc3;
1083        curidx *= 3;
1084        curidx += qc4;
1085        curidx += 40;
1086
1087        curbits += p_bits[curidx];
1088    }
1089    return curbits;
1090}
1091
1092static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
1093                                         PutBitContext *pb, const float *in,
1094                                         const float *scaled, int size, int scale_idx,
1095                                         int cb, const float lambda, const float uplim,
1096                                         int *bits)
1097{
1098    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1099    int i;
1100    int curbits = 0;
1101    int qc1, qc2, qc3, qc4;
1102
1103    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1104
1105    for (i = 0; i < size; i += 4) {
1106        int curidx;
1107        int t0, t1, t2, t3, t4;
1108
1109        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1110        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1111        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1112        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1113
1114        __asm__ volatile (
1115            ".set push                      \n\t"
1116            ".set noreorder                 \n\t"
1117
1118            "ori    %[t4],  $zero,  2       \n\t"
1119            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1120            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1121            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1122            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1123            "movn   %[qc1], %[t4],  %[t0]   \n\t"
1124            "movn   %[qc2], %[t4],  %[t1]   \n\t"
1125            "movn   %[qc3], %[t4],  %[t2]   \n\t"
1126            "movn   %[qc4], %[t4],  %[t3]   \n\t"
1127
1128            ".set pop                       \n\t"
1129
1130            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1131              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1132              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1133              [t4]"=&r"(t4)
1134        );
1135
1136        curidx = qc1;
1137        curidx *= 3;
1138        curidx += qc2;
1139        curidx *= 3;
1140        curidx += qc3;
1141        curidx *= 3;
1142        curidx += qc4;
1143
1144        curbits += p_bits[curidx];
1145        curbits += uquad_sign_bits[curidx];
1146    }
1147    return curbits;
1148}
1149
1150static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
1151                                         PutBitContext *pb, const float *in,
1152                                         const float *scaled, int size, int scale_idx,
1153                                         int cb, const float lambda, const float uplim,
1154                                         int *bits)
1155{
1156    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1157    int i;
1158    int qc1, qc2, qc3, qc4;
1159    int curbits = 0;
1160
1161    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1162
1163    for (i = 0; i < size; i += 4) {
1164        int curidx, curidx2;
1165        int *in_int = (int *)&in[i];
1166        int t0, t1, t2, t3, t4, t5, t6, t7;
1167
1168        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1169        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1170        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1171        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1172
1173        __asm__ volatile (
1174            ".set push                      \n\t"
1175            ".set noreorder                 \n\t"
1176
1177            "ori    %[t4],  $zero,  4       \n\t"
1178            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1179            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1180            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1181            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1182            "movn   %[qc1], %[t4],  %[t0]   \n\t"
1183            "movn   %[qc2], %[t4],  %[t1]   \n\t"
1184            "movn   %[qc3], %[t4],  %[t2]   \n\t"
1185            "movn   %[qc4], %[t4],  %[t3]   \n\t"
1186            "lw     %[t0],  0(%[in_int])    \n\t"
1187            "lw     %[t1],  4(%[in_int])    \n\t"
1188            "lw     %[t2],  8(%[in_int])    \n\t"
1189            "lw     %[t3],  12(%[in_int])   \n\t"
1190            "srl    %[t0],  %[t0],  31      \n\t"
1191            "srl    %[t1],  %[t1],  31      \n\t"
1192            "srl    %[t2],  %[t2],  31      \n\t"
1193            "srl    %[t3],  %[t3],  31      \n\t"
1194            "subu   %[t4],  $zero,  %[qc1]  \n\t"
1195            "subu   %[t5],  $zero,  %[qc2]  \n\t"
1196            "subu   %[t6],  $zero,  %[qc3]  \n\t"
1197            "subu   %[t7],  $zero,  %[qc4]  \n\t"
1198            "movn   %[qc1], %[t4],  %[t0]   \n\t"
1199            "movn   %[qc2], %[t5],  %[t1]   \n\t"
1200            "movn   %[qc3], %[t6],  %[t2]   \n\t"
1201            "movn   %[qc4], %[t7],  %[t3]   \n\t"
1202
1203            ".set pop                       \n\t"
1204
1205            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1206              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1207              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1208              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1209            : [in_int]"r"(in_int)
1210            : "memory"
1211        );
1212
1213        curidx  = 9 * qc1;
1214        curidx += qc2 + 40;
1215
1216        curidx2  = 9 * qc3;
1217        curidx2 += qc4 + 40;
1218
1219        curbits += p_bits[curidx] + p_bits[curidx2];
1220    }
1221    return curbits;
1222}
1223
1224static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
1225                                          PutBitContext *pb, const float *in,
1226                                          const float *scaled, int size, int scale_idx,
1227                                          int cb, const float lambda, const float uplim,
1228                                          int *bits)
1229{
1230    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1231    int i;
1232    int qc1, qc2, qc3, qc4;
1233    int curbits = 0;
1234
1235    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1236
1237    for (i = 0; i < size; i += 4) {
1238        int curidx, curidx2;
1239        int t0, t1, t2, t3, t4;
1240
1241        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1242        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1243        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1244        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1245
1246        __asm__ volatile (
1247            ".set push                      \n\t"
1248            ".set noreorder                 \n\t"
1249
1250            "ori    %[t4],  $zero,  7       \n\t"
1251            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1252            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1253            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1254            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1255            "movn   %[qc1], %[t4],  %[t0]   \n\t"
1256            "movn   %[qc2], %[t4],  %[t1]   \n\t"
1257            "movn   %[qc3], %[t4],  %[t2]   \n\t"
1258            "movn   %[qc4], %[t4],  %[t3]   \n\t"
1259
1260            ".set pop                       \n\t"
1261
1262            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1263              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1264              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1265              [t4]"=&r"(t4)
1266        );
1267
1268        curidx  = 8 * qc1;
1269        curidx += qc2;
1270
1271        curidx2  = 8 * qc3;
1272        curidx2 += qc4;
1273
1274        curbits += p_bits[curidx] +
1275                   upair7_sign_bits[curidx] +
1276                   p_bits[curidx2] +
1277                   upair7_sign_bits[curidx2];
1278    }
1279    return curbits;
1280}
1281
1282static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
1283                                           PutBitContext *pb, const float *in,
1284                                           const float *scaled, int size, int scale_idx,
1285                                           int cb, const float lambda, const float uplim,
1286                                           int *bits)
1287{
1288    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1289    int i;
1290    int qc1, qc2, qc3, qc4;
1291    int curbits = 0;
1292
1293    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1294
1295    for (i = 0; i < size; i += 4) {
1296        int curidx, curidx2;
1297        int t0, t1, t2, t3, t4;
1298
1299        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1300        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1301        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1302        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1303
1304        __asm__ volatile (
1305            ".set push                      \n\t"
1306            ".set noreorder                 \n\t"
1307
1308            "ori    %[t4],  $zero,  12      \n\t"
1309            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1310            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1311            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1312            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1313            "movn   %[qc1], %[t4],  %[t0]   \n\t"
1314            "movn   %[qc2], %[t4],  %[t1]   \n\t"
1315            "movn   %[qc3], %[t4],  %[t2]   \n\t"
1316            "movn   %[qc4], %[t4],  %[t3]   \n\t"
1317
1318            ".set pop                       \n\t"
1319
1320            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1321              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1322              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1323              [t4]"=&r"(t4)
1324        );
1325
1326        curidx  = 13 * qc1;
1327        curidx += qc2;
1328
1329        curidx2  = 13 * qc3;
1330        curidx2 += qc4;
1331
1332        curbits += p_bits[curidx] +
1333                   p_bits[curidx2] +
1334                   upair12_sign_bits[curidx] +
1335                   upair12_sign_bits[curidx2];
1336    }
1337    return curbits;
1338}
1339
1340static float get_band_numbits_ESC_mips(struct AACEncContext *s,
1341                                       PutBitContext *pb, const float *in,
1342                                       const float *scaled, int size, int scale_idx,
1343                                       int cb, const float lambda, const float uplim,
1344                                       int *bits)
1345{
1346    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1347    int i;
1348    int qc1, qc2, qc3, qc4;
1349    int curbits = 0;
1350
1351    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1352
1353    for (i = 0; i < size; i += 4) {
1354        int curidx, curidx2;
1355        int cond0, cond1, cond2, cond3;
1356        int c1, c2, c3, c4;
1357        int t4, t5;
1358
1359        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1360        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1361        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1362        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1363
1364        __asm__ volatile (
1365            ".set push                                  \n\t"
1366            ".set noreorder                             \n\t"
1367
1368            "ori        %[t4],      $zero,  15          \n\t"
1369            "ori        %[t5],      $zero,  16          \n\t"
1370            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
1371            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
1372            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
1373            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
1374            "srl        %[c1],      %[c1],  18          \n\t"
1375            "srl        %[c2],      %[c2],  18          \n\t"
1376            "srl        %[c3],      %[c3],  18          \n\t"
1377            "srl        %[c4],      %[c4],  18          \n\t"
1378            "slt        %[cond0],   %[t4],  %[qc1]      \n\t"
1379            "slt        %[cond1],   %[t4],  %[qc2]      \n\t"
1380            "slt        %[cond2],   %[t4],  %[qc3]      \n\t"
1381            "slt        %[cond3],   %[t4],  %[qc4]      \n\t"
1382            "movn       %[qc1],     %[t5],  %[cond0]    \n\t"
1383            "movn       %[qc2],     %[t5],  %[cond1]    \n\t"
1384            "movn       %[qc3],     %[t5],  %[cond2]    \n\t"
1385            "movn       %[qc4],     %[t5],  %[cond3]    \n\t"
1386            "ori        %[t5],      $zero,  31          \n\t"
1387            "clz        %[c1],      %[c1]               \n\t"
1388            "clz        %[c2],      %[c2]               \n\t"
1389            "clz        %[c3],      %[c3]               \n\t"
1390            "clz        %[c4],      %[c4]               \n\t"
1391            "subu       %[c1],      %[t5],  %[c1]       \n\t"
1392            "subu       %[c2],      %[t5],  %[c2]       \n\t"
1393            "subu       %[c3],      %[t5],  %[c3]       \n\t"
1394            "subu       %[c4],      %[t5],  %[c4]       \n\t"
1395            "sll        %[c1],      %[c1],  1           \n\t"
1396            "sll        %[c2],      %[c2],  1           \n\t"
1397            "sll        %[c3],      %[c3],  1           \n\t"
1398            "sll        %[c4],      %[c4],  1           \n\t"
1399            "addiu      %[c1],      %[c1],  -3          \n\t"
1400            "addiu      %[c2],      %[c2],  -3          \n\t"
1401            "addiu      %[c3],      %[c3],  -3          \n\t"
1402            "addiu      %[c4],      %[c4],  -3          \n\t"
1403            "subu       %[cond0],   $zero,  %[cond0]    \n\t"
1404            "subu       %[cond1],   $zero,  %[cond1]    \n\t"
1405            "subu       %[cond2],   $zero,  %[cond2]    \n\t"
1406            "subu       %[cond3],   $zero,  %[cond3]    \n\t"
1407            "and        %[c1],      %[c1],  %[cond0]    \n\t"
1408            "and        %[c2],      %[c2],  %[cond1]    \n\t"
1409            "and        %[c3],      %[c3],  %[cond2]    \n\t"
1410            "and        %[c4],      %[c4],  %[cond3]    \n\t"
1411
1412            ".set pop                                   \n\t"
1413
1414            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1415              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1416              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
1417              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
1418              [c1]"=&r"(c1), [c2]"=&r"(c2),
1419              [c3]"=&r"(c3), [c4]"=&r"(c4),
1420              [t4]"=&r"(t4), [t5]"=&r"(t5)
1421        );
1422
1423        curidx = 17 * qc1;
1424        curidx += qc2;
1425
1426        curidx2 = 17 * qc3;
1427        curidx2 += qc4;
1428
1429        curbits += p_bits[curidx];
1430        curbits += esc_sign_bits[curidx];
1431        curbits += p_bits[curidx2];
1432        curbits += esc_sign_bits[curidx2];
1433
1434        curbits += c1;
1435        curbits += c2;
1436        curbits += c3;
1437        curbits += c4;
1438    }
1439    return curbits;
1440}
1441
1442static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
1443                                             PutBitContext *pb, const float *in,
1444                                             const float *scaled, int size, int scale_idx,
1445                                             int cb, const float lambda, const float uplim,
1446                                             int *bits) = {
1447    get_band_numbits_ZERO_mips,
1448    get_band_numbits_SQUAD_mips,
1449    get_band_numbits_SQUAD_mips,
1450    get_band_numbits_UQUAD_mips,
1451    get_band_numbits_UQUAD_mips,
1452    get_band_numbits_SPAIR_mips,
1453    get_band_numbits_SPAIR_mips,
1454    get_band_numbits_UPAIR7_mips,
1455    get_band_numbits_UPAIR7_mips,
1456    get_band_numbits_UPAIR12_mips,
1457    get_band_numbits_UPAIR12_mips,
1458    get_band_numbits_ESC_mips,
1459    get_band_numbits_NONE_mips, /* cb 12 doesn't exist */
1460    get_band_numbits_ZERO_mips,
1461    get_band_numbits_ZERO_mips,
1462    get_band_numbits_ZERO_mips,
1463};
1464
1465#define get_band_numbits(                                  \
1466                                s, pb, in, scaled, size, scale_idx, cb, \
1467                                lambda, uplim, bits)                    \
1468    get_band_numbits_arr[cb](                              \
1469                                s, pb, in, scaled, size, scale_idx, cb, \
1470                                lambda, uplim, bits)
1471
1472static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
1473                                     const float *scaled, int size, int scale_idx,
1474                                     int cb, const float lambda, const float uplim,
1475                                     int *bits, float *energy, int rtz)
1476{
1477    return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
1478}
1479
1480/**
1481 * Functions developed from template function and optimized for getting the band cost
1482 */
1483#if HAVE_MIPSFPU
1484static float get_band_cost_ZERO_mips(struct AACEncContext *s,
1485                                     PutBitContext *pb, const float *in,
1486                                     const float *scaled, int size, int scale_idx,
1487                                     int cb, const float lambda, const float uplim,
1488                                     int *bits, float *energy)
1489{
1490    int i;
1491    float cost = 0;
1492
1493    for (i = 0; i < size; i += 4) {
1494        cost += in[i  ] * in[i  ];
1495        cost += in[i+1] * in[i+1];
1496        cost += in[i+2] * in[i+2];
1497        cost += in[i+3] * in[i+3];
1498    }
1499    if (bits)
1500        *bits = 0;
1501    if (energy)
1502        *energy = 0.0f;
1503    return cost * lambda;
1504}
1505
1506static float get_band_cost_NONE_mips(struct AACEncContext *s,
1507                                     PutBitContext *pb, const float *in,
1508                                     const float *scaled, int size, int scale_idx,
1509                                     int cb, const float lambda, const float uplim,
1510                                     int *bits, float *energy)
1511{
1512    av_assert0(0);
1513    return 0;
1514}
1515
1516static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
1517                                      PutBitContext *pb, const float *in,
1518                                      const float *scaled, int size, int scale_idx,
1519                                      int cb, const float lambda, const float uplim,
1520                                      int *bits, float *energy)
1521{
1522    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1523    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1524    int i;
1525    float cost = 0;
1526    float qenergy = 0.0f;
1527    int qc1, qc2, qc3, qc4;
1528    int curbits = 0;
1529
1530    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1531    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1532
1533    for (i = 0; i < size; i += 4) {
1534        const float *vec;
1535        int curidx;
1536        int   *in_int = (int   *)&in[i];
1537        float *in_pos = (float *)&in[i];
1538        float di0, di1, di2, di3;
1539        int t0, t1, t2, t3, t4, t5, t6, t7;
1540
1541        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1542        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1543        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1544        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1545
1546        __asm__ volatile (
1547            ".set push                                  \n\t"
1548            ".set noreorder                             \n\t"
1549
1550            "slt        %[qc1], $zero,  %[qc1]          \n\t"
1551            "slt        %[qc2], $zero,  %[qc2]          \n\t"
1552            "slt        %[qc3], $zero,  %[qc3]          \n\t"
1553            "slt        %[qc4], $zero,  %[qc4]          \n\t"
1554            "lw         %[t0],  0(%[in_int])            \n\t"
1555            "lw         %[t1],  4(%[in_int])            \n\t"
1556            "lw         %[t2],  8(%[in_int])            \n\t"
1557            "lw         %[t3],  12(%[in_int])           \n\t"
1558            "srl        %[t0],  %[t0],  31              \n\t"
1559            "srl        %[t1],  %[t1],  31              \n\t"
1560            "srl        %[t2],  %[t2],  31              \n\t"
1561            "srl        %[t3],  %[t3],  31              \n\t"
1562            "subu       %[t4],  $zero,  %[qc1]          \n\t"
1563            "subu       %[t5],  $zero,  %[qc2]          \n\t"
1564            "subu       %[t6],  $zero,  %[qc3]          \n\t"
1565            "subu       %[t7],  $zero,  %[qc4]          \n\t"
1566            "movn       %[qc1], %[t4],  %[t0]           \n\t"
1567            "movn       %[qc2], %[t5],  %[t1]           \n\t"
1568            "movn       %[qc3], %[t6],  %[t2]           \n\t"
1569            "movn       %[qc4], %[t7],  %[t3]           \n\t"
1570
1571            ".set pop                                   \n\t"
1572
1573            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1574              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1575              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1576              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1577            : [in_int]"r"(in_int)
1578            : "memory"
1579        );
1580
1581        curidx = qc1;
1582        curidx *= 3;
1583        curidx += qc2;
1584        curidx *= 3;
1585        curidx += qc3;
1586        curidx *= 3;
1587        curidx += qc4;
1588        curidx += 40;
1589
1590        curbits += p_bits[curidx];
1591        vec     = &p_codes[curidx*4];
1592
1593        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
1594                +  vec[2]*vec[2] + vec[3]*vec[3];
1595
1596        __asm__ volatile (
1597            ".set push                                  \n\t"
1598            ".set noreorder                             \n\t"
1599
1600            "lwc1       $f0,    0(%[in_pos])            \n\t"
1601            "lwc1       $f1,    0(%[vec])               \n\t"
1602            "lwc1       $f2,    4(%[in_pos])            \n\t"
1603            "lwc1       $f3,    4(%[vec])               \n\t"
1604            "lwc1       $f4,    8(%[in_pos])            \n\t"
1605            "lwc1       $f5,    8(%[vec])               \n\t"
1606            "lwc1       $f6,    12(%[in_pos])           \n\t"
1607            "lwc1       $f7,    12(%[vec])              \n\t"
1608            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1609            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1610            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1611            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1612
1613            ".set pop                                   \n\t"
1614
1615            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1616              [di2]"=&f"(di2), [di3]"=&f"(di3)
1617            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1618              [IQ]"f"(IQ)
1619            : "$f0", "$f1", "$f2", "$f3",
1620              "$f4", "$f5", "$f6", "$f7",
1621              "memory"
1622        );
1623
1624        cost += di0 * di0 + di1 * di1
1625                + di2 * di2 + di3 * di3;
1626    }
1627
1628    if (bits)
1629        *bits = curbits;
1630    if (energy)
1631        *energy = qenergy * (IQ*IQ);
1632    return cost * lambda + curbits;
1633}
1634
1635static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
1636                                      PutBitContext *pb, const float *in,
1637                                      const float *scaled, int size, int scale_idx,
1638                                      int cb, const float lambda, const float uplim,
1639                                      int *bits, float *energy)
1640{
1641    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1642    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1643    int i;
1644    float cost = 0;
1645    float qenergy = 0.0f;
1646    int curbits = 0;
1647    int qc1, qc2, qc3, qc4;
1648
1649    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
1650    float   *p_codes = (float  *)ff_aac_codebook_vectors[cb-1];
1651
1652    for (i = 0; i < size; i += 4) {
1653        const float *vec;
1654        int curidx;
1655        float *in_pos = (float *)&in[i];
1656        float di0, di1, di2, di3;
1657        int t0, t1, t2, t3, t4;
1658
1659        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1660        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1661        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1662        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1663
1664        __asm__ volatile (
1665            ".set push                                  \n\t"
1666            ".set noreorder                             \n\t"
1667
1668            "ori        %[t4],  $zero,  2               \n\t"
1669            "slt        %[t0],  %[t4],  %[qc1]          \n\t"
1670            "slt        %[t1],  %[t4],  %[qc2]          \n\t"
1671            "slt        %[t2],  %[t4],  %[qc3]          \n\t"
1672            "slt        %[t3],  %[t4],  %[qc4]          \n\t"
1673            "movn       %[qc1], %[t4],  %[t0]           \n\t"
1674            "movn       %[qc2], %[t4],  %[t1]           \n\t"
1675            "movn       %[qc3], %[t4],  %[t2]           \n\t"
1676            "movn       %[qc4], %[t4],  %[t3]           \n\t"
1677
1678            ".set pop                                   \n\t"
1679
1680            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1681              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1682              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1683              [t4]"=&r"(t4)
1684        );
1685
1686        curidx = qc1;
1687        curidx *= 3;
1688        curidx += qc2;
1689        curidx *= 3;
1690        curidx += qc3;
1691        curidx *= 3;
1692        curidx += qc4;
1693
1694        curbits += p_bits[curidx];
1695        curbits += uquad_sign_bits[curidx];
1696        vec     = &p_codes[curidx*4];
1697
1698        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
1699                +  vec[2]*vec[2] + vec[3]*vec[3];
1700
1701        __asm__ volatile (
1702            ".set push                                  \n\t"
1703            ".set noreorder                             \n\t"
1704
1705            "lwc1       %[di0], 0(%[in_pos])            \n\t"
1706            "lwc1       %[di1], 4(%[in_pos])            \n\t"
1707            "lwc1       %[di2], 8(%[in_pos])            \n\t"
1708            "lwc1       %[di3], 12(%[in_pos])           \n\t"
1709            "abs.s      %[di0], %[di0]                  \n\t"
1710            "abs.s      %[di1], %[di1]                  \n\t"
1711            "abs.s      %[di2], %[di2]                  \n\t"
1712            "abs.s      %[di3], %[di3]                  \n\t"
1713            "lwc1       $f0,    0(%[vec])               \n\t"
1714            "lwc1       $f1,    4(%[vec])               \n\t"
1715            "lwc1       $f2,    8(%[vec])               \n\t"
1716            "lwc1       $f3,    12(%[vec])              \n\t"
1717            "nmsub.s    %[di0], %[di0], $f0,    %[IQ]   \n\t"
1718            "nmsub.s    %[di1], %[di1], $f1,    %[IQ]   \n\t"
1719            "nmsub.s    %[di2], %[di2], $f2,    %[IQ]   \n\t"
1720            "nmsub.s    %[di3], %[di3], $f3,    %[IQ]   \n\t"
1721
1722            ".set pop                                   \n\t"
1723
1724            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1725              [di2]"=&f"(di2), [di3]"=&f"(di3)
1726            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1727              [IQ]"f"(IQ)
1728            : "$f0", "$f1", "$f2", "$f3",
1729              "memory"
1730        );
1731
1732        cost += di0 * di0 + di1 * di1
1733                + di2 * di2 + di3 * di3;
1734    }
1735
1736    if (bits)
1737        *bits = curbits;
1738    if (energy)
1739        *energy = qenergy * (IQ*IQ);
1740    return cost * lambda + curbits;
1741}
1742
1743static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
1744                                      PutBitContext *pb, const float *in,
1745                                      const float *scaled, int size, int scale_idx,
1746                                      int cb, const float lambda, const float uplim,
1747                                      int *bits, float *energy)
1748{
1749    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1750    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1751    int i;
1752    float cost = 0;
1753    float qenergy = 0.0f;
1754    int qc1, qc2, qc3, qc4;
1755    int curbits = 0;
1756
1757    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1758    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1759
1760    for (i = 0; i < size; i += 4) {
1761        const float *vec, *vec2;
1762        int curidx, curidx2;
1763        int   *in_int = (int   *)&in[i];
1764        float *in_pos = (float *)&in[i];
1765        float di0, di1, di2, di3;
1766        int t0, t1, t2, t3, t4, t5, t6, t7;
1767
1768        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1769        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1770        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1771        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1772
1773        __asm__ volatile (
1774            ".set push                                  \n\t"
1775            ".set noreorder                             \n\t"
1776
1777            "ori        %[t4],  $zero,  4               \n\t"
1778            "slt        %[t0],  %[t4],  %[qc1]          \n\t"
1779            "slt        %[t1],  %[t4],  %[qc2]          \n\t"
1780            "slt        %[t2],  %[t4],  %[qc3]          \n\t"
1781            "slt        %[t3],  %[t4],  %[qc4]          \n\t"
1782            "movn       %[qc1], %[t4],  %[t0]           \n\t"
1783            "movn       %[qc2], %[t4],  %[t1]           \n\t"
1784            "movn       %[qc3], %[t4],  %[t2]           \n\t"
1785            "movn       %[qc4], %[t4],  %[t3]           \n\t"
1786            "lw         %[t0],  0(%[in_int])            \n\t"
1787            "lw         %[t1],  4(%[in_int])            \n\t"
1788            "lw         %[t2],  8(%[in_int])            \n\t"
1789            "lw         %[t3],  12(%[in_int])           \n\t"
1790            "srl        %[t0],  %[t0],  31              \n\t"
1791            "srl        %[t1],  %[t1],  31              \n\t"
1792            "srl        %[t2],  %[t2],  31              \n\t"
1793            "srl        %[t3],  %[t3],  31              \n\t"
1794            "subu       %[t4],  $zero,  %[qc1]          \n\t"
1795            "subu       %[t5],  $zero,  %[qc2]          \n\t"
1796            "subu       %[t6],  $zero,  %[qc3]          \n\t"
1797            "subu       %[t7],  $zero,  %[qc4]          \n\t"
1798            "movn       %[qc1], %[t4],  %[t0]           \n\t"
1799            "movn       %[qc2], %[t5],  %[t1]           \n\t"
1800            "movn       %[qc3], %[t6],  %[t2]           \n\t"
1801            "movn       %[qc4], %[t7],  %[t3]           \n\t"
1802
1803            ".set pop                                   \n\t"
1804
1805            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1806              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1807              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1808              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1809            : [in_int]"r"(in_int)
1810            : "memory"
1811        );
1812
1813        curidx = 9 * qc1;
1814        curidx += qc2 + 40;
1815
1816        curidx2 = 9 * qc3;
1817        curidx2 += qc4 + 40;
1818
1819        curbits += p_bits[curidx];
1820        curbits += p_bits[curidx2];
1821
1822        vec     = &p_codes[curidx*2];
1823        vec2    = &p_codes[curidx2*2];
1824
1825        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
1826                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
1827
1828        __asm__ volatile (
1829            ".set push                                  \n\t"
1830            ".set noreorder                             \n\t"
1831
1832            "lwc1       $f0,    0(%[in_pos])            \n\t"
1833            "lwc1       $f1,    0(%[vec])               \n\t"
1834            "lwc1       $f2,    4(%[in_pos])            \n\t"
1835            "lwc1       $f3,    4(%[vec])               \n\t"
1836            "lwc1       $f4,    8(%[in_pos])            \n\t"
1837            "lwc1       $f5,    0(%[vec2])              \n\t"
1838            "lwc1       $f6,    12(%[in_pos])           \n\t"
1839            "lwc1       $f7,    4(%[vec2])              \n\t"
1840            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1841            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1842            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1843            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1844
1845            ".set pop                                   \n\t"
1846
1847            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1848              [di2]"=&f"(di2), [di3]"=&f"(di3)
1849            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1850              [vec2]"r"(vec2), [IQ]"f"(IQ)
1851            : "$f0", "$f1", "$f2", "$f3",
1852              "$f4", "$f5", "$f6", "$f7",
1853              "memory"
1854        );
1855
1856        cost += di0 * di0 + di1 * di1
1857                + di2 * di2 + di3 * di3;
1858    }
1859
1860    if (bits)
1861        *bits = curbits;
1862    if (energy)
1863        *energy = qenergy * (IQ*IQ);
1864    return cost * lambda + curbits;
1865}
1866
1867static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
1868                                       PutBitContext *pb, const float *in,
1869                                       const float *scaled, int size, int scale_idx,
1870                                       int cb, const float lambda, const float uplim,
1871                                       int *bits, float *energy)
1872{
1873    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1874    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1875    int i;
1876    float cost = 0;
1877    float qenergy = 0.0f;
1878    int qc1, qc2, qc3, qc4;
1879    int curbits = 0;
1880
1881    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1882    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1883
1884    for (i = 0; i < size; i += 4) {
1885        const float *vec, *vec2;
1886        int curidx, curidx2, sign1, count1, sign2, count2;
1887        int   *in_int = (int   *)&in[i];
1888        float *in_pos = (float *)&in[i];
1889        float di0, di1, di2, di3;
1890        int t0, t1, t2, t3, t4;
1891
1892        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1893        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1894        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1895        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1896
1897        __asm__ volatile (
1898            ".set push                                          \n\t"
1899            ".set noreorder                                     \n\t"
1900
1901            "ori        %[t4],      $zero,      7               \n\t"
1902            "ori        %[sign1],   $zero,      0               \n\t"
1903            "ori        %[sign2],   $zero,      0               \n\t"
1904            "slt        %[t0],      %[t4],      %[qc1]          \n\t"
1905            "slt        %[t1],      %[t4],      %[qc2]          \n\t"
1906            "slt        %[t2],      %[t4],      %[qc3]          \n\t"
1907            "slt        %[t3],      %[t4],      %[qc4]          \n\t"
1908            "movn       %[qc1],     %[t4],      %[t0]           \n\t"
1909            "movn       %[qc2],     %[t4],      %[t1]           \n\t"
1910            "movn       %[qc3],     %[t4],      %[t2]           \n\t"
1911            "movn       %[qc4],     %[t4],      %[t3]           \n\t"
1912            "lw         %[t0],      0(%[in_int])                \n\t"
1913            "lw         %[t1],      4(%[in_int])                \n\t"
1914            "lw         %[t2],      8(%[in_int])                \n\t"
1915            "lw         %[t3],      12(%[in_int])               \n\t"
1916            "slt        %[t0],      %[t0],      $zero           \n\t"
1917            "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
1918            "slt        %[t2],      %[t2],      $zero           \n\t"
1919            "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
1920            "slt        %[t1],      %[t1],      $zero           \n\t"
1921            "sll        %[t0],      %[sign1],   1               \n\t"
1922            "or         %[t0],      %[t0],      %[t1]           \n\t"
1923            "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
1924            "slt        %[t3],      %[t3],      $zero           \n\t"
1925            "sll        %[t0],      %[sign2],   1               \n\t"
1926            "or         %[t0],      %[t0],      %[t3]           \n\t"
1927            "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
1928            "slt        %[count1],  $zero,      %[qc1]          \n\t"
1929            "slt        %[t1],      $zero,      %[qc2]          \n\t"
1930            "slt        %[count2],  $zero,      %[qc3]          \n\t"
1931            "slt        %[t2],      $zero,      %[qc4]          \n\t"
1932            "addu       %[count1],  %[count1],  %[t1]           \n\t"
1933            "addu       %[count2],  %[count2],  %[t2]           \n\t"
1934
1935            ".set pop                                           \n\t"
1936
1937            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1938              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1939              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1940              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
1941              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1942              [t4]"=&r"(t4)
1943            : [in_int]"r"(in_int)
1944            : "memory"
1945        );
1946
1947        curidx = 8 * qc1;
1948        curidx += qc2;
1949
1950        curidx2 = 8 * qc3;
1951        curidx2 += qc4;
1952
1953        curbits += p_bits[curidx];
1954        curbits += upair7_sign_bits[curidx];
1955        vec     = &p_codes[curidx*2];
1956
1957        curbits += p_bits[curidx2];
1958        curbits += upair7_sign_bits[curidx2];
1959        vec2    = &p_codes[curidx2*2];
1960
1961        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
1962                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
1963
1964        __asm__ volatile (
1965            ".set push                                          \n\t"
1966            ".set noreorder                                     \n\t"
1967
1968            "lwc1       %[di0],     0(%[in_pos])                \n\t"
1969            "lwc1       %[di1],     4(%[in_pos])                \n\t"
1970            "lwc1       %[di2],     8(%[in_pos])                \n\t"
1971            "lwc1       %[di3],     12(%[in_pos])               \n\t"
1972            "abs.s      %[di0],     %[di0]                      \n\t"
1973            "abs.s      %[di1],     %[di1]                      \n\t"
1974            "abs.s      %[di2],     %[di2]                      \n\t"
1975            "abs.s      %[di3],     %[di3]                      \n\t"
1976            "lwc1       $f0,        0(%[vec])                   \n\t"
1977            "lwc1       $f1,        4(%[vec])                   \n\t"
1978            "lwc1       $f2,        0(%[vec2])                  \n\t"
1979            "lwc1       $f3,        4(%[vec2])                  \n\t"
1980            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
1981            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
1982            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
1983            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
1984
1985            ".set pop                                           \n\t"
1986
1987            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1988              [di2]"=&f"(di2), [di3]"=&f"(di3)
1989            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1990              [vec2]"r"(vec2), [IQ]"f"(IQ)
1991            : "$f0", "$f1", "$f2", "$f3",
1992              "memory"
1993        );
1994
1995        cost += di0 * di0 + di1 * di1
1996                + di2 * di2 + di3 * di3;
1997    }
1998
1999    if (bits)
2000        *bits = curbits;
2001    if (energy)
2002        *energy = qenergy * (IQ*IQ);
2003    return cost * lambda + curbits;
2004}
2005
2006static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
2007                                        PutBitContext *pb, const float *in,
2008                                        const float *scaled, int size, int scale_idx,
2009                                        int cb, const float lambda, const float uplim,
2010                                        int *bits, float *energy)
2011{
2012    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
2013    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
2014    int i;
2015    float cost = 0;
2016    float qenergy = 0.0f;
2017    int qc1, qc2, qc3, qc4;
2018    int curbits = 0;
2019
2020    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
2021    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
2022
2023    for (i = 0; i < size; i += 4) {
2024        const float *vec, *vec2;
2025        int curidx, curidx2;
2026        int sign1, count1, sign2, count2;
2027        int   *in_int = (int   *)&in[i];
2028        float *in_pos = (float *)&in[i];
2029        float di0, di1, di2, di3;
2030        int t0, t1, t2, t3, t4;
2031
2032        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
2033        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
2034        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
2035        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
2036
2037        __asm__ volatile (
2038            ".set push                                          \n\t"
2039            ".set noreorder                                     \n\t"
2040
2041            "ori        %[t4],      $zero,      12              \n\t"
2042            "ori        %[sign1],   $zero,      0               \n\t"
2043            "ori        %[sign2],   $zero,      0               \n\t"
2044            "slt        %[t0],      %[t4],      %[qc1]          \n\t"
2045            "slt        %[t1],      %[t4],      %[qc2]          \n\t"
2046            "slt        %[t2],      %[t4],      %[qc3]          \n\t"
2047            "slt        %[t3],      %[t4],      %[qc4]          \n\t"
2048            "movn       %[qc1],     %[t4],      %[t0]           \n\t"
2049            "movn       %[qc2],     %[t4],      %[t1]           \n\t"
2050            "movn       %[qc3],     %[t4],      %[t2]           \n\t"
2051            "movn       %[qc4],     %[t4],      %[t3]           \n\t"
2052            "lw         %[t0],      0(%[in_int])                \n\t"
2053            "lw         %[t1],      4(%[in_int])                \n\t"
2054            "lw         %[t2],      8(%[in_int])                \n\t"
2055            "lw         %[t3],      12(%[in_int])               \n\t"
2056            "slt        %[t0],      %[t0],      $zero           \n\t"
2057            "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
2058            "slt        %[t2],      %[t2],      $zero           \n\t"
2059            "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
2060            "slt        %[t1],      %[t1],      $zero           \n\t"
2061            "sll        %[t0],      %[sign1],   1               \n\t"
2062            "or         %[t0],      %[t0],      %[t1]           \n\t"
2063            "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
2064            "slt        %[t3],      %[t3],      $zero           \n\t"
2065            "sll        %[t0],      %[sign2],   1               \n\t"
2066            "or         %[t0],      %[t0],      %[t3]           \n\t"
2067            "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
2068            "slt        %[count1],  $zero,      %[qc1]          \n\t"
2069            "slt        %[t1],      $zero,      %[qc2]          \n\t"
2070            "slt        %[count2],  $zero,      %[qc3]          \n\t"
2071            "slt        %[t2],      $zero,      %[qc4]          \n\t"
2072            "addu       %[count1],  %[count1],  %[t1]           \n\t"
2073            "addu       %[count2],  %[count2],  %[t2]           \n\t"
2074
2075            ".set pop                                           \n\t"
2076
2077            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
2078              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
2079              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
2080              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
2081              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
2082              [t4]"=&r"(t4)
2083            : [in_int]"r"(in_int)
2084            : "memory"
2085        );
2086
2087        curidx = 13 * qc1;
2088        curidx += qc2;
2089
2090        curidx2 = 13 * qc3;
2091        curidx2 += qc4;
2092
2093        curbits += p_bits[curidx];
2094        curbits += p_bits[curidx2];
2095        curbits += upair12_sign_bits[curidx];
2096        curbits += upair12_sign_bits[curidx2];
2097        vec     = &p_codes[curidx*2];
2098        vec2    = &p_codes[curidx2*2];
2099
2100        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
2101                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
2102
2103        __asm__ volatile (
2104            ".set push                                          \n\t"
2105            ".set noreorder                                     \n\t"
2106
2107            "lwc1       %[di0],     0(%[in_pos])                \n\t"
2108            "lwc1       %[di1],     4(%[in_pos])                \n\t"
2109            "lwc1       %[di2],     8(%[in_pos])                \n\t"
2110            "lwc1       %[di3],     12(%[in_pos])               \n\t"
2111            "abs.s      %[di0],     %[di0]                      \n\t"
2112            "abs.s      %[di1],     %[di1]                      \n\t"
2113            "abs.s      %[di2],     %[di2]                      \n\t"
2114            "abs.s      %[di3],     %[di3]                      \n\t"
2115            "lwc1       $f0,        0(%[vec])                   \n\t"
2116            "lwc1       $f1,        4(%[vec])                   \n\t"
2117            "lwc1       $f2,        0(%[vec2])                  \n\t"
2118            "lwc1       $f3,        4(%[vec2])                  \n\t"
2119            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
2120            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
2121            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
2122            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
2123
2124            ".set pop                                           \n\t"
2125
2126            : [di0]"=&f"(di0), [di1]"=&f"(di1),
2127              [di2]"=&f"(di2), [di3]"=&f"(di3)
2128            : [in_pos]"r"(in_pos), [vec]"r"(vec),
2129              [vec2]"r"(vec2), [IQ]"f"(IQ)
2130            : "$f0", "$f1", "$f2", "$f3",
2131              "memory"
2132        );
2133
2134        cost += di0 * di0 + di1 * di1
2135                + di2 * di2 + di3 * di3;
2136    }
2137
2138    if (bits)
2139        *bits = curbits;
2140    if (energy)
2141        *energy = qenergy * (IQ*IQ);
2142    return cost * lambda + curbits;
2143}
2144
2145static float get_band_cost_ESC_mips(struct AACEncContext *s,
2146                                    PutBitContext *pb, const float *in,
2147                                    const float *scaled, int size, int scale_idx,
2148                                    int cb, const float lambda, const float uplim,
2149                                    int *bits, float *energy)
2150{
2151    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
2152    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
2153    const float CLIPPED_ESCAPE = 165140.0f * IQ;
2154    int i;
2155    float cost = 0;
2156    float qenergy = 0.0f;
2157    int qc1, qc2, qc3, qc4;
2158    int curbits = 0;
2159
2160    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
2161    float   *p_codes = (float*  )ff_aac_codebook_vectors[cb-1];
2162
2163    for (i = 0; i < size; i += 4) {
2164        const float *vec, *vec2;
2165        int curidx, curidx2;
2166        float t1, t2, t3, t4, V;
2167        float di1, di2, di3, di4;
2168        int cond0, cond1, cond2, cond3;
2169        int c1, c2, c3, c4;
2170        int t6, t7;
2171
2172        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
2173        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
2174        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
2175        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
2176
2177        __asm__ volatile (
2178            ".set push                                  \n\t"
2179            ".set noreorder                             \n\t"
2180
2181            "ori        %[t6],      $zero,  15          \n\t"
2182            "ori        %[t7],      $zero,  16          \n\t"
2183            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
2184            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
2185            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
2186            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
2187            "srl        %[c1],      %[c1],  18          \n\t"
2188            "srl        %[c2],      %[c2],  18          \n\t"
2189            "srl        %[c3],      %[c3],  18          \n\t"
2190            "srl        %[c4],      %[c4],  18          \n\t"
2191            "slt        %[cond0],   %[t6],  %[qc1]      \n\t"
2192            "slt        %[cond1],   %[t6],  %[qc2]      \n\t"
2193            "slt        %[cond2],   %[t6],  %[qc3]      \n\t"
2194            "slt        %[cond3],   %[t6],  %[qc4]      \n\t"
2195            "movn       %[qc1],     %[t7],  %[cond0]    \n\t"
2196            "movn       %[qc2],     %[t7],  %[cond1]    \n\t"
2197            "movn       %[qc3],     %[t7],  %[cond2]    \n\t"
2198            "movn       %[qc4],     %[t7],  %[cond3]    \n\t"
2199
2200            ".set pop                                   \n\t"
2201
2202            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
2203              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
2204              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
2205              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
2206              [c1]"=&r"(c1), [c2]"=&r"(c2),
2207              [c3]"=&r"(c3), [c4]"=&r"(c4),
2208              [t6]"=&r"(t6), [t7]"=&r"(t7)
2209        );
2210
2211        curidx = 17 * qc1;
2212        curidx += qc2;
2213
2214        curidx2 = 17 * qc3;
2215        curidx2 += qc4;
2216
2217        curbits += p_bits[curidx];
2218        curbits += esc_sign_bits[curidx];
2219        vec     = &p_codes[curidx*2];
2220
2221        curbits += p_bits[curidx2];
2222        curbits += esc_sign_bits[curidx2];
2223        vec2     = &p_codes[curidx2*2];
2224
2225        curbits += (av_log2(c1) * 2 - 3) & (-cond0);
2226        curbits += (av_log2(c2) * 2 - 3) & (-cond1);
2227        curbits += (av_log2(c3) * 2 - 3) & (-cond2);
2228        curbits += (av_log2(c4) * 2 - 3) & (-cond3);
2229
2230        t1 = fabsf(in[i  ]);
2231        t2 = fabsf(in[i+1]);
2232        t3 = fabsf(in[i+2]);
2233        t4 = fabsf(in[i+3]);
2234
2235        if (cond0) {
2236            if (t1 >= CLIPPED_ESCAPE) {
2237                di1 = t1 - CLIPPED_ESCAPE;
2238                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
2239            } else {
2240                di1 = t1 - (V = c1 * cbrtf(c1) * IQ);
2241                qenergy += V*V;
2242            }
2243        } else {
2244            di1 = t1 - (V = vec[0] * IQ);
2245            qenergy += V*V;
2246        }
2247
2248        if (cond1) {
2249            if (t2 >= CLIPPED_ESCAPE) {
2250                di2 = t2 - CLIPPED_ESCAPE;
2251                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
2252            } else {
2253                di2 = t2 - (V = c2 * cbrtf(c2) * IQ);
2254                qenergy += V*V;
2255            }
2256        } else {
2257            di2 = t2 - (V = vec[1] * IQ);
2258            qenergy += V*V;
2259        }
2260
2261        if (cond2) {
2262            if (t3 >= CLIPPED_ESCAPE) {
2263                di3 = t3 - CLIPPED_ESCAPE;
2264                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
2265            } else {
2266                di3 = t3 - (V = c3 * cbrtf(c3) * IQ);
2267                qenergy += V*V;
2268            }
2269        } else {
2270            di3 = t3 - (V = vec2[0] * IQ);
2271            qenergy += V*V;
2272        }
2273
2274        if (cond3) {
2275            if (t4 >= CLIPPED_ESCAPE) {
2276                di4 = t4 - CLIPPED_ESCAPE;
2277                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
2278            } else {
2279                di4 = t4 - (V = c4 * cbrtf(c4) * IQ);
2280                qenergy += V*V;
2281            }
2282        } else {
2283            di4 = t4 - (V = vec2[1]*IQ);
2284            qenergy += V*V;
2285        }
2286
2287        cost += di1 * di1 + di2 * di2
2288                + di3 * di3 + di4 * di4;
2289    }
2290
2291    if (bits)
2292        *bits = curbits;
2293    return cost * lambda + curbits;
2294}
2295
2296static float (*const get_band_cost_arr[])(struct AACEncContext *s,
2297                                          PutBitContext *pb, const float *in,
2298                                          const float *scaled, int size, int scale_idx,
2299                                          int cb, const float lambda, const float uplim,
2300                                          int *bits, float *energy) = {
2301    get_band_cost_ZERO_mips,
2302    get_band_cost_SQUAD_mips,
2303    get_band_cost_SQUAD_mips,
2304    get_band_cost_UQUAD_mips,
2305    get_band_cost_UQUAD_mips,
2306    get_band_cost_SPAIR_mips,
2307    get_band_cost_SPAIR_mips,
2308    get_band_cost_UPAIR7_mips,
2309    get_band_cost_UPAIR7_mips,
2310    get_band_cost_UPAIR12_mips,
2311    get_band_cost_UPAIR12_mips,
2312    get_band_cost_ESC_mips,
2313    get_band_cost_NONE_mips, /* cb 12 doesn't exist */
2314    get_band_cost_ZERO_mips,
2315    get_band_cost_ZERO_mips,
2316    get_band_cost_ZERO_mips,
2317};
2318
2319#define get_band_cost(                                  \
2320                                s, pb, in, scaled, size, scale_idx, cb, \
2321                                lambda, uplim, bits, energy)            \
2322    get_band_cost_arr[cb](                              \
2323                                s, pb, in, scaled, size, scale_idx, cb, \
2324                                lambda, uplim, bits, energy)
2325
2326static float quantize_band_cost(struct AACEncContext *s, const float *in,
2327                                const float *scaled, int size, int scale_idx,
2328                                int cb, const float lambda, const float uplim,
2329                                int *bits, float *energy, int rtz)
2330{
2331    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits, energy);
2332}
2333
2334#include "libavcodec/aacenc_quantization_misc.h"
2335
2336#include "libavcodec/aaccoder_twoloop.h"
2337
2338static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
2339{
2340    int start = 0, i, w, w2, g, sid_sf_boost, prev_mid, prev_side;
2341    uint8_t nextband0[128], nextband1[128];
2342    float M[128], S[128];
2343    float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
2344    const float lambda = s->lambda;
2345    const float mslambda = FFMIN(1.0f, lambda / 120.f);
2346    SingleChannelElement *sce0 = &cpe->ch[0];
2347    SingleChannelElement *sce1 = &cpe->ch[1];
2348    if (!cpe->common_window)
2349        return;
2350
2351    /** Scout out next nonzero bands */
2352    ff_init_nextband_map(sce0, nextband0);
2353    ff_init_nextband_map(sce1, nextband1);
2354
2355    prev_mid = sce0->sf_idx[0];
2356    prev_side = sce1->sf_idx[0];
2357    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
2358        start = 0;
2359        for (g = 0;  g < sce0->ics.num_swb; g++) {
2360            float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f;
2361            if (!cpe->is_mask[w*16+g])
2362                cpe->ms_mask[w*16+g] = 0;
2363            if (!sce0->zeroes[w*16+g] && !sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g]) {
2364                float Mmax = 0.0f, Smax = 0.0f;
2365
2366                /* Must compute mid/side SF and book for the whole window group */
2367                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2368                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
2369                        M[i] = (sce0->coeffs[start+(w+w2)*128+i]
2370                              + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
2371                        S[i] =  M[i]
2372                              - sce1->coeffs[start+(w+w2)*128+i];
2373                    }
2374                    abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
2375                    abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
2376                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) {
2377                        Mmax = FFMAX(Mmax, M34[i]);
2378                        Smax = FFMAX(Smax, S34[i]);
2379                    }
2380                }
2381
2382                for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) {
2383                    float dist1 = 0.0f, dist2 = 0.0f;
2384                    int B0 = 0, B1 = 0;
2385                    int minidx;
2386                    int mididx, sididx;
2387                    int midcb, sidcb;
2388
2389                    minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]);
2390                    mididx = av_clip(minidx, 0, SCALE_MAX_POS - SCALE_DIV_512);
2391                    sididx = av_clip(minidx - sid_sf_boost * 3, 0, SCALE_MAX_POS - SCALE_DIV_512);
2392                    if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT
2393                        && (   !ff_sfdelta_can_replace(sce0, nextband0, prev_mid, mididx, w*16+g)
2394                            || !ff_sfdelta_can_replace(sce1, nextband1, prev_side, sididx, w*16+g))) {
2395                        /* scalefactor range violation, bad stuff, will decrease quality unacceptably */
2396                        continue;
2397                    }
2398
2399                    midcb = find_min_book(Mmax, mididx);
2400                    sidcb = find_min_book(Smax, sididx);
2401
2402                    /* No CB can be zero */
2403                    midcb = FFMAX(1,midcb);
2404                    sidcb = FFMAX(1,sidcb);
2405
2406                    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2407                        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
2408                        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
2409                        float minthr = FFMIN(band0->threshold, band1->threshold);
2410                        int b1,b2,b3,b4;
2411                        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
2412                            M[i] = (sce0->coeffs[start+(w+w2)*128+i]
2413                                  + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
2414                            S[i] =  M[i]
2415                                  - sce1->coeffs[start+(w+w2)*128+i];
2416                        }
2417
2418                        abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
2419                        abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
2420                        abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
2421                        abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
2422                        dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
2423                                                    L34,
2424                                                    sce0->ics.swb_sizes[g],
2425                                                    sce0->sf_idx[w*16+g],
2426                                                    sce0->band_type[w*16+g],
2427                                                    lambda / band0->threshold, INFINITY, &b1, NULL, 0);
2428                        dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
2429                                                    R34,
2430                                                    sce1->ics.swb_sizes[g],
2431                                                    sce1->sf_idx[w*16+g],
2432                                                    sce1->band_type[w*16+g],
2433                                                    lambda / band1->threshold, INFINITY, &b2, NULL, 0);
2434                        dist2 += quantize_band_cost(s, M,
2435                                                    M34,
2436                                                    sce0->ics.swb_sizes[g],
2437                                                    mididx,
2438                                                    midcb,
2439                                                    lambda / minthr, INFINITY, &b3, NULL, 0);
2440                        dist2 += quantize_band_cost(s, S,
2441                                                    S34,
2442                                                    sce1->ics.swb_sizes[g],
2443                                                    sididx,
2444                                                    sidcb,
2445                                                    mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
2446                        B0 += b1+b2;
2447                        B1 += b3+b4;
2448                        dist1 -= b1+b2;
2449                        dist2 -= b3+b4;
2450                    }
2451                    cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0;
2452                    if (cpe->ms_mask[w*16+g]) {
2453                        if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT) {
2454                            sce0->sf_idx[w*16+g] = mididx;
2455                            sce1->sf_idx[w*16+g] = sididx;
2456                            sce0->band_type[w*16+g] = midcb;
2457                            sce1->band_type[w*16+g] = sidcb;
2458                        } else if ((sce0->band_type[w*16+g] != NOISE_BT) ^ (sce1->band_type[w*16+g] != NOISE_BT)) {
2459                            /* ms_mask unneeded, and it confuses some decoders */
2460                            cpe->ms_mask[w*16+g] = 0;
2461                        }
2462                        break;
2463                    } else if (B1 > B0) {
2464                        /* More boost won't fix this */
2465                        break;
2466                    }
2467                }
2468            }
2469            if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT)
2470                prev_mid = sce0->sf_idx[w*16+g];
2471            if (!sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
2472                prev_side = sce1->sf_idx[w*16+g];
2473            start += sce0->ics.swb_sizes[g];
2474        }
2475    }
2476}
2477#endif /*HAVE_MIPSFPU */
2478
2479#include "libavcodec/aaccoder_trellis.h"
2480
2481#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
2482#endif /* HAVE_INLINE_ASM */
2483
2484void ff_aac_coder_init_mips(AACEncContext *c) {
2485#if HAVE_INLINE_ASM
2486#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
2487    AACCoefficientsEncoder *e = c->coder;
2488    int option = c->options.coder;
2489
2490    if (option == 2) {
2491        e->quantize_and_encode_band = quantize_and_encode_band_mips;
2492        e->encode_window_bands_info = codebook_trellis_rate;
2493#if HAVE_MIPSFPU
2494        e->search_for_quantizers    = search_for_quantizers_twoloop;
2495#endif /* HAVE_MIPSFPU */
2496    }
2497#if HAVE_MIPSFPU
2498    e->search_for_ms            = search_for_ms_mips;
2499#endif /* HAVE_MIPSFPU */
2500#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
2501#endif /* HAVE_INLINE_ASM */
2502}
2503