1/* 2 * Copyright (c) 2012 3 * MIPS Technologies, Inc., California. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its 14 * contributors may be used to endorse or promote products derived from 15 * this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * Author: Bojan Zivkovic (bojan@mips.com) 30 * 31 * MPEG Audio decoder optimized for MIPS floating-point architecture 32 * 33 * This file is part of FFmpeg. 34 * 35 * FFmpeg is free software; you can redistribute it and/or 36 * modify it under the terms of the GNU Lesser General Public 37 * License as published by the Free Software Foundation; either 38 * version 2.1 of the License, or (at your option) any later version. 39 * 40 * FFmpeg is distributed in the hope that it will be useful, 41 * but WITHOUT ANY WARRANTY; without even the implied warranty of 42 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 43 * Lesser General Public License for more details. 44 * 45 * You should have received a copy of the GNU Lesser General Public 46 * License along with FFmpeg; if not, write to the Free Software 47 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 48 */ 49 50/** 51 * @file 52 * Reference: libavcodec/mpegaudiodsp_template.c 53 * libavcodec/dct32.c 54 */ 55 56#include <string.h> 57 58#include "config.h" 59#include "libavutil/mips/asmdefs.h" 60#include "libavcodec/mpegaudiodsp.h" 61 62#if HAVE_INLINE_ASM && HAVE_MIPSFPU 63#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 64 65static void ff_mpadsp_apply_window_mips_float(float *synth_buf, float *window, 66 int *dither_state, float *samples, ptrdiff_t incr) 67{ 68 register const float *w, *w2, *p; 69 int j; 70 float *samples2; 71 float sum, sum2; 72 /* temporary variables */ 73 int incr1 = incr << 2; 74 int t_sample; 75 float in1, in2, in3, in4, in5, in6, in7, in8; 76 float *p2; 77 78 /* copy to avoid wrap */ 79 memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf)); 80 81 /** 82 * instructions are scheduled to minimize pipeline stall. 83 * use of round_sample function from the original code is 84 * changed with appropriate assembly instructions. 85 */ 86 87 __asm__ volatile ( 88 "lwc1 %[sum], 0(%[dither_state]) \t\n" 89 "sll %[t_sample], %[incr1], 5 \t\n" 90 "sub %[t_sample], %[t_sample], %[incr1] \n\t" 91 "li %[j], 4 \t\n" 92 "lwc1 %[in1], 0(%[window]) \t\n" 93 "lwc1 %[in2], 16*4(%[synth_buf]) \t\n" 94 "sw $zero, 0(%[dither_state]) \t\n" 95 "lwc1 %[in3], 64*4(%[window]) \t\n" 96 "lwc1 %[in4], 80*4(%[synth_buf]) \t\n" 97 PTR_ADDU "%[samples2],%[samples], %[t_sample] \t\n" 98 "madd.s %[sum], %[sum], %[in1], %[in2] \t\n" 99 "lwc1 %[in5], 128*4(%[window]) \t\n" 100 "lwc1 %[in6], 144*4(%[synth_buf]) \t\n" 101 "lwc1 %[in7], 192*4(%[window]) \t\n" 102 "madd.s %[sum], %[sum], %[in3], %[in4] \t\n" 103 "lwc1 %[in8], 208*4(%[synth_buf]) \t\n" 104 "lwc1 %[in1], 256*4(%[window]) \t\n" 105 "lwc1 %[in2], 272*4(%[synth_buf]) \t\n" 106 "madd.s %[sum], %[sum], %[in5], %[in6] \t\n" 107 "lwc1 %[in3], 320*4(%[window]) \t\n" 108 "lwc1 %[in4], 336*4(%[synth_buf]) \t\n" 109 "lwc1 %[in5], 384*4(%[window]) \t\n" 110 "madd.s %[sum], %[sum], %[in7], %[in8] \t\n" 111 "lwc1 %[in6], 400*4(%[synth_buf]) \t\n" 112 "lwc1 %[in7], 448*4(%[window]) \t\n" 113 "lwc1 %[in8], 464*4(%[synth_buf]) \t\n" 114 "madd.s %[sum], %[sum], %[in1], %[in2] \t\n" 115 "lwc1 %[in1], 32*4(%[window]) \t\n" 116 "lwc1 %[in2], 48*4(%[synth_buf]) \t\n" 117 "madd.s %[sum], %[sum], %[in3], %[in4] \t\n" 118 "lwc1 %[in3], 96*4(%[window]) \t\n" 119 "lwc1 %[in4], 112*4(%[synth_buf]) \t\n" 120 "madd.s %[sum], %[sum], %[in5], %[in6] \t\n" 121 "lwc1 %[in5], 160*4(%[window]) \t\n" 122 "lwc1 %[in6], 176*4(%[synth_buf]) \t\n" 123 "madd.s %[sum], %[sum], %[in7], %[in8] \t\n" 124 "lwc1 %[in7], 224*4(%[window]) \t\n" 125 "lwc1 %[in8], 240*4(%[synth_buf]) \t\n" 126 "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" 127 "lwc1 %[in1], 288*4(%[window]) \t\n" 128 "lwc1 %[in2], 304*4(%[synth_buf]) \t\n" 129 "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n" 130 "lwc1 %[in3], 352*4(%[window]) \t\n" 131 "lwc1 %[in4], 368*4(%[synth_buf]) \t\n" 132 "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n" 133 "lwc1 %[in5], 416*4(%[window]) \t\n" 134 "lwc1 %[in6], 432*4(%[synth_buf]) \t\n" 135 "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n" 136 "lwc1 %[in7], 480*4(%[window]) \t\n" 137 "lwc1 %[in8], 496*4(%[synth_buf]) \t\n" 138 "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" 139 PTR_ADDU "%[w], %[window], 4 \t\n" 140 "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n" 141 PTR_ADDU "%[w2], %[window], 124 \t\n" 142 PTR_ADDIU "%[p], %[synth_buf], 68 \t\n" 143 PTR_ADDIU "%[p2], %[synth_buf], 188 \t\n" 144 "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n" 145 "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n" 146 "swc1 %[sum], 0(%[samples]) \t\n" 147 PTR_ADDU "%[samples], %[samples], %[incr1] \t\n" 148 149 /* calculate two samples at the same time to avoid one memory 150 access per two sample */ 151 152 "ff_mpadsp_apply_window_loop%=: \t\n" 153 "lwc1 %[in1], 0(%[w]) \t\n" 154 "lwc1 %[in2], 0(%[p]) \t\n" 155 "lwc1 %[in3], 0(%[w2]) \t\n" 156 "lwc1 %[in4], 64*4(%[w]) \t\n" 157 "lwc1 %[in5], 64*4(%[p]) \t\n" 158 "lwc1 %[in6], 64*4(%[w2]) \t\n" 159 "mul.s %[sum], %[in1], %[in2] \t\n" 160 "mul.s %[sum2], %[in2], %[in3] \t\n" 161 "lwc1 %[in1], 128*4(%[w]) \t\n" 162 "lwc1 %[in2], 128*4(%[p]) \t\n" 163 "madd.s %[sum], %[sum], %[in4], %[in5] \t\n" 164 "nmadd.s %[sum2], %[sum2], %[in5], %[in6] \t\n" 165 "lwc1 %[in3], 128*4(%[w2]) \t\n" 166 "lwc1 %[in4], 192*4(%[w]) \t\n" 167 "madd.s %[sum], %[sum], %[in1], %[in2] \t\n" 168 "lwc1 %[in5], 192*4(%[p]) \t\n" 169 "lwc1 %[in6], 192*4(%[w2]) \t\n" 170 "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" 171 "lwc1 %[in1], 256*4(%[w]) \t\n" 172 "lwc1 %[in2], 256*4(%[p]) \t\n" 173 "madd.s %[sum], %[sum], %[in4], %[in5] \t\n" 174 "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" 175 "lwc1 %[in3], 256*4(%[w2]) \t\n" 176 "lwc1 %[in4], 320*4(%[w]) \t\n" 177 "madd.s %[sum], %[sum], %[in1], %[in2] \t\n" 178 "lwc1 %[in5], 320*4(%[p]) \t\n" 179 "lwc1 %[in6], 320*4(%[w2]) \t\n" 180 "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" 181 "lwc1 %[in1], 384*4(%[w]) \t\n" 182 "lwc1 %[in2], 384*4(%[p]) \t\n" 183 "madd.s %[sum], %[sum], %[in4], %[in5] \t\n" 184 "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" 185 "lwc1 %[in3], 384*4(%[w2]) \t\n" 186 "lwc1 %[in4], 448*4(%[w]) \t\n" 187 "madd.s %[sum], %[sum], %[in1], %[in2] \t\n" 188 "lwc1 %[in5], 448*4(%[p]) \t\n" 189 "lwc1 %[in6], 448*4(%[w2]) \t\n" 190 "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" 191 "madd.s %[sum], %[sum], %[in4], %[in5] \t\n" 192 "lwc1 %[in1], 32*4(%[w]) \t\n" 193 "lwc1 %[in2], 0(%[p2]) \t\n" 194 "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" 195 "lwc1 %[in3], 32*4(%[w2]) \t\n" 196 "lwc1 %[in4], 96*4(%[w]) \t\n" 197 "lwc1 %[in5], 64*4(%[p2]) \t\n" 198 "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" 199 "lwc1 %[in6], 96*4(%[w2]) \t\n" 200 "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" 201 "lwc1 %[in1], 160*4(%[w]) \t\n" 202 "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n" 203 "lwc1 %[in2], 128*4(%[p2]) \t\n" 204 "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" 205 "lwc1 %[in3], 160*4(%[w2]) \t\n" 206 "lwc1 %[in4], 224*4(%[w]) \t\n" 207 "lwc1 %[in5], 192*4(%[p2]) \t\n" 208 "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" 209 "lwc1 %[in6], 224*4(%[w2]) \t\n" 210 "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" 211 "lwc1 %[in1], 288*4(%[w]) \t\n" 212 "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n" 213 "lwc1 %[in2], 256*4(%[p2]) \t\n" 214 "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" 215 "lwc1 %[in3], 288*4(%[w2]) \t\n" 216 "lwc1 %[in4], 352*4(%[w]) \t\n" 217 "lwc1 %[in5], 320*4(%[p2]) \t\n" 218 "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" 219 "lwc1 %[in6], 352*4(%[w2]) \t\n" 220 "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" 221 "lwc1 %[in1], 416*4(%[w]) \t\n" 222 "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n" 223 "lwc1 %[in2], 384*4(%[p2]) \t\n" 224 "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" 225 "lwc1 %[in3], 416*4(%[w2]) \t\n" 226 "lwc1 %[in4], 480*4(%[w]) \t\n" 227 "lwc1 %[in5], 448*4(%[p2]) \t\n" 228 "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" 229 "lwc1 %[in6], 480*4(%[w2]) \t\n" 230 "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n" 231 PTR_ADDIU "%[w], %[w], 4 \t\n" 232 "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n" 233 PTR_ADDIU "%[w2], %[w2], -4 \t\n" 234 "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n" 235 "addu %[j], %[j], 4 \t\n" 236 PTR_ADDIU "%[p], 4 \t\n" 237 "swc1 %[sum], 0(%[samples]) \t\n" 238 PTR_ADDIU "%[p2], -4 \t\n" 239 "swc1 %[sum2], 0(%[samples2]) \t\n" 240 PTR_ADDU "%[samples], %[samples], %[incr1] \t\n" 241 PTR_SUBU "%[samples2],%[samples2], %[incr1] \t\n" 242 "bne %[j], 64, ff_mpadsp_apply_window_loop%= \t\n" 243 244 "lwc1 %[in1], 48*4(%[window]) \t\n" 245 "lwc1 %[in2], 32*4(%[synth_buf]) \t\n" 246 "lwc1 %[in3], 112*4(%[window]) \t\n" 247 "lwc1 %[in4], 96*4(%[synth_buf]) \t\n" 248 "lwc1 %[in5], 176*4(%[window]) \t\n" 249 "lwc1 %[in6], 160*4(%[synth_buf]) \t\n" 250 "mul.s %[sum], %[in1], %[in2] \t\n" 251 "lwc1 %[in7], 240*4(%[window]) \t\n" 252 "lwc1 %[in8], 224*4(%[synth_buf]) \t\n" 253 "lwc1 %[in1], 304*4(%[window]) \t\n" 254 "nmadd.s %[sum], %[sum], %[in3], %[in4] \t\n" 255 "lwc1 %[in2], 288*4(%[synth_buf]) \t\n" 256 "lwc1 %[in3], 368*4(%[window]) \t\n" 257 "lwc1 %[in4], 352*4(%[synth_buf]) \t\n" 258 "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n" 259 "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n" 260 "lwc1 %[in5], 432*4(%[window]) \t\n" 261 "lwc1 %[in6], 416*4(%[synth_buf]) \t\n" 262 "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n" 263 "lwc1 %[in7], 496*4(%[window]) \t\n" 264 "lwc1 %[in8], 480*4(%[synth_buf]) \t\n" 265 "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n" 266 "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n" 267 "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n" 268 "swc1 %[sum], 0(%[samples]) \t\n" 269 270 : [sum] "=&f" (sum), [sum2] "=&f" (sum2), 271 [w2] "=&r" (w2), [w] "=&r" (w), 272 [p] "=&r" (p), [p2] "=&r" (p2), [j] "=&r" (j), 273 [samples] "+r" (samples), [samples2] "=&r" (samples2), 274 [in1] "=&f" (in1), [in2] "=&f" (in2), 275 [in3] "=&f" (in3), [in4] "=&f" (in4), 276 [in5] "=&f" (in5), [in6] "=&f" (in6), 277 [in7] "=&f" (in7), [in8] "=&f" (in8), 278 [t_sample] "=&r" (t_sample) 279 : [synth_buf] "r" (synth_buf), [window] "r" (window), 280 [dither_state] "r" (dither_state), [incr1] "r" (incr1) 281 : "memory" 282 ); 283} 284 285static void ff_dct32_mips_float(float *out, const float *tab) 286{ 287 float val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7, 288 val8 , val9 , val10, val11, val12, val13, val14, val15, 289 val16, val17, val18, val19, val20, val21, val22, val23, 290 val24, val25, val26, val27, val28, val29, val30, val31; 291 float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp8, fTmp9; 292 float f1, f2, f3, f4, f5, f6, f7; 293 294 f1 = 0.50241928618815570551; 295 f2 = 0.50060299823519630134; 296 f3 = 10.19000812354805681150; 297 f4 = 5.10114861868916385802; 298 f5 = 0.67480834145500574602; 299 f6 = 0.74453627100229844977; 300 f7 = 0.50979557910415916894; 301 /** 302 * instructions are scheduled to minimize pipeline stall. 303 */ 304 __asm__ volatile ( 305 "lwc1 %[fTmp1], 0*4(%[tab]) \n\t" 306 "lwc1 %[fTmp2], 31*4(%[tab]) \n\t" 307 "lwc1 %[fTmp3], 15*4(%[tab]) \n\t" 308 "lwc1 %[fTmp4], 16*4(%[tab]) \n\t" 309 "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" 310 "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" 311 "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" 312 "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" 313 "mul.s %[fTmp8], %[fTmp8], %[f2] \n\t" 314 "add.s %[val0], %[fTmp5], %[fTmp6] \n\t" 315 "sub.s %[val15], %[fTmp5], %[fTmp6] \n\t" 316 "lwc1 %[fTmp1], 7*4(%[tab]) \n\t" 317 "lwc1 %[fTmp2], 24*4(%[tab]) \n\t" 318 "madd.s %[val16], %[fTmp8], %[fTmp9], %[f3] \n\t" 319 "nmsub.s %[val31], %[fTmp8], %[fTmp9], %[f3] \n\t" 320 "mul.s %[val15], %[val15], %[f1] \n\t" 321 "lwc1 %[fTmp3], 8*4(%[tab]) \n\t" 322 "lwc1 %[fTmp4], 23*4(%[tab]) \n\t" 323 "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" 324 "mul.s %[val31], %[val31], %[f1] \n\t" 325 "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" 326 "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" 327 "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" 328 "add.s %[val7], %[fTmp5], %[fTmp6] \n\t" 329 "sub.s %[val8], %[fTmp5], %[fTmp6] \n\t" 330 "mul.s %[fTmp8], %[fTmp8], %[f5] \n\t" 331 "sub.s %[fTmp2], %[val0], %[val7] \n\t" 332 "mul.s %[val8], %[val8], %[f4] \n\t" 333 "madd.s %[val23], %[fTmp8], %[fTmp9], %[f6] \n\t" 334 "nmsub.s %[val24], %[fTmp8], %[fTmp9], %[f6] \n\t" 335 "add.s %[val0], %[val0], %[val7] \n\t" 336 "mul.s %[val7], %[f7], %[fTmp2] \n\t" 337 "sub.s %[fTmp2], %[val15], %[val8] \n\t" 338 "add.s %[val8], %[val15], %[val8] \n\t" 339 "mul.s %[val24], %[val24], %[f4] \n\t" 340 "sub.s %[fTmp3], %[val16], %[val23] \n\t" 341 "add.s %[val16], %[val16], %[val23] \n\t" 342 "mul.s %[val15], %[f7], %[fTmp2] \n\t" 343 "sub.s %[fTmp4], %[val31], %[val24] \n\t" 344 "mul.s %[val23], %[f7], %[fTmp3] \n\t" 345 "add.s %[val24], %[val31], %[val24] \n\t" 346 "mul.s %[val31], %[f7], %[fTmp4] \n\t" 347 348 : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), 349 [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), 350 [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), 351 [val0] "=&f" (val0), [val7] "=&f" (val7), 352 [val8] "=&f" (val8), [val15] "=&f" (val15), 353 [val16] "=&f" (val16), [val23] "=&f" (val23), 354 [val24] "=&f" (val24), [val31] "=&f" (val31) 355 : [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), 356 [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7) 357 : "memory" 358 ); 359 360 f1 = 0.64682178335999012954; 361 f2 = 0.53104259108978417447; 362 f3 = 1.48416461631416627724; 363 f4 = 0.78815462345125022473; 364 f5 = 0.55310389603444452782; 365 f6 = 1.16943993343288495515; 366 f7 = 2.56291544774150617881; 367 __asm__ volatile ( 368 "lwc1 %[fTmp1], 3*4(%[tab]) \n\t" 369 "lwc1 %[fTmp2], 28*4(%[tab]) \n\t" 370 "lwc1 %[fTmp3], 12*4(%[tab]) \n\t" 371 "lwc1 %[fTmp4], 19*4(%[tab]) \n\t" 372 "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" 373 "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" 374 "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" 375 "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" 376 "mul.s %[fTmp8], %[fTmp8], %[f2] \n\t" 377 "add.s %[val3], %[fTmp5], %[fTmp6] \n\t" 378 "sub.s %[val12], %[fTmp5], %[fTmp6] \n\t" 379 "lwc1 %[fTmp1], 4*4(%[tab]) \n\t" 380 "lwc1 %[fTmp2], 27*4(%[tab]) \n\t" 381 "madd.s %[val19], %[fTmp8], %[fTmp9], %[f3] \n\t" 382 "nmsub.s %[val28], %[fTmp8], %[fTmp9], %[f3] \n\t" 383 "mul.s %[val12], %[val12], %[f1] \n\t" 384 "lwc1 %[fTmp3], 11*4(%[tab]) \n\t" 385 "lwc1 %[fTmp4], 20*4(%[tab]) \n\t" 386 "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" 387 "mul.s %[val28], %[val28], %[f1] \n\t" 388 "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" 389 "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" 390 "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" 391 "mul.s %[fTmp8], %[fTmp8], %[f5] \n\t" 392 "add.s %[val4], %[fTmp5], %[fTmp6] \n\t" 393 "sub.s %[val11], %[fTmp5], %[fTmp6] \n\t" 394 "madd.s %[val20], %[fTmp8], %[fTmp9], %[f6] \n\t" 395 "nmsub.s %[val27], %[fTmp8], %[fTmp9], %[f6] \n\t" 396 "mul.s %[val11], %[val11], %[f4] \n\t" 397 "sub.s %[fTmp2], %[val3], %[val4] \n\t" 398 "add.s %[val3], %[val3], %[val4] \n\t" 399 "sub.s %[fTmp4], %[val19], %[val20] \n\t" 400 "mul.s %[val27], %[val27], %[f4] \n\t" 401 "sub.s %[fTmp3], %[val12], %[val11] \n\t" 402 "mul.s %[val4], %[f7], %[fTmp2] \n\t" 403 "add.s %[val11], %[val12], %[val11] \n\t" 404 "add.s %[val19], %[val19], %[val20] \n\t" 405 "mul.s %[val20], %[f7], %[fTmp4] \n\t" 406 "mul.s %[val12], %[f7], %[fTmp3] \n\t" 407 "sub.s %[fTmp2], %[val28], %[val27] \n\t" 408 "add.s %[val27], %[val28], %[val27] \n\t" 409 "mul.s %[val28], %[f7], %[fTmp2] \n\t" 410 411 : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), 412 [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), 413 [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), 414 [val3] "=&f" (val3), [val4] "=&f" (val4), 415 [val11] "=&f" (val11), [val12] "=&f" (val12), 416 [val19] "=&f" (val19), [val20] "=&f" (val20), 417 [val27] "=&f" (val27), [val28] "=&f" (val28) 418 : [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), 419 [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7) 420 : "memory" 421 ); 422 423 f1 = 0.54119610014619698439; 424 __asm__ volatile ( 425 "sub.s %[fTmp2], %[val0], %[val3] \n\t" 426 "add.s %[val0], %[val0], %[val3] \n\t" 427 "sub.s %[fTmp3], %[val7], %[val4] \n\t" 428 "add.s %[val4], %[val7], %[val4] \n\t" 429 "sub.s %[fTmp4], %[val8], %[val11] \n\t" 430 "mul.s %[val3], %[f1], %[fTmp2] \n\t" 431 "add.s %[val8], %[val8], %[val11] \n\t" 432 "mul.s %[val7], %[f1], %[fTmp3] \n\t" 433 "sub.s %[fTmp2], %[val15], %[val12] \n\t" 434 "mul.s %[val11], %[f1], %[fTmp4] \n\t" 435 "add.s %[val12], %[val15], %[val12] \n\t" 436 "mul.s %[val15], %[f1], %[fTmp2] \n\t" 437 438 : [val0] "+&f" (val0), [val3] "+&f" (val3), 439 [val4] "+&f" (val4), [val7] "+&f" (val7), 440 [val8] "+&f" (val8), [val11] "+&f" (val11), 441 [val12] "+&f" (val12), [val15] "+&f" (val15), 442 [fTmp2] "=&f" (fTmp2), 443 [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4) 444 : [f1] "f" (f1) 445 ); 446 447 __asm__ volatile ( 448 "sub.s %[fTmp2], %[val16], %[val19] \n\t" 449 "add.s %[val16], %[val16], %[val19] \n\t" 450 "sub.s %[fTmp3], %[val23], %[val20] \n\t" 451 "add.s %[val20], %[val23], %[val20] \n\t" 452 "sub.s %[fTmp4], %[val24], %[val27] \n\t" 453 "mul.s %[val19], %[f1], %[fTmp2] \n\t" 454 "add.s %[val24], %[val24], %[val27] \n\t" 455 "mul.s %[val23], %[f1], %[fTmp3] \n\t" 456 "sub.s %[fTmp2], %[val31], %[val28] \n\t" 457 "mul.s %[val27], %[f1], %[fTmp4] \n\t" 458 "add.s %[val28], %[val31], %[val28] \n\t" 459 "mul.s %[val31], %[f1], %[fTmp2] \n\t" 460 461 : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), 462 [val16] "+&f" (val16), [val19] "+&f" (val19), [val20] "+&f" (val20), 463 [val23] "+&f" (val23), [val24] "+&f" (val24), [val27] "+&f" (val27), 464 [val28] "+&f" (val28), [val31] "+&f" (val31) 465 : [f1] "f" (f1) 466 ); 467 468 f1 = 0.52249861493968888062; 469 f2 = 0.50547095989754365998; 470 f3 = 3.40760841846871878570; 471 f4 = 1.72244709823833392782; 472 f5 = 0.62250412303566481615; 473 f6 = 0.83934964541552703873; 474 f7 = 0.60134488693504528054; 475 __asm__ volatile ( 476 "lwc1 %[fTmp1], 1*4(%[tab]) \n\t" 477 "lwc1 %[fTmp2], 30*4(%[tab]) \n\t" 478 "lwc1 %[fTmp3], 14*4(%[tab]) \n\t" 479 "lwc1 %[fTmp4], 17*4(%[tab]) \n\t" 480 "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" 481 "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" 482 "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" 483 "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" 484 "mul.s %[fTmp8], %[fTmp8], %[f2] \n\t" 485 "add.s %[val1], %[fTmp5], %[fTmp6] \n\t" 486 "sub.s %[val14], %[fTmp5], %[fTmp6] \n\t" 487 "lwc1 %[fTmp1], 6*4(%[tab]) \n\t" 488 "lwc1 %[fTmp2], 25*4(%[tab]) \n\t" 489 "madd.s %[val17], %[fTmp8], %[fTmp9], %[f3] \n\t" 490 "nmsub.s %[val30], %[fTmp8], %[fTmp9], %[f3] \n\t" 491 "mul.s %[val14], %[val14], %[f1] \n\t" 492 "lwc1 %[fTmp3], 9*4(%[tab]) \n\t" 493 "lwc1 %[fTmp4], 22*4(%[tab]) \n\t" 494 "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" 495 "mul.s %[val30], %[val30], %[f1] \n\t" 496 "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" 497 "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" 498 "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" 499 "add.s %[val6], %[fTmp5], %[fTmp6] \n\t" 500 "sub.s %[val9], %[fTmp5], %[fTmp6] \n\t" 501 "mul.s %[fTmp8], %[fTmp8], %[f5] \n\t" 502 "sub.s %[fTmp2], %[val1], %[val6] \n\t" 503 "add.s %[val1], %[val1], %[val6] \n\t" 504 "mul.s %[val9], %[val9], %[f4] \n\t" 505 "madd.s %[val22], %[fTmp8], %[fTmp9], %[f6] \n\t" 506 "nmsub.s %[val25], %[fTmp8], %[fTmp9], %[f6] \n\t" 507 "mul.s %[val6], %[f7], %[fTmp2] \n\t" 508 "sub.s %[fTmp2], %[val14], %[val9] \n\t" 509 "add.s %[val9], %[val14], %[val9] \n\t" 510 "mul.s %[val25], %[val25], %[f4] \n\t" 511 "sub.s %[fTmp3], %[val17], %[val22] \n\t" 512 "add.s %[val17], %[val17], %[val22] \n\t" 513 "mul.s %[val14], %[f7], %[fTmp2] \n\t" 514 "sub.s %[fTmp2], %[val30], %[val25] \n\t" 515 "mul.s %[val22], %[f7], %[fTmp3] \n\t" 516 "add.s %[val25], %[val30], %[val25] \n\t" 517 "mul.s %[val30], %[f7], %[fTmp2] \n\t" 518 519 : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), 520 [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), 521 [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), 522 [val1] "=&f" (val1), [val6] "=&f" (val6), 523 [val9] "=&f" (val9), [val14] "=&f" (val14), 524 [val17] "=&f" (val17), [val22] "=&f" (val22), 525 [val25] "=&f" (val25), [val30] "=&f" (val30) 526 : [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), 527 [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7) 528 : "memory" 529 ); 530 531 f1 = 0.56694403481635770368; 532 f2 = 0.51544730992262454697; 533 f3 = 2.05778100995341155085; 534 f4 = 1.06067768599034747134; 535 f5 = 0.58293496820613387367; 536 f6 = 0.97256823786196069369; 537 f7 = 0.89997622313641570463; 538 __asm__ volatile ( 539 "lwc1 %[fTmp1], 2*4(%[tab]) \n\t" 540 "lwc1 %[fTmp2], 29*4(%[tab]) \n\t" 541 "lwc1 %[fTmp3], 13*4(%[tab]) \n\t" 542 "lwc1 %[fTmp4], 18*4(%[tab]) \n\t" 543 "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" 544 "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" 545 "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" 546 "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" 547 "mul.s %[fTmp8], %[fTmp8], %[f2] \n\t" 548 "add.s %[val2], %[fTmp5], %[fTmp6] \n\t" 549 "sub.s %[val13], %[fTmp5], %[fTmp6] \n\t" 550 "lwc1 %[fTmp1], 5*4(%[tab]) \n\t" 551 "lwc1 %[fTmp2], 26*4(%[tab]) \n\t" 552 "madd.s %[val18], %[fTmp8], %[fTmp9], %[f3] \n\t" 553 "nmsub.s %[val29], %[fTmp8], %[fTmp9], %[f3] \n\t" 554 "mul.s %[val13], %[val13], %[f1] \n\t" 555 "lwc1 %[fTmp3], 10*4(%[tab]) \n\t" 556 "lwc1 %[fTmp4], 21*4(%[tab]) \n\t" 557 "mul.s %[val29], %[val29], %[f1] \n\t" 558 "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t" 559 "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t" 560 "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t" 561 "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t" 562 "add.s %[val5], %[fTmp5], %[fTmp6] \n\t" 563 "sub.s %[val10], %[fTmp5], %[fTmp6] \n\t" 564 "mul.s %[fTmp8], %[fTmp8], %[f5] \n\t" 565 "sub.s %[fTmp2], %[val2], %[val5] \n\t" 566 "mul.s %[val10], %[val10], %[f4] \n\t" 567 "madd.s %[val21], %[fTmp8], %[fTmp9], %[f6] \n\t" 568 "nmsub.s %[val26], %[fTmp8], %[fTmp9], %[f6] \n\t" 569 "add.s %[val2], %[val2], %[val5] \n\t" 570 "mul.s %[val5], %[f7], %[fTmp2] \n\t" 571 "sub.s %[fTmp3], %[val13], %[val10] \n\t" 572 "add.s %[val10], %[val13], %[val10] \n\t" 573 "mul.s %[val26], %[val26], %[f4] \n\t" 574 "sub.s %[fTmp4], %[val18], %[val21] \n\t" 575 "add.s %[val18], %[val18], %[val21] \n\t" 576 "mul.s %[val13], %[f7], %[fTmp3] \n\t" 577 "sub.s %[fTmp2], %[val29], %[val26] \n\t" 578 "add.s %[val26], %[val29], %[val26] \n\t" 579 "mul.s %[val21], %[f7], %[fTmp4] \n\t" 580 "mul.s %[val29], %[f7], %[fTmp2] \n\t" 581 582 : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), 583 [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6), 584 [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9), 585 [val2] "=&f" (val2), [val5] "=&f" (val5), 586 [val10] "=&f" (val10), [val13] "=&f" (val13), 587 [val18] "=&f" (val18), [val21] "=&f" (val21), 588 [val26] "=&f" (val26), [val29] "=&f" (val29) 589 : [tab] "r" (tab), [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), 590 [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7) 591 : "memory" 592 ); 593 594 f1 = 1.30656296487637652785; 595 __asm__ volatile ( 596 "sub.s %[fTmp2], %[val1], %[val2] \n\t" 597 "add.s %[val1], %[val1], %[val2] \n\t" 598 "sub.s %[fTmp3], %[val6], %[val5] \n\t" 599 "add.s %[val5], %[val6], %[val5] \n\t" 600 "sub.s %[fTmp4], %[val9], %[val10] \n\t" 601 "mul.s %[val2], %[f1], %[fTmp2] \n\t" 602 "add.s %[val9], %[val9], %[val10] \n\t" 603 "mul.s %[val6], %[f1], %[fTmp3] \n\t" 604 "sub.s %[fTmp2], %[val14], %[val13] \n\t" 605 "mul.s %[val10], %[f1], %[fTmp4] \n\t" 606 "add.s %[val13], %[val14], %[val13] \n\t" 607 "mul.s %[val14], %[f1], %[fTmp2] \n\t" 608 609 : [fTmp2] "=&f" (fTmp2), 610 [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), 611 [val1] "+&f" (val1), [val2] "+&f" (val2), 612 [val5] "+&f" (val5), [val6] "+&f" (val6), 613 [val9] "+&f" (val9), [val10] "+&f" (val10), 614 [val13] "+&f" (val13), [val14] "+&f" (val14) 615 : [f1]"f"(f1) 616 ); 617 618 __asm__ volatile ( 619 "sub.s %[fTmp2], %[val17], %[val18] \n\t" 620 "add.s %[val17], %[val17], %[val18] \n\t" 621 "sub.s %[fTmp3], %[val22], %[val21] \n\t" 622 "add.s %[val21], %[val22], %[val21] \n\t" 623 "sub.s %[fTmp4], %[val25], %[val26] \n\t" 624 "mul.s %[val18], %[f1], %[fTmp2] \n\t" 625 "add.s %[val25], %[val25], %[val26] \n\t" 626 "mul.s %[val22], %[f1], %[fTmp3] \n\t" 627 "sub.s %[fTmp2], %[val30], %[val29] \n\t" 628 "mul.s %[val26], %[f1], %[fTmp4] \n\t" 629 "add.s %[val29], %[val30], %[val29] \n\t" 630 "mul.s %[val30], %[f1], %[fTmp2] \n\t" 631 632 : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), 633 [val17] "+&f" (val17), [val18] "+&f" (val18), [val21] "+&f" (val21), 634 [val22] "+&f" (val22), [val25] "+&f" (val25), [val26] "+&f" (val26), 635 [val29] "+&f" (val29), [val30] "+&f" (val30) 636 : [f1] "f" (f1) 637 ); 638 639 f1 = 0.70710678118654752439; 640 __asm__ volatile ( 641 "sub.s %[fTmp2], %[val0], %[val1] \n\t" 642 "add.s %[val0], %[val0], %[val1] \n\t" 643 "sub.s %[fTmp3], %[val3], %[val2] \n\t" 644 "add.s %[val2], %[val3], %[val2] \n\t" 645 "sub.s %[fTmp4], %[val4], %[val5] \n\t" 646 "mul.s %[val1], %[f1], %[fTmp2] \n\t" 647 "swc1 %[val0], 0(%[out]) \n\t" 648 "mul.s %[val3], %[fTmp3], %[f1] \n\t" 649 "add.s %[val4], %[val4], %[val5] \n\t" 650 "mul.s %[val5], %[f1], %[fTmp4] \n\t" 651 "swc1 %[val1], 16*4(%[out]) \n\t" 652 "sub.s %[fTmp2], %[val7], %[val6] \n\t" 653 "add.s %[val2], %[val2], %[val3] \n\t" 654 "swc1 %[val3], 24*4(%[out]) \n\t" 655 "add.s %[val6], %[val7], %[val6] \n\t" 656 "mul.s %[val7], %[f1], %[fTmp2] \n\t" 657 "swc1 %[val2], 8*4(%[out]) \n\t" 658 "add.s %[val6], %[val6], %[val7] \n\t" 659 "swc1 %[val7], 28*4(%[out]) \n\t" 660 "add.s %[val4], %[val4], %[val6] \n\t" 661 "add.s %[val6], %[val6], %[val5] \n\t" 662 "add.s %[val5], %[val5], %[val7] \n\t" 663 "swc1 %[val4], 4*4(%[out]) \n\t" 664 "swc1 %[val5], 20*4(%[out]) \n\t" 665 "swc1 %[val6], 12*4(%[out]) \n\t" 666 667 : [fTmp2] "=&f" (fTmp2), 668 [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), 669 [val0] "+&f" (val0), [val1] "+&f" (val1), 670 [val2] "+&f" (val2), [val3] "+&f" (val3), 671 [val4] "+&f" (val4), [val5] "+&f" (val5), 672 [val6] "+&f" (val6), [val7] "+&f" (val7) 673 : [out] "r" (out), [f1]"f"(f1) 674 ); 675 676 __asm__ volatile ( 677 "sub.s %[fTmp2], %[val8], %[val9] \n\t" 678 "add.s %[val8], %[val8], %[val9] \n\t" 679 "sub.s %[fTmp3], %[val11], %[val10] \n\t" 680 "add.s %[val10], %[val11], %[val10] \n\t" 681 "sub.s %[fTmp4], %[val12], %[val13] \n\t" 682 "mul.s %[val9], %[f1], %[fTmp2] \n\t" 683 "add.s %[val12], %[val12], %[val13] \n\t" 684 "mul.s %[val11], %[f1], %[fTmp3] \n\t" 685 "sub.s %[fTmp2], %[val15], %[val14] \n\t" 686 "mul.s %[val13], %[f1], %[fTmp4] \n\t" 687 "add.s %[val14], %[val15], %[val14] \n\t" 688 "add.s %[val10], %[val10], %[val11] \n\t" 689 "mul.s %[val15], %[f1], %[fTmp2] \n\t" 690 "add.s %[val14], %[val14], %[val15] \n\t" 691 "add.s %[val12], %[val12], %[val14] \n\t" 692 "add.s %[val14], %[val14], %[val13] \n\t" 693 "add.s %[val13], %[val13], %[val15] \n\t" 694 "add.s %[val8], %[val8], %[val12] \n\t" 695 "add.s %[val12], %[val12], %[val10] \n\t" 696 "add.s %[val10], %[val10], %[val14] \n\t" 697 "add.s %[val14], %[val14], %[val9] \n\t" 698 "add.s %[val9], %[val9], %[val13] \n\t" 699 "add.s %[val13], %[val13], %[val11] \n\t" 700 "add.s %[val11], %[val11], %[val15] \n\t" 701 "swc1 %[val8], 2*4(%[out]) \n\t" 702 "swc1 %[val9], 18*4(%[out]) \n\t" 703 "swc1 %[val10], 10*4(%[out]) \n\t" 704 "swc1 %[val11], 26*4(%[out]) \n\t" 705 "swc1 %[val12], 6*4(%[out]) \n\t" 706 "swc1 %[val13], 22*4(%[out]) \n\t" 707 "swc1 %[val14], 14*4(%[out]) \n\t" 708 "swc1 %[val15], 30*4(%[out]) \n\t" 709 710 : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), 711 [val8] "+&f" (val8), [val9] "+&f" (val9), [val10] "+&f" (val10), 712 [val11] "+&f" (val11), [val12] "+&f" (val12), [val13] "+&f" (val13), 713 [val14] "+&f" (val14), [val15] "+&f" (val15) 714 : [f1] "f" (f1), [out] "r" (out) 715 ); 716 717 __asm__ volatile ( 718 "sub.s %[fTmp2], %[val16], %[val17] \n\t" 719 "add.s %[val16], %[val16], %[val17] \n\t" 720 "sub.s %[fTmp3], %[val19], %[val18] \n\t" 721 "add.s %[val18], %[val19], %[val18] \n\t" 722 "sub.s %[fTmp4], %[val20], %[val21] \n\t" 723 "mul.s %[val17], %[f1], %[fTmp2] \n\t" 724 "add.s %[val20], %[val20], %[val21] \n\t" 725 "mul.s %[val19], %[f1], %[fTmp3] \n\t" 726 "sub.s %[fTmp2], %[val23], %[val22] \n\t" 727 "mul.s %[val21], %[f1], %[fTmp4] \n\t" 728 "add.s %[val22], %[val23], %[val22] \n\t" 729 "add.s %[val18], %[val18], %[val19] \n\t" 730 "mul.s %[val23], %[f1], %[fTmp2] \n\t" 731 "add.s %[val22], %[val22], %[val23] \n\t" 732 "add.s %[val20], %[val20], %[val22] \n\t" 733 "add.s %[val22], %[val22], %[val21] \n\t" 734 "add.s %[val21], %[val21], %[val23] \n\t" 735 736 : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), 737 [val16] "+&f" (val16), [val17] "+&f" (val17), [val18] "+&f" (val18), 738 [val19] "+&f" (val19), [val20] "+&f" (val20), [val21] "+&f" (val21), 739 [val22] "+&f" (val22), [val23] "+&f" (val23) 740 : [f1] "f" (f1) 741 ); 742 743 __asm__ volatile ( 744 "sub.s %[fTmp2], %[val24], %[val25] \n\t" 745 "add.s %[val24], %[val24], %[val25] \n\t" 746 "sub.s %[fTmp3], %[val27], %[val26] \n\t" 747 "add.s %[val26], %[val27], %[val26] \n\t" 748 "sub.s %[fTmp4], %[val28], %[val29] \n\t" 749 "mul.s %[val25], %[f1], %[fTmp2] \n\t" 750 "add.s %[val28], %[val28], %[val29] \n\t" 751 "mul.s %[val27], %[f1], %[fTmp3] \n\t" 752 "sub.s %[fTmp2], %[val31], %[val30] \n\t" 753 "mul.s %[val29], %[f1], %[fTmp4] \n\t" 754 "add.s %[val30], %[val31], %[val30] \n\t" 755 "add.s %[val26], %[val26], %[val27] \n\t" 756 "mul.s %[val31], %[f1], %[fTmp2] \n\t" 757 "add.s %[val30], %[val30], %[val31] \n\t" 758 "add.s %[val28], %[val28], %[val30] \n\t" 759 "add.s %[val30], %[val30], %[val29] \n\t" 760 "add.s %[val29], %[val29], %[val31] \n\t" 761 "add.s %[val24], %[val24], %[val28] \n\t" 762 "add.s %[val28], %[val28], %[val26] \n\t" 763 "add.s %[val26], %[val26], %[val30] \n\t" 764 "add.s %[val30], %[val30], %[val25] \n\t" 765 "add.s %[val25], %[val25], %[val29] \n\t" 766 "add.s %[val29], %[val29], %[val27] \n\t" 767 "add.s %[val27], %[val27], %[val31] \n\t" 768 769 : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4), 770 [val24] "+&f" (val24), [val25] "+&f" (val25), [val26] "+&f" (val26), 771 [val27] "+&f" (val27), [val28] "+&f" (val28), [val29] "+&f" (val29), 772 [val30] "+&f" (val30), [val31] "+&f" (val31) 773 : [f1] "f" (f1) 774 ); 775 776 out[ 1] = val16 + val24; 777 out[17] = val17 + val25; 778 out[ 9] = val18 + val26; 779 out[25] = val19 + val27; 780 out[ 5] = val20 + val28; 781 out[21] = val21 + val29; 782 out[13] = val22 + val30; 783 out[29] = val23 + val31; 784 out[ 3] = val24 + val20; 785 out[19] = val25 + val21; 786 out[11] = val26 + val22; 787 out[27] = val27 + val23; 788 out[ 7] = val28 + val18; 789 out[23] = val29 + val19; 790 out[15] = val30 + val17; 791 out[31] = val31; 792} 793 794static void imdct36_mips_float(float *out, float *buf, float *in, float *win) 795{ 796 float t0, t1, t2, t3, s0, s1, s2, s3; 797 float tmp[18]; 798 /* temporary variables */ 799 float in1, in2, in3, in4, in5, in6; 800 float out1, out2, out3, out4, out5; 801 float f1, f2, f3, f4, f5, f6, f7, f8, f9; 802 803 /** 804 * all loops are unrolled totally, and instructions are scheduled to 805 * minimize pipeline stall. instructions of the first two loops are 806 * reorganized, in order to eliminate unnecessary readings and 807 * writings into array. values defined in macros and tables are 808 * eliminated - they are directly loaded in appropriate variables 809 */ 810 811 /* loop 1 and 2 */ 812 __asm__ volatile ( 813 "lwc1 %[in1], 17*4(%[in]) \t\n" 814 "lwc1 %[in2], 16*4(%[in]) \t\n" 815 "lwc1 %[in3], 15*4(%[in]) \t\n" 816 "lwc1 %[in4], 14*4(%[in]) \t\n" 817 "lwc1 %[in5], 13*4(%[in]) \t\n" 818 "lwc1 %[in6], 12*4(%[in]) \t\n" 819 "add.s %[out1], %[in1], %[in2] \t\n" 820 "add.s %[out2], %[in2], %[in3] \t\n" 821 "add.s %[out3], %[in3], %[in4] \t\n" 822 "add.s %[out4], %[in4], %[in5] \t\n" 823 "add.s %[out5], %[in5], %[in6] \t\n" 824 "lwc1 %[in1], 11*4(%[in]) \t\n" 825 "swc1 %[out2], 16*4(%[in]) \t\n" 826 "add.s %[out1], %[out1], %[out3] \t\n" 827 "swc1 %[out4], 14*4(%[in]) \t\n" 828 "add.s %[out3], %[out3], %[out5] \t\n" 829 "lwc1 %[in2], 10*4(%[in]) \t\n" 830 "lwc1 %[in3], 9*4(%[in]) \t\n" 831 "swc1 %[out1], 17*4(%[in]) \t\n" 832 "lwc1 %[in4], 8*4(%[in]) \t\n" 833 "swc1 %[out3], 15*4(%[in]) \t\n" 834 "add.s %[out1], %[in6], %[in1] \t\n" 835 "add.s %[out2], %[in1], %[in2] \t\n" 836 "add.s %[out3], %[in2], %[in3] \t\n" 837 "add.s %[out4], %[in3], %[in4] \t\n" 838 "lwc1 %[in5], 7*4(%[in]) \t\n" 839 "swc1 %[out1], 12*4(%[in]) \t\n" 840 "add.s %[out5], %[out5], %[out2] \t\n" 841 "swc1 %[out3], 10*4(%[in]) \t\n" 842 "add.s %[out2], %[out2], %[out4] \t\n" 843 "lwc1 %[in6], 6*4(%[in]) \t\n" 844 "lwc1 %[in1], 5*4(%[in]) \t\n" 845 "swc1 %[out5], 13*4(%[in]) \t\n" 846 "lwc1 %[in2], 4*4(%[in]) \t\n" 847 "swc1 %[out2], 11*4(%[in]) \t\n" 848 "add.s %[out5], %[in4], %[in5] \t\n" 849 "add.s %[out1], %[in5], %[in6] \t\n" 850 "add.s %[out2], %[in6], %[in1] \t\n" 851 "add.s %[out3], %[in1], %[in2] \t\n" 852 "lwc1 %[in3], 3*4(%[in]) \t\n" 853 "swc1 %[out5], 8*4(%[in]) \t\n" 854 "add.s %[out4], %[out4], %[out1] \t\n" 855 "swc1 %[out2], 6*4(%[in]) \t\n" 856 "add.s %[out1], %[out1], %[out3] \t\n" 857 "lwc1 %[in4], 2*4(%[in]) \t\n" 858 "lwc1 %[in5], 1*4(%[in]) \t\n" 859 "swc1 %[out4], 9*4(%[in]) \t\n" 860 "lwc1 %[in6], 0(%[in]) \t\n" 861 "swc1 %[out1], 7*4(%[in]) \t\n" 862 "add.s %[out4], %[in2], %[in3] \t\n" 863 "add.s %[out5], %[in3], %[in4] \t\n" 864 "add.s %[out1], %[in4], %[in5] \t\n" 865 "add.s %[out2], %[in5], %[in6] \t\n" 866 "swc1 %[out4], 4*4(%[in]) \t\n" 867 "add.s %[out3], %[out3], %[out5] \t\n" 868 "swc1 %[out1], 2*4(%[in]) \t\n" 869 "add.s %[out5], %[out5], %[out2] \t\n" 870 "swc1 %[out2], 1*4(%[in]) \t\n" 871 "swc1 %[out3], 5*4(%[in]) \t\n" 872 "swc1 %[out5], 3*4(%[in]) \t\n" 873 874 : [in1] "=&f" (in1), [in2] "=&f" (in2), 875 [in3] "=&f" (in3), [in4] "=&f" (in4), 876 [in5] "=&f" (in5), [in6] "=&f" (in6), 877 [out1] "=&f" (out1), [out2] "=&f" (out2), 878 [out3] "=&f" (out3), [out4] "=&f" (out4), 879 [out5] "=&f" (out5) 880 : [in] "r" (in) 881 : "memory" 882 ); 883 884 /* loop 3 */ 885 f1 = 0.5; 886 f2 = 0.93969262078590838405; 887 f3 = -0.76604444311897803520; 888 f4 = -0.17364817766693034885; 889 f5 = -0.86602540378443864676; 890 f6 = 0.98480775301220805936; 891 f7 = -0.34202014332566873304; 892 f8 = 0.86602540378443864676; 893 f9 = -0.64278760968653932632; 894 __asm__ volatile ( 895 "lwc1 %[in1], 8*4(%[in]) \t\n" 896 "lwc1 %[in2], 16*4(%[in]) \t\n" 897 "lwc1 %[in3], 4*4(%[in]) \t\n" 898 "lwc1 %[in4], 0(%[in]) \t\n" 899 "lwc1 %[in5], 12*4(%[in]) \t\n" 900 "add.s %[t2], %[in1], %[in2] \t\n" 901 "add.s %[t0], %[in1], %[in3] \t\n" 902 "madd.s %[t3], %[in4], %[in5], %[f1] \t\n" 903 "sub.s %[t1], %[in4], %[in5] \t\n" 904 "sub.s %[t2], %[t2], %[in3] \t\n" 905 "mul.s %[t0], %[t0], %[f2] \t\n" 906 "nmsub.s %[out1], %[t1], %[t2], %[f1] \t\n" 907 "add.s %[out2], %[t1], %[t2] \t\n" 908 "add.s %[t2], %[in2], %[in3] \t\n" 909 "sub.s %[t1], %[in1], %[in2] \t\n" 910 "sub.s %[out3], %[t3], %[t0] \t\n" 911 "swc1 %[out1], 6*4(%[tmp]) \t\n" 912 "swc1 %[out2], 16*4(%[tmp]) \t\n" 913 "mul.s %[t2], %[t2], %[f3] \t\n" 914 "mul.s %[t1], %[t1], %[f4] \t\n" 915 "add.s %[out1], %[t3], %[t0] \t\n" 916 "lwc1 %[in1], 10*4(%[in]) \t\n" 917 "lwc1 %[in2], 14*4(%[in]) \t\n" 918 "sub.s %[out3], %[out3], %[t2] \t\n" 919 "add.s %[out2], %[t3], %[t2] \t\n" 920 "add.s %[out1], %[out1], %[t1] \t\n" 921 "lwc1 %[in3], 2*4(%[in]) \t\n" 922 "lwc1 %[in4], 6*4(%[in]) \t\n" 923 "swc1 %[out3], 10*4(%[tmp]) \t\n" 924 "sub.s %[out2], %[out2], %[t1] \t\n" 925 "swc1 %[out1], 2*4(%[tmp]) \t\n" 926 "add.s %[out1], %[in1], %[in2] \t\n" 927 "add.s %[t2], %[in1], %[in3] \t\n" 928 "sub.s %[t3], %[in1], %[in2] \t\n" 929 "swc1 %[out2], 14*4(%[tmp]) \t\n" 930 "sub.s %[out1], %[out1], %[in3] \t\n" 931 "mul.s %[t2], %[t2], %[f6] \t\n" 932 "mul.s %[t3], %[t3], %[f7] \t\n" 933 "mul.s %[t0], %[in4], %[f8] \t\n" 934 "mul.s %[out1], %[out1], %[f5] \t\n" 935 "add.s %[t1], %[in2], %[in3] \t\n" 936 "add.s %[out2], %[t2], %[t3] \t\n" 937 "lwc1 %[in1], 9*4(%[in]) \t\n" 938 "swc1 %[out1], 4*4(%[tmp]) \t\n" 939 "mul.s %[t1], %[t1], %[f9] \t\n" 940 "lwc1 %[in2], 17*4(%[in]) \t\n" 941 "add.s %[out2], %[out2], %[t0] \t\n" 942 "lwc1 %[in3], 5*4(%[in]) \t\n" 943 "lwc1 %[in4], 1*4(%[in]) \t\n" 944 "add.s %[out3], %[t2], %[t1] \t\n" 945 "sub.s %[out1], %[t3], %[t1] \t\n" 946 "swc1 %[out2], 0(%[tmp]) \t\n" 947 "lwc1 %[in5], 13*4(%[in]) \t\n" 948 "add.s %[t2], %[in1], %[in2] \t\n" 949 "sub.s %[out3], %[out3], %[t0] \t\n" 950 "sub.s %[out1], %[out1], %[t0] \t\n" 951 "add.s %[t0], %[in1], %[in3] \t\n" 952 "madd.s %[t3], %[in4], %[in5], %[f1] \t\n" 953 "sub.s %[t2], %[t2], %[in3] \t\n" 954 "swc1 %[out3], 12*4(%[tmp]) \t\n" 955 "swc1 %[out1], 8*4(%[tmp]) \t\n" 956 "sub.s %[t1], %[in4], %[in5] \t\n" 957 "mul.s %[t0], %[t0], %[f2] \t\n" 958 "nmsub.s %[out1], %[t1], %[t2], %[f1] \t\n" 959 "add.s %[out2], %[t1], %[t2] \t\n" 960 "add.s %[t2], %[in2], %[in3] \t\n" 961 "sub.s %[t1], %[in1], %[in2] \t\n" 962 "sub.s %[out3], %[t3], %[t0] \t\n" 963 "swc1 %[out1], 7*4(%[tmp]) \t\n" 964 "swc1 %[out2], 17*4(%[tmp]) \t\n" 965 "mul.s %[t2], %[t2], %[f3] \t\n" 966 "mul.s %[t1], %[t1], %[f4] \t\n" 967 "add.s %[out1], %[t3], %[t0] \t\n" 968 "lwc1 %[in1], 11*4(%[in]) \t\n" 969 "lwc1 %[in2], 15*4(%[in]) \t\n" 970 "sub.s %[out3], %[out3], %[t2] \t\n" 971 "add.s %[out2], %[t3], %[t2] \t\n" 972 "add.s %[out1], %[out1], %[t1] \t\n" 973 "lwc1 %[in3], 3*4(%[in]) \t\n" 974 "lwc1 %[in4], 7*4(%[in]) \t\n" 975 "swc1 %[out3], 11*4(%[tmp]) \t\n" 976 "sub.s %[out2], %[out2], %[t1] \t\n" 977 "swc1 %[out1], 3*4(%[tmp]) \t\n" 978 "add.s %[out3], %[in1], %[in2] \t\n" 979 "add.s %[t2], %[in1], %[in3] \t\n" 980 "sub.s %[t3], %[in1], %[in2] \t\n" 981 "swc1 %[out2], 15*4(%[tmp]) \t\n" 982 "mul.s %[t0], %[in4], %[f8] \t\n" 983 "sub.s %[out3], %[out3], %[in3] \t\n" 984 "mul.s %[t2], %[t2], %[f6] \t\n" 985 "mul.s %[t3], %[t3], %[f7] \t\n" 986 "add.s %[t1], %[in2], %[in3] \t\n" 987 "mul.s %[out3], %[out3], %[f5] \t\n" 988 "add.s %[out1], %[t2], %[t3] \t\n" 989 "mul.s %[t1], %[t1], %[f9] \t\n" 990 "swc1 %[out3], 5*4(%[tmp]) \t\n" 991 "add.s %[out1], %[out1], %[t0] \t\n" 992 "add.s %[out2], %[t2], %[t1] \t\n" 993 "sub.s %[out3], %[t3], %[t1] \t\n" 994 "swc1 %[out1], 1*4(%[tmp]) \t\n" 995 "sub.s %[out2], %[out2], %[t0] \t\n" 996 "sub.s %[out3], %[out3], %[t0] \t\n" 997 "swc1 %[out2], 13*4(%[tmp]) \t\n" 998 "swc1 %[out3], 9*4(%[tmp]) \t\n" 999 1000 : [t0] "=&f" (t0), [t1] "=&f" (t1), 1001 [t2] "=&f" (t2), [t3] "=&f" (t3), 1002 [in1] "=&f" (in1), [in2] "=&f" (in2), 1003 [in3] "=&f" (in3), [in4] "=&f" (in4), 1004 [in5] "=&f" (in5), [out1] "=&f" (out1), 1005 [out2] "=&f" (out2), [out3] "=&f" (out3) 1006 : [in] "r" (in), [tmp] "r" (tmp), [f1]"f"(f1), [f2]"f"(f2), 1007 [f3]"f"(f3), [f4]"f"(f4), [f5]"f"(f5), [f6]"f"(f6), 1008 [f7]"f"(f7), [f8]"f"(f8), [f9]"f"(f9) 1009 : "memory" 1010 ); 1011 1012 /* loop 4 */ 1013 f1 = 0.50190991877167369479; 1014 f2 = 5.73685662283492756461; 1015 f3 = 0.51763809020504152469; 1016 f4 = 1.93185165257813657349; 1017 f5 = 0.55168895948124587824; 1018 f6 = 1.18310079157624925896; 1019 f7 = 0.61038729438072803416; 1020 f8 = 0.87172339781054900991; 1021 f9 = 0.70710678118654752439; 1022 __asm__ volatile ( 1023 "lwc1 %[in1], 2*4(%[tmp]) \t\n" 1024 "lwc1 %[in2], 0(%[tmp]) \t\n" 1025 "lwc1 %[in3], 3*4(%[tmp]) \t\n" 1026 "lwc1 %[in4], 1*4(%[tmp]) \t\n" 1027 "add.s %[s0], %[in1], %[in2] \t\n" 1028 "sub.s %[s2], %[in1], %[in2] \t\n" 1029 "add.s %[s1], %[in3], %[in4] \t\n" 1030 "sub.s %[s3], %[in3], %[in4] \t\n" 1031 "lwc1 %[in1], 9*4(%[win]) \t\n" 1032 "lwc1 %[in2], 4*9*4(%[buf]) \t\n" 1033 "lwc1 %[in3], 8*4(%[win]) \t\n" 1034 "mul.s %[s1], %[s1], %[f1] \t\n" 1035 "mul.s %[s3], %[s3], %[f2] \t\n" 1036 "lwc1 %[in4], 4*8*4(%[buf]) \t\n" 1037 "lwc1 %[in5], 29*4(%[win]) \t\n" 1038 "lwc1 %[in6], 28*4(%[win]) \t\n" 1039 "add.s %[t0], %[s0], %[s1] \t\n" 1040 "sub.s %[t1], %[s0], %[s1] \t\n" 1041 "mul.s %[out3], %[in5], %[t0] \t\n" 1042 "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" 1043 "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" 1044 "mul.s %[out4], %[in6], %[t0] \t\n" 1045 "add.s %[t0], %[s2], %[s3] \t\n" 1046 "swc1 %[out3], 4*9*4(%[buf]) \t\n" 1047 "swc1 %[out1], 288*4(%[out]) \t\n" 1048 "swc1 %[out2], 256*4(%[out]) \t\n" 1049 "swc1 %[out4], 4*8*4(%[buf]) \t\n" 1050 "sub.s %[t1], %[s2], %[s3] \t\n" 1051 "lwc1 %[in1], 17*4(%[win]) \t\n" 1052 "lwc1 %[in2], 4*17*4(%[buf]) \t\n" 1053 "lwc1 %[in3], 0(%[win]) \t\n" 1054 "lwc1 %[in4], 0(%[buf]) \t\n" 1055 "lwc1 %[in5], 37*4(%[win]) \t\n" 1056 "lwc1 %[in6], 20*4(%[win]) \t\n" 1057 "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" 1058 "lwc1 %[in1], 6*4(%[tmp]) \t\n" 1059 "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" 1060 "mul.s %[out3], %[t0], %[in5] \t\n" 1061 "mul.s %[out4], %[t0], %[in6] \t\n" 1062 "swc1 %[out1], 544*4(%[out]) \t\n" 1063 "lwc1 %[in2], 4*4(%[tmp]) \t\n" 1064 "swc1 %[out2], 0(%[out]) \t\n" 1065 "swc1 %[out3], 4*17*4(%[buf]) \t\n" 1066 "swc1 %[out4], 0(%[buf]) \t\n" 1067 "lwc1 %[in3], 7*4(%[tmp]) \t\n" 1068 "add.s %[s0], %[in1], %[in2] \t\n" 1069 "sub.s %[s2], %[in1], %[in2] \t\n" 1070 "lwc1 %[in4], 5*4(%[tmp]) \t\n" 1071 "add.s %[s1], %[in3], %[in4] \t\n" 1072 "sub.s %[s3], %[in3], %[in4] \t\n" 1073 "lwc1 %[in1], 10*4(%[win]) \t\n" 1074 "lwc1 %[in2], 4*10*4(%[buf]) \t\n" 1075 "lwc1 %[in3], 7*4(%[win]) \t\n" 1076 "mul.s %[s1], %[s1], %[f3] \t\n" 1077 "mul.s %[s3], %[s3], %[f4] \t\n" 1078 "add.s %[t0], %[s0], %[s1] \t\n" 1079 "sub.s %[t1], %[s0], %[s1] \t\n" 1080 "lwc1 %[in4], 4*7*4(%[buf]) \t\n" 1081 "lwc1 %[in5], 30*4(%[win]) \t\n" 1082 "lwc1 %[in6], 27*4(%[win]) \t\n" 1083 "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" 1084 "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" 1085 "mul.s %[out3], %[t0], %[in5] \t\n" 1086 "mul.s %[out4], %[t0], %[in6] \t\n" 1087 "add.s %[t0], %[s2], %[s3] \t\n" 1088 "swc1 %[out1], 320*4(%[out]) \t\n" 1089 "swc1 %[out2], 224*4(%[out]) \t\n" 1090 "swc1 %[out3], 4*10*4(%[buf]) \t\n" 1091 "swc1 %[out4], 4*7*4(%[buf]) \t\n" 1092 "sub.s %[t1], %[s2], %[s3] \t\n" 1093 "lwc1 %[in1], 16*4(%[win]) \t\n" 1094 "lwc1 %[in2], 4*16*4(%[buf]) \t\n" 1095 "lwc1 %[in3], 1*4(%[win]) \t\n" 1096 "lwc1 %[in4], 4*1*4(%[buf]) \t\n" 1097 "lwc1 %[in5], 36*4(%[win]) \t\n" 1098 "lwc1 %[in6], 21*4(%[win]) \t\n" 1099 "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" 1100 "lwc1 %[in1], 10*4(%[tmp]) \t\n" 1101 "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" 1102 "mul.s %[out3], %[in5], %[t0] \t\n" 1103 "mul.s %[out4], %[in6], %[t0] \t\n" 1104 "swc1 %[out1], 512*4(%[out]) \t\n" 1105 "lwc1 %[in2], 8*4(%[tmp]) \t\n" 1106 "swc1 %[out2], 32*4(%[out]) \t\n" 1107 "swc1 %[out3], 4*16*4(%[buf]) \t\n" 1108 "swc1 %[out4], 4*1*4(%[buf]) \t\n" 1109 "add.s %[s0], %[in1], %[in2] \t\n" 1110 "sub.s %[s2], %[in1], %[in2] \t\n" 1111 "lwc1 %[in3], 11*4(%[tmp]) \t\n" 1112 "lwc1 %[in4], 9*4(%[tmp]) \t\n" 1113 "add.s %[s1], %[in3], %[in4] \t\n" 1114 "sub.s %[s3], %[in3], %[in4] \t\n" 1115 "lwc1 %[in1], 11*4(%[win]) \t\n" 1116 "lwc1 %[in2], 4*11*4(%[buf]) \t\n" 1117 "lwc1 %[in3], 6*4(%[win]) \t\n" 1118 "mul.s %[s1], %[s1], %[f5] \t\n" 1119 "mul.s %[s3], %[s3], %[f6] \t\n" 1120 "lwc1 %[in4], 4*6*4(%[buf]) \t\n" 1121 "lwc1 %[in5], 31*4(%[win]) \t\n" 1122 "lwc1 %[in6], 26*4(%[win]) \t\n" 1123 "add.s %[t0], %[s0], %[s1] \t\n" 1124 "sub.s %[t1], %[s0], %[s1] \t\n" 1125 "mul.s %[out3], %[t0], %[in5] \t\n" 1126 "mul.s %[out4], %[t0], %[in6] \t\n" 1127 "add.s %[t0], %[s2], %[s3] \t\n" 1128 "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" 1129 "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" 1130 "swc1 %[out3], 4*11*4(%[buf]) \t\n" 1131 "swc1 %[out4], 4*6*4(%[buf]) \t\n" 1132 "sub.s %[t1], %[s2], %[s3] \t\n" 1133 "swc1 %[out1], 352*4(%[out]) \t\n" 1134 "swc1 %[out2], 192*4(%[out]) \t\n" 1135 "lwc1 %[in1], 15*4(%[win]) \t\n" 1136 "lwc1 %[in2], 4*15*4(%[buf]) \t\n" 1137 "lwc1 %[in3], 2*4(%[win]) \t\n" 1138 "lwc1 %[in4], 4*2*4(%[buf]) \t\n" 1139 "lwc1 %[in5], 35*4(%[win]) \t\n" 1140 "lwc1 %[in6], 22*4(%[win]) \t\n" 1141 "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" 1142 "lwc1 %[in1], 14*4(%[tmp]) \t\n" 1143 "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" 1144 "mul.s %[out3], %[t0], %[in5] \t\n" 1145 "mul.s %[out4], %[t0], %[in6] \t\n" 1146 "swc1 %[out1], 480*4(%[out]) \t\n" 1147 "lwc1 %[in2], 12*4(%[tmp]) \t\n" 1148 "swc1 %[out2], 64*4(%[out]) \t\n" 1149 "swc1 %[out3], 4*15*4(%[buf]) \t\n" 1150 "swc1 %[out4], 4*2*4(%[buf]) \t\n" 1151 "lwc1 %[in3], 15*4(%[tmp]) \t\n" 1152 "add.s %[s0], %[in1], %[in2] \t\n" 1153 "sub.s %[s2], %[in1], %[in2] \t\n" 1154 "lwc1 %[in4], 13*4(%[tmp]) \t\n" 1155 "add.s %[s1], %[in3], %[in4] \t\n" 1156 "sub.s %[s3], %[in3], %[in4] \t\n" 1157 "lwc1 %[in1], 12*4(%[win]) \t\n" 1158 "lwc1 %[in2], 4*12*4(%[buf]) \t\n" 1159 "lwc1 %[in3], 5*4(%[win]) \t\n" 1160 "mul.s %[s1], %[s1], %[f7] \t\n" 1161 "mul.s %[s3], %[s3], %[f8] \t\n" 1162 "lwc1 %[in4], 4*5*4(%[buf]) \t\n" 1163 "lwc1 %[in5], 32*4(%[win]) \t\n" 1164 "lwc1 %[in6], 25*4(%[win]) \t\n" 1165 "add.s %[t0], %[s0], %[s1] \t\n" 1166 "sub.s %[t1], %[s0], %[s1] \t\n" 1167 "lwc1 %[s0], 16*4(%[tmp]) \t\n" 1168 "lwc1 %[s1], 17*4(%[tmp]) \t\n" 1169 "mul.s %[out3], %[t0], %[in5] \t\n" 1170 "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" 1171 "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" 1172 "mul.s %[out4], %[t0], %[in6] \t\n" 1173 "add.s %[t0], %[s2], %[s3] \t\n" 1174 "swc1 %[out3], 4*12*4(%[buf]) \t\n" 1175 "swc1 %[out1], 384*4(%[out]) \t\n" 1176 "swc1 %[out2], 160*4(%[out]) \t\n" 1177 "swc1 %[out4], 4*5*4(%[buf]) \t\n" 1178 "sub.s %[t1], %[s2], %[s3] \t\n" 1179 "lwc1 %[in1], 14*4(%[win]) \t\n" 1180 "lwc1 %[in2], 4*14*4(%[buf]) \t\n" 1181 "lwc1 %[in3], 3*4(%[win]) \t\n" 1182 "lwc1 %[in4], 4*3*4(%[buf]) \t\n" 1183 "lwc1 %[in5], 34*4(%[win]) \t\n" 1184 "lwc1 %[in6], 23*4(%[win]) \t\n" 1185 "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" 1186 "mul.s %[s1], %[s1], %[f9] \t\n" 1187 "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" 1188 "mul.s %[out3], %[in5], %[t0] \t\n" 1189 "mul.s %[out4], %[in6], %[t0] \t\n" 1190 "swc1 %[out1], 448*4(%[out]) \t\n" 1191 "add.s %[t0], %[s0], %[s1] \t\n" 1192 "swc1 %[out2], 96*4(%[out]) \t\n" 1193 "swc1 %[out3], 4*14*4(%[buf]) \t\n" 1194 "swc1 %[out4], 4*3*4(%[buf]) \t\n" 1195 "sub.s %[t1], %[s0], %[s1] \t\n" 1196 "lwc1 %[in1], 13*4(%[win]) \t\n" 1197 "lwc1 %[in2], 4*13*4(%[buf]) \t\n" 1198 "lwc1 %[in3], 4*4(%[win]) \t\n" 1199 "lwc1 %[in4], 4*4*4(%[buf]) \t\n" 1200 "lwc1 %[in5], 33*4(%[win]) \t\n" 1201 "lwc1 %[in6], 24*4(%[win]) \t\n" 1202 "madd.s %[out1], %[in2], %[in1], %[t1] \t\n" 1203 "madd.s %[out2], %[in4], %[in3], %[t1] \t\n" 1204 "mul.s %[out3], %[t0], %[in5] \t\n" 1205 "mul.s %[out4], %[t0], %[in6] \t\n" 1206 "swc1 %[out1], 416*4(%[out]) \t\n" 1207 "swc1 %[out2], 128*4(%[out]) \t\n" 1208 "swc1 %[out3], 4*13*4(%[buf]) \t\n" 1209 "swc1 %[out4], 4*4*4(%[buf]) \t\n" 1210 1211 : [in1] "=&f" (in1), [in2] "=&f" (in2), 1212 [in3] "=&f" (in3), [in4] "=&f" (in4), 1213 [in5] "=&f" (in5), [in6] "=&f" (in6), 1214 [out1] "=&f" (out1), [out2] "=&f" (out2), 1215 [out3] "=&f" (out3), [out4] "=&f" (out4), 1216 [t0] "=&f" (t0), [t1] "=&f" (t1), 1217 [s0] "=&f" (s0), [s1] "=&f" (s1), 1218 [s2] "=&f" (s2), [s3] "=&f" (s3) 1219 : [tmp] "r" (tmp), [win] "r" (win), 1220 [buf] "r" (buf), [out] "r" (out), 1221 [f1]"f"(f1), [f2]"f"(f2), [f3]"f"(f3), [f4]"f"(f4), 1222 [f5]"f"(f5), [f6]"f"(f6), [f7]"f"(f7), [f8]"f"(f8), [f9]"f"(f9) 1223 : "memory" 1224 ); 1225} 1226 1227static void ff_imdct36_blocks_mips_float(float *out, float *buf, float *in, 1228 int count, int switch_point, int block_type) 1229{ 1230 int j; 1231 for (j=0 ; j < count; j++) { 1232 /* apply window & overlap with previous buffer */ 1233 1234 /* select window */ 1235 int win_idx = (switch_point && j < 2) ? 0 : block_type; 1236 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; 1237 1238 imdct36_mips_float(out, buf, in, win); 1239 1240 in += 18; 1241 buf += ((j&3) != 3 ? 1 : (72-3)); 1242 out++; 1243 } 1244} 1245 1246#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ 1247#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */ 1248 1249void ff_mpadsp_init_mipsfpu(MPADSPContext *s) 1250{ 1251#if HAVE_INLINE_ASM && HAVE_MIPSFPU 1252#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 1253 s->apply_window_float = ff_mpadsp_apply_window_mips_float; 1254 s->imdct36_blocks_float = ff_imdct36_blocks_mips_float; 1255 s->dct32_float = ff_dct32_mips_float; 1256#endif 1257#endif 1258} 1259