1/* 2 * VC-1 and WMV3 - DSP functions Loongson MMI-optimized 3 * 4 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/attributes.h" 24#include "libavutil/avassert.h" 25#include "libavutil/mem_internal.h" 26 27#include "libavcodec/vc1dsp.h" 28#include "constants.h" 29#include "vc1dsp_mips.h" 30#include "hpeldsp_mips.h" 31#include "libavutil/mips/mmiutils.h" 32 33#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \ 34 "li %[tmp0], "#r1" \n\t" \ 35 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 36 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 37 "li %[tmp0], "#r2" \n\t" \ 38 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 39 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 40 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \ 41 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \ 42 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 43 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \ 44 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \ 45 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 46 \ 47 "li %[tmp0], "#r3" \n\t" \ 48 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 49 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 50 "li %[tmp0], "#r4" \n\t" \ 51 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 52 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 53 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \ 54 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \ 55 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 56 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \ 57 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \ 58 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \ 59 \ 60 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \ 61 "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \ 62 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \ 63 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \ 64 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \ 65 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \ 66 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \ 67 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 68 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \ 69 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 70 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \ 71 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \ 72 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \ 73 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \ 74 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \ 75 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t" 76 77#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \ 78 "li %[tmp0], "#r1" \n\t" \ 79 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 80 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 81 "li %[tmp0], "#r2" \n\t" \ 82 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 83 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 84 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \ 85 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \ 86 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 87 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \ 88 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \ 89 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 90 \ 91 "li %[tmp0], "#r3" \n\t" \ 92 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 93 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 94 "li %[tmp0], "#r4" \n\t" \ 95 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 96 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 97 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \ 98 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \ 99 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 100 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \ 101 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \ 102 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \ 103 \ 104 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \ 105 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \ 106 "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \ 107 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \ 108 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \ 109 "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \ 110 "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \ 111 "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \ 112 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \ 113 "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \ 114 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \ 115 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 116 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \ 117 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 118 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \ 119 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \ 120 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \ 121 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \ 122 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \ 123 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t" 124 125/* Do inverse transform on 8x8 block */ 126void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 127{ 128 int dc = block[0]; 129 double ftmp[9]; 130 mips_reg addr[1]; 131 int count; 132 union mmi_intfloat64 dc_u; 133 134 dc = (3 * dc + 1) >> 1; 135 dc = (3 * dc + 16) >> 5; 136 dc_u.i = dc; 137 138 __asm__ volatile( 139 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 140 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 141 "li %[count], 0x02 \n\t" 142 143 "1: \n\t" 144 MMI_LDC1(%[ftmp1], %[dest], 0x00) 145 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t" 146 MMI_LDC1(%[ftmp2], %[addr0], 0x00) 147 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 148 MMI_LDC1(%[ftmp3], %[addr0], 0x00) 149 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 150 MMI_LDC1(%[ftmp4], %[addr0], 0x00) 151 152 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" 153 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 154 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" 155 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 156 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" 157 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 158 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" 159 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 160 161 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 162 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 163 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 164 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 165 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t" 166 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t" 167 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t" 168 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t" 169 170 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 171 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" 172 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" 173 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" 174 175 MMI_SDC1(%[ftmp1], %[dest], 0x00) 176 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t" 177 MMI_SDC1(%[ftmp2], %[addr0], 0x00) 178 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 179 MMI_SDC1(%[ftmp3], %[addr0], 0x00) 180 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 181 MMI_SDC1(%[ftmp4], %[addr0], 0x00) 182 183 "addiu %[count], %[count], -0x01 \n\t" 184 PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t" 185 "bnez %[count], 1b \n\t" 186 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 187 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 188 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 189 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 190 [ftmp8]"=&f"(ftmp[8]), 191 [addr0]"=&r"(addr[0]), 192 [count]"=&r"(count), [dest]"+&r"(dest) 193 : [linesize]"r"((mips_reg)linesize), 194 [dc]"f"(dc_u.f) 195 : "memory" 196 ); 197} 198 199#if _MIPS_SIM != _ABIO32 200void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]) 201{ 202 DECLARE_ALIGNED(16, int16_t, temp[64]); 203 double ftmp[23]; 204 uint64_t tmp[1]; 205 206 __asm__ volatile ( 207 /* 1st loop: start */ 208 "li %[tmp0], 0x03 \n\t" 209 "mtc1 %[tmp0], %[ftmp0] \n\t" 210 211 // 1st part 212 MMI_LDC1(%[ftmp1], %[block], 0x00) 213 MMI_LDC1(%[ftmp11], %[block], 0x10) 214 MMI_LDC1(%[ftmp2], %[block], 0x20) 215 MMI_LDC1(%[ftmp12], %[block], 0x30) 216 MMI_LDC1(%[ftmp3], %[block], 0x40) 217 MMI_LDC1(%[ftmp13], %[block], 0x50) 218 MMI_LDC1(%[ftmp4], %[block], 0x60) 219 MMI_LDC1(%[ftmp14], %[block], 0x70) 220 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 221 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 222 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 223 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 224 225 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 226 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 227 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t" 228 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t" 229 230 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */ 231 VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c, 232 0x000f0010, 0x00040009, %[ff_pw_4]) 233 234 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */ 235 VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4, 236 0xfffc000f, 0xfff7fff0, %[ff_pw_4]) 237 238 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */ 239 VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4, 240 0xfff00009, 0x000f0004, %[ff_pw_4]) 241 242 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */ 243 VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c, 244 0xfff70004, 0xfff0000f, %[ff_pw_4]) 245 246 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18], 247 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) 248 249 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22], 250 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) 251 252 MMI_SDC1(%[ftmp15], %[temp], 0x00) 253 MMI_SDC1(%[ftmp19], %[temp], 0x08) 254 MMI_SDC1(%[ftmp16], %[temp], 0x10) 255 MMI_SDC1(%[ftmp20], %[temp], 0x18) 256 MMI_SDC1(%[ftmp17], %[temp], 0x20) 257 MMI_SDC1(%[ftmp21], %[temp], 0x28) 258 MMI_SDC1(%[ftmp18], %[temp], 0x30) 259 MMI_SDC1(%[ftmp22], %[temp], 0x38) 260 261 // 2nd part 262 MMI_LDC1(%[ftmp1], %[block], 0x08) 263 MMI_LDC1(%[ftmp11], %[block], 0x18) 264 MMI_LDC1(%[ftmp2], %[block], 0x28) 265 MMI_LDC1(%[ftmp12], %[block], 0x38) 266 MMI_LDC1(%[ftmp3], %[block], 0x48) 267 MMI_LDC1(%[ftmp13], %[block], 0x58) 268 MMI_LDC1(%[ftmp4], %[block], 0x68) 269 MMI_LDC1(%[ftmp14], %[block], 0x78) 270 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 271 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 272 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 273 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 274 275 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 276 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 277 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t" 278 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t" 279 280 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */ 281 VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c, 282 0x000f0010, 0x00040009, %[ff_pw_4]) 283 284 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */ 285 VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4, 286 0xfffc000f, 0xfff7fff0, %[ff_pw_4]) 287 288 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */ 289 VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4, 290 0xfff00009, 0x000f0004, %[ff_pw_4]) 291 292 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */ 293 VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c, 294 0xfff70004, 0xfff0000f, %[ff_pw_4]) 295 296 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18], 297 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) 298 299 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22], 300 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4]) 301 302 MMI_SDC1(%[ftmp19], %[temp], 0x48) 303 MMI_SDC1(%[ftmp20], %[temp], 0x58) 304 MMI_SDC1(%[ftmp21], %[temp], 0x68) 305 MMI_SDC1(%[ftmp22], %[temp], 0x78) 306 /* 1st loop: end */ 307 308 /* 2nd loop: start */ 309 "li %[tmp0], 0x07 \n\t" 310 "mtc1 %[tmp0], %[ftmp0] \n\t" 311 312 // 1st part 313 MMI_LDC1(%[ftmp1], %[temp], 0x00) 314 MMI_LDC1(%[ftmp11], %[temp], 0x10) 315 MMI_LDC1(%[ftmp2], %[temp], 0x20) 316 MMI_LDC1(%[ftmp12], %[temp], 0x30) 317 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 318 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 319 "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t" 320 "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t" 321 322 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 323 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 324 "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t" 325 "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t" 326 327 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */ 328 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c, 329 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1]) 330 331 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */ 332 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4, 333 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1]) 334 335 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */ 336 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4, 337 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1]) 338 339 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */ 340 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c, 341 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1]) 342 343 MMI_SDC1(%[ftmp15], %[block], 0x00) 344 MMI_SDC1(%[ftmp16], %[block], 0x10) 345 MMI_SDC1(%[ftmp17], %[block], 0x20) 346 MMI_SDC1(%[ftmp18], %[block], 0x30) 347 MMI_SDC1(%[ftmp19], %[block], 0x40) 348 MMI_SDC1(%[ftmp20], %[block], 0x50) 349 MMI_SDC1(%[ftmp21], %[block], 0x60) 350 MMI_SDC1(%[ftmp22], %[block], 0x70) 351 352 // 2nd part 353 MMI_LDC1(%[ftmp1], %[temp], 0x08) 354 MMI_LDC1(%[ftmp11], %[temp], 0x18) 355 MMI_LDC1(%[ftmp2], %[temp], 0x28) 356 MMI_LDC1(%[ftmp12], %[temp], 0x38) 357 MMI_LDC1(%[ftmp3], %[temp], 0x48) 358 MMI_LDC1(%[ftmp13], %[temp], 0x58) 359 MMI_LDC1(%[ftmp4], %[temp], 0x68) 360 MMI_LDC1(%[ftmp14], %[temp], 0x78) 361 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 362 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 363 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 364 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 365 366 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 367 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 368 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t" 369 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t" 370 371 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */ 372 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c, 373 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1]) 374 375 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */ 376 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4, 377 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1]) 378 379 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */ 380 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4, 381 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1]) 382 383 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */ 384 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c, 385 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1]) 386 387 MMI_SDC1(%[ftmp15], %[block], 0x08) 388 MMI_SDC1(%[ftmp16], %[block], 0x18) 389 MMI_SDC1(%[ftmp17], %[block], 0x28) 390 MMI_SDC1(%[ftmp18], %[block], 0x38) 391 MMI_SDC1(%[ftmp19], %[block], 0x48) 392 MMI_SDC1(%[ftmp20], %[block], 0x58) 393 MMI_SDC1(%[ftmp21], %[block], 0x68) 394 MMI_SDC1(%[ftmp22], %[block], 0x78) 395 /* 2nd loop: end */ 396 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 397 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 398 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 399 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 400 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 401 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 402 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), 403 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]), 404 [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]), 405 [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]), 406 [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]), 407 [ftmp22]"=&f"(ftmp[22]), 408 [tmp0]"=&r"(tmp[0]) 409 : [ff_pw_1]"f"(ff_pw_32_1.f), [ff_pw_64]"f"(ff_pw_32_64.f), 410 [ff_pw_4]"f"(ff_pw_32_4.f), [block]"r"(block), 411 [temp]"r"(temp) 412 : "memory" 413 ); 414} 415#endif 416 417/* Do inverse transform on 8x4 part of block */ 418void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 419{ 420 int dc = block[0]; 421 double ftmp[9]; 422 union mmi_intfloat64 dc_u; 423 424 dc = ( 3 * dc + 1) >> 1; 425 dc = (17 * dc + 64) >> 7; 426 dc_u.i = dc; 427 428 __asm__ volatile( 429 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 430 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 431 432 MMI_LDC1(%[ftmp1], %[dest0], 0x00) 433 MMI_LDC1(%[ftmp2], %[dest1], 0x00) 434 MMI_LDC1(%[ftmp3], %[dest2], 0x00) 435 MMI_LDC1(%[ftmp4], %[dest3], 0x00) 436 437 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" 438 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 439 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" 440 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 441 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" 442 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 443 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" 444 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 445 446 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 447 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 448 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 449 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 450 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t" 451 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t" 452 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t" 453 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t" 454 455 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 456 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" 457 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" 458 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" 459 460 MMI_SDC1(%[ftmp1], %[dest0], 0x00) 461 MMI_SDC1(%[ftmp2], %[dest1], 0x00) 462 MMI_SDC1(%[ftmp3], %[dest2], 0x00) 463 MMI_SDC1(%[ftmp4], %[dest3], 0x00) 464 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 465 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 466 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 467 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 468 [ftmp8]"=&f"(ftmp[8]) 469 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize), 470 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize), 471 [dc]"f"(dc_u.f) 472 : "memory" 473 ); 474} 475 476#if _MIPS_SIM != _ABIO32 477void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 478{ 479 int16_t *src = block; 480 int16_t *dst = block; 481 double ftmp[16]; 482 uint32_t tmp[1]; 483 int16_t count = 4; 484 int16_t coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4, 485 12, 15, 6, -4, -12, -16, -16, -9, 486 12, 9, -6, -16, -12, 4, 16, 15, 487 12, 4, -16, -9, 12, 15, -6, -16, 488 12, -4, -16, 9, 12, -15, -6, 16, 489 12, -9, -6, 16, -12, -4, 16, -15, 490 12, -15, 6, 4, -12, 16, -16, 9, 491 12, -16, 16, -15, 12, -9, 6, -4}; 492 493 // 1st loop 494 __asm__ volatile ( 495 "li %[tmp0], 0x03 \n\t" 496 "mtc1 %[tmp0], %[ftmp0] \n\t" 497 498 "1: \n\t" 499 MMI_LDC1(%[ftmp1], %[src], 0x00) 500 MMI_LDC1(%[ftmp2], %[src], 0x08) 501 502 /* ftmp11: dst1,dst0 */ 503 MMI_LDC1(%[ftmp3], %[coeff], 0x00) 504 MMI_LDC1(%[ftmp4], %[coeff], 0x08) 505 MMI_LDC1(%[ftmp5], %[coeff], 0x10) 506 MMI_LDC1(%[ftmp6], %[coeff], 0x18) 507 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 508 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 509 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 510 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 511 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 512 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 513 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 514 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 515 "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t" 516 "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t" 517 518 /* ftmp12: dst3,dst2 */ 519 MMI_LDC1(%[ftmp3], %[coeff], 0x20) 520 MMI_LDC1(%[ftmp4], %[coeff], 0x28) 521 MMI_LDC1(%[ftmp5], %[coeff], 0x30) 522 MMI_LDC1(%[ftmp6], %[coeff], 0x38) 523 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 524 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 525 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 526 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 527 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 528 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 529 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 530 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 531 "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t" 532 "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t" 533 534 /* ftmp13: dst5,dst4 */ 535 MMI_LDC1(%[ftmp3], %[coeff], 0x40) 536 MMI_LDC1(%[ftmp4], %[coeff], 0x48) 537 MMI_LDC1(%[ftmp5], %[coeff], 0x50) 538 MMI_LDC1(%[ftmp6], %[coeff], 0x58) 539 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 540 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 541 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 542 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 543 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 544 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 545 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 546 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 547 "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t" 548 "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t" 549 550 /* ftmp14: dst7,dst6 */ 551 MMI_LDC1(%[ftmp3], %[coeff], 0x60) 552 MMI_LDC1(%[ftmp4], %[coeff], 0x68) 553 MMI_LDC1(%[ftmp5], %[coeff], 0x70) 554 MMI_LDC1(%[ftmp6], %[coeff], 0x78) 555 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 556 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 557 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 558 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 559 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 560 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 561 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 562 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 563 "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t" 564 "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t" 565 566 /* ftmp9: dst3,dst2,dst1,dst0 ftmp10: dst7,dst6,dst5,dst4 */ 567 "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t" 568 "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t" 569 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" 570 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" 571 "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t" 572 "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t" 573 "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 574 "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t" 575 "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t" 576 "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 577 MMI_SDC1(%[ftmp9], %[dst], 0x00) 578 MMI_SDC1(%[ftmp10], %[dst], 0x08) 579 580 PTR_ADDIU "%[src], %[src], 0x10 \n\t" 581 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t" 582 "addiu %[count], %[count], -0x01 \n\t" 583 "bnez %[count], 1b \n\t" 584 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 585 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 586 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 587 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 588 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 589 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 590 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), 591 [ftmp14]"=&f"(ftmp[14]), [tmp0]"=&r"(tmp[0]), 592 [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count) 593 : [ff_pw_4]"f"(ff_pw_32_4.f), [coeff]"r"(coeff) 594 : "memory" 595 ); 596 597 src = block; 598 599 // 2nd loop 600 __asm__ volatile ( 601 "li %[tmp0], 0x44 \n\t" 602 "mtc1 %[tmp0], %[ftmp15] \n\t" 603 604 // 1st part 605 "li %[tmp0], 0x07 \n\t" 606 "mtc1 %[tmp0], %[ftmp0] \n\t" 607 MMI_LDC1(%[ftmp1], %[src], 0x00) 608 MMI_LDC1(%[ftmp2], %[src], 0x10) 609 MMI_LDC1(%[ftmp3], %[src], 0x20) 610 MMI_LDC1(%[ftmp4], %[src], 0x30) 611 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 612 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 613 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 614 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 615 616 /* ftmp11: dst03,dst02,dst01,dst00 */ 617 "li %[tmp0], 0x00160011 \n\t" 618 "mtc1 %[tmp0], %[ftmp3] \n\t" 619 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 620 "li %[tmp0], 0x000a0011 \n\t" 621 "mtc1 %[tmp0], %[ftmp4] \n\t" 622 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 623 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 624 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 625 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 626 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 627 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 628 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 629 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 630 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 631 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 632 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 633 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 634 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 635 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t" 636 637 /* ftmp12: dst13,dst12,dst11,dst10 */ 638 "li %[tmp0], 0x000a0011 \n\t" 639 "mtc1 %[tmp0], %[ftmp3] \n\t" 640 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 641 "li %[tmp0], 0xffeaffef \n\t" 642 "mtc1 %[tmp0], %[ftmp4] \n\t" 643 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 644 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 645 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 646 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 647 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 648 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 649 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 650 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 651 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 652 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 653 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 654 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 655 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 656 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t" 657 658 /* ftmp13: dst23,dst22,dst21,dst20 */ 659 "li %[tmp0], 0xfff60011 \n\t" 660 "mtc1 %[tmp0], %[ftmp3] \n\t" 661 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 662 "li %[tmp0], 0x0016ffef \n\t" 663 "mtc1 %[tmp0], %[ftmp4] \n\t" 664 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 665 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 666 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 667 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 668 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 669 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 670 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 671 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 672 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 673 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 674 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 675 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 676 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 677 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t" 678 679 /* ftmp14: dst33,dst32,dst31,dst30 */ 680 "li %[tmp0], 0xffea0011 \n\t" 681 "mtc1 %[tmp0], %[ftmp3] \n\t" 682 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 683 "li %[tmp0], 0xfff60011 \n\t" 684 "mtc1 %[tmp0], %[ftmp4] \n\t" 685 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 686 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 687 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 688 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 689 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 690 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 691 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 692 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 693 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 694 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 695 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 696 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 697 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 698 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t" 699 700 MMI_LWC1(%[ftmp1], %[dest], 0x00) 701 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 702 MMI_LWC1(%[ftmp2], %[tmp0], 0x00) 703 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 704 MMI_LWC1(%[ftmp3], %[tmp0], 0x00) 705 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 706 MMI_LWC1(%[ftmp4], %[tmp0], 0x00) 707 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 708 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 709 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 710 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 711 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 712 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 713 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t" 714 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t" 715 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" 716 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 717 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 718 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 719 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 720 MMI_SWC1(%[ftmp1], %[dest], 0x00) 721 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 722 MMI_SWC1(%[ftmp2], %[tmp0], 0x00) 723 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 724 MMI_SWC1(%[ftmp3], %[tmp0], 0x00) 725 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 726 MMI_SWC1(%[ftmp4], %[tmp0], 0x00) 727 728 // 2nd part 729 "li %[tmp0], 0x07 \n\t" 730 "mtc1 %[tmp0], %[ftmp0] \n\t" 731 MMI_LDC1(%[ftmp1], %[src], 0x08) 732 MMI_LDC1(%[ftmp2], %[src], 0x18) 733 MMI_LDC1(%[ftmp3], %[src], 0x28) 734 MMI_LDC1(%[ftmp4], %[src], 0x38) 735 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 736 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 737 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 738 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 739 740 /* ftmp11: dst03,dst02,dst01,dst00 */ 741 "li %[tmp0], 0x00160011 \n\t" 742 "mtc1 %[tmp0], %[ftmp3] \n\t" 743 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 744 "li %[tmp0], 0x000a0011 \n\t" 745 "mtc1 %[tmp0], %[ftmp4] \n\t" 746 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 747 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 748 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 749 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 750 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 751 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 752 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 753 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 754 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 755 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 756 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 757 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 758 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 759 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t" 760 761 /* ftmp12: dst13,dst12,dst11,dst10 */ 762 "li %[tmp0], 0x000a0011 \n\t" 763 "mtc1 %[tmp0], %[ftmp3] \n\t" 764 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 765 "li %[tmp0], 0xffeaffef \n\t" 766 "mtc1 %[tmp0], %[ftmp4] \n\t" 767 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 768 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 769 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 770 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 771 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 772 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 773 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 774 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 775 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 776 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 777 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 778 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 779 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 780 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t" 781 782 /* ftmp13: dst23,dst22,dst21,dst20 */ 783 "li %[tmp0], 0xfff60011 \n\t" 784 "mtc1 %[tmp0], %[ftmp3] \n\t" 785 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 786 "li %[tmp0], 0x0016ffef \n\t" 787 "mtc1 %[tmp0], %[ftmp4] \n\t" 788 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 789 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 790 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 791 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 792 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 793 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 794 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 795 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 796 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 797 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 798 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 799 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 800 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 801 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t" 802 803 /* ftmp14: dst33,dst32,dst31,dst30 */ 804 "li %[tmp0], 0xffea0011 \n\t" 805 "mtc1 %[tmp0], %[ftmp3] \n\t" 806 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 807 "li %[tmp0], 0xfff60011 \n\t" 808 "mtc1 %[tmp0], %[ftmp4] \n\t" 809 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 810 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 811 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 812 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 813 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 814 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 815 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 816 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 817 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 818 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 819 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 820 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 821 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 822 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t" 823 824 MMI_LWC1(%[ftmp1], %[dest], 0x04) 825 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 826 MMI_LWC1(%[ftmp2], %[tmp0], 0x04) 827 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 828 MMI_LWC1(%[ftmp3], %[tmp0], 0x04) 829 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 830 MMI_LWC1(%[ftmp4], %[tmp0], 0x04) 831 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 832 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 833 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 834 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 835 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 836 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 837 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t" 838 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t" 839 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" 840 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 841 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 842 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 843 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 844 MMI_SWC1(%[ftmp1], %[dest], 0x04) 845 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 846 MMI_SWC1(%[ftmp2], %[tmp0], 0x04) 847 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 848 MMI_SWC1(%[ftmp3], %[tmp0], 0x04) 849 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 850 MMI_SWC1(%[ftmp4], %[tmp0], 0x04) 851 852 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 853 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 854 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 855 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 856 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 857 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 858 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), 859 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]), 860 [tmp0]"=&r"(tmp[0]) 861 : [ff_pw_64]"f"(ff_pw_32_64.f), 862 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize) 863 :"memory" 864 ); 865} 866#endif 867 868/* Do inverse transform on 4x8 parts of block */ 869void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 870{ 871 int dc = block[0]; 872 double ftmp[9]; 873 union mmi_intfloat64 dc_u; 874 DECLARE_VAR_LOW32; 875 876 dc = (17 * dc + 4) >> 3; 877 dc = (12 * dc + 64) >> 7; 878 dc_u.i = dc; 879 880 __asm__ volatile( 881 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 882 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 883 884 MMI_LWC1(%[ftmp1], %[dest0], 0x00) 885 MMI_LWC1(%[ftmp2], %[dest1], 0x00) 886 MMI_LWC1(%[ftmp3], %[dest2], 0x00) 887 MMI_LWC1(%[ftmp4], %[dest3], 0x00) 888 MMI_LWC1(%[ftmp5], %[dest4], 0x00) 889 MMI_LWC1(%[ftmp6], %[dest5], 0x00) 890 MMI_LWC1(%[ftmp7], %[dest6], 0x00) 891 MMI_LWC1(%[ftmp8], %[dest7], 0x00) 892 893 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 894 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 895 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 896 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 897 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 898 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 899 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 900 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 901 902 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 903 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 904 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 905 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 906 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t" 907 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t" 908 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t" 909 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t" 910 911 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 912 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 913 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 914 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 915 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 916 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 917 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 918 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 919 920 MMI_SWC1(%[ftmp1], %[dest0], 0x00) 921 MMI_SWC1(%[ftmp2], %[dest1], 0x00) 922 MMI_SWC1(%[ftmp3], %[dest2], 0x00) 923 MMI_SWC1(%[ftmp4], %[dest3], 0x00) 924 MMI_SWC1(%[ftmp5], %[dest4], 0x00) 925 MMI_SWC1(%[ftmp6], %[dest5], 0x00) 926 MMI_SWC1(%[ftmp7], %[dest6], 0x00) 927 MMI_SWC1(%[ftmp8], %[dest7], 0x00) 928 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 929 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 930 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 931 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 932 RESTRICT_ASM_LOW32 933 [ftmp8]"=&f"(ftmp[8]) 934 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize), 935 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize), 936 [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize), 937 [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize), 938 [dc]"f"(dc_u.f) 939 : "memory" 940 ); 941} 942 943#if _MIPS_SIM != _ABIO32 944void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 945{ 946 int16_t *src = block; 947 int16_t *dst = block; 948 double ftmp[23]; 949 uint64_t count = 8, tmp[1]; 950 int16_t coeff[16] = {17, 22, 17, 10, 951 17, 10,-17,-22, 952 17,-10,-17, 22, 953 17,-22, 17,-10}; 954 955 // 1st loop 956 __asm__ volatile ( 957 958 "li %[tmp0], 0x03 \n\t" 959 "mtc1 %[tmp0], %[ftmp0] \n\t" 960 961 MMI_LDC1(%[ftmp2], %[coeff], 0x00) 962 MMI_LDC1(%[ftmp3], %[coeff], 0x08) 963 MMI_LDC1(%[ftmp4], %[coeff], 0x10) 964 MMI_LDC1(%[ftmp5], %[coeff], 0x18) 965 "1: \n\t" 966 /* ftmp8: dst3,dst2,dst1,dst0 */ 967 MMI_LDC1(%[ftmp1], %[src], 0x00) 968 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t" 969 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" 970 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t" 971 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t" 972 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t" 973 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t" 974 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 975 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 976 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t" 977 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t" 978 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t" 979 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t" 980 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 981 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 982 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 983 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 984 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t" 985 MMI_SDC1(%[ftmp8], %[dst], 0x00) 986 987 PTR_ADDIU "%[src], %[src], 0x10 \n\t" 988 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t" 989 "addiu %[count], %[count], -0x01 \n\t" 990 "bnez %[count], 1b \n\t" 991 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 992 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 993 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 994 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 995 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 996 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 997 [tmp0]"=&r"(tmp[0]), [count]"+&r"(count), 998 [src]"+&r"(src), [dst]"+&r"(dst) 999 : [ff_pw_4]"f"(ff_pw_32_4.f), [coeff]"r"(coeff) 1000 : "memory" 1001 ); 1002 1003 src = block; 1004 1005 // 2nd loop 1006 __asm__ volatile ( 1007 "li %[tmp0], 0x07 \n\t" 1008 "mtc1 %[tmp0], %[ftmp0] \n\t" 1009 1010 MMI_LDC1(%[ftmp1], %[src], 0x00) 1011 MMI_LDC1(%[ftmp2], %[src], 0x20) 1012 MMI_LDC1(%[ftmp3], %[src], 0x40) 1013 MMI_LDC1(%[ftmp4], %[src], 0x60) 1014 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 1015 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 1016 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 1017 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 1018 1019 MMI_LDC1(%[ftmp1], %[src], 0x10) 1020 MMI_LDC1(%[ftmp2], %[src], 0x30) 1021 MMI_LDC1(%[ftmp3], %[src], 0x50) 1022 MMI_LDC1(%[ftmp4], %[src], 0x70) 1023 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1024 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1025 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t" 1026 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t" 1027 1028 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */ 1029 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c, 1030 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1]) 1031 1032 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */ 1033 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4, 1034 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1]) 1035 1036 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */ 1037 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4, 1038 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1]) 1039 1040 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */ 1041 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c, 1042 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1]) 1043 1044 MMI_LWC1(%[ftmp1], %[dest], 0x00) 1045 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1046 MMI_LWC1(%[ftmp2], %[tmp0], 0x00) 1047 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1048 MMI_LWC1(%[ftmp3], %[tmp0], 0x00) 1049 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1050 MMI_LWC1(%[ftmp4], %[tmp0], 0x00) 1051 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1052 MMI_LWC1(%[ftmp5], %[tmp0], 0x00) 1053 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1054 MMI_LWC1(%[ftmp6], %[tmp0], 0x00) 1055 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1056 MMI_LWC1(%[ftmp7], %[tmp0], 0x00) 1057 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1058 MMI_LWC1(%[ftmp8], %[tmp0], 0x00) 1059 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1060 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1061 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1062 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1063 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1064 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 1065 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 1066 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 1067 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 1068 1069 "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t" 1070 "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t" 1071 "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t" 1072 "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t" 1073 "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t" 1074 "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t" 1075 "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t" 1076 "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t" 1077 1078 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1079 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1080 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1081 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1082 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 1083 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 1084 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 1085 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 1086 1087 MMI_SWC1(%[ftmp1], %[dest], 0x00) 1088 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1089 MMI_SWC1(%[ftmp2], %[tmp0], 0x00) 1090 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1091 MMI_SWC1(%[ftmp3], %[tmp0], 0x00) 1092 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1093 MMI_SWC1(%[ftmp4], %[tmp0], 0x00) 1094 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1095 MMI_SWC1(%[ftmp5], %[tmp0], 0x00) 1096 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1097 MMI_SWC1(%[ftmp6], %[tmp0], 0x00) 1098 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1099 MMI_SWC1(%[ftmp7], %[tmp0], 0x00) 1100 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1101 MMI_SWC1(%[ftmp8], %[tmp0], 0x00) 1102 1103 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1104 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1105 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 1106 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 1107 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 1108 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 1109 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), 1110 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]), 1111 [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]), 1112 [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]), 1113 [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]), 1114 [ftmp22]"=&f"(ftmp[22]), 1115 [tmp0]"=&r"(tmp[0]) 1116 : [ff_pw_1]"f"(ff_pw_32_1.f), [ff_pw_64]"f"(ff_pw_32_64.f), 1117 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize) 1118 : "memory" 1119 ); 1120} 1121#endif 1122 1123/* Do inverse transform on 4x4 part of block */ 1124void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 1125{ 1126 int dc = block[0]; 1127 double ftmp[5]; 1128 union mmi_intfloat64 dc_u; 1129 DECLARE_VAR_LOW32; 1130 1131 dc = (17 * dc + 4) >> 3; 1132 dc = (17 * dc + 64) >> 7; 1133 dc_u.i = dc; 1134 1135 __asm__ volatile( 1136 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1137 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 1138 1139 MMI_LWC1(%[ftmp1], %[dest0], 0x00) 1140 MMI_LWC1(%[ftmp2], %[dest1], 0x00) 1141 MMI_LWC1(%[ftmp3], %[dest2], 0x00) 1142 MMI_LWC1(%[ftmp4], %[dest3], 0x00) 1143 1144 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1145 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1146 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1147 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1148 1149 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 1150 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 1151 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 1152 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 1153 1154 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1155 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1156 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1157 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1158 1159 MMI_SWC1(%[ftmp1], %[dest0], 0x00) 1160 MMI_SWC1(%[ftmp2], %[dest1], 0x00) 1161 MMI_SWC1(%[ftmp3], %[dest2], 0x00) 1162 MMI_SWC1(%[ftmp4], %[dest3], 0x00) 1163 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1164 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1165 RESTRICT_ASM_LOW32 1166 [ftmp4]"=&f"(ftmp[4]) 1167 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize), 1168 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize), 1169 [dc]"f"(dc_u.f) 1170 : "memory" 1171 ); 1172} 1173 1174void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 1175{ 1176 int16_t *src = block; 1177 int16_t *dst = block; 1178 double ftmp[16]; 1179 uint32_t count = 4, tmp[1]; 1180 int16_t coeff[16] = {17, 22, 17, 10, 1181 17, 10,-17,-22, 1182 17,-10,-17, 22, 1183 17,-22, 17,-10}; 1184 // 1st loop 1185 __asm__ volatile ( 1186 1187 "li %[tmp0], 0x03 \n\t" 1188 "mtc1 %[tmp0], %[ftmp0] \n\t" 1189 MMI_LDC1(%[ftmp2], %[coeff], 0x00) 1190 MMI_LDC1(%[ftmp3], %[coeff], 0x08) 1191 MMI_LDC1(%[ftmp4], %[coeff], 0x10) 1192 MMI_LDC1(%[ftmp5], %[coeff], 0x18) 1193 "1: \n\t" 1194 /* ftmp8: dst3,dst2,dst1,dst0 */ 1195 MMI_LDC1(%[ftmp1], %[src], 0x00) 1196 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t" 1197 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" 1198 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t" 1199 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t" 1200 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t" 1201 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t" 1202 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 1203 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 1204 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t" 1205 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t" 1206 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t" 1207 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t" 1208 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 1209 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1210 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 1211 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 1212 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t" 1213 MMI_SDC1(%[ftmp8], %[dst], 0x00) 1214 1215 PTR_ADDIU "%[src], %[src], 0x10 \n\t" 1216 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t" 1217 "addiu %[count], %[count], -0x01 \n\t" 1218 "bnez %[count], 1b \n\t" 1219 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1220 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1221 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 1222 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 1223 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 1224 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 1225 [tmp0]"=&r"(tmp[0]), [count]"+&r"(count), 1226 [src]"+&r"(src), [dst]"+&r"(dst) 1227 : [ff_pw_4]"f"(ff_pw_32_4.f), [coeff]"r"(coeff) 1228 : "memory" 1229 ); 1230 1231 src = block; 1232 1233 // 2nd loop 1234 __asm__ volatile ( 1235 "li %[tmp0], 0x07 \n\t" 1236 "mtc1 %[tmp0], %[ftmp0] \n\t" 1237 "li %[tmp0], 0x44 \n\t" 1238 "mtc1 %[tmp0], %[ftmp15] \n\t" 1239 1240 MMI_LDC1(%[ftmp1], %[src], 0x00) 1241 MMI_LDC1(%[ftmp2], %[src], 0x10) 1242 MMI_LDC1(%[ftmp3], %[src], 0x20) 1243 MMI_LDC1(%[ftmp4], %[src], 0x30) 1244 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 1245 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 1246 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 1247 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 1248 1249 /* ftmp11: dst03,dst02,dst01,dst00 */ 1250 "li %[tmp0], 0x00160011 \n\t" 1251 "mtc1 %[tmp0], %[ftmp3] \n\t" 1252 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1253 "li %[tmp0], 0x000a0011 \n\t" 1254 "mtc1 %[tmp0], %[ftmp4] \n\t" 1255 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1256 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1257 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1258 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1259 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1260 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1261 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1262 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1263 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1264 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1265 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1266 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1267 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1268 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t" 1269 1270 /* ftmp12: dst13,dst12,dst11,dst10 */ 1271 "li %[tmp0], 0x000a0011 \n\t" 1272 "mtc1 %[tmp0], %[ftmp3] \n\t" 1273 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1274 "li %[tmp0], 0xffeaffef \n\t" 1275 "mtc1 %[tmp0], %[ftmp4] \n\t" 1276 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1277 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1278 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1279 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1280 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1281 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1282 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1283 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1284 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1285 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1286 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1287 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1288 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1289 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t" 1290 1291 /* ftmp13: dst23,dst22,dst21,dst20 */ 1292 "li %[tmp0], 0xfff60011 \n\t" 1293 "mtc1 %[tmp0], %[ftmp3] \n\t" 1294 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1295 "li %[tmp0], 0x0016ffef \n\t" 1296 "mtc1 %[tmp0], %[ftmp4] \n\t" 1297 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1298 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1299 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1300 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1301 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1302 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1303 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1304 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1305 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1306 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1307 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1308 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1309 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1310 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t" 1311 1312 /* ftmp14: dst33,dst32,dst31,dst30 */ 1313 "li %[tmp0], 0xffea0011 \n\t" 1314 "mtc1 %[tmp0], %[ftmp3] \n\t" 1315 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1316 "li %[tmp0], 0xfff60011 \n\t" 1317 "mtc1 %[tmp0], %[ftmp4] \n\t" 1318 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1319 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1320 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1321 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1322 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1323 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1324 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1325 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1326 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1327 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1328 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1329 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1330 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1331 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t" 1332 1333 MMI_LWC1(%[ftmp1], %[dest], 0x00) 1334 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1335 MMI_LWC1(%[ftmp2], %[tmp0], 0x00) 1336 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1337 MMI_LWC1(%[ftmp3], %[tmp0], 0x00) 1338 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1339 MMI_LWC1(%[ftmp4], %[tmp0], 0x00) 1340 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1341 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1342 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1343 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1344 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1345 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 1346 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t" 1347 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t" 1348 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" 1349 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1350 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1351 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1352 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1353 1354 MMI_SWC1(%[ftmp1], %[dest], 0x00) 1355 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1356 MMI_SWC1(%[ftmp2], %[tmp0], 0x00) 1357 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1358 MMI_SWC1(%[ftmp3], %[tmp0], 0x00) 1359 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1360 MMI_SWC1(%[ftmp4], %[tmp0], 0x00) 1361 1362 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 1363 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 1364 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 1365 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 1366 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 1367 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 1368 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), 1369 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]), 1370 [tmp0]"=&r"(tmp[0]) 1371 : [ff_pw_64]"f"(ff_pw_32_64.f), 1372 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize) 1373 :"memory" 1374 ); 1375} 1376 1377/* Apply overlap transform to horizontal edge */ 1378void ff_vc1_h_overlap_mmi(uint8_t *src, ptrdiff_t stride) 1379{ 1380 int i; 1381 int a, b, c, d; 1382 int d1, d2; 1383 int rnd = 1; 1384 for (i = 0; i < 8; i++) { 1385 a = src[-2]; 1386 b = src[-1]; 1387 c = src[0]; 1388 d = src[1]; 1389 d1 = (a - d + 3 + rnd) >> 3; 1390 d2 = (a - d + b - c + 4 - rnd) >> 3; 1391 1392 src[-2] = a - d1; 1393 src[-1] = av_clip_uint8(b - d2); 1394 src[0] = av_clip_uint8(c + d2); 1395 src[1] = d + d1; 1396 src += stride; 1397 rnd = !rnd; 1398 } 1399} 1400 1401void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, ptrdiff_t left_stride, ptrdiff_t right_stride, int flags) 1402{ 1403 int i; 1404 int a, b, c, d; 1405 int d1, d2; 1406 int rnd1 = flags & 2 ? 3 : 4; 1407 int rnd2 = 7 - rnd1; 1408 for (i = 0; i < 8; i++) { 1409 a = left[6]; 1410 b = left[7]; 1411 c = right[0]; 1412 d = right[1]; 1413 d1 = a - d; 1414 d2 = a - d + b - c; 1415 1416 left[6] = ((a << 3) - d1 + rnd1) >> 3; 1417 left[7] = ((b << 3) - d2 + rnd2) >> 3; 1418 right[0] = ((c << 3) + d2 + rnd1) >> 3; 1419 right[1] = ((d << 3) + d1 + rnd2) >> 3; 1420 1421 right += right_stride; 1422 left += left_stride; 1423 if (flags & 1) { 1424 rnd2 = 7 - rnd2; 1425 rnd1 = 7 - rnd1; 1426 } 1427 } 1428} 1429 1430/* Apply overlap transform to vertical edge */ 1431void ff_vc1_v_overlap_mmi(uint8_t *src, ptrdiff_t stride) 1432{ 1433 int i; 1434 int a, b, c, d; 1435 int d1, d2; 1436 int rnd = 1; 1437 for (i = 0; i < 8; i++) { 1438 a = src[-2 * stride]; 1439 b = src[-stride]; 1440 c = src[0]; 1441 d = src[stride]; 1442 d1 = (a - d + 3 + rnd) >> 3; 1443 d2 = (a - d + b - c + 4 - rnd) >> 3; 1444 1445 src[-2 * stride] = a - d1; 1446 src[-stride] = av_clip_uint8(b - d2); 1447 src[0] = av_clip_uint8(c + d2); 1448 src[stride] = d + d1; 1449 src++; 1450 rnd = !rnd; 1451 } 1452} 1453 1454void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom) 1455{ 1456 int i; 1457 int a, b, c, d; 1458 int d1, d2; 1459 int rnd1 = 4, rnd2 = 3; 1460 for (i = 0; i < 8; i++) { 1461 a = top[48]; 1462 b = top[56]; 1463 c = bottom[0]; 1464 d = bottom[8]; 1465 d1 = a - d; 1466 d2 = a - d + b - c; 1467 1468 top[48] = ((a << 3) - d1 + rnd1) >> 3; 1469 top[56] = ((b << 3) - d2 + rnd2) >> 3; 1470 bottom[0] = ((c << 3) + d2 + rnd1) >> 3; 1471 bottom[8] = ((d << 3) + d1 + rnd2) >> 3; 1472 1473 bottom++; 1474 top++; 1475 rnd2 = 7 - rnd2; 1476 rnd1 = 7 - rnd1; 1477 } 1478} 1479 1480/** 1481 * VC-1 in-loop deblocking filter for one line 1482 * @param src source block type 1483 * @param stride block stride 1484 * @param pq block quantizer 1485 * @return whether other 3 pairs should be filtered or not 1486 * @see 8.6 1487 */ 1488static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq) 1489{ 1490 int a0 = (2 * (src[-2 * stride] - src[1 * stride]) - 1491 5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3; 1492 int a0_sign = a0 >> 31; /* Store sign */ 1493 1494 a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */ 1495 if (a0 < pq) { 1496 int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) - 1497 5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3); 1498 int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) - 1499 5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3); 1500 if (a1 < a0 || a2 < a0) { 1501 int clip = src[-1 * stride] - src[0 * stride]; 1502 int clip_sign = clip >> 31; 1503 1504 clip = ((clip ^ clip_sign) - clip_sign) >> 1; 1505 if (clip) { 1506 int a3 = FFMIN(a1, a2); 1507 int d = 5 * (a3 - a0); 1508 int d_sign = (d >> 31); 1509 1510 d = ((d ^ d_sign) - d_sign) >> 3; 1511 d_sign ^= a0_sign; 1512 1513 if (d_sign ^ clip_sign) 1514 d = 0; 1515 else { 1516 d = FFMIN(d, clip); 1517 d = (d ^ d_sign) - d_sign; /* Restore sign */ 1518 src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d); 1519 src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d); 1520 } 1521 return 1; 1522 } 1523 } 1524 } 1525 return 0; 1526} 1527 1528/** 1529 * VC-1 in-loop deblocking filter 1530 * @param src source block type 1531 * @param step distance between horizontally adjacent elements 1532 * @param stride distance between vertically adjacent elements 1533 * @param len edge length to filter (4 or 8 pixels) 1534 * @param pq block quantizer 1535 * @see 8.6 1536 */ 1537static inline void vc1_loop_filter(uint8_t *src, int step, int stride, 1538 int len, int pq) 1539{ 1540 int i; 1541 int filt3; 1542 1543 for (i = 0; i < len; i += 4) { 1544 filt3 = vc1_filter_line(src + 2 * step, stride, pq); 1545 if (filt3) { 1546 vc1_filter_line(src + 0 * step, stride, pq); 1547 vc1_filter_line(src + 1 * step, stride, pq); 1548 vc1_filter_line(src + 3 * step, stride, pq); 1549 } 1550 src += step * 4; 1551 } 1552} 1553 1554void ff_vc1_v_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq) 1555{ 1556 vc1_loop_filter(src, 1, stride, 4, pq); 1557} 1558 1559void ff_vc1_h_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq) 1560{ 1561 vc1_loop_filter(src, stride, 1, 4, pq); 1562} 1563 1564void ff_vc1_v_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq) 1565{ 1566 vc1_loop_filter(src, 1, stride, 8, pq); 1567} 1568 1569void ff_vc1_h_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq) 1570{ 1571 vc1_loop_filter(src, stride, 1, 8, pq); 1572} 1573 1574void ff_vc1_v_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq) 1575{ 1576 vc1_loop_filter(src, 1, stride, 16, pq); 1577} 1578 1579void ff_vc1_h_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq) 1580{ 1581 vc1_loop_filter(src, stride, 1, 16, pq); 1582} 1583 1584void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, 1585 ptrdiff_t stride, int rnd) 1586{ 1587 ff_put_pixels8_8_mmi(dst, src, stride, 8); 1588} 1589void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, 1590 ptrdiff_t stride, int rnd) 1591{ 1592 ff_put_pixels16_8_mmi(dst, src, stride, 16); 1593} 1594void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, 1595 ptrdiff_t stride, int rnd) 1596{ 1597 ff_avg_pixels8_8_mmi(dst, src, stride, 8); 1598} 1599void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, 1600 ptrdiff_t stride, int rnd) 1601{ 1602 ff_avg_pixels16_8_mmi(dst, src, stride, 16); 1603} 1604 1605#define OP_PUT(S, D) 1606#define OP_AVG(S, D) \ 1607 "ldc1 $f16, "#S" \n\t" \ 1608 "pavgb "#D", "#D", $f16 \n\t" 1609 1610/** Add rounder from $f14 to $f6 and pack result at destination */ 1611#define NORMALIZE_MMI(SHIFT) \ 1612 "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \ 1613 "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \ 1614 "psrah $f6, $f6, "SHIFT" \n\t" \ 1615 "psrah $f8, $f8, "SHIFT" \n\t" 1616 1617#define TRANSFER_DO_PACK(OP) \ 1618 "packushb $f6, $f6, $f8 \n\t" \ 1619 OP((%[dst]), $f6) \ 1620 "sdc1 $f6, 0x00(%[dst]) \n\t" 1621 1622#define TRANSFER_DONT_PACK(OP) \ 1623 OP(0(%[dst]), $f6) \ 1624 OP(8(%[dst]), $f8) \ 1625 "sdc1 $f6, 0x00(%[dst]) \n\t" \ 1626 "sdc1 $f8, 0x08(%[dst]) \n\t" 1627 1628/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ 1629#define DO_UNPACK(reg) \ 1630 "punpcklbh "reg", "reg", $f0 \n\t" 1631#define DONT_UNPACK(reg) 1632 1633/** Compute the rounder 32-r or 8-r and unpacks it to $f14 */ 1634#define LOAD_ROUNDER_MMI(ROUND) \ 1635 "lwc1 $f14, "ROUND" \n\t" \ 1636 "punpcklhw $f14, $f14, $f14 \n\t" \ 1637 "punpcklwd $f14, $f14, $f14 \n\t" 1638 1639 1640#define SHIFT2_LINE(OFF, R0, R1, R2, R3) \ 1641 "paddh "#R1", "#R1", "#R2" \n\t" \ 1642 PTR_ADDU "$9, %[src], %[stride1] \n\t" \ 1643 MMI_ULWC1(R0, $9, 0x00) \ 1644 "pmullh "#R1", "#R1", $f6 \n\t" \ 1645 "punpcklbh "#R0", "#R0", $f0 \n\t" \ 1646 PTR_ADDU "$9, %[src], %[stride] \n\t" \ 1647 MMI_ULWC1(R3, $9, 0x00) \ 1648 "psubh "#R1", "#R1", "#R0" \n\t" \ 1649 "punpcklbh "#R3", "#R3", $f0 \n\t" \ 1650 "paddh "#R1", "#R1", $f14 \n\t" \ 1651 "psubh "#R1", "#R1", "#R3" \n\t" \ 1652 "psrah "#R1", "#R1", %[shift] \n\t" \ 1653 MMI_SDC1(R1, %[dst], OFF) \ 1654 PTR_ADDU "%[src], %[src], %[stride] \n\t" 1655 1656/** Sacrificing $f12 makes it possible to pipeline loads from src */ 1657static void vc1_put_ver_16b_shift2_mmi(int16_t *dst, 1658 const uint8_t *src, mips_reg stride, 1659 int rnd, int64_t shift) 1660{ 1661 union mmi_intfloat64 shift_u; 1662 DECLARE_VAR_LOW32; 1663 DECLARE_VAR_ADDRT; 1664 shift_u.i = shift; 1665 1666 __asm__ volatile( 1667 "pxor $f0, $f0, $f0 \n\t" 1668 "li $8, 0x03 \n\t" 1669 LOAD_ROUNDER_MMI("%[rnd]") 1670 "1: \n\t" 1671 MMI_ULWC1($f4, %[src], 0x00) 1672 PTR_ADDU "%[src], %[src], %[stride] \n\t" 1673 MMI_ULWC1($f6, %[src], 0x00) 1674 "punpcklbh $f4, $f4, $f0 \n\t" 1675 "punpcklbh $f6, $f6, $f0 \n\t" 1676 SHIFT2_LINE( 0, $f2, $f4, $f6, $f8) 1677 SHIFT2_LINE( 24, $f4, $f6, $f8, $f2) 1678 SHIFT2_LINE( 48, $f6, $f8, $f2, $f4) 1679 SHIFT2_LINE( 72, $f8, $f2, $f4, $f6) 1680 SHIFT2_LINE( 96, $f2, $f4, $f6, $f8) 1681 SHIFT2_LINE(120, $f4, $f6, $f8, $f2) 1682 SHIFT2_LINE(144, $f6, $f8, $f2, $f4) 1683 SHIFT2_LINE(168, $f8, $f2, $f4, $f6) 1684 PTR_SUBU "%[src], %[src], %[stride2] \n\t" 1685 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t" 1686 "addiu $8, $8, -0x01 \n\t" 1687 "bnez $8, 1b \n\t" 1688 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT 1689 [src]"+r"(src), [dst]"+r"(dst) 1690 : [stride]"r"(stride), [stride1]"r"(-2*stride), 1691 [shift]"f"(shift_u.f), [rnd]"m"(rnd), 1692 [stride2]"r"(9*stride-4) 1693 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", 1694 "$f14", "$f16", "memory" 1695 ); 1696} 1697 1698/** 1699 * Data is already unpacked, so some operations can directly be made from 1700 * memory. 1701 */ 1702#define VC1_HOR_16B_SHIFT2(OP, OPNAME) \ 1703static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \ 1704 const int16_t *src, int rnd) \ 1705{ \ 1706 int h = 8; \ 1707 DECLARE_VAR_ALL64; \ 1708 DECLARE_VAR_ADDRT; \ 1709 \ 1710 src -= 1; \ 1711 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \ 1712 \ 1713 __asm__ volatile( \ 1714 LOAD_ROUNDER_MMI("%[rnd]") \ 1715 "1: \n\t" \ 1716 MMI_ULDC1($f2, %[src], 0x00) \ 1717 MMI_ULDC1($f4, %[src], 0x08) \ 1718 MMI_ULDC1($f6, %[src], 0x02) \ 1719 MMI_ULDC1($f8, %[src], 0x0a) \ 1720 MMI_ULDC1($f0, %[src], 0x06) \ 1721 "paddh $f2, $f2, $f0 \n\t" \ 1722 MMI_ULDC1($f0, %[src], 0x0e) \ 1723 "paddh $f4, $f4, $f0 \n\t" \ 1724 MMI_ULDC1($f0, %[src], 0x04) \ 1725 "paddh $f6, $f6, $f0 \n\t" \ 1726 MMI_ULDC1($f0, %[src], 0x0b) \ 1727 "paddh $f8, $f8, $f0 \n\t" \ 1728 "pmullh $f6, $f6, %[ff_pw_9] \n\t" \ 1729 "pmullh $f8, $f8, %[ff_pw_9] \n\t" \ 1730 "psubh $f6, $f6, $f2 \n\t" \ 1731 "psubh $f8, $f8, $f4 \n\t" \ 1732 "li $8, 0x07 \n\t" \ 1733 "mtc1 $8, $f16 \n\t" \ 1734 NORMALIZE_MMI("$f16") \ 1735 /* Remove bias */ \ 1736 "paddh $f6, $f6, %[ff_pw_128] \n\t" \ 1737 "paddh $f8, $f8, %[ff_pw_128] \n\t" \ 1738 TRANSFER_DO_PACK(OP) \ 1739 "addiu %[h], %[h], -0x01 \n\t" \ 1740 PTR_ADDIU "%[src], %[src], 0x18 \n\t" \ 1741 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1742 "bnez %[h], 1b \n\t" \ 1743 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \ 1744 [h]"+r"(h), \ 1745 [src]"+r"(src), [dst]"+r"(dst) \ 1746 : [stride]"r"(stride), [rnd]"m"(rnd), \ 1747 [ff_pw_9]"f"(ff_pw_9.f), [ff_pw_128]"f"(ff_pw_128.f) \ 1748 : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f14", \ 1749 "$f16", "memory" \ 1750 ); \ 1751} 1752 1753VC1_HOR_16B_SHIFT2(OP_PUT, put_) 1754VC1_HOR_16B_SHIFT2(OP_AVG, avg_) 1755 1756/** 1757 * Purely vertical or horizontal 1/2 shift interpolation. 1758 * Sacrify $f12 for *9 factor. 1759 */ 1760#define VC1_SHIFT2(OP, OPNAME)\ 1761static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \ 1762 mips_reg stride, int rnd, \ 1763 mips_reg offset) \ 1764{ \ 1765 DECLARE_VAR_LOW32; \ 1766 DECLARE_VAR_ADDRT; \ 1767 \ 1768 rnd = 8 - rnd; \ 1769 \ 1770 __asm__ volatile( \ 1771 "pxor $f0, $f0, $f0 \n\t" \ 1772 "li $10, 0x08 \n\t" \ 1773 LOAD_ROUNDER_MMI("%[rnd]") \ 1774 "1: \n\t" \ 1775 MMI_ULWC1($f6, %[src], 0x00) \ 1776 MMI_ULWC1($f8, %[src], 0x04) \ 1777 PTR_ADDU "$9, %[src], %[offset] \n\t" \ 1778 MMI_ULWC1($f2, $9, 0x00) \ 1779 MMI_ULWC1($f4, $9, 0x04) \ 1780 PTR_ADDU "%[src], %[src], %[offset] \n\t" \ 1781 "punpcklbh $f6, $f6, $f0 \n\t" \ 1782 "punpcklbh $f8, $f8, $f0 \n\t" \ 1783 "punpcklbh $f2, $f2, $f0 \n\t" \ 1784 "punpcklbh $f4, $f4, $f0 \n\t" \ 1785 "paddh $f6, $f6, $f2 \n\t" \ 1786 "paddh $f8, $f8, $f4 \n\t" \ 1787 PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \ 1788 MMI_ULWC1($f2, $9, 0x00) \ 1789 MMI_ULWC1($f4, $9, 0x04) \ 1790 "pmullh $f6, $f6, %[ff_pw_9] \n\t" /* 0,9,9,0*/ \ 1791 "pmullh $f8, $f8, %[ff_pw_9] \n\t" /* 0,9,9,0*/ \ 1792 "punpcklbh $f2, $f2, $f0 \n\t" \ 1793 "punpcklbh $f4, $f4, $f0 \n\t" \ 1794 "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \ 1795 "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \ 1796 PTR_ADDU "$9, %[src], %[offset] \n\t" \ 1797 MMI_ULWC1($f2, $9, 0x00) \ 1798 MMI_ULWC1($f4, $9, 0x04) \ 1799 "punpcklbh $f2, $f2, $f0 \n\t" \ 1800 "punpcklbh $f4, $f4, $f0 \n\t" \ 1801 "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \ 1802 "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \ 1803 "li $8, 0x04 \n\t" \ 1804 "mtc1 $8, $f16 \n\t" \ 1805 NORMALIZE_MMI("$f16") \ 1806 "packushb $f6, $f6, $f8 \n\t" \ 1807 OP((%[dst]), $f6) \ 1808 "sdc1 $f6, 0x00(%[dst]) \n\t" \ 1809 "addiu $10, $10, -0x01 \n\t" \ 1810 PTR_ADDU "%[src], %[src], %[stride1] \n\t" \ 1811 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1812 "bnez $10, 1b \n\t" \ 1813 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \ 1814 [src]"+r"(src), [dst]"+r"(dst) \ 1815 : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \ 1816 [stride]"r"(stride), [rnd]"m"(rnd), \ 1817 [stride1]"r"(stride-offset), \ 1818 [ff_pw_9]"f"(ff_pw_9.f) \ 1819 : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \ 1820 "$f14", "$f16", "memory" \ 1821 ); \ 1822} 1823 1824VC1_SHIFT2(OP_PUT, put_) 1825VC1_SHIFT2(OP_AVG, avg_) 1826 1827/** 1828 * Core of the 1/4 and 3/4 shift bicubic interpolation. 1829 * 1830 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty). 1831 * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked. 1832 * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1. 1833 * @param A1 Stride address of 1st tap (beware of unpacked/packed). 1834 * @param A2 Stride address of 2nd tap 1835 * @param A3 Stride address of 3rd tap 1836 * @param A4 Stride address of 4th tap 1837 */ 1838#define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \ 1839 PTR_ADDU "$9, %[src], "#A1" \n\t" \ 1840 LOAD($f2, $9, M*0) \ 1841 LOAD($f4, $9, M*4) \ 1842 UNPACK("$f2") \ 1843 UNPACK("$f4") \ 1844 "pmullh $f2, $f2, %[ff_pw_3] \n\t" \ 1845 "pmullh $f4, $f4, %[ff_pw_3] \n\t" \ 1846 PTR_ADDU "$9, %[src], "#A2" \n\t" \ 1847 LOAD($f6, $9, M*0) \ 1848 LOAD($f8, $9, M*4) \ 1849 UNPACK("$f6") \ 1850 UNPACK("$f8") \ 1851 "pmullh $f6, $f6, %[ff_pw_18] \n\t" /* *18 */ \ 1852 "pmullh $f8, $f8, %[ff_pw_18] \n\t" /* *18 */ \ 1853 "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \ 1854 "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \ 1855 PTR_ADDU "$9, %[src], "#A4" \n\t" \ 1856 LOAD($f2, $9, M*0) \ 1857 LOAD($f4, $9, M*4) \ 1858 UNPACK("$f2") \ 1859 UNPACK("$f4") \ 1860 "li $8, 0x02 \n\t" \ 1861 "mtc1 $8, $f16 \n\t" \ 1862 "psllh $f2, $f2, $f16 \n\t" /* 4* */ \ 1863 "psllh $f4, $f4, $f16 \n\t" /* 4* */ \ 1864 "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \ 1865 "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \ 1866 PTR_ADDU "$9, %[src], "#A3" \n\t" \ 1867 LOAD($f2, $9, M*0) \ 1868 LOAD($f4, $9, M*4) \ 1869 UNPACK("$f2") \ 1870 UNPACK("$f4") \ 1871 "pmullh $f2, $f2, %[ff_pw_53] \n\t" /* *53 */ \ 1872 "pmullh $f4, $f4, %[ff_pw_53] \n\t" /* *53 */ \ 1873 "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \ 1874 "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */ 1875 1876/** 1877 * Macro to build the vertical 16bits version of vc1_put_shift[13]. 1878 * Here, offset=src_stride. Parameters passed A1 to A4 must use 1879 * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride). 1880 * 1881 * @param NAME Either 1 or 3 1882 * @see MSPEL_FILTER13_CORE for information on A1->A4 1883 */ 1884#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ 1885static void \ 1886vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \ 1887 mips_reg src_stride, \ 1888 int rnd, int64_t shift) \ 1889{ \ 1890 int h = 8; \ 1891 union mmi_intfloat64 shift_u; \ 1892 DECLARE_VAR_LOW32; \ 1893 DECLARE_VAR_ADDRT; \ 1894 shift_u.i = shift; \ 1895 \ 1896 src -= src_stride; \ 1897 \ 1898 __asm__ volatile( \ 1899 "pxor $f0, $f0, $f0 \n\t" \ 1900 LOAD_ROUNDER_MMI("%[rnd]") \ 1901 ".p2align 3 \n\t" \ 1902 "1: \n\t" \ 1903 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \ 1904 NORMALIZE_MMI("%[shift]") \ 1905 TRANSFER_DONT_PACK(OP_PUT) \ 1906 /* Last 3 (in fact 4) bytes on the line */ \ 1907 PTR_ADDU "$9, %[src], "#A1" \n\t" \ 1908 MMI_ULWC1($f2, $9, 0x08) \ 1909 DO_UNPACK("$f2") \ 1910 "mov.d $f6, $f2 \n\t" \ 1911 "paddh $f2, $f2, $f2 \n\t" \ 1912 "paddh $f2, $f2, $f6 \n\t" /* 3* */ \ 1913 PTR_ADDU "$9, %[src], "#A2" \n\t" \ 1914 MMI_ULWC1($f6, $9, 0x08) \ 1915 DO_UNPACK("$f6") \ 1916 "pmullh $f6, $f6, %[ff_pw_18] \n\t" /* *18 */ \ 1917 "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \ 1918 PTR_ADDU "$9, %[src], "#A3" \n\t" \ 1919 MMI_ULWC1($f2, $9, 0x08) \ 1920 DO_UNPACK("$f2") \ 1921 "pmullh $f2, $f2, %[ff_pw_53] \n\t" /* *53 */ \ 1922 "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \ 1923 PTR_ADDU "$9, %[src], "#A4" \n\t" \ 1924 MMI_ULWC1($f2, $9, 0x08) \ 1925 DO_UNPACK("$f2") \ 1926 "li $8, 0x02 \n\t" \ 1927 "mtc1 $8, $f16 \n\t" \ 1928 "psllh $f2, $f2, $f16 \n\t" /* 4* */ \ 1929 "psubh $f6, $f6, $f2 \n\t" \ 1930 "paddh $f6, $f6, $f14 \n\t" \ 1931 "li $8, 0x06 \n\t" \ 1932 "mtc1 $8, $f16 \n\t" \ 1933 "psrah $f6, $f6, $f16 \n\t" \ 1934 "sdc1 $f6, 0x10(%[dst]) \n\t" \ 1935 "addiu %[h], %[h], -0x01 \n\t" \ 1936 PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \ 1937 PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \ 1938 "bnez %[h], 1b \n\t" \ 1939 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \ 1940 [h]"+r"(h), \ 1941 [src]"+r"(src), [dst]"+r"(dst) \ 1942 : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \ 1943 [stride_x3]"r"(3*src_stride), \ 1944 [rnd]"m"(rnd), [shift]"f"(shift_u.f), \ 1945 [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \ 1946 [ff_pw_3]"f"(ff_pw_3.f) \ 1947 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \ 1948 "$f14", "$f16", "memory" \ 1949 ); \ 1950} 1951 1952/** 1953 * Macro to build the horizontal 16bits version of vc1_put_shift[13]. 1954 * Here, offset=16bits, so parameters passed A1 to A4 should be simple. 1955 * 1956 * @param NAME Either 1 or 3 1957 * @see MSPEL_FILTER13_CORE for information on A1->A4 1958 */ 1959#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ 1960static void \ 1961OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \ 1962 const int16_t *src, int rnd) \ 1963{ \ 1964 int h = 8; \ 1965 DECLARE_VAR_ALL64; \ 1966 DECLARE_VAR_ADDRT; \ 1967 \ 1968 src -= 1; \ 1969 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ 1970 \ 1971 __asm__ volatile( \ 1972 "pxor $f0, $f0, $f0 \n\t" \ 1973 LOAD_ROUNDER_MMI("%[rnd]") \ 1974 ".p2align 3 \n\t" \ 1975 "1: \n\t" \ 1976 MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \ 1977 "li $8, 0x07 \n\t" \ 1978 "mtc1 $8, $f16 \n\t" \ 1979 NORMALIZE_MMI("$f16") \ 1980 /* Remove bias */ \ 1981 "paddh $f6, $f6, %[ff_pw_128] \n\t" \ 1982 "paddh $f8, $f8, %[ff_pw_128] \n\t" \ 1983 TRANSFER_DO_PACK(OP) \ 1984 "addiu %[h], %[h], -0x01 \n\t" \ 1985 PTR_ADDU "%[src], %[src], 0x18 \n\t" \ 1986 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1987 "bnez %[h], 1b \n\t" \ 1988 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \ 1989 [h]"+r"(h), \ 1990 [src]"+r"(src), [dst]"+r"(dst) \ 1991 : [stride]"r"(stride), [rnd]"m"(rnd), \ 1992 [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \ 1993 [ff_pw_3]"f"(ff_pw_3.f), [ff_pw_128]"f"(ff_pw_128.f) \ 1994 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \ 1995 "$f14", "$f16", "memory" \ 1996 ); \ 1997} 1998 1999/** 2000 * Macro to build the 8bits, any direction, version of vc1_put_shift[13]. 2001 * Here, offset=src_stride. Parameters passed A1 to A4 must use 2002 * %3 (offset), %4 (2*offset) and %5 (3*offset). 2003 * 2004 * @param NAME Either 1 or 3 2005 * @see MSPEL_FILTER13_CORE for information on A1->A4 2006 */ 2007#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ 2008static void \ 2009OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \ 2010 mips_reg stride, int rnd, mips_reg offset) \ 2011{ \ 2012 int h = 8; \ 2013 DECLARE_VAR_LOW32; \ 2014 DECLARE_VAR_ADDRT; \ 2015 \ 2016 src -= offset; \ 2017 rnd = 32-rnd; \ 2018 \ 2019 __asm__ volatile ( \ 2020 "pxor $f0, $f0, $f0 \n\t" \ 2021 LOAD_ROUNDER_MMI("%[rnd]") \ 2022 ".p2align 3 \n\t" \ 2023 "1: \n\t" \ 2024 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \ 2025 "li $8, 0x06 \n\t" \ 2026 "mtc1 $8, $f16 \n\t" \ 2027 NORMALIZE_MMI("$f16") \ 2028 TRANSFER_DO_PACK(OP) \ 2029 "addiu %[h], %[h], -0x01 \n\t" \ 2030 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 2031 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 2032 "bnez %[h], 1b \n\t" \ 2033 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \ 2034 [h]"+r"(h), \ 2035 [src]"+r"(src), [dst]"+r"(dst) \ 2036 : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \ 2037 [offset_x3]"r"(3*offset), [stride]"r"(stride), \ 2038 [rnd]"m"(rnd), \ 2039 [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \ 2040 [ff_pw_3]"f"(ff_pw_3.f) \ 2041 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \ 2042 "$f14", "$f16", "memory" \ 2043 ); \ 2044} 2045 2046 2047/** 1/4 shift bicubic interpolation */ 2048MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_) 2049MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_) 2050MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0) 2051MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_) 2052MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_) 2053 2054/** 3/4 shift bicubic interpolation */ 2055MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_) 2056MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_) 2057MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3]) 2058MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_) 2059MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_) 2060 2061typedef void (*vc1_mspel_mc_filter_ver_16bits) 2062 (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd, 2063 int64_t shift); 2064typedef void (*vc1_mspel_mc_filter_hor_16bits) 2065 (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd); 2066typedef void (*vc1_mspel_mc_filter_8bits) 2067 (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd, 2068 mips_reg offset); 2069 2070/** 2071 * Interpolate fractional pel values by applying proper vertical then 2072 * horizontal filter. 2073 * 2074 * @param dst Destination buffer for interpolated pels. 2075 * @param src Source buffer. 2076 * @param stride Stride for both src and dst buffers. 2077 * @param hmode Horizontal filter (expressed in quarter pixels shift). 2078 * @param hmode Vertical filter. 2079 * @param rnd Rounding bias. 2080 */ 2081#define VC1_MSPEL_MC(OP) \ 2082static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ 2083 int hmode, int vmode, int rnd) \ 2084{ \ 2085 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ 2086 { NULL, vc1_put_ver_16b_shift1_mmi, \ 2087 vc1_put_ver_16b_shift2_mmi, \ 2088 vc1_put_ver_16b_shift3_mmi }; \ 2089 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ 2090 { NULL, OP ## vc1_hor_16b_shift1_mmi, \ 2091 OP ## vc1_hor_16b_shift2_mmi, \ 2092 OP ## vc1_hor_16b_shift3_mmi }; \ 2093 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \ 2094 { NULL, OP ## vc1_shift1_mmi, \ 2095 OP ## vc1_shift2_mmi, \ 2096 OP ## vc1_shift3_mmi }; \ 2097 \ 2098 if (vmode) { /* Vertical filter to apply */ \ 2099 if (hmode) { /* Horizontal filter to apply, output to tmp */ \ 2100 static const int shift_value[] = { 0, 5, 1, 5 }; \ 2101 int shift = (shift_value[hmode]+shift_value[vmode])>>1; \ 2102 int r; \ 2103 LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \ 2104 \ 2105 r = (1<<(shift-1)) + rnd-1; \ 2106 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \ 2107 \ 2108 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \ 2109 return; \ 2110 } \ 2111 else { /* No horizontal filter, output 8 lines to dst */ \ 2112 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \ 2113 return; \ 2114 } \ 2115 } \ 2116 \ 2117 /* Horizontal mode with no vertical mode */ \ 2118 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \ 2119} \ 2120static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ 2121 int stride, int hmode, int vmode, int rnd)\ 2122{ \ 2123 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ 2124 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 2125 dst += 8*stride; src += 8*stride; \ 2126 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ 2127 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 2128} 2129 2130VC1_MSPEL_MC(put_) 2131VC1_MSPEL_MC(avg_) 2132 2133/** Macro to ease bicubic filter interpolation functions declarations */ 2134#define DECLARE_FUNCTION(a, b) \ 2135void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \ 2136 const uint8_t *src, \ 2137 ptrdiff_t stride, \ 2138 int rnd) \ 2139{ \ 2140 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ 2141} \ 2142void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \ 2143 const uint8_t *src, \ 2144 ptrdiff_t stride, \ 2145 int rnd) \ 2146{ \ 2147 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ 2148} \ 2149void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \ 2150 const uint8_t *src, \ 2151 ptrdiff_t stride, \ 2152 int rnd) \ 2153{ \ 2154 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ 2155} \ 2156void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \ 2157 const uint8_t *src, \ 2158 ptrdiff_t stride, \ 2159 int rnd) \ 2160{ \ 2161 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ 2162} 2163 2164DECLARE_FUNCTION(0, 1) 2165DECLARE_FUNCTION(0, 2) 2166DECLARE_FUNCTION(0, 3) 2167 2168DECLARE_FUNCTION(1, 0) 2169DECLARE_FUNCTION(1, 1) 2170DECLARE_FUNCTION(1, 2) 2171DECLARE_FUNCTION(1, 3) 2172 2173DECLARE_FUNCTION(2, 0) 2174DECLARE_FUNCTION(2, 1) 2175DECLARE_FUNCTION(2, 2) 2176DECLARE_FUNCTION(2, 3) 2177 2178DECLARE_FUNCTION(3, 0) 2179DECLARE_FUNCTION(3, 1) 2180DECLARE_FUNCTION(3, 2) 2181DECLARE_FUNCTION(3, 3) 2182 2183#define CHROMA_MC_8_MMI \ 2184 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ 2185 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 2186 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ 2187 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 2188 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 2189 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 2190 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 2191 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 2192 \ 2193 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \ 2194 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \ 2195 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \ 2196 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \ 2197 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \ 2198 "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \ 2199 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \ 2200 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \ 2201 \ 2202 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 2203 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 2204 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ 2205 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \ 2206 \ 2207 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 2208 "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \ 2209 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ 2210 "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \ 2211 \ 2212 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \ 2213 "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ 2214 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 2215 2216 2217#define CHROMA_MC_4_MMI \ 2218 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 2219 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 2220 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 2221 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 2222 \ 2223 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \ 2224 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \ 2225 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \ 2226 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \ 2227 \ 2228 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 2229 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 2230 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ 2231 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \ 2232 \ 2233 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \ 2234 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 2235 2236 2237void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */, 2238 uint8_t *src /* align 1 */, 2239 ptrdiff_t stride, int h, int x, int y) 2240{ 2241 union mmi_intfloat64 A, B, C, D; 2242 double ftmp[10]; 2243 uint32_t tmp[1]; 2244 DECLARE_VAR_ALL64; 2245 DECLARE_VAR_ADDRT; 2246 A.i = (8 - x) * (8 - y); 2247 B.i = (x) * (8 - y); 2248 C.i = (8 - x) * (y); 2249 D.i = (x) * (y); 2250 2251 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 2252 2253 __asm__ volatile( 2254 "li %[tmp0], 0x06 \n\t" 2255 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2256 "mtc1 %[tmp0], %[ftmp9] \n\t" 2257 "pshufh %[A], %[A], %[ftmp0] \n\t" 2258 "pshufh %[B], %[B], %[ftmp0] \n\t" 2259 "pshufh %[C], %[C], %[ftmp0] \n\t" 2260 "pshufh %[D], %[D], %[ftmp0] \n\t" 2261 2262 "1: \n\t" 2263 MMI_ULDC1(%[ftmp1], %[src], 0x00) 2264 MMI_ULDC1(%[ftmp2], %[src], 0x01) 2265 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2266 MMI_ULDC1(%[ftmp3], %[src], 0x00) 2267 MMI_ULDC1(%[ftmp4], %[src], 0x01) 2268 2269 CHROMA_MC_8_MMI 2270 2271 MMI_SDC1(%[ftmp1], %[dst], 0x00) 2272 "addiu %[h], %[h], -0x01 \n\t" 2273 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2274 "bnez %[h], 1b \n\t" 2275 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2276 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2277 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2278 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 2279 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 2280 RESTRICT_ASM_ALL64 2281 RESTRICT_ASM_ADDRT 2282 [tmp0]"=&r"(tmp[0]), 2283 [src]"+&r"(src), [dst]"+&r"(dst), 2284 [h]"+&r"(h) 2285 : [stride]"r"((mips_reg)stride), 2286 [A]"f"(A.f), [B]"f"(B.f), 2287 [C]"f"(C.f), [D]"f"(D.f), 2288 [ff_pw_28]"f"(ff_pw_28.f) 2289 : "memory" 2290 ); 2291} 2292 2293void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */, 2294 uint8_t *src /* align 1 */, 2295 ptrdiff_t stride, int h, int x, int y) 2296{ 2297 union mmi_intfloat64 A, B, C, D; 2298 double ftmp[6]; 2299 uint32_t tmp[1]; 2300 DECLARE_VAR_LOW32; 2301 DECLARE_VAR_ADDRT; 2302 A.i = (8 - x) * (8 - y); 2303 B.i = (x) * (8 - y); 2304 C.i = (8 - x) * (y); 2305 D.i = (x) * (y); 2306 2307 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 2308 2309 __asm__ volatile( 2310 "li %[tmp0], 0x06 \n\t" 2311 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2312 "mtc1 %[tmp0], %[ftmp5] \n\t" 2313 "pshufh %[A], %[A], %[ftmp0] \n\t" 2314 "pshufh %[B], %[B], %[ftmp0] \n\t" 2315 "pshufh %[C], %[C], %[ftmp0] \n\t" 2316 "pshufh %[D], %[D], %[ftmp0] \n\t" 2317 2318 "1: \n\t" 2319 MMI_ULWC1(%[ftmp1], %[src], 0x00) 2320 MMI_ULWC1(%[ftmp2], %[src], 0x01) 2321 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2322 MMI_ULWC1(%[ftmp3], %[src], 0x00) 2323 MMI_ULWC1(%[ftmp4], %[src], 0x01) 2324 2325 CHROMA_MC_4_MMI 2326 2327 MMI_SWC1(%[ftmp1], %[dst], 0x00) 2328 "addiu %[h], %[h], -0x01 \n\t" 2329 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2330 "bnez %[h], 1b \n\t" 2331 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2332 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2333 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2334 [tmp0]"=&r"(tmp[0]), 2335 RESTRICT_ASM_LOW32 2336 RESTRICT_ASM_ADDRT 2337 [src]"+&r"(src), [dst]"+&r"(dst), 2338 [h]"+&r"(h) 2339 : [stride]"r"((mips_reg)stride), 2340 [A]"f"(A.f), [B]"f"(B.f), 2341 [C]"f"(C.f), [D]"f"(D.f), 2342 [ff_pw_28]"f"(ff_pw_28.f) 2343 : "memory" 2344 ); 2345} 2346 2347void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */, 2348 uint8_t *src /* align 1 */, 2349 ptrdiff_t stride, int h, int x, int y) 2350{ 2351 union mmi_intfloat64 A, B, C, D; 2352 double ftmp[10]; 2353 uint32_t tmp[1]; 2354 DECLARE_VAR_ALL64; 2355 DECLARE_VAR_ADDRT; 2356 A.i = (8 - x) * (8 - y); 2357 B.i = (x) * (8 - y); 2358 C.i = (8 - x) * (y); 2359 D.i = (x) * (y); 2360 2361 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 2362 2363 __asm__ volatile( 2364 "li %[tmp0], 0x06 \n\t" 2365 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2366 "mtc1 %[tmp0], %[ftmp9] \n\t" 2367 "pshufh %[A], %[A], %[ftmp0] \n\t" 2368 "pshufh %[B], %[B], %[ftmp0] \n\t" 2369 "pshufh %[C], %[C], %[ftmp0] \n\t" 2370 "pshufh %[D], %[D], %[ftmp0] \n\t" 2371 2372 "1: \n\t" 2373 MMI_ULDC1(%[ftmp1], %[src], 0x00) 2374 MMI_ULDC1(%[ftmp2], %[src], 0x01) 2375 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2376 MMI_ULDC1(%[ftmp3], %[src], 0x00) 2377 MMI_ULDC1(%[ftmp4], %[src], 0x01) 2378 2379 CHROMA_MC_8_MMI 2380 2381 MMI_LDC1(%[ftmp2], %[dst], 0x00) 2382 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" 2383 2384 MMI_SDC1(%[ftmp1], %[dst], 0x00) 2385 "addiu %[h], %[h], -0x01 \n\t" 2386 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2387 "bnez %[h], 1b \n\t" 2388 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2389 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2390 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2391 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 2392 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 2393 [tmp0]"=&r"(tmp[0]), 2394 RESTRICT_ASM_ALL64 2395 RESTRICT_ASM_ADDRT 2396 [src]"+&r"(src), [dst]"+&r"(dst), 2397 [h]"+&r"(h) 2398 : [stride]"r"((mips_reg)stride), 2399 [A]"f"(A.f), [B]"f"(B.f), 2400 [C]"f"(C.f), [D]"f"(D.f), 2401 [ff_pw_28]"f"(ff_pw_28.f) 2402 : "memory" 2403 ); 2404} 2405 2406void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */, 2407 uint8_t *src /* align 1 */, 2408 ptrdiff_t stride, int h, int x, int y) 2409{ 2410 union mmi_intfloat64 A, B, C, D; 2411 double ftmp[6]; 2412 uint32_t tmp[1]; 2413 DECLARE_VAR_LOW32; 2414 DECLARE_VAR_ADDRT; 2415 A.i = (8 - x) * (8 - y); 2416 B.i = (x) * (8 - y); 2417 C.i = (8 - x) * (y); 2418 D.i = (x) * (y); 2419 2420 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0); 2421 2422 __asm__ volatile( 2423 "li %[tmp0], 0x06 \n\t" 2424 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2425 "mtc1 %[tmp0], %[ftmp5] \n\t" 2426 "pshufh %[A], %[A], %[ftmp0] \n\t" 2427 "pshufh %[B], %[B], %[ftmp0] \n\t" 2428 "pshufh %[C], %[C], %[ftmp0] \n\t" 2429 "pshufh %[D], %[D], %[ftmp0] \n\t" 2430 2431 "1: \n\t" 2432 MMI_ULWC1(%[ftmp1], %[src], 0x00) 2433 MMI_ULWC1(%[ftmp2], %[src], 0x01) 2434 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2435 MMI_ULWC1(%[ftmp3], %[src], 0x00) 2436 MMI_ULWC1(%[ftmp4], %[src], 0x01) 2437 2438 CHROMA_MC_4_MMI 2439 2440 MMI_LWC1(%[ftmp2], %[dst], 0x00) 2441 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" 2442 2443 MMI_SWC1(%[ftmp1], %[dst], 0x00) 2444 "addiu %[h], %[h], -0x01 \n\t" 2445 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2446 "bnez %[h], 1b \n\t" 2447 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 2448 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 2449 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 2450 [tmp0]"=&r"(tmp[0]), 2451 RESTRICT_ASM_LOW32 2452 RESTRICT_ASM_ADDRT 2453 [src]"+&r"(src), [dst]"+&r"(dst), 2454 [h]"+&r"(h) 2455 : [stride]"r"((mips_reg)stride), 2456 [A]"f"(A.f), [B]"f"(B.f), 2457 [C]"f"(C.f), [D]"f"(D.f), 2458 [ff_pw_28]"f"(ff_pw_28.f) 2459 : "memory" 2460 ); 2461} 2462