1/* 2 * WMV2 - DSP functions Loongson MMI-optimized 3 * 4 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/avassert.h" 24#include "constants.h" 25#include "wmv2dsp_mips.h" 26#include "libavutil/mips/mmiutils.h" 27 28#define W0 2048 29#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ 30#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ 31#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ 32#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ 33#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ 34#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ 35#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ 36 37static void wmv2_idct_row_mmi(short * b) 38{ 39 int s1, s2; 40 int a0, a1, a2, a3, a4, a5, a6, a7; 41 42 /* step 1 */ 43 a0 = W0 * b[0] + W0 * b[4]; 44 a1 = W1 * b[1] + W7 * b[7]; 45 a2 = W2 * b[2] + W6 * b[6]; 46 a3 = W3 * b[5] - W5 * b[3]; 47 a4 = W0 * b[0] - W0 * b[4]; 48 a5 = W5 * b[5] + W3 * b[3]; 49 a6 = W6 * b[2] - W2 * b[6]; 50 a7 = W7 * b[1] - W1 * b[7]; 51 52 /* step 2 */ 53 s1 = (181 * (a1 - a5 + a7 - a3) + 128) >> 8; // 1, 3, 5, 7 54 s2 = (181 * (a1 - a5 - a7 + a3) + 128) >> 8; 55 56 /* step 3 */ 57 b[0] = (a0 + a2 + a1 + a5 + 128) >> 8; 58 b[1] = (a4 + a6 + s1 + 128) >> 8; 59 b[2] = (a4 - a6 + s2 + 128) >> 8; 60 b[3] = (a0 - a2 + a7 + a3 + 128) >> 8; 61 b[4] = (a0 - a2 - a7 - a3 + 128) >> 8; 62 b[5] = (a4 - a6 - s2 + 128) >> 8; 63 b[6] = (a4 + a6 - s1 + 128) >> 8; 64 b[7] = (a0 + a2 - a1 - a5 + 128) >> 8; 65} 66 67static void wmv2_idct_col_mmi(short * b) 68{ 69 int s1, s2; 70 int a0, a1, a2, a3, a4, a5, a6, a7; 71 72 /* step 1, with extended precision */ 73 a0 = (W0 * b[ 0] + W0 * b[32] ) >> 3; 74 a1 = (W1 * b[ 8] + W7 * b[56] + 4) >> 3; 75 a2 = (W2 * b[16] + W6 * b[48] + 4) >> 3; 76 a3 = (W3 * b[40] - W5 * b[24] + 4) >> 3; 77 a4 = (W0 * b[ 0] - W0 * b[32] ) >> 3; 78 a5 = (W5 * b[40] + W3 * b[24] + 4) >> 3; 79 a6 = (W6 * b[16] - W2 * b[48] + 4) >> 3; 80 a7 = (W7 * b[ 8] - W1 * b[56] + 4) >> 3; 81 82 /* step 2 */ 83 s1 = (181 * (a1 - a5 + a7 - a3) + 128) >> 8; 84 s2 = (181 * (a1 - a5 - a7 + a3) + 128) >> 8; 85 86 /* step 3 */ 87 b[ 0] = (a0 + a2 + a1 + a5 + 8192) >> 14; 88 b[ 8] = (a4 + a6 + s1 + 8192) >> 14; 89 b[16] = (a4 - a6 + s2 + 8192) >> 14; 90 b[24] = (a0 - a2 + a7 + a3 + 8192) >> 14; 91 92 b[32] = (a0 - a2 - a7 - a3 + 8192) >> 14; 93 b[40] = (a4 - a6 - s2 + 8192) >> 14; 94 b[48] = (a4 + a6 - s1 + 8192) >> 14; 95 b[56] = (a0 + a2 - a1 - a5 + 8192) >> 14; 96} 97 98void ff_wmv2_idct_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block) 99{ 100 int i; 101 double ftmp[11]; 102 103 for (i = 0; i < 64; i += 8) 104 wmv2_idct_row_mmi(block + i); 105 for (i = 0; i < 8; i++) 106 wmv2_idct_col_mmi(block + i); 107 108 __asm__ volatile ( 109 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 110 111 // low 4 loop 112 MMI_LDC1(%[ftmp1], %[block], 0x00) 113 MMI_LDC1(%[ftmp2], %[block], 0x08) 114 MMI_LDC1(%[ftmp3], %[block], 0x10) 115 MMI_LDC1(%[ftmp4], %[block], 0x18) 116 MMI_LDC1(%[ftmp5], %[block], 0x20) 117 MMI_LDC1(%[ftmp6], %[block], 0x28) 118 MMI_LDC1(%[ftmp7], %[block], 0x30) 119 MMI_LDC1(%[ftmp8], %[block], 0x38) 120 121 MMI_LDC1(%[ftmp9], %[dest], 0x00) 122 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t" 123 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 124 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" 125 "paddh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" 126 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" 127 MMI_SDC1(%[ftmp1], %[dest], 0x00) 128 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 129 130 MMI_LDC1(%[ftmp9], %[dest], 0x00) 131 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t" 132 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 133 "paddh %[ftmp3], %[ftmp3], %[ftmp9] \n\t" 134 "paddh %[ftmp4], %[ftmp4], %[ftmp10] \n\t" 135 "packushb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" 136 MMI_SDC1(%[ftmp3], %[dest], 0x00) 137 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 138 139 MMI_LDC1(%[ftmp9], %[dest], 0x00) 140 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t" 141 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 142 "paddh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" 143 "paddh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" 144 "packushb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" 145 MMI_SDC1(%[ftmp5], %[dest], 0x00) 146 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 147 148 MMI_LDC1(%[ftmp9], %[dest], 0x00) 149 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t" 150 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 151 "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t" 152 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" 153 "packushb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" 154 MMI_SDC1(%[ftmp7], %[dest], 0x00) 155 156 PTR_ADDIU "%[block], %[block], 0x40 \n\t" 157 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 158 159 // high 4 loop 160 MMI_LDC1(%[ftmp1], %[block], 0x00) 161 MMI_LDC1(%[ftmp2], %[block], 0x08) 162 MMI_LDC1(%[ftmp3], %[block], 0x10) 163 MMI_LDC1(%[ftmp4], %[block], 0x18) 164 MMI_LDC1(%[ftmp5], %[block], 0x20) 165 MMI_LDC1(%[ftmp6], %[block], 0x28) 166 MMI_LDC1(%[ftmp7], %[block], 0x30) 167 MMI_LDC1(%[ftmp8], %[block], 0x38) 168 169 MMI_LDC1(%[ftmp9], %[dest], 0x00) 170 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t" 171 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 172 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" 173 "paddh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" 174 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" 175 MMI_SDC1(%[ftmp1], %[dest], 0x00) 176 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 177 178 MMI_LDC1(%[ftmp9], %[dest], 0x00) 179 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t" 180 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 181 "paddh %[ftmp3], %[ftmp3], %[ftmp9] \n\t" 182 "paddh %[ftmp4], %[ftmp4], %[ftmp10] \n\t" 183 "packushb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" 184 MMI_SDC1(%[ftmp3], %[dest], 0x00) 185 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 186 187 MMI_LDC1(%[ftmp9], %[dest], 0x00) 188 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t" 189 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 190 "paddh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" 191 "paddh %[ftmp6], %[ftmp6], %[ftmp10] \n\t" 192 "packushb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" 193 MMI_SDC1(%[ftmp5], %[dest], 0x00) 194 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 195 196 MMI_LDC1(%[ftmp9], %[dest], 0x00) 197 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t" 198 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 199 "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t" 200 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" 201 "packushb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" 202 MMI_SDC1(%[ftmp7], %[dest], 0x00) 203 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 204 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 205 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 206 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 207 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 208 [ftmp10]"=&f"(ftmp[10]), 209 [block]"+&r"(block), [dest]"+&r"(dest) 210 : [line_size]"r"((mips_reg)line_size) 211 : "memory" 212 ); 213} 214 215void ff_wmv2_idct_put_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block) 216{ 217 int i; 218 double ftmp[8]; 219 220 for (i = 0; i < 64; i += 8) 221 wmv2_idct_row_mmi(block + i); 222 for (i = 0; i < 8; i++) 223 wmv2_idct_col_mmi(block + i); 224 225 __asm__ volatile ( 226 // low 4 loop 227 MMI_LDC1(%[ftmp0], %[block], 0x00) 228 MMI_LDC1(%[ftmp1], %[block], 0x08) 229 MMI_LDC1(%[ftmp2], %[block], 0x10) 230 MMI_LDC1(%[ftmp3], %[block], 0x18) 231 MMI_LDC1(%[ftmp4], %[block], 0x20) 232 MMI_LDC1(%[ftmp5], %[block], 0x28) 233 MMI_LDC1(%[ftmp6], %[block], 0x30) 234 MMI_LDC1(%[ftmp7], %[block], 0x38) 235 "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" 236 "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" 237 "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" 238 "packushb %[ftmp6], %[ftmp6], %[ftmp7] \n\t" 239 MMI_SDC1(%[ftmp0], %[dest], 0x00) 240 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 241 MMI_SDC1(%[ftmp2], %[dest], 0x00) 242 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 243 MMI_SDC1(%[ftmp4], %[dest], 0x00) 244 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 245 MMI_SDC1(%[ftmp6], %[dest], 0x00) 246 247 PTR_ADDIU "%[block], %[block], 0x40 \n\t" 248 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 249 250 // high 4 loop 251 MMI_LDC1(%[ftmp0], %[block], 0x00) 252 MMI_LDC1(%[ftmp1], %[block], 0x08) 253 MMI_LDC1(%[ftmp2], %[block], 0x10) 254 MMI_LDC1(%[ftmp3], %[block], 0x18) 255 MMI_LDC1(%[ftmp4], %[block], 0x20) 256 MMI_LDC1(%[ftmp5], %[block], 0x28) 257 MMI_LDC1(%[ftmp6], %[block], 0x30) 258 MMI_LDC1(%[ftmp7], %[block], 0x38) 259 "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" 260 "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" 261 "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" 262 "packushb %[ftmp6], %[ftmp6], %[ftmp7] \n\t" 263 MMI_SDC1(%[ftmp0], %[dest], 0x00) 264 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 265 MMI_SDC1(%[ftmp2], %[dest], 0x00) 266 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 267 MMI_SDC1(%[ftmp4], %[dest], 0x00) 268 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t" 269 MMI_SDC1(%[ftmp6], %[dest], 0x00) 270 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 271 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 272 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 273 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 274 [block]"+&r"(block), [dest]"+&r"(dest) 275 : [line_size]"r"((mips_reg)line_size) 276 : "memory" 277 ); 278} 279