1 /* 2 * Copyright (c) 2019 Shiyou Yin (yinshiyou-hf@loongson.cn) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include "libavcodec/hevcdec.h" 22 #include "libavcodec/bit_depth_template.c" 23 #include "libavcodec/mips/hevcdsp_mips.h" 24 #include "libavutil/mips/mmiutils.h" 25 26 #define PUT_HEVC_QPEL_H(w, x_step, src_step, dst_step) \ 27 void ff_hevc_put_hevc_qpel_h##w##_8_mmi(int16_t *dst, uint8_t *_src, \ 28 ptrdiff_t _srcstride, \ 29 int height, intptr_t mx, \ 30 intptr_t my, int width) \ 31 { \ 32 int x, y; \ 33 pixel *src = (pixel*)_src - 3; \ 34 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 35 double ftmp[15]; \ 36 uint64_t rtmp[1]; \ 37 const int8_t *filter = ff_hevc_qpel_filters[mx - 1]; \ 38 DECLARE_VAR_ALL64; \ 39 \ 40 x = x_step; \ 41 y = height; \ 42 __asm__ volatile( \ 43 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 44 "li %[rtmp0], 0x08 \n\t" \ 45 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 46 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 47 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 48 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 49 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 50 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 51 \ 52 "1: \n\t" \ 53 "2: \n\t" \ 54 MMI_ULDC1(%[ftmp3], %[src], 0x00) \ 55 MMI_ULDC1(%[ftmp4], %[src], 0x01) \ 56 MMI_ULDC1(%[ftmp5], %[src], 0x02) \ 57 MMI_ULDC1(%[ftmp6], %[src], 0x03) \ 58 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 59 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 60 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 61 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 62 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \ 63 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 64 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 65 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 66 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 67 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \ 68 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 69 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 70 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 71 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 72 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \ 73 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 74 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 75 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 76 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 77 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \ 78 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 79 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 80 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 81 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 82 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 83 MMI_ULDC1(%[ftmp3], %[dst], 0x00) \ 84 \ 85 "daddi %[x], %[x], -0x01 \n\t" \ 86 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 87 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t" \ 88 "bnez %[x], 2b \n\t" \ 89 \ 90 "daddi %[y], %[y], -0x01 \n\t" \ 91 "li %[x], " #x_step " \n\t" \ 92 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 93 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 94 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 95 PTR_ADDIU "%[dst], %[dst], 0x80 \n\t" \ 96 "bnez %[y], 1b \n\t" \ 97 : RESTRICT_ASM_ALL64 \ 98 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 99 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 100 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 101 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 102 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 103 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \ 104 [src]"+&r"(src), [dst]"+&r"(dst), [y]"+&r"(y), \ 105 [x]"+&r"(x) \ 106 : [filter]"r"(filter), [stride]"r"(srcstride) \ 107 : "memory" \ 108 ); \ 109 } 110 111 PUT_HEVC_QPEL_H(4, 1, -4, -8); 112 PUT_HEVC_QPEL_H(8, 2, -8, -16); 113 PUT_HEVC_QPEL_H(12, 3, -12, -24); 114 PUT_HEVC_QPEL_H(16, 4, -16, -32); 115 PUT_HEVC_QPEL_H(24, 6, -24, -48); 116 PUT_HEVC_QPEL_H(32, 8, -32, -64); 117 PUT_HEVC_QPEL_H(48, 12, -48, -96); 118 PUT_HEVC_QPEL_H(64, 16, -64, -128); 119 120 #define PUT_HEVC_QPEL_HV(w, x_step, src_step, dst_step) \ 121 void ff_hevc_put_hevc_qpel_hv##w##_8_mmi(int16_t *dst, uint8_t *_src, \ 122 ptrdiff_t _srcstride, \ 123 int height, intptr_t mx, \ 124 intptr_t my, int width) \ 125 { \ 126 int x, y; \ 127 const int8_t *filter; \ 128 pixel *src = (pixel*)_src; \ 129 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 130 int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; \ 131 int16_t *tmp = tmp_array; \ 132 double ftmp[15]; \ 133 uint64_t rtmp[1]; \ 134 DECLARE_VAR_ALL64; \ 135 \ 136 src -= (QPEL_EXTRA_BEFORE * srcstride + 3); \ 137 filter = ff_hevc_qpel_filters[mx - 1]; \ 138 x = x_step; \ 139 y = height + QPEL_EXTRA; \ 140 __asm__ volatile( \ 141 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 142 "li %[rtmp0], 0x08 \n\t" \ 143 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 144 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 145 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 146 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 147 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 148 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 149 \ 150 "1: \n\t" \ 151 "2: \n\t" \ 152 MMI_ULDC1(%[ftmp3], %[src], 0x00) \ 153 MMI_ULDC1(%[ftmp4], %[src], 0x01) \ 154 MMI_ULDC1(%[ftmp5], %[src], 0x02) \ 155 MMI_ULDC1(%[ftmp6], %[src], 0x03) \ 156 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 157 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 158 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 159 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 160 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \ 161 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 162 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 163 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 164 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 165 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \ 166 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 167 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 168 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 169 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 170 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \ 171 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 172 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 173 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 174 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 175 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \ 176 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 177 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 178 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 179 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 180 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 181 MMI_ULDC1(%[ftmp3], %[tmp], 0x00) \ 182 \ 183 "daddi %[x], %[x], -0x01 \n\t" \ 184 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 185 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 186 "bnez %[x], 2b \n\t" \ 187 \ 188 "daddi %[y], %[y], -0x01 \n\t" \ 189 "li %[x], " #x_step " \n\t" \ 190 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 191 PTR_ADDIU "%[tmp], %[tmp], " #dst_step " \n\t" \ 192 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 193 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 194 "bnez %[y], 1b \n\t" \ 195 : RESTRICT_ASM_ALL64 \ 196 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 197 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 198 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 199 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 200 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 201 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \ 202 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \ 203 [x]"+&r"(x) \ 204 : [filter]"r"(filter), [stride]"r"(srcstride) \ 205 : "memory" \ 206 ); \ 207 \ 208 tmp = tmp_array + QPEL_EXTRA_BEFORE * 4 -12; \ 209 filter = ff_hevc_qpel_filters[my - 1]; \ 210 x = x_step; \ 211 y = height; \ 212 __asm__ volatile( \ 213 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 214 "li %[rtmp0], 0x08 \n\t" \ 215 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 216 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 217 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 218 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 219 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 220 "li %[rtmp0], 0x06 \n\t" \ 221 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 222 \ 223 "1: \n\t" \ 224 "2: \n\t" \ 225 MMI_ULDC1(%[ftmp3], %[tmp], 0x00) \ 226 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 227 MMI_ULDC1(%[ftmp4], %[tmp], 0x00) \ 228 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 229 MMI_ULDC1(%[ftmp5], %[tmp], 0x00) \ 230 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 231 MMI_ULDC1(%[ftmp6], %[tmp], 0x00) \ 232 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 233 MMI_ULDC1(%[ftmp7], %[tmp], 0x00) \ 234 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 235 MMI_ULDC1(%[ftmp8], %[tmp], 0x00) \ 236 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 237 MMI_ULDC1(%[ftmp9], %[tmp], 0x00) \ 238 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 239 MMI_ULDC1(%[ftmp10], %[tmp], 0x00) \ 240 PTR_ADDIU "%[tmp], %[tmp], -0x380 \n\t" \ 241 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 242 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 243 TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10], \ 244 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 245 "pmaddhw %[ftmp11], %[ftmp3], %[ftmp1] \n\t" \ 246 "pmaddhw %[ftmp12], %[ftmp7], %[ftmp2] \n\t" \ 247 "pmaddhw %[ftmp13], %[ftmp4], %[ftmp1] \n\t" \ 248 "pmaddhw %[ftmp14], %[ftmp8], %[ftmp2] \n\t" \ 249 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 250 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 251 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4]) \ 252 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 253 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 254 "pmaddhw %[ftmp11], %[ftmp5], %[ftmp1] \n\t" \ 255 "pmaddhw %[ftmp12], %[ftmp9], %[ftmp2] \n\t" \ 256 "pmaddhw %[ftmp13], %[ftmp6], %[ftmp1] \n\t" \ 257 "pmaddhw %[ftmp14], %[ftmp10], %[ftmp2] \n\t" \ 258 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 259 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 260 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6]) \ 261 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 262 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 263 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 264 MMI_USDC1(%[ftmp3], %[dst], 0x00) \ 265 \ 266 "daddi %[x], %[x], -0x01 \n\t" \ 267 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t" \ 268 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 269 "bnez %[x], 2b \n\t" \ 270 \ 271 "daddi %[y], %[y], -0x01 \n\t" \ 272 "li %[x], " #x_step " \n\t" \ 273 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 274 PTR_ADDIU "%[tmp], %[tmp], " #dst_step " \n\t" \ 275 PTR_ADDIU "%[dst], %[dst], 0x80 \n\t" \ 276 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 277 "bnez %[y], 1b \n\t" \ 278 : RESTRICT_ASM_ALL64 \ 279 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 280 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 281 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 282 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 283 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 284 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \ 285 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), \ 286 [ftmp14]"=&f"(ftmp[14]), [rtmp0]"=&r"(rtmp[0]), \ 287 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), \ 288 [x]"+&r"(x) \ 289 : [filter]"r"(filter), [stride]"r"(srcstride) \ 290 : "memory" \ 291 ); \ 292 } 293 294 PUT_HEVC_QPEL_HV(4, 1, -4, -8); 295 PUT_HEVC_QPEL_HV(8, 2, -8, -16); 296 PUT_HEVC_QPEL_HV(12, 3, -12, -24); 297 PUT_HEVC_QPEL_HV(16, 4, -16, -32); 298 PUT_HEVC_QPEL_HV(24, 6, -24, -48); 299 PUT_HEVC_QPEL_HV(32, 8, -32, -64); 300 PUT_HEVC_QPEL_HV(48, 12, -48, -96); 301 PUT_HEVC_QPEL_HV(64, 16, -64, -128); 302 303 #define PUT_HEVC_QPEL_BI_H(w, x_step, src_step, src2_step, dst_step) \ 304 void ff_hevc_put_hevc_qpel_bi_h##w##_8_mmi(uint8_t *_dst, \ 305 ptrdiff_t _dststride, \ 306 uint8_t *_src, \ 307 ptrdiff_t _srcstride, \ 308 int16_t *src2, int height, \ 309 intptr_t mx, intptr_t my, \ 310 int width) \ 311 { \ 312 int x, y; \ 313 pixel *src = (pixel*)_src - 3; \ 314 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 315 pixel *dst = (pixel *)_dst; \ 316 ptrdiff_t dststride = _dststride / sizeof(pixel); \ 317 const int8_t *filter = ff_hevc_qpel_filters[mx - 1]; \ 318 double ftmp[20]; \ 319 uint64_t rtmp[1]; \ 320 union av_intfloat64 shift; \ 321 union av_intfloat64 offset; \ 322 DECLARE_VAR_ALL64; \ 323 DECLARE_VAR_LOW32; \ 324 shift.i = 7; \ 325 offset.i = 64; \ 326 \ 327 x = width >> 2; \ 328 y = height; \ 329 __asm__ volatile( \ 330 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 331 "li %[rtmp0], 0x08 \n\t" \ 332 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 333 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 334 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 335 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 336 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 337 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 338 "punpcklhw %[offset], %[offset], %[offset] \n\t" \ 339 "punpcklwd %[offset], %[offset], %[offset] \n\t" \ 340 \ 341 "1: \n\t" \ 342 "li %[x], " #x_step " \n\t" \ 343 "2: \n\t" \ 344 MMI_ULDC1(%[ftmp3], %[src], 0x00) \ 345 MMI_ULDC1(%[ftmp4], %[src], 0x01) \ 346 MMI_ULDC1(%[ftmp5], %[src], 0x02) \ 347 MMI_ULDC1(%[ftmp6], %[src], 0x03) \ 348 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 349 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 350 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 351 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 352 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \ 353 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 354 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 355 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 356 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 357 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \ 358 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 359 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 360 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 361 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 362 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \ 363 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 364 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 365 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 366 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 367 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \ 368 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 369 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 370 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 371 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 372 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 373 "paddh %[ftmp3], %[ftmp3], %[offset] \n\t" \ 374 MMI_ULDC1(%[ftmp4], %[src2], 0x00) \ 375 "li %[rtmp0], 0x10 \n\t" \ 376 "dmtc1 %[rtmp0], %[ftmp8] \n\t" \ 377 "punpcklhw %[ftmp5], %[ftmp0], %[ftmp3] \n\t" \ 378 "punpckhhw %[ftmp6], %[ftmp0], %[ftmp3] \n\t" \ 379 "punpckhhw %[ftmp3], %[ftmp0], %[ftmp4] \n\t" \ 380 "punpcklhw %[ftmp4], %[ftmp0], %[ftmp4] \n\t" \ 381 "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t" \ 382 "psraw %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \ 383 "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t" \ 384 "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \ 385 "paddw %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 386 "paddw %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 387 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \ 388 "psraw %[ftmp6], %[ftmp6], %[shift] \n\t" \ 389 "packsswh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 390 "pcmpgth %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 391 "pand %[ftmp3], %[ftmp5], %[ftmp7] \n\t" \ 392 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 393 MMI_USWC1(%[ftmp3], %[dst], 0x00) \ 394 \ 395 "daddi %[x], %[x], -0x01 \n\t" \ 396 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 397 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \ 398 PTR_ADDIU "%[src2], %[src2], 0x08 \n\t" \ 399 "bnez %[x], 2b \n\t" \ 400 \ 401 "daddi %[y], %[y], -0x01 \n\t" \ 402 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 403 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 404 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \ 405 PTR_ADDU "%[src], %[src], %[src_stride] \n\t" \ 406 PTR_ADDU "%[dst], %[dst], %[dst_stride] \n\t" \ 407 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \ 408 "bnez %[y], 1b \n\t" \ 409 : RESTRICT_ASM_ALL64 RESTRICT_ASM_LOW32 \ 410 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 411 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 412 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 413 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 414 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 415 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \ 416 [ftmp12]"=&f"(ftmp[12]), [src2]"+&r"(src2), \ 417 [dst]"+&r"(dst), [src]"+&r"(src), [y]"+&r"(y), [x]"=&r"(x), \ 418 [offset]"+&f"(offset.f), [rtmp0]"=&r"(rtmp[0]) \ 419 : [src_stride]"r"(srcstride), [dst_stride]"r"(dststride), \ 420 [filter]"r"(filter), [shift]"f"(shift.f) \ 421 : "memory" \ 422 ); \ 423 } 424 425 PUT_HEVC_QPEL_BI_H(4, 1, -4, -8, -4); 426 PUT_HEVC_QPEL_BI_H(8, 2, -8, -16, -8); 427 PUT_HEVC_QPEL_BI_H(12, 3, -12, -24, -12); 428 PUT_HEVC_QPEL_BI_H(16, 4, -16, -32, -16); 429 PUT_HEVC_QPEL_BI_H(24, 6, -24, -48, -24); 430 PUT_HEVC_QPEL_BI_H(32, 8, -32, -64, -32); 431 PUT_HEVC_QPEL_BI_H(48, 12, -48, -96, -48); 432 PUT_HEVC_QPEL_BI_H(64, 16, -64, -128, -64); 433 434 #define PUT_HEVC_QPEL_BI_HV(w, x_step, src_step, src2_step, dst_step) \ 435 void ff_hevc_put_hevc_qpel_bi_hv##w##_8_mmi(uint8_t *_dst, \ 436 ptrdiff_t _dststride, \ 437 uint8_t *_src, \ 438 ptrdiff_t _srcstride, \ 439 int16_t *src2, int height, \ 440 intptr_t mx, intptr_t my, \ 441 int width) \ 442 { \ 443 int x, y; \ 444 const int8_t *filter; \ 445 pixel *src = (pixel*)_src; \ 446 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 447 pixel *dst = (pixel *)_dst; \ 448 ptrdiff_t dststride = _dststride / sizeof(pixel); \ 449 int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; \ 450 int16_t *tmp = tmp_array; \ 451 double ftmp[20]; \ 452 uint64_t rtmp[1]; \ 453 union av_intfloat64 shift; \ 454 union av_intfloat64 offset; \ 455 DECLARE_VAR_ALL64; \ 456 DECLARE_VAR_LOW32; \ 457 shift.i = 7; \ 458 offset.i = 64; \ 459 \ 460 src -= (QPEL_EXTRA_BEFORE * srcstride + 3); \ 461 filter = ff_hevc_qpel_filters[mx - 1]; \ 462 x = width >> 2; \ 463 y = height + QPEL_EXTRA; \ 464 __asm__ volatile( \ 465 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 466 "li %[rtmp0], 0x08 \n\t" \ 467 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 468 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 469 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 470 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 471 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 472 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 473 \ 474 "1: \n\t" \ 475 "2: \n\t" \ 476 MMI_ULDC1(%[ftmp3], %[src], 0x00) \ 477 MMI_ULDC1(%[ftmp4], %[src], 0x01) \ 478 MMI_ULDC1(%[ftmp5], %[src], 0x02) \ 479 MMI_ULDC1(%[ftmp6], %[src], 0x03) \ 480 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 481 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 482 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 483 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 484 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \ 485 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 486 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 487 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 488 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 489 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \ 490 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 491 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 492 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 493 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 494 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \ 495 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 496 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 497 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 498 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 499 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \ 500 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 501 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 502 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 503 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 504 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 505 MMI_USDC1(%[ftmp3], %[tmp], 0x00) \ 506 \ 507 "daddi %[x], %[x], -0x01 \n\t" \ 508 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 509 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 510 "bnez %[x], 2b \n\t" \ 511 \ 512 "daddi %[y], %[y], -0x01 \n\t" \ 513 "li %[x], " #x_step " \n\t" \ 514 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 515 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \ 516 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 517 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 518 "bnez %[y], 1b \n\t" \ 519 : RESTRICT_ASM_ALL64 \ 520 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 521 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 522 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 523 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 524 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 525 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \ 526 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \ 527 [x]"+&r"(x) \ 528 : [filter]"r"(filter), [stride]"r"(srcstride) \ 529 : "memory" \ 530 ); \ 531 \ 532 tmp = tmp_array; \ 533 filter = ff_hevc_qpel_filters[my - 1]; \ 534 x = width >> 2; \ 535 y = height; \ 536 __asm__ volatile( \ 537 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 538 "li %[rtmp0], 0x08 \n\t" \ 539 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 540 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 541 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 542 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 543 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 544 "li %[rtmp0], 0x06 \n\t" \ 545 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 546 "punpcklwd %[offset], %[offset], %[offset] \n\t" \ 547 \ 548 "1: \n\t" \ 549 "li %[x], " #x_step " \n\t" \ 550 "2: \n\t" \ 551 MMI_ULDC1(%[ftmp3], %[tmp], 0x00) \ 552 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 553 MMI_ULDC1(%[ftmp4], %[tmp], 0x00) \ 554 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 555 MMI_ULDC1(%[ftmp5], %[tmp], 0x00) \ 556 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 557 MMI_ULDC1(%[ftmp6], %[tmp], 0x00) \ 558 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 559 MMI_ULDC1(%[ftmp7], %[tmp], 0x00) \ 560 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 561 MMI_ULDC1(%[ftmp8], %[tmp], 0x00) \ 562 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 563 MMI_ULDC1(%[ftmp9], %[tmp], 0x00) \ 564 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 565 MMI_ULDC1(%[ftmp10], %[tmp], 0x00) \ 566 PTR_ADDIU "%[tmp], %[tmp], -0x380 \n\t" \ 567 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 568 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 569 TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10], \ 570 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 571 "pmaddhw %[ftmp11], %[ftmp3], %[ftmp1] \n\t" \ 572 "pmaddhw %[ftmp12], %[ftmp7], %[ftmp2] \n\t" \ 573 "pmaddhw %[ftmp13], %[ftmp4], %[ftmp1] \n\t" \ 574 "pmaddhw %[ftmp14], %[ftmp8], %[ftmp2] \n\t" \ 575 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 576 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 577 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4]) \ 578 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 579 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 580 "pmaddhw %[ftmp11], %[ftmp5], %[ftmp1] \n\t" \ 581 "pmaddhw %[ftmp12], %[ftmp9], %[ftmp2] \n\t" \ 582 "pmaddhw %[ftmp13], %[ftmp6], %[ftmp1] \n\t" \ 583 "pmaddhw %[ftmp14], %[ftmp10], %[ftmp2] \n\t" \ 584 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 585 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 586 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6]) \ 587 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 588 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 589 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 590 MMI_ULDC1(%[ftmp4], %[src2], 0x00) \ 591 "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ 592 "li %[rtmp0], 0x10 \n\t" \ 593 "dmtc1 %[rtmp0], %[ftmp8] \n\t" \ 594 "punpcklhw %[ftmp5], %[ftmp7], %[ftmp3] \n\t" \ 595 "punpckhhw %[ftmp6], %[ftmp7], %[ftmp3] \n\t" \ 596 "punpckhhw %[ftmp3], %[ftmp7], %[ftmp4] \n\t" \ 597 "punpcklhw %[ftmp4], %[ftmp7], %[ftmp4] \n\t" \ 598 "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t" \ 599 "psraw %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \ 600 "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t" \ 601 "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \ 602 "paddw %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 603 "paddw %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 604 "paddw %[ftmp5], %[ftmp5], %[offset] \n\t" \ 605 "paddw %[ftmp6], %[ftmp6], %[offset] \n\t" \ 606 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \ 607 "psraw %[ftmp6], %[ftmp6], %[shift] \n\t" \ 608 "packsswh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 609 "pcmpgth %[ftmp7], %[ftmp5], %[ftmp7] \n\t" \ 610 "pand %[ftmp3], %[ftmp5], %[ftmp7] \n\t" \ 611 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 612 MMI_USWC1(%[ftmp3], %[dst], 0x00) \ 613 \ 614 "daddi %[x], %[x], -0x01 \n\t" \ 615 PTR_ADDIU "%[src2], %[src2], 0x08 \n\t" \ 616 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 617 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \ 618 "bnez %[x], 2b \n\t" \ 619 \ 620 "daddi %[y], %[y], -0x01 \n\t" \ 621 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \ 622 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \ 623 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 624 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \ 625 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 626 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 627 "bnez %[y], 1b \n\t" \ 628 : RESTRICT_ASM_ALL64 RESTRICT_ASM_LOW32 \ 629 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 630 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 631 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 632 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 633 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 634 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \ 635 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), \ 636 [ftmp14]"=&f"(ftmp[14]), [src2]"+&r"(src2), \ 637 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x), \ 638 [offset]"+&f"(offset.f), [rtmp0]"=&r"(rtmp[0]) \ 639 : [filter]"r"(filter), [stride]"r"(dststride), \ 640 [shift]"f"(shift.f) \ 641 : "memory" \ 642 ); \ 643 } 644 645 PUT_HEVC_QPEL_BI_HV(4, 1, -4, -8, -4); 646 PUT_HEVC_QPEL_BI_HV(8, 2, -8, -16, -8); 647 PUT_HEVC_QPEL_BI_HV(12, 3, -12, -24, -12); 648 PUT_HEVC_QPEL_BI_HV(16, 4, -16, -32, -16); 649 PUT_HEVC_QPEL_BI_HV(24, 6, -24, -48, -24); 650 PUT_HEVC_QPEL_BI_HV(32, 8, -32, -64, -32); 651 PUT_HEVC_QPEL_BI_HV(48, 12, -48, -96, -48); 652 PUT_HEVC_QPEL_BI_HV(64, 16, -64, -128, -64); 653 654 #define PUT_HEVC_EPEL_BI_HV(w, x_step, src_step, src2_step, dst_step) \ 655 void ff_hevc_put_hevc_epel_bi_hv##w##_8_mmi(uint8_t *_dst, \ 656 ptrdiff_t _dststride, \ 657 uint8_t *_src, \ 658 ptrdiff_t _srcstride, \ 659 int16_t *src2, int height, \ 660 intptr_t mx, intptr_t my, \ 661 int width) \ 662 { \ 663 int x, y; \ 664 pixel *src = (pixel *)_src; \ 665 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 666 pixel *dst = (pixel *)_dst; \ 667 ptrdiff_t dststride = _dststride / sizeof(pixel); \ 668 const int8_t *filter = ff_hevc_epel_filters[mx - 1]; \ 669 int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; \ 670 int16_t *tmp = tmp_array; \ 671 double ftmp[12]; \ 672 uint64_t rtmp[1]; \ 673 union av_intfloat64 shift; \ 674 union av_intfloat64 offset; \ 675 DECLARE_VAR_ALL64; \ 676 DECLARE_VAR_LOW32; \ 677 shift.i = 7; \ 678 offset.i = 64; \ 679 \ 680 src -= (EPEL_EXTRA_BEFORE * srcstride + 1); \ 681 x = width >> 2; \ 682 y = height + EPEL_EXTRA; \ 683 __asm__ volatile( \ 684 MMI_LWC1(%[ftmp1], %[filter], 0x00) \ 685 "li %[rtmp0], 0x08 \n\t" \ 686 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 687 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 688 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 689 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 690 \ 691 "1: \n\t" \ 692 "2: \n\t" \ 693 MMI_ULDC1(%[ftmp3], %[src], 0x00) \ 694 MMI_ULDC1(%[ftmp4], %[src], 0x01) \ 695 MMI_ULDC1(%[ftmp5], %[src], 0x02) \ 696 MMI_ULDC1(%[ftmp6], %[src], 0x03) \ 697 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 698 "pmullh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ 699 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 700 "pmullh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \ 701 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 702 "pmullh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ 703 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 704 "pmullh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ 705 TRANSPOSE_4H(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], \ 706 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9]) \ 707 "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 708 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ 709 "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ 710 MMI_ULDC1(%[ftmp2], %[tmp], 0x00) \ 711 \ 712 "daddi %[x], %[x], -0x01 \n\t" \ 713 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 714 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 715 "bnez %[x], 2b \n\t" \ 716 \ 717 "daddi %[y], %[y], -0x01 \n\t" \ 718 "li %[x], " #x_step " \n\t" \ 719 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 720 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \ 721 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 722 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 723 "bnez %[y], 1b \n\t" \ 724 : RESTRICT_ASM_ALL64 \ 725 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 726 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 727 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 728 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 729 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 730 [rtmp0]"=&r"(rtmp[0]), \ 731 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \ 732 [x]"+&r"(x) \ 733 : [filter]"r"(filter), [stride]"r"(srcstride) \ 734 : "memory" \ 735 ); \ 736 \ 737 tmp = tmp_array; \ 738 filter = ff_hevc_epel_filters[my - 1]; \ 739 x = width >> 2; \ 740 y = height; \ 741 __asm__ volatile( \ 742 MMI_LWC1(%[ftmp1], %[filter], 0x00) \ 743 "li %[rtmp0], 0x08 \n\t" \ 744 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 745 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 746 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 747 "li %[rtmp0], 0x06 \n\t" \ 748 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 749 "punpcklwd %[offset], %[offset], %[offset] \n\t" \ 750 "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" \ 751 \ 752 "1: \n\t" \ 753 "li %[x], " #x_step " \n\t" \ 754 "2: \n\t" \ 755 MMI_ULDC1(%[ftmp3], %[tmp], 0x00) \ 756 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 757 MMI_ULDC1(%[ftmp4], %[tmp], 0x00) \ 758 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 759 MMI_ULDC1(%[ftmp5], %[tmp], 0x00) \ 760 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 761 MMI_ULDC1(%[ftmp6], %[tmp], 0x00) \ 762 PTR_ADDIU "%[tmp], %[tmp], -0x180 \n\t" \ 763 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 764 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 765 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" \ 766 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t" \ 767 TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp3], %[ftmp4]) \ 768 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 769 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 770 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp1] \n\t" \ 771 "pmaddhw %[ftmp8], %[ftmp6], %[ftmp1] \n\t" \ 772 TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp5], %[ftmp6]) \ 773 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 774 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 775 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 776 MMI_ULDC1(%[ftmp4], %[tmp], 0x02) \ 777 "li %[rtmp0], 0x10 \n\t" \ 778 "dmtc1 %[rtmp0], %[ftmp8] \n\t" \ 779 "punpcklhw %[ftmp5], %[ftmp2], %[ftmp3] \n\t" \ 780 "punpckhhw %[ftmp6], %[ftmp2], %[ftmp3] \n\t" \ 781 "punpckhhw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \ 782 "punpcklhw %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ 783 "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t" \ 784 "psraw %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \ 785 "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t" \ 786 "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \ 787 "paddw %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 788 "paddw %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 789 "paddw %[ftmp5], %[ftmp5], %[offset] \n\t" \ 790 "paddw %[ftmp6], %[ftmp6], %[offset] \n\t" \ 791 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \ 792 "psraw %[ftmp6], %[ftmp6], %[shift] \n\t" \ 793 "packsswh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 794 "pcmpgth %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \ 795 "pand %[ftmp3], %[ftmp5], %[ftmp7] \n\t" \ 796 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 797 MMI_USWC1(%[ftmp3], %[dst], 0x0) \ 798 \ 799 "daddi %[x], %[x], -0x01 \n\t" \ 800 PTR_ADDIU "%[src2], %[src2], 0x08 \n\t" \ 801 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 802 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \ 803 "bnez %[x], 2b \n\t" \ 804 \ 805 "daddi %[y], %[y], -0x01 \n\t" \ 806 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \ 807 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \ 808 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 809 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \ 810 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 811 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 812 "bnez %[y], 1b \n\t" \ 813 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ALL64 \ 814 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 815 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 816 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 817 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 818 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 819 [ftmp10]"=&f"(ftmp[10]), [src2]"+&r"(src2), \ 820 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x), \ 821 [offset]"+&f"(offset.f), [rtmp0]"=&r"(rtmp[0]) \ 822 : [filter]"r"(filter), [stride]"r"(dststride), \ 823 [shift]"f"(shift.f) \ 824 : "memory" \ 825 ); \ 826 } 827 828 PUT_HEVC_EPEL_BI_HV(4, 1, -4, -8, -4); 829 PUT_HEVC_EPEL_BI_HV(8, 2, -8, -16, -8); 830 PUT_HEVC_EPEL_BI_HV(12, 3, -12, -24, -12); 831 PUT_HEVC_EPEL_BI_HV(16, 4, -16, -32, -16); 832 PUT_HEVC_EPEL_BI_HV(24, 6, -24, -48, -24); 833 PUT_HEVC_EPEL_BI_HV(32, 8, -32, -64, -32); 834 835 #define PUT_HEVC_PEL_BI_PIXELS(w, x_step, src_step, dst_step, src2_step) \ 836 void ff_hevc_put_hevc_pel_bi_pixels##w##_8_mmi(uint8_t *_dst, \ 837 ptrdiff_t _dststride, \ 838 uint8_t *_src, \ 839 ptrdiff_t _srcstride, \ 840 int16_t *src2, int height, \ 841 intptr_t mx, intptr_t my, \ 842 int width) \ 843 { \ 844 int x, y; \ 845 pixel *src = (pixel *)_src; \ 846 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 847 pixel *dst = (pixel *)_dst; \ 848 ptrdiff_t dststride = _dststride / sizeof(pixel); \ 849 double ftmp[12]; \ 850 uint64_t rtmp[1]; \ 851 union av_intfloat64 shift; \ 852 DECLARE_VAR_ALL64; \ 853 shift.i = 7; \ 854 \ 855 y = height; \ 856 x = width >> 3; \ 857 __asm__ volatile( \ 858 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 859 "li %[rtmp0], 0x06 \n\t" \ 860 "dmtc1 %[rtmp0], %[ftmp1] \n\t" \ 861 "li %[rtmp0], 0x10 \n\t" \ 862 "dmtc1 %[rtmp0], %[ftmp10] \n\t" \ 863 "li %[rtmp0], 0x40 \n\t" \ 864 "dmtc1 %[rtmp0], %[offset] \n\t" \ 865 "punpcklhw %[offset], %[offset], %[offset] \n\t" \ 866 "punpcklwd %[offset], %[offset], %[offset] \n\t" \ 867 \ 868 "1: \n\t" \ 869 "2: \n\t" \ 870 MMI_ULDC1(%[ftmp5], %[src], 0x00) \ 871 MMI_ULDC1(%[ftmp2], %[src2], 0x00) \ 872 MMI_ULDC1(%[ftmp3], %[src2], 0x08) \ 873 "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" \ 874 "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 875 "psllh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ 876 "psllh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ 877 "paddh %[ftmp4], %[ftmp4], %[offset] \n\t" \ 878 "paddh %[ftmp5], %[ftmp5], %[offset] \n\t" \ 879 "punpcklhw %[ftmp6], %[ftmp4], %[ftmp0] \n\t" \ 880 "punpckhhw %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 881 "punpcklhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 882 "punpckhhw %[ftmp9], %[ftmp5], %[ftmp0] \n\t" \ 883 "punpcklhw %[ftmp4], %[ftmp0], %[ftmp3] \n\t" \ 884 "punpckhhw %[ftmp5], %[ftmp0], %[ftmp3] \n\t" \ 885 "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t" \ 886 "punpcklhw %[ftmp2], %[ftmp0], %[ftmp2] \n\t" \ 887 "psraw %[ftmp2], %[ftmp2], %[ftmp10] \n\t" \ 888 "psraw %[ftmp3], %[ftmp3], %[ftmp10] \n\t" \ 889 "psraw %[ftmp4], %[ftmp4], %[ftmp10] \n\t" \ 890 "psraw %[ftmp5], %[ftmp5], %[ftmp10] \n\t" \ 891 "paddw %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \ 892 "paddw %[ftmp3], %[ftmp3], %[ftmp7] \n\t" \ 893 "paddw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \ 894 "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ 895 "psraw %[ftmp2], %[ftmp2], %[shift] \n\t" \ 896 "psraw %[ftmp3], %[ftmp3], %[shift] \n\t" \ 897 "psraw %[ftmp4], %[ftmp4], %[shift] \n\t" \ 898 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \ 899 "packsswh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 900 "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ 901 "pcmpgth %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \ 902 "pcmpgth %[ftmp5], %[ftmp4], %[ftmp0] \n\t" \ 903 "pand %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 904 "pand %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ 905 "packushb %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ 906 MMI_USDC1(%[ftmp2], %[dst], 0x0) \ 907 \ 908 "daddi %[x], %[x], -0x01 \n\t" \ 909 PTR_ADDIU "%[src], %[src], 0x08 \n\t" \ 910 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t" \ 911 PTR_ADDIU "%[src2], %[src2], 0x10 \n\t" \ 912 "bnez %[x], 2b \n\t" \ 913 \ 914 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 915 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 916 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \ 917 "li %[x], " #x_step " \n\t" \ 918 "daddi %[y], %[y], -0x01 \n\t" \ 919 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" \ 920 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" \ 921 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \ 922 "bnez %[y], 1b \n\t" \ 923 : RESTRICT_ASM_ALL64 \ 924 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 925 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 926 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 927 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 928 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 929 [ftmp10]"=&f"(ftmp[10]), [offset]"=&f"(ftmp[11]), \ 930 [src2]"+&r"(src2), [dst]"+&r"(dst), [src]"+&r"(src), \ 931 [x]"+&r"(x), [y]"+&r"(y), [rtmp0]"=&r"(rtmp[0]) \ 932 : [dststride]"r"(dststride), [shift]"f"(shift.f), \ 933 [srcstride]"r"(srcstride) \ 934 : "memory" \ 935 ); \ 936 } \ 937 938 PUT_HEVC_PEL_BI_PIXELS(8, 1, -8, -8, -16); 939 PUT_HEVC_PEL_BI_PIXELS(16, 2, -16, -16, -32); 940 PUT_HEVC_PEL_BI_PIXELS(24, 3, -24, -24, -48); 941 PUT_HEVC_PEL_BI_PIXELS(32, 4, -32, -32, -64); 942 PUT_HEVC_PEL_BI_PIXELS(48, 6, -48, -48, -96); 943 PUT_HEVC_PEL_BI_PIXELS(64, 8, -64, -64, -128); 944 945 #define PUT_HEVC_QPEL_UNI_HV(w, x_step, src_step, dst_step, tmp_step) \ 946 void ff_hevc_put_hevc_qpel_uni_hv##w##_8_mmi(uint8_t *_dst, \ 947 ptrdiff_t _dststride, \ 948 uint8_t *_src, \ 949 ptrdiff_t _srcstride, \ 950 int height, \ 951 intptr_t mx, intptr_t my, \ 952 int width) \ 953 { \ 954 int x, y; \ 955 const int8_t *filter; \ 956 pixel *src = (pixel*)_src; \ 957 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 958 pixel *dst = (pixel *)_dst; \ 959 ptrdiff_t dststride = _dststride / sizeof(pixel); \ 960 int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; \ 961 int16_t *tmp = tmp_array; \ 962 double ftmp[20]; \ 963 uint64_t rtmp[1]; \ 964 union av_intfloat64 shift; \ 965 union av_intfloat64 offset; \ 966 DECLARE_VAR_ALL64; \ 967 DECLARE_VAR_LOW32; \ 968 shift.i = 6; \ 969 offset.i = 32; \ 970 \ 971 src -= (QPEL_EXTRA_BEFORE * srcstride + 3); \ 972 filter = ff_hevc_qpel_filters[mx - 1]; \ 973 x = width >> 2; \ 974 y = height + QPEL_EXTRA; \ 975 __asm__ volatile( \ 976 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 977 "li %[rtmp0], 0x08 \n\t" \ 978 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 979 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 980 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 981 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 982 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 983 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 984 \ 985 "1: \n\t" \ 986 "2: \n\t" \ 987 MMI_ULDC1(%[ftmp3], %[src], 0x00) \ 988 MMI_ULDC1(%[ftmp4], %[src], 0x01) \ 989 MMI_ULDC1(%[ftmp5], %[src], 0x02) \ 990 MMI_ULDC1(%[ftmp6], %[src], 0x03) \ 991 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 992 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 993 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 994 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 995 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \ 996 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 997 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 998 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 999 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 1000 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \ 1001 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 1002 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 1003 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 1004 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 1005 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \ 1006 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 1007 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 1008 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 1009 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 1010 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \ 1011 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 1012 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 1013 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 1014 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 1015 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 1016 MMI_USDC1(%[ftmp3], %[tmp], 0x0) \ 1017 \ 1018 "daddi %[x], %[x], -0x01 \n\t" \ 1019 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 1020 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 1021 "bnez %[x], 2b \n\t" \ 1022 \ 1023 "daddi %[y], %[y], -0x01 \n\t" \ 1024 "li %[x], " #x_step " \n\t" \ 1025 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 1026 PTR_ADDIU "%[tmp], %[tmp], " #tmp_step " \n\t" \ 1027 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 1028 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1029 "bnez %[y], 1b \n\t" \ 1030 : RESTRICT_ASM_ALL64 \ 1031 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 1032 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 1033 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 1034 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 1035 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 1036 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \ 1037 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \ 1038 [x]"+&r"(x) \ 1039 : [filter]"r"(filter), [stride]"r"(srcstride) \ 1040 : "memory" \ 1041 ); \ 1042 \ 1043 tmp = tmp_array; \ 1044 filter = ff_hevc_qpel_filters[my - 1]; \ 1045 x = width >> 2; \ 1046 y = height; \ 1047 __asm__ volatile( \ 1048 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 1049 "li %[rtmp0], 0x08 \n\t" \ 1050 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 1051 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 1052 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 1053 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 1054 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 1055 "li %[rtmp0], 0x06 \n\t" \ 1056 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 1057 "punpcklhw %[offset], %[offset], %[offset] \n\t" \ 1058 "punpcklwd %[offset], %[offset], %[offset] \n\t" \ 1059 \ 1060 "1: \n\t" \ 1061 "li %[x], " #x_step " \n\t" \ 1062 "2: \n\t" \ 1063 MMI_ULDC1(%[ftmp3], %[tmp], 0x00) \ 1064 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1065 MMI_ULDC1(%[ftmp4], %[tmp], 0x00) \ 1066 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1067 MMI_ULDC1(%[ftmp5], %[tmp], 0x00) \ 1068 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1069 MMI_ULDC1(%[ftmp6], %[tmp], 0x00) \ 1070 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1071 MMI_ULDC1(%[ftmp7], %[tmp], 0x00) \ 1072 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1073 MMI_ULDC1(%[ftmp8], %[tmp], 0x00) \ 1074 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1075 MMI_ULDC1(%[ftmp9], %[tmp], 0x00) \ 1076 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1077 MMI_ULDC1(%[ftmp10], %[tmp], 0x00) \ 1078 PTR_ADDIU "%[tmp], %[tmp], -0x380 \n\t" \ 1079 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 1080 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 1081 TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10], \ 1082 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 1083 "pmaddhw %[ftmp11], %[ftmp3], %[ftmp1] \n\t" \ 1084 "pmaddhw %[ftmp12], %[ftmp7], %[ftmp2] \n\t" \ 1085 "pmaddhw %[ftmp13], %[ftmp4], %[ftmp1] \n\t" \ 1086 "pmaddhw %[ftmp14], %[ftmp8], %[ftmp2] \n\t" \ 1087 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 1088 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 1089 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4]) \ 1090 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 1091 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 1092 "pmaddhw %[ftmp11], %[ftmp5], %[ftmp1] \n\t" \ 1093 "pmaddhw %[ftmp12], %[ftmp9], %[ftmp2] \n\t" \ 1094 "pmaddhw %[ftmp13], %[ftmp6], %[ftmp1] \n\t" \ 1095 "pmaddhw %[ftmp14], %[ftmp10], %[ftmp2] \n\t" \ 1096 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 1097 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 1098 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6]) \ 1099 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 1100 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 1101 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 1102 "paddh %[ftmp3], %[ftmp3], %[offset] \n\t" \ 1103 "psrah %[ftmp3], %[ftmp3], %[shift] \n\t" \ 1104 "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ 1105 "pcmpgth %[ftmp7], %[ftmp3], %[ftmp7] \n\t" \ 1106 "pand %[ftmp3], %[ftmp3], %[ftmp7] \n\t" \ 1107 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 1108 MMI_USWC1(%[ftmp3], %[dst], 0x00) \ 1109 \ 1110 "daddi %[x], %[x], -0x01 \n\t" \ 1111 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 1112 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \ 1113 "bnez %[x], 2b \n\t" \ 1114 \ 1115 "daddi %[y], %[y], -0x01 \n\t" \ 1116 PTR_ADDIU "%[tmp], %[tmp], " #tmp_step " \n\t" \ 1117 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 1118 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1119 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1120 "bnez %[y], 1b \n\t" \ 1121 : RESTRICT_ASM_ALL64 RESTRICT_ASM_LOW32 \ 1122 [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 1123 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 1124 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 1125 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 1126 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 1127 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \ 1128 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), \ 1129 [ftmp14]"=&f"(ftmp[14]), \ 1130 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x), \ 1131 [offset]"+&f"(offset.f), [rtmp0]"=&r"(rtmp[0]) \ 1132 : [filter]"r"(filter), [stride]"r"(dststride), \ 1133 [shift]"f"(shift.f) \ 1134 : "memory" \ 1135 ); \ 1136 } 1137 1138 PUT_HEVC_QPEL_UNI_HV(4, 1, -4, -4, -8); 1139 PUT_HEVC_QPEL_UNI_HV(8, 2, -8, -8, -16); 1140 PUT_HEVC_QPEL_UNI_HV(12, 3, -12, -12, -24); 1141 PUT_HEVC_QPEL_UNI_HV(16, 4, -16, -16, -32); 1142 PUT_HEVC_QPEL_UNI_HV(24, 6, -24, -24, -48); 1143 PUT_HEVC_QPEL_UNI_HV(32, 8, -32, -32, -64); 1144 PUT_HEVC_QPEL_UNI_HV(48, 12, -48, -48, -96); 1145 PUT_HEVC_QPEL_UNI_HV(64, 16, -64, -64, -128); 1146