1/* 2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "me_cmp_mips.h" 23 24static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride, 25 uint8_t *ref, int32_t ref_stride, 26 int32_t height) 27{ 28 int32_t ht_cnt; 29 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; 30 v8u16 sad = { 0 }; 31 32 for (ht_cnt = (height >> 2); ht_cnt--;) { 33 LD_UB4(src, src_stride, src0, src1, src2, src3); 34 src += (4 * src_stride); 35 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 36 ref += (4 * ref_stride); 37 38 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, 39 src0, src1, ref0, ref1); 40 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 41 } 42 43 return (HADD_UH_U32(sad)); 44} 45 46static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride, 47 uint8_t *ref, int32_t ref_stride, 48 int32_t height) 49{ 50 int32_t ht_cnt; 51 v16u8 src0, src1, ref0, ref1; 52 v8u16 sad = { 0 }; 53 54 for (ht_cnt = (height >> 2); ht_cnt--;) { 55 LD_UB2(src, src_stride, src0, src1); 56 src += (2 * src_stride); 57 LD_UB2(ref, ref_stride, ref0, ref1); 58 ref += (2 * ref_stride); 59 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 60 61 LD_UB2(src, src_stride, src0, src1); 62 src += (2 * src_stride); 63 LD_UB2(ref, ref_stride, ref0, ref1); 64 ref += (2 * ref_stride); 65 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 66 } 67 68 return (HADD_UH_U32(sad)); 69} 70 71static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src, 72 int32_t src_stride, 73 uint8_t *ref, 74 int32_t ref_stride, 75 int32_t height) 76{ 77 int32_t ht_cnt; 78 v16u8 src0, src1, src2, src3, comp0, comp1; 79 v16u8 ref0, ref1, ref2, ref3, ref4, ref5; 80 v8u16 sad = { 0 }; 81 82 for (ht_cnt = (height >> 3); ht_cnt--;) { 83 LD_UB4(src, src_stride, src0, src1, src2, src3); 84 src += (4 * src_stride); 85 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 86 ref += (4 * ref_stride); 87 88 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); 89 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5); 90 SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1, 91 ref0, ref1, ref2, ref3); 92 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 93 AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1); 94 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 95 96 LD_UB4(src, src_stride, src0, src1, src2, src3); 97 src += (4 * src_stride); 98 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 99 ref += (4 * ref_stride); 100 101 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); 102 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5); 103 SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1, 104 ref0, ref1, ref2, ref3); 105 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 106 AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1); 107 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 108 } 109 110 return (HADD_UH_U32(sad)); 111} 112 113static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src, 114 int32_t src_stride, 115 uint8_t *ref, 116 int32_t ref_stride, 117 int32_t height) 118{ 119 int32_t ht_cnt; 120 v16u8 src0, src1, src2, src3, comp0, comp1; 121 v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31; 122 v8u16 sad = { 0 }; 123 124 for (ht_cnt = (height >> 3); ht_cnt--;) { 125 LD_UB4(src, src_stride, src0, src1, src2, src3); 126 src += (4 * src_stride); 127 LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30); 128 LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31); 129 ref += (4 * ref_stride); 130 131 AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1); 132 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 133 AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1); 134 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 135 136 LD_UB4(src, src_stride, src0, src1, src2, src3); 137 src += (4 * src_stride); 138 LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30); 139 LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31); 140 ref += (4 * ref_stride); 141 142 AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1); 143 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 144 AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1); 145 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 146 } 147 148 return (HADD_UH_U32(sad)); 149} 150 151static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src, 152 int32_t src_stride, 153 uint8_t *ref, 154 int32_t ref_stride, 155 int32_t height) 156{ 157 int32_t ht_cnt; 158 v16u8 src0, src1, src2, src3, comp0, comp1; 159 v16u8 ref0, ref1, ref2, ref3, ref4; 160 v8u16 sad = { 0 }; 161 162 for (ht_cnt = (height >> 3); ht_cnt--;) { 163 LD_UB4(src, src_stride, src0, src1, src2, src3); 164 src += (4 * src_stride); 165 LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4); 166 ref += (4 * ref_stride); 167 168 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); 169 PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1); 170 PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3); 171 AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1); 172 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 173 174 LD_UB4(src, src_stride, src0, src1, src2, src3); 175 src += (4 * src_stride); 176 LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4); 177 ref += (4 * ref_stride); 178 179 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); 180 PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1); 181 PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3); 182 AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1); 183 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 184 } 185 186 return (HADD_UH_U32(sad)); 187} 188 189static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src, 190 int32_t src_stride, 191 uint8_t *ref, 192 int32_t ref_stride, 193 int32_t height) 194{ 195 int32_t ht_cnt; 196 v16u8 src0, src1, src2, src3, comp0, comp1; 197 v16u8 ref0, ref1, ref2, ref3, ref4; 198 v8u16 sad = { 0 }; 199 200 for (ht_cnt = (height >> 3); ht_cnt--;) { 201 LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3); 202 ref += (5 * ref_stride); 203 LD_UB4(src, src_stride, src0, src1, src2, src3); 204 src += (4 * src_stride); 205 206 AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1); 207 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 208 AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1); 209 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 210 211 ref4 = ref3; 212 213 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 214 ref += (3 * ref_stride); 215 LD_UB4(src, src_stride, src0, src1, src2, src3); 216 src += (4 * src_stride); 217 218 AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1); 219 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 220 AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1); 221 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 222 } 223 224 return (HADD_UH_U32(sad)); 225} 226 227static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src, 228 int32_t src_stride, 229 uint8_t *ref, 230 int32_t ref_stride, 231 int32_t height) 232{ 233 int32_t ht_cnt; 234 v16u8 src0, src1, src2, src3, temp0, temp1, diff; 235 v16u8 ref0, ref1, ref2, ref3, ref4; 236 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 237 v8u16 comp0, comp1, comp2, comp3; 238 v8u16 sad = { 0 }; 239 240 for (ht_cnt = (height >> 2); ht_cnt--;) { 241 LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3); 242 ref += (4 * ref_stride); 243 LD_UB4(src, src_stride, src0, src1, src2, src3); 244 src += (4 * src_stride); 245 246 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); 247 248 VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1); 249 comp0 = __msa_hadd_u_h(temp0, temp0); 250 comp1 = __msa_hadd_u_h(temp1, temp1); 251 comp0 += comp1; 252 comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2); 253 comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0); 254 255 temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1); 256 comp2 = __msa_hadd_u_h(temp0, temp0); 257 comp1 += comp2; 258 comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2); 259 comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1); 260 comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0); 261 diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1); 262 sad += __msa_hadd_u_h(diff, diff); 263 264 temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2); 265 comp3 = __msa_hadd_u_h(temp1, temp1); 266 comp2 += comp3; 267 comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2); 268 comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2); 269 270 temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3); 271 comp0 = __msa_hadd_u_h(temp0, temp0); 272 comp3 += comp0; 273 comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2); 274 comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3); 275 comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2); 276 diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3); 277 sad += __msa_hadd_u_h(diff, diff); 278 } 279 280 return (HADD_UH_U32(sad)); 281} 282 283static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src, 284 int32_t src_stride, 285 uint8_t *ref, 286 int32_t ref_stride, 287 int32_t height) 288{ 289 int32_t ht_cnt; 290 v16u8 src0, src1, src2, src3, comp, diff; 291 v16u8 temp0, temp1, temp2, temp3; 292 v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14; 293 v8u16 comp0, comp1, comp2, comp3; 294 v8u16 sad = { 0 }; 295 296 for (ht_cnt = (height >> 3); ht_cnt--;) { 297 LD_UB4(src, src_stride, src0, src1, src2, src3); 298 src += (4 * src_stride); 299 LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03); 300 LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13); 301 ref += (5 * ref_stride); 302 303 ILVRL_B2_UB(ref14, ref04, temp0, temp1); 304 comp0 = __msa_hadd_u_h(temp0, temp0); 305 comp1 = __msa_hadd_u_h(temp1, temp1); 306 ILVRL_B2_UB(ref10, ref00, temp2, temp3); 307 comp2 = __msa_hadd_u_h(temp2, temp2); 308 comp3 = __msa_hadd_u_h(temp3, temp3); 309 comp0 += comp2; 310 comp1 += comp3; 311 SRARI_H2_UH(comp0, comp1, 2); 312 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0); 313 diff = __msa_asub_u_b(src0, comp); 314 sad += __msa_hadd_u_h(diff, diff); 315 316 ILVRL_B2_UB(ref11, ref01, temp0, temp1); 317 comp0 = __msa_hadd_u_h(temp0, temp0); 318 comp1 = __msa_hadd_u_h(temp1, temp1); 319 comp2 += comp0; 320 comp3 += comp1; 321 SRARI_H2_UH(comp2, comp3, 2); 322 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2); 323 diff = __msa_asub_u_b(src1, comp); 324 sad += __msa_hadd_u_h(diff, diff); 325 326 ILVRL_B2_UB(ref12, ref02, temp2, temp3); 327 comp2 = __msa_hadd_u_h(temp2, temp2); 328 comp3 = __msa_hadd_u_h(temp3, temp3); 329 comp0 += comp2; 330 comp1 += comp3; 331 SRARI_H2_UH(comp0, comp1, 2); 332 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0); 333 diff = __msa_asub_u_b(src2, comp); 334 sad += __msa_hadd_u_h(diff, diff); 335 336 ILVRL_B2_UB(ref13, ref03, temp0, temp1); 337 comp0 = __msa_hadd_u_h(temp0, temp0); 338 comp1 = __msa_hadd_u_h(temp1, temp1); 339 comp2 += comp0; 340 comp3 += comp1; 341 SRARI_H2_UH(comp2, comp3, 2); 342 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2); 343 diff = __msa_asub_u_b(src3, comp); 344 sad += __msa_hadd_u_h(diff, diff); 345 346 LD_UB4(src, src_stride, src0, src1, src2, src3); 347 src += (4 * src_stride); 348 LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03); 349 LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13); 350 ref += (3 * ref_stride); 351 352 ILVRL_B2_UB(ref10, ref00, temp2, temp3); 353 comp2 = __msa_hadd_u_h(temp2, temp2); 354 comp3 = __msa_hadd_u_h(temp3, temp3); 355 comp0 += comp2; 356 comp1 += comp3; 357 SRARI_H2_UH(comp0, comp1, 2); 358 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0); 359 diff = __msa_asub_u_b(src0, comp); 360 sad += __msa_hadd_u_h(diff, diff); 361 362 ILVRL_B2_UB(ref11, ref01, temp0, temp1); 363 comp0 = __msa_hadd_u_h(temp0, temp0); 364 comp1 = __msa_hadd_u_h(temp1, temp1); 365 comp2 += comp0; 366 comp3 += comp1; 367 SRARI_H2_UH(comp2, comp3, 2); 368 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2); 369 diff = __msa_asub_u_b(src1, comp); 370 sad += __msa_hadd_u_h(diff, diff); 371 372 ILVRL_B2_UB(ref12, ref02, temp2, temp3); 373 comp2 = __msa_hadd_u_h(temp2, temp2); 374 comp3 = __msa_hadd_u_h(temp3, temp3); 375 comp0 += comp2; 376 comp1 += comp3; 377 SRARI_H2_UH(comp0, comp1, 2); 378 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0); 379 diff = __msa_asub_u_b(src2, comp); 380 sad += __msa_hadd_u_h(diff, diff); 381 382 ILVRL_B2_UB(ref13, ref03, temp0, temp1); 383 comp0 = __msa_hadd_u_h(temp0, temp0); 384 comp1 = __msa_hadd_u_h(temp1, temp1); 385 comp2 += comp0; 386 comp3 += comp1; 387 SRARI_H2_UH(comp2, comp3, 2); 388 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2); 389 diff = __msa_asub_u_b(src3, comp); 390 sad += __msa_hadd_u_h(diff, diff); 391 } 392 393 return (HADD_UH_U32(sad)); 394} 395 396#define CALC_MSE_B(src, ref, var) \ 397{ \ 398 v16u8 src_l0_m, src_l1_m; \ 399 v8i16 res_l0_m, res_l1_m; \ 400 \ 401 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ 402 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ 403 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ 404} 405 406static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride, 407 uint8_t *ref_ptr, int32_t ref_stride, 408 int32_t height) 409{ 410 int32_t ht_cnt; 411 uint32_t sse; 412 uint32_t src0, src1, src2, src3; 413 uint32_t ref0, ref1, ref2, ref3; 414 v16u8 src = { 0 }; 415 v16u8 ref = { 0 }; 416 v4i32 var = { 0 }; 417 418 for (ht_cnt = (height >> 2); ht_cnt--;) { 419 LW4(src_ptr, src_stride, src0, src1, src2, src3); 420 src_ptr += (4 * src_stride); 421 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 422 ref_ptr += (4 * ref_stride); 423 424 INSERT_W4_UB(src0, src1, src2, src3, src); 425 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 426 CALC_MSE_B(src, ref, var); 427 } 428 429 sse = HADD_SW_S32(var); 430 431 return sse; 432} 433 434static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride, 435 uint8_t *ref_ptr, int32_t ref_stride, 436 int32_t height) 437{ 438 int32_t ht_cnt; 439 uint32_t sse; 440 v16u8 src0, src1, src2, src3; 441 v16u8 ref0, ref1, ref2, ref3; 442 v4i32 var = { 0 }; 443 444 for (ht_cnt = (height >> 2); ht_cnt--;) { 445 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); 446 src_ptr += (4 * src_stride); 447 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 448 ref_ptr += (4 * ref_stride); 449 450 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, 451 src0, src1, ref0, ref1); 452 CALC_MSE_B(src0, ref0, var); 453 CALC_MSE_B(src1, ref1, var); 454 } 455 456 sse = HADD_SW_S32(var); 457 458 return sse; 459} 460 461static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride, 462 uint8_t *ref_ptr, int32_t ref_stride, 463 int32_t height) 464{ 465 int32_t ht_cnt; 466 uint32_t sse; 467 v16u8 src, ref; 468 v4i32 var = { 0 }; 469 470 for (ht_cnt = (height >> 2); ht_cnt--;) { 471 src = LD_UB(src_ptr); 472 src_ptr += src_stride; 473 ref = LD_UB(ref_ptr); 474 ref_ptr += ref_stride; 475 CALC_MSE_B(src, ref, var); 476 477 src = LD_UB(src_ptr); 478 src_ptr += src_stride; 479 ref = LD_UB(ref_ptr); 480 ref_ptr += ref_stride; 481 CALC_MSE_B(src, ref, var); 482 483 src = LD_UB(src_ptr); 484 src_ptr += src_stride; 485 ref = LD_UB(ref_ptr); 486 ref_ptr += ref_stride; 487 CALC_MSE_B(src, ref, var); 488 489 src = LD_UB(src_ptr); 490 src_ptr += src_stride; 491 ref = LD_UB(ref_ptr); 492 ref_ptr += ref_stride; 493 CALC_MSE_B(src, ref, var); 494 } 495 496 sse = HADD_SW_S32(var); 497 498 return sse; 499} 500 501static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride, 502 uint8_t *ref, int32_t ref_stride) 503{ 504 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 505 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 506 v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 507 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; 508 v8i16 sum = { 0 }; 509 v8i16 zero = { 0 }; 510 511 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 512 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 513 ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3, 514 src4, ref4, src5, ref5, src6, ref6, src7, ref7, 515 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7); 516 HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3); 517 HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7); 518 TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7, 519 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7); 520 BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1, 521 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1); 522 BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2, 523 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2); 524 BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4, 525 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4); 526 TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, 527 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); 528 BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1, 529 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1); 530 BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2, 531 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2); 532 ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7, 533 diff0, diff1, diff2, diff3); 534 sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7); 535 sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6); 536 sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5); 537 sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4); 538 sum += __msa_add_a_h((v8i16) diff0, zero); 539 sum += __msa_add_a_h((v8i16) diff1, zero); 540 sum += __msa_add_a_h((v8i16) diff2, zero); 541 sum += __msa_add_a_h((v8i16) diff3, zero); 542 543 return (HADD_UH_U32(sum)); 544} 545 546static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride, 547 uint8_t *ref, int32_t ref_stride) 548{ 549 int32_t sum_res = 0; 550 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 551 v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 552 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; 553 v8i16 sum = { 0 }; 554 v16i8 zero = { 0 }; 555 556 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 557 TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7, 558 src0, src1, src2, src3, src4, src5, src6, src7); 559 ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3, 560 zero, src4, zero, src5, zero, src6, zero, src7, 561 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7); 562 BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1, 563 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1); 564 BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2, 565 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2); 566 BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4, 567 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4); 568 TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, 569 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); 570 BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1, 571 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1); 572 BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2, 573 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2); 574 ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7, 575 diff0, diff1, diff2, diff3); 576 sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7); 577 sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6); 578 sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5); 579 sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4); 580 sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero); 581 sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero); 582 sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero); 583 sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero); 584 sum_res = (HADD_UH_U32(sum)); 585 sum_res -= abs(temp0[0] + temp4[0]); 586 587 return sum_res; 588} 589 590int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, 591 ptrdiff_t stride, int height) 592{ 593 return sad_16width_msa(src, stride, ref, stride, height); 594} 595 596int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, 597 ptrdiff_t stride, int height) 598{ 599 return sad_8width_msa(src, stride, ref, stride, height); 600} 601 602int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 603 ptrdiff_t stride, int h) 604{ 605 return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h); 606} 607 608int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 609 ptrdiff_t stride, int h) 610{ 611 return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h); 612} 613 614int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 615 ptrdiff_t stride, int h) 616{ 617 return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h); 618} 619 620int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 621 ptrdiff_t stride, int h) 622{ 623 return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h); 624} 625 626int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 627 ptrdiff_t stride, int h) 628{ 629 return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h); 630} 631 632int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 633 ptrdiff_t stride, int h) 634{ 635 return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h); 636} 637 638int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, 639 ptrdiff_t stride, int height) 640{ 641 return sse_16width_msa(src, stride, ref, stride, height); 642} 643 644int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, 645 ptrdiff_t stride, int height) 646{ 647 return sse_8width_msa(src, stride, ref, stride, height); 648} 649 650int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, 651 ptrdiff_t stride, int height) 652{ 653 return sse_4width_msa(src, stride, ref, stride, height); 654} 655 656int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, 657 ptrdiff_t stride, int h) 658{ 659 return hadamard_diff_8x8_msa(src, stride, dst, stride); 660} 661 662int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, 663 ptrdiff_t stride, int h) 664{ 665 return hadamard_intra_8x8_msa(src, stride, dst, stride); 666} 667 668/* Hadamard Transform functions */ 669#define WRAPPER8_16_SQ(name8, name16) \ 670int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \ 671 ptrdiff_t stride, int h) \ 672{ \ 673 int score = 0; \ 674 score += name8(s, dst, src, stride, 8); \ 675 score += name8(s, dst + 8, src + 8, stride, 8); \ 676 if(h == 16) { \ 677 dst += 8 * stride; \ 678 src += 8 * stride; \ 679 score +=name8(s, dst, src, stride, 8); \ 680 score +=name8(s, dst + 8, src + 8, stride, 8); \ 681 } \ 682 return score; \ 683} 684 685WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa); 686WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa); 687