1/* 2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "libavcodec/mips/hevcdsp_mips.h" 23#include "libavcodec/mips/hevc_macros_msa.h" 24 25static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = { 26 /* 8 width cases */ 27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28 /* 4 width cases */ 29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 30 /* 4 width cases */ 31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 32}; 33 34#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 35 mask0, mask1, mask2, mask3, \ 36 filt0, filt1, filt2, filt3, \ 37 out0, out1) \ 38{ \ 39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 40 \ 41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ 47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ 48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \ 49} 50 51#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 52 mask0, mask1, mask2, mask3, \ 53 filt0, filt1, filt2, filt3, \ 54 out0, out1, out2, out3) \ 55{ \ 56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 57 \ 58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 61 out0, out1, out2, out3); \ 62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ 63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ 64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ 65 out0, out1, out2, out3); \ 66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ 67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ 68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ 69 out0, out1, out2, out3); \ 70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ 71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ 72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ 73 out0, out1, out2, out3); \ 74} 75 76#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 77 mask0, mask1, filt0, filt1, \ 78 out0, out1) \ 79{ \ 80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 81 \ 82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 86} 87 88#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 89 mask0, mask1, filt0, filt1, \ 90 out0, out1, out2, out3) \ 91{ \ 92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 93 \ 94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 97 out0, out1, out2, out3); \ 98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 101 out0, out1, out2, out3); \ 102} 103 104static void copy_width8_msa(uint8_t *src, int32_t src_stride, 105 uint8_t *dst, int32_t dst_stride, 106 int32_t height) 107{ 108 int32_t cnt; 109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 110 111 if (2 == height) { 112 LD2(src, src_stride, out0, out1); 113 SD(out0, dst); 114 dst += dst_stride; 115 SD(out1, dst); 116 } else if (6 == height) { 117 LD4(src, src_stride, out0, out1, out2, out3); 118 src += (4 * src_stride); 119 SD4(out0, out1, out2, out3, dst, dst_stride); 120 dst += (4 * dst_stride); 121 LD2(src, src_stride, out0, out1); 122 SD(out0, dst); 123 dst += dst_stride; 124 SD(out1, dst); 125 } else if (0 == (height % 8)) { 126 for (cnt = (height >> 3); cnt--;) { 127 LD4(src, src_stride, out0, out1, out2, out3); 128 src += (4 * src_stride); 129 LD4(src, src_stride, out4, out5, out6, out7); 130 src += (4 * src_stride); 131 SD4(out0, out1, out2, out3, dst, dst_stride); 132 dst += (4 * dst_stride); 133 SD4(out4, out5, out6, out7, dst, dst_stride); 134 dst += (4 * dst_stride); 135 } 136 } else if (0 == (height % 4)) { 137 for (cnt = (height >> 2); cnt--;) { 138 LD4(src, src_stride, out0, out1, out2, out3); 139 src += (4 * src_stride); 140 SD4(out0, out1, out2, out3, dst, dst_stride); 141 dst += (4 * dst_stride); 142 } 143 } 144} 145 146static void copy_width12_msa(uint8_t *src, int32_t src_stride, 147 uint8_t *dst, int32_t dst_stride, 148 int32_t height) 149{ 150 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 151 152 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 153 src += (8 * src_stride); 154 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 155 dst += (8 * dst_stride); 156 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 157 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 158} 159 160static void copy_width16_msa(uint8_t *src, int32_t src_stride, 161 uint8_t *dst, int32_t dst_stride, 162 int32_t height) 163{ 164 int32_t cnt; 165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 166 167 if (12 == height) { 168 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 169 src += (8 * src_stride); 170 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 171 dst += (8 * dst_stride); 172 LD_UB4(src, src_stride, src0, src1, src2, src3); 173 src += (4 * src_stride); 174 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 175 dst += (4 * dst_stride); 176 } else if (0 == (height % 8)) { 177 for (cnt = (height >> 3); cnt--;) { 178 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, 179 src7); 180 src += (8 * src_stride); 181 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 182 dst_stride); 183 dst += (8 * dst_stride); 184 } 185 } else if (0 == (height % 4)) { 186 for (cnt = (height >> 2); cnt--;) { 187 LD_UB4(src, src_stride, src0, src1, src2, src3); 188 src += (4 * src_stride); 189 190 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 191 dst += (4 * dst_stride); 192 } 193 } 194} 195 196static void copy_width24_msa(uint8_t *src, int32_t src_stride, 197 uint8_t *dst, int32_t dst_stride, 198 int32_t height) 199{ 200 int32_t cnt; 201 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 203 204 for (cnt = 4; cnt--;) { 205 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 206 LD4(src + 16, src_stride, out0, out1, out2, out3); 207 src += (4 * src_stride); 208 LD4(src + 16, src_stride, out4, out5, out6, out7); 209 src += (4 * src_stride); 210 211 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 212 SD4(out0, out1, out2, out3, dst + 16, dst_stride); 213 dst += (4 * dst_stride); 214 SD4(out4, out5, out6, out7, dst + 16, dst_stride); 215 dst += (4 * dst_stride); 216 } 217} 218 219static void copy_width32_msa(uint8_t *src, int32_t src_stride, 220 uint8_t *dst, int32_t dst_stride, 221 int32_t height) 222{ 223 int32_t cnt; 224 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 225 226 for (cnt = (height >> 2); cnt--;) { 227 LD_UB4(src, src_stride, src0, src1, src2, src3); 228 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 229 src += (4 * src_stride); 230 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 231 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 232 dst += (4 * dst_stride); 233 } 234} 235 236static void copy_width48_msa(uint8_t *src, int32_t src_stride, 237 uint8_t *dst, int32_t dst_stride, 238 int32_t height) 239{ 240 int32_t cnt; 241 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 242 v16u8 src11; 243 244 for (cnt = (height >> 2); cnt--;) { 245 LD_UB4(src, src_stride, src0, src1, src2, src3); 246 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 247 LD_UB4(src + 32, src_stride, src8, src9, src10, src11); 248 src += (4 * src_stride); 249 250 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 251 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 252 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride); 253 dst += (4 * dst_stride); 254 } 255} 256 257static void copy_width64_msa(uint8_t *src, int32_t src_stride, 258 uint8_t *dst, int32_t dst_stride, 259 int32_t height) 260{ 261 int32_t cnt; 262 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 264 265 for (cnt = (height >> 2); cnt--;) { 266 LD_UB4(src, 16, src0, src1, src2, src3); 267 src += src_stride; 268 LD_UB4(src, 16, src4, src5, src6, src7); 269 src += src_stride; 270 LD_UB4(src, 16, src8, src9, src10, src11); 271 src += src_stride; 272 LD_UB4(src, 16, src12, src13, src14, src15); 273 src += src_stride; 274 275 ST_UB4(src0, src1, src2, src3, dst, 16); 276 dst += dst_stride; 277 ST_UB4(src4, src5, src6, src7, dst, 16); 278 dst += dst_stride; 279 ST_UB4(src8, src9, src10, src11, dst, 16); 280 dst += dst_stride; 281 ST_UB4(src12, src13, src14, src15, dst, 16); 282 dst += dst_stride; 283 } 284} 285 286static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, 287 uint8_t *dst, int32_t dst_stride, 288 const int8_t *filter) 289{ 290 v16u8 mask0, mask1, mask2, mask3, out; 291 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 292 v8i16 filt, out0, out1; 293 294 mask0 = LD_UB(&ff_hevc_mask_arr[16]); 295 src -= 3; 296 297 /* rearranging filter */ 298 filt = LD_SH(filter); 299 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 300 301 mask1 = mask0 + 2; 302 mask2 = mask0 + 4; 303 mask3 = mask0 + 6; 304 305 LD_SB4(src, src_stride, src0, src1, src2, src3); 306 XORI_B4_128_SB(src0, src1, src2, src3); 307 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 308 mask3, filt0, filt1, filt2, filt3, out0, out1); 309 SRARI_H2_SH(out0, out1, 6); 310 SAT_SH2_SH(out0, out1, 7); 311 out = PCKEV_XORI128_UB(out0, out1); 312 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 313} 314 315static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, 316 uint8_t *dst, int32_t dst_stride, 317 const int8_t *filter) 318{ 319 v16i8 filt0, filt1, filt2, filt3; 320 v16i8 src0, src1, src2, src3; 321 v16u8 mask0, mask1, mask2, mask3, out; 322 v8i16 filt, out0, out1, out2, out3; 323 324 mask0 = LD_UB(&ff_hevc_mask_arr[16]); 325 src -= 3; 326 327 /* rearranging filter */ 328 filt = LD_SH(filter); 329 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 330 331 mask1 = mask0 + 2; 332 mask2 = mask0 + 4; 333 mask3 = mask0 + 6; 334 335 LD_SB4(src, src_stride, src0, src1, src2, src3); 336 XORI_B4_128_SB(src0, src1, src2, src3); 337 src += (4 * src_stride); 338 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 339 mask3, filt0, filt1, filt2, filt3, out0, out1); 340 LD_SB4(src, src_stride, src0, src1, src2, src3); 341 XORI_B4_128_SB(src0, src1, src2, src3); 342 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 343 mask3, filt0, filt1, filt2, filt3, out2, out3); 344 SRARI_H4_SH(out0, out1, out2, out3, 6); 345 SAT_SH4_SH(out0, out1, out2, out3, 7); 346 out = PCKEV_XORI128_UB(out0, out1); 347 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 348 out = PCKEV_XORI128_UB(out2, out3); 349 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 350} 351 352static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, 353 uint8_t *dst, int32_t dst_stride, 354 const int8_t *filter) 355{ 356 v16u8 mask0, mask1, mask2, mask3, out; 357 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 358 v8i16 filt, out0, out1, out2, out3; 359 360 mask0 = LD_UB(&ff_hevc_mask_arr[16]); 361 src -= 3; 362 363 /* rearranging filter */ 364 filt = LD_SH(filter); 365 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 366 367 mask1 = mask0 + 2; 368 mask2 = mask0 + 4; 369 mask3 = mask0 + 6; 370 371 LD_SB4(src, src_stride, src0, src1, src2, src3); 372 XORI_B4_128_SB(src0, src1, src2, src3); 373 src += (4 * src_stride); 374 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 375 mask3, filt0, filt1, filt2, filt3, out0, out1); 376 LD_SB4(src, src_stride, src0, src1, src2, src3); 377 XORI_B4_128_SB(src0, src1, src2, src3); 378 src += (4 * src_stride); 379 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 380 mask3, filt0, filt1, filt2, filt3, out2, out3); 381 SRARI_H4_SH(out0, out1, out2, out3, 6); 382 SAT_SH4_SH(out0, out1, out2, out3, 7); 383 out = PCKEV_XORI128_UB(out0, out1); 384 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 385 out = PCKEV_XORI128_UB(out2, out3); 386 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 387 dst += (8 * dst_stride); 388 389 LD_SB4(src, src_stride, src0, src1, src2, src3); 390 XORI_B4_128_SB(src0, src1, src2, src3); 391 src += (4 * src_stride); 392 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 393 mask3, filt0, filt1, filt2, filt3, out0, out1); 394 LD_SB4(src, src_stride, src0, src1, src2, src3); 395 XORI_B4_128_SB(src0, src1, src2, src3); 396 src += (4 * src_stride); 397 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 398 mask3, filt0, filt1, filt2, filt3, out2, out3); 399 400 SRARI_H4_SH(out0, out1, out2, out3, 6); 401 SAT_SH4_SH(out0, out1, out2, out3, 7); 402 out = PCKEV_XORI128_UB(out0, out1); 403 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 404 out = PCKEV_XORI128_UB(out2, out3); 405 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 406} 407 408static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, 409 uint8_t *dst, int32_t dst_stride, 410 const int8_t *filter, int32_t height) 411{ 412 if (4 == height) { 413 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); 414 } else if (8 == height) { 415 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); 416 } else if (16 == height) { 417 common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter); 418 } 419} 420 421static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, 422 uint8_t *dst, int32_t dst_stride, 423 const int8_t *filter, int32_t height) 424{ 425 uint32_t loop_cnt; 426 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 427 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; 428 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; 429 v8i16 filt, out0, out1, out2, out3; 430 431 mask0 = LD_UB(&ff_hevc_mask_arr[0]); 432 src -= 3; 433 434 /* rearranging filter */ 435 filt = LD_SH(filter); 436 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 437 438 mask1 = mask0 + 2; 439 mask2 = mask0 + 4; 440 mask3 = mask0 + 6; 441 442 for (loop_cnt = (height >> 2); loop_cnt--;) { 443 LD_SB4(src, src_stride, src0, src1, src2, src3); 444 XORI_B4_128_SB(src0, src1, src2, src3); 445 src += (4 * src_stride); 446 447 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); 448 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); 449 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, 450 out0, out1, out2, out3); 451 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); 452 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); 453 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, 454 out0, out1, out2, out3); 455 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); 456 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); 457 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, 458 out0, out1, out2, out3); 459 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); 460 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); 461 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, 462 out0, out1, out2, out3); 463 464 SRARI_H4_SH(out0, out1, out2, out3, 6); 465 SAT_SH4_SH(out0, out1, out2, out3, 7); 466 tmp0 = PCKEV_XORI128_UB(out0, out1); 467 tmp1 = PCKEV_XORI128_UB(out2, out3); 468 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 469 dst += (4 * dst_stride); 470 } 471} 472 473static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, 474 uint8_t *dst, int32_t dst_stride, 475 const int8_t *filter, int32_t height) 476{ 477 uint32_t loop_cnt; 478 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00; 479 v16u8 tmp0, tmp1, tmp2; 480 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 481 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 482 v16i8 filt0, filt1, filt2, filt3; 483 v8i16 filt, out0, out1, out2, out3, out4, out5; 484 485 mask00 = LD_UB(&ff_hevc_mask_arr[0]); 486 mask0 = LD_UB(&ff_hevc_mask_arr[16]); 487 488 src = src - 3; 489 490 /* rearranging filter */ 491 filt = LD_SH(filter); 492 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 493 494 mask1 = mask00 + 2; 495 mask2 = mask00 + 4; 496 mask3 = mask00 + 6; 497 mask4 = mask0 + 2; 498 mask5 = mask0 + 4; 499 mask6 = mask0 + 6; 500 501 for (loop_cnt = 4; loop_cnt--;) { 502 /* 8 width */ 503 LD_SB4(src, src_stride, src0, src1, src2, src3); 504 /* 4 width */ 505 LD_SB4(src + 8, src_stride, src4, src5, src6, src7); 506 507 XORI_B4_128_SB(src0, src1, src2, src3); 508 XORI_B4_128_SB(src4, src5, src6, src7); 509 src += (4 * src_stride); 510 511 VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1); 512 VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3); 513 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, 514 out1, out2, out3); 515 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 516 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 517 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0, 518 out1, out2, out3); 519 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 520 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 521 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0, 522 out1, out2, out3); 523 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5); 524 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7); 525 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0, 526 out1, out2, out3); 527 528 /* 4 width */ 529 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1); 530 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5); 531 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3); 532 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5); 533 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5); 534 DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5); 535 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7); 536 DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5); 537 538 SRARI_H4_SH(out0, out1, out2, out3, 6); 539 SRARI_H2_SH(out4, out5, 6); 540 SAT_SH4_SH(out0, out1, out2, out3, 7); 541 SAT_SH2_SH(out4, out5, 7); 542 tmp0 = PCKEV_XORI128_UB(out0, out1); 543 tmp1 = PCKEV_XORI128_UB(out2, out3); 544 tmp2 = PCKEV_XORI128_UB(out4, out5); 545 546 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 547 ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride); 548 dst += (4 * dst_stride); 549 } 550} 551 552static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, 553 uint8_t *dst, int32_t dst_stride, 554 const int8_t *filter, int32_t height) 555{ 556 uint32_t loop_cnt; 557 v16u8 mask0, mask1, mask2, mask3, out; 558 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 559 v16i8 filt0, filt1, filt2, filt3; 560 v8i16 filt, out0, out1, out2, out3; 561 562 mask0 = LD_UB(&ff_hevc_mask_arr[0]); 563 src -= 3; 564 565 /* rearranging filter */ 566 filt = LD_SH(filter); 567 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 568 569 mask1 = mask0 + 2; 570 mask2 = mask0 + 4; 571 mask3 = mask0 + 6; 572 573 for (loop_cnt = (height >> 2); loop_cnt--;) { 574 LD_SB2(src, src_stride, src0, src2); 575 LD_SB2(src + 8, src_stride, src1, src3); 576 src += (2 * src_stride); 577 578 LD_SB2(src, src_stride, src4, src6); 579 LD_SB2(src + 8, src_stride, src5, src7); 580 src += (2 * src_stride); 581 582 XORI_B4_128_SB(src0, src1, src2, src3); 583 XORI_B4_128_SB(src4, src5, src6, src7); 584 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 585 mask3, filt0, filt1, filt2, filt3, out0, 586 out1, out2, out3); 587 SRARI_H4_SH(out0, out1, out2, out3, 6); 588 SAT_SH4_SH(out0, out1, out2, out3, 7); 589 out = PCKEV_XORI128_UB(out0, out1); 590 ST_UB(out, dst); 591 dst += dst_stride; 592 out = PCKEV_XORI128_UB(out2, out3); 593 ST_UB(out, dst); 594 dst += dst_stride; 595 596 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, 597 mask3, filt0, filt1, filt2, filt3, out0, 598 out1, out2, out3); 599 SRARI_H4_SH(out0, out1, out2, out3, 6); 600 SAT_SH4_SH(out0, out1, out2, out3, 7); 601 out = PCKEV_XORI128_UB(out0, out1); 602 ST_UB(out, dst); 603 dst += dst_stride; 604 out = PCKEV_XORI128_UB(out2, out3); 605 ST_UB(out, dst); 606 dst += dst_stride; 607 } 608} 609 610static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, 611 uint8_t *dst, int32_t dst_stride, 612 const int8_t *filter, int32_t height) 613{ 614 uint32_t loop_cnt; 615 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 616 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out; 617 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10; 618 v16i8 vec11; 619 v8i16 out0, out1, out2, out3, out8, out9, filt; 620 621 mask0 = LD_UB(&ff_hevc_mask_arr[0]); 622 src -= 3; 623 624 /* rearranging filter */ 625 filt = LD_SH(filter); 626 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 627 628 mask1 = mask0 + 2; 629 mask2 = mask0 + 4; 630 mask3 = mask0 + 6; 631 mask4 = mask0 + 8; 632 mask5 = mask0 + 10; 633 mask6 = mask0 + 12; 634 mask7 = mask0 + 14; 635 636 for (loop_cnt = 16; loop_cnt--;) { 637 LD_SB2(src, src_stride, src0, src2); 638 LD_SB2(src + 16, src_stride, src1, src3); 639 XORI_B4_128_SB(src0, src1, src2, src3); 640 src += (2 * src_stride); 641 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8); 642 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9); 643 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3); 644 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0, 645 out8, out2, out9); 646 DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3); 647 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8); 648 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9); 649 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3); 650 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, 651 out0, out8, out2, out9); 652 DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3); 653 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10); 654 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11); 655 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7); 656 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1, 657 out0, out8, out2, out9); 658 DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3); 659 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10); 660 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11); 661 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7); 662 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3, 663 out0, out8, out2, out9); 664 DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3); 665 SRARI_H4_SH(out0, out8, out2, out9, 6); 666 SRARI_H2_SH(out1, out3, 6); 667 SAT_SH4_SH(out0, out8, out2, out9, 7); 668 SAT_SH2_SH(out1, out3, 7); 669 out = PCKEV_XORI128_UB(out8, out9); 670 ST_D2(out, 0, 1, dst + 16, dst_stride); 671 out = PCKEV_XORI128_UB(out0, out1); 672 ST_UB(out, dst); 673 dst += dst_stride; 674 out = PCKEV_XORI128_UB(out2, out3); 675 ST_UB(out, dst); 676 dst += dst_stride; 677 } 678} 679 680static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, 681 uint8_t *dst, int32_t dst_stride, 682 const int8_t *filter, int32_t height) 683{ 684 uint32_t loop_cnt; 685 v16u8 mask0, mask1, mask2, mask3, out; 686 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 687 v16i8 filt0, filt1, filt2, filt3; 688 v8i16 filt, out0, out1, out2, out3; 689 690 mask0 = LD_UB(&ff_hevc_mask_arr[0]); 691 src -= 3; 692 693 /* rearranging filter */ 694 filt = LD_SH(filter); 695 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 696 697 mask1 = mask0 + 2; 698 mask2 = mask0 + 4; 699 mask3 = mask0 + 6; 700 701 for (loop_cnt = (height >> 1); loop_cnt--;) { 702 src0 = LD_SB(src); 703 src1 = LD_SB(src + 8); 704 src2 = LD_SB(src + 16); 705 src3 = LD_SB(src + 24); 706 src += src_stride; 707 XORI_B4_128_SB(src0, src1, src2, src3); 708 709 src4 = LD_SB(src); 710 src5 = LD_SB(src + 8); 711 src6 = LD_SB(src + 16); 712 src7 = LD_SB(src + 24); 713 src += src_stride; 714 XORI_B4_128_SB(src4, src5, src6, src7); 715 716 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 717 mask3, filt0, filt1, filt2, filt3, out0, 718 out1, out2, out3); 719 SRARI_H4_SH(out0, out1, out2, out3, 6); 720 SAT_SH4_SH(out0, out1, out2, out3, 7); 721 722 out = PCKEV_XORI128_UB(out0, out1); 723 ST_UB(out, dst); 724 out = PCKEV_XORI128_UB(out2, out3); 725 ST_UB(out, dst + 16); 726 dst += dst_stride; 727 728 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, 729 mask3, filt0, filt1, filt2, filt3, out0, 730 out1, out2, out3); 731 SRARI_H4_SH(out0, out1, out2, out3, 6); 732 SAT_SH4_SH(out0, out1, out2, out3, 7); 733 out = PCKEV_XORI128_UB(out0, out1); 734 ST_UB(out, dst); 735 out = PCKEV_XORI128_UB(out2, out3); 736 ST_UB(out, dst + 16); 737 dst += dst_stride; 738 } 739} 740 741static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, 742 uint8_t *dst, int32_t dst_stride, 743 const int8_t *filter, int32_t height) 744{ 745 uint32_t loop_cnt; 746 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2; 747 v16i8 src4; 748 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out; 749 v8i16 filt, out0, out1, out2, out3; 750 751 mask0 = LD_UB(&ff_hevc_mask_arr[0]); 752 src -= 3; 753 754 /* rearranging filter */ 755 filt = LD_SH(filter); 756 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 757 758 mask1 = mask0 + 2; 759 mask2 = mask0 + 4; 760 mask3 = mask0 + 6; 761 mask4 = mask0 + 8; 762 mask5 = mask0 + 10; 763 mask6 = mask0 + 12; 764 mask7 = mask0 + 14; 765 766 for (loop_cnt = 64; loop_cnt--;) { 767 src0 = LD_SB(src); 768 src1 = LD_SB(src + 8); 769 src2 = LD_SB(src + 16); 770 src3 = LD_SB(src + 32); 771 src4 = LD_SB(src + 40); 772 src += src_stride; 773 774 XORI_B4_128_SB(src0, src1, src2, src3); 775 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128); 776 777 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0, 778 vec0, vec1, vec2); 779 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2); 780 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1, 781 vec0, vec1, vec2); 782 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1); 783 out2 = __msa_dpadd_s_h(out2, vec2, filt1); 784 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2, 785 vec0, vec1, vec2); 786 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1); 787 out2 = __msa_dpadd_s_h(out2, vec2, filt2); 788 789 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3, 790 vec0, vec1, vec2); 791 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1); 792 out2 = __msa_dpadd_s_h(out2, vec2, filt3); 793 794 SRARI_H2_SH(out0, out1, 6); 795 out3 = __msa_srari_h(out2, 6); 796 SAT_SH3_SH(out0, out1, out3, 7); 797 out = PCKEV_XORI128_UB(out0, out1); 798 ST_UB(out, dst); 799 800 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0, 801 vec0, vec1, vec2); 802 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2); 803 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1, 804 vec0, vec1, vec2); 805 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1); 806 out2 = __msa_dpadd_s_h(out2, vec2, filt1); 807 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2, 808 vec0, vec1, vec2); 809 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1); 810 out2 = __msa_dpadd_s_h(out2, vec2, filt2); 811 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3, 812 vec0, vec1, vec2); 813 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1); 814 out2 = __msa_dpadd_s_h(out2, vec2, filt3); 815 816 SRARI_H2_SH(out0, out1, 6); 817 out2 = __msa_srari_h(out2, 6); 818 SAT_SH3_SH(out0, out1, out2, 7); 819 out = PCKEV_XORI128_UB(out3, out0); 820 ST_UB(out, dst + 16); 821 out = PCKEV_XORI128_UB(out1, out2); 822 ST_UB(out, dst + 32); 823 dst += dst_stride; 824 } 825} 826 827static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, 828 uint8_t *dst, int32_t dst_stride, 829 const int8_t *filter, int32_t height) 830{ 831 int32_t loop_cnt; 832 v16u8 mask0, mask1, mask2, mask3, out; 833 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 834 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 835 v16i8 filt0, filt1, filt2, filt3; 836 v8i16 res0, res1, res2, res3, filt; 837 838 mask0 = LD_UB(&ff_hevc_mask_arr[0]); 839 src -= 3; 840 841 /* rearranging filter */ 842 filt = LD_SH(filter); 843 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 844 845 mask1 = mask0 + 2; 846 mask2 = mask0 + 4; 847 mask3 = mask0 + 6; 848 849 for (loop_cnt = height; loop_cnt--;) { 850 LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7); 851 src += src_stride; 852 853 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 854 855 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 856 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 857 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, 858 res1, res2, res3); 859 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 860 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 861 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0, 862 res1, res2, res3); 863 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5); 864 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7); 865 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0, 866 res1, res2, res3); 867 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5); 868 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7); 869 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0, 870 res1, res2, res3); 871 872 SRARI_H4_SH(res0, res1, res2, res3, 6); 873 SAT_SH4_SH(res0, res1, res2, res3, 7); 874 out = PCKEV_XORI128_UB(res0, res1); 875 ST_UB(out, dst); 876 out = PCKEV_XORI128_UB(res2, res3); 877 ST_UB(out, dst + 16); 878 879 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1); 880 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3); 881 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, 882 res1, res2, res3); 883 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1); 884 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3); 885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0, 886 res1, res2, res3); 887 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5); 888 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7); 889 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0, 890 res1, res2, res3); 891 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5); 892 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7); 893 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0, 894 res1, res2, res3); 895 896 SRARI_H4_SH(res0, res1, res2, res3, 6); 897 SAT_SH4_SH(res0, res1, res2, res3, 7); 898 out = PCKEV_XORI128_UB(res0, res1); 899 ST_UB(out, dst + 32); 900 out = PCKEV_XORI128_UB(res2, res3); 901 ST_UB(out, dst + 48); 902 dst += dst_stride; 903 } 904} 905 906static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, 907 uint8_t *dst, int32_t dst_stride, 908 const int8_t *filter, int32_t height) 909{ 910 uint32_t loop_cnt; 911 v16u8 out0, out1; 912 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 913 v16i8 src11, src12, src13, src14; 914 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 915 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; 916 v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312; 917 v16i8 src10998, filt0, filt1, filt2, filt3; 918 v8i16 filt, out10, out32, out54, out76; 919 920 src -= (3 * src_stride); 921 922 filt = LD_SH(filter); 923 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 924 925 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 926 src += (7 * src_stride); 927 928 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 929 src54_r, src21_r); 930 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 931 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, 932 src4332, src6554); 933 XORI_B3_128_SB(src2110, src4332, src6554); 934 935 for (loop_cnt = (height >> 3); loop_cnt--;) { 936 LD_SB4(src, src_stride, src7, src8, src9, src10); 937 src += (4 * src_stride); 938 LD_SB4(src, src_stride, src11, src12, src13, src14); 939 src += (4 * src_stride); 940 941 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 942 src87_r, src98_r, src109_r); 943 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, 944 src1110_r, src1211_r, src1312_r, src1413_r); 945 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); 946 ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r, 947 src12111110, src14131312); 948 XORI_B2_128_SB(src8776, src10998); 949 XORI_B2_128_SB(src12111110, src14131312); 950 951 DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32); 952 DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76); 953 DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32); 954 DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76); 955 DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32); 956 DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76); 957 DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32); 958 DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76); 959 SRARI_H2_SH(out10, out32, 6); 960 SRARI_H2_SH(out54, out76, 6); 961 SAT_SH2_SH(out10, out32, 7); 962 SAT_SH2_SH(out54, out76, 7); 963 out0 = PCKEV_XORI128_UB(out10, out32); 964 out1 = PCKEV_XORI128_UB(out54, out76); 965 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 966 dst += (8 * dst_stride); 967 968 src2110 = src10998; 969 src4332 = src12111110; 970 src6554 = src14131312; 971 src6 = src14; 972 } 973} 974 975static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, 976 uint8_t *dst, int32_t dst_stride, 977 const int8_t *filter, int32_t height) 978{ 979 uint32_t loop_cnt; 980 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 981 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 982 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; 983 v16u8 tmp0, tmp1; 984 v8i16 filt, out0_r, out1_r, out2_r, out3_r; 985 986 src -= (3 * src_stride); 987 988 filt = LD_SH(filter); 989 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 990 991 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 992 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 993 src += (7 * src_stride); 994 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 995 src54_r, src21_r); 996 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 997 998 for (loop_cnt = (height >> 2); loop_cnt--;) { 999 LD_SB4(src, src_stride, src7, src8, src9, src10); 1000 XORI_B4_128_SB(src7, src8, src9, src10); 1001 src += (4 * src_stride); 1002 1003 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1004 src87_r, src98_r, src109_r); 1005 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0, 1006 filt0, out0_r, out1_r, out2_r, out3_r); 1007 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1, 1008 filt1, out0_r, out1_r, out2_r, out3_r); 1009 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2, 1010 filt2, out0_r, out1_r, out2_r, out3_r); 1011 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3, 1012 filt3, out0_r, out1_r, out2_r, out3_r); 1013 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 1014 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1015 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 1016 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 1017 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 1018 dst += (4 * dst_stride); 1019 1020 src10_r = src54_r; 1021 src32_r = src76_r; 1022 src54_r = src98_r; 1023 src21_r = src65_r; 1024 src43_r = src87_r; 1025 src65_r = src109_r; 1026 src6 = src10; 1027 } 1028} 1029 1030static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, 1031 uint8_t *dst, int32_t dst_stride, 1032 const int8_t *filter, int32_t height) 1033{ 1034 uint32_t loop_cnt; 1035 uint32_t out2, out3; 1036 uint64_t out0, out1; 1037 v16u8 tmp0, tmp1, tmp2, tmp3; 1038 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1039 v16i8 filt0, filt1, filt2, filt3; 1040 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 1041 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 1042 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 1043 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 1044 1045 src -= (3 * src_stride); 1046 1047 filt = LD_SH(filter); 1048 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1049 1050 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1051 src += (7 * src_stride); 1052 1053 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1054 1055 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 1056 src54_r, src21_r); 1057 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1058 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, 1059 src54_l, src21_l); 1060 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1061 1062 for (loop_cnt = 4; loop_cnt--;) { 1063 LD_SB4(src, src_stride, src7, src8, src9, src10); 1064 XORI_B4_128_SB(src7, src8, src9, src10); 1065 src += (4 * src_stride); 1066 1067 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1068 src87_r, src98_r, src109_r); 1069 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 1070 src87_l, src98_l, src109_l); 1071 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0, 1072 filt1, filt2, filt3); 1073 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0, 1074 filt1, filt2, filt3); 1075 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0, 1076 filt1, filt2, filt3); 1077 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0, 1078 filt1, filt2, filt3); 1079 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0, 1080 filt1, filt2, filt3); 1081 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0, 1082 filt1, filt2, filt3); 1083 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0, 1084 filt1, filt2, filt3); 1085 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0, 1086 filt1, filt2, filt3); 1087 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 1088 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); 1089 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1090 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1091 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1092 out3_r, tmp0, tmp1, tmp2, tmp3); 1093 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 1094 1095 out0 = __msa_copy_u_d((v2i64) tmp0, 0); 1096 out1 = __msa_copy_u_d((v2i64) tmp1, 0); 1097 out2 = __msa_copy_u_w((v4i32) tmp0, 2); 1098 out3 = __msa_copy_u_w((v4i32) tmp1, 2); 1099 SD(out0, dst); 1100 SW(out2, (dst + 8)); 1101 dst += dst_stride; 1102 SD(out1, dst); 1103 SW(out3, (dst + 8)); 1104 dst += dst_stride; 1105 out0 = __msa_copy_u_d((v2i64) tmp2, 0); 1106 out1 = __msa_copy_u_d((v2i64) tmp3, 0); 1107 out2 = __msa_copy_u_w((v4i32) tmp2, 2); 1108 out3 = __msa_copy_u_w((v4i32) tmp3, 2); 1109 SD(out0, dst); 1110 SW(out2, (dst + 8)); 1111 dst += dst_stride; 1112 SD(out1, dst); 1113 SW(out3, (dst + 8)); 1114 dst += dst_stride; 1115 1116 src10_r = src54_r; 1117 src32_r = src76_r; 1118 src54_r = src98_r; 1119 src21_r = src65_r; 1120 src43_r = src87_r; 1121 src65_r = src109_r; 1122 src10_l = src54_l; 1123 src32_l = src76_l; 1124 src54_l = src98_l; 1125 src21_l = src65_l; 1126 src43_l = src87_l; 1127 src65_l = src109_l; 1128 src6 = src10; 1129 } 1130} 1131 1132static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, 1133 uint8_t *dst, int32_t dst_stride, 1134 const int8_t *filter, int32_t height) 1135{ 1136 uint32_t loop_cnt; 1137 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1138 v16i8 filt0, filt1, filt2, filt3; 1139 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 1140 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 1141 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 1142 v16u8 tmp0, tmp1, tmp2, tmp3; 1143 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 1144 1145 src -= (3 * src_stride); 1146 1147 filt = LD_SH(filter); 1148 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1149 1150 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1151 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1152 src += (7 * src_stride); 1153 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 1154 src54_r, src21_r); 1155 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1156 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, 1157 src54_l, src21_l); 1158 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1159 1160 for (loop_cnt = (height >> 2); loop_cnt--;) { 1161 LD_SB4(src, src_stride, src7, src8, src9, src10); 1162 XORI_B4_128_SB(src7, src8, src9, src10); 1163 src += (4 * src_stride); 1164 1165 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1166 src87_r, src98_r, src109_r); 1167 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 1168 src87_l, src98_l, src109_l); 1169 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0, 1170 filt1, filt2, filt3); 1171 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0, 1172 filt1, filt2, filt3); 1173 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0, 1174 filt1, filt2, filt3); 1175 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0, 1176 filt1, filt2, filt3); 1177 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0, 1178 filt1, filt2, filt3); 1179 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0, 1180 filt1, filt2, filt3); 1181 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0, 1182 filt1, filt2, filt3); 1183 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0, 1184 filt1, filt2, filt3); 1185 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 1186 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); 1187 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1188 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1189 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1190 out3_r, tmp0, tmp1, tmp2, tmp3); 1191 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 1192 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 1193 dst += (4 * dst_stride); 1194 1195 src10_r = src54_r; 1196 src32_r = src76_r; 1197 src54_r = src98_r; 1198 src21_r = src65_r; 1199 src43_r = src87_r; 1200 src65_r = src109_r; 1201 src10_l = src54_l; 1202 src32_l = src76_l; 1203 src54_l = src98_l; 1204 src21_l = src65_l; 1205 src43_l = src87_l; 1206 src65_l = src109_l; 1207 src6 = src10; 1208 } 1209} 1210 1211static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, 1212 uint8_t *dst, int32_t dst_stride, 1213 const int8_t *filter, int32_t height, 1214 int32_t width) 1215{ 1216 uint8_t *src_tmp; 1217 uint8_t *dst_tmp; 1218 uint32_t loop_cnt, cnt; 1219 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1220 v16i8 filt0, filt1, filt2, filt3; 1221 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 1222 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 1223 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 1224 v16u8 tmp0, tmp1, tmp2, tmp3; 1225 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 1226 1227 src -= (3 * src_stride); 1228 1229 filt = LD_SH(filter); 1230 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1231 1232 for (cnt = (width >> 4); cnt--;) { 1233 src_tmp = src; 1234 dst_tmp = dst; 1235 1236 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1237 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1238 src_tmp += (7 * src_stride); 1239 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, 1240 src32_r, src54_r, src21_r); 1241 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1242 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, 1243 src32_l, src54_l, src21_l); 1244 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1245 1246 for (loop_cnt = (height >> 2); loop_cnt--;) { 1247 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); 1248 XORI_B4_128_SB(src7, src8, src9, src10); 1249 src_tmp += (4 * src_stride); 1250 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 1251 src87_r, src98_r, src109_r); 1252 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 1253 src87_l, src98_l, src109_l); 1254 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, 1255 filt0, filt1, filt2, filt3); 1256 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, 1257 filt0, filt1, filt2, filt3); 1258 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, 1259 filt0, filt1, filt2, filt3); 1260 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, 1261 filt0, filt1, filt2, filt3); 1262 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, 1263 filt0, filt1, filt2, filt3); 1264 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, 1265 filt0, filt1, filt2, filt3); 1266 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, 1267 filt0, filt1, filt2, filt3); 1268 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, 1269 filt0, filt1, filt2, filt3); 1270 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 1271 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); 1272 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 1273 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 1274 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 1275 out3_r, tmp0, tmp1, tmp2, tmp3); 1276 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 1277 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); 1278 dst_tmp += (4 * dst_stride); 1279 1280 src10_r = src54_r; 1281 src32_r = src76_r; 1282 src54_r = src98_r; 1283 src21_r = src65_r; 1284 src43_r = src87_r; 1285 src65_r = src109_r; 1286 src10_l = src54_l; 1287 src32_l = src76_l; 1288 src54_l = src98_l; 1289 src21_l = src65_l; 1290 src43_l = src87_l; 1291 src65_l = src109_l; 1292 src6 = src10; 1293 } 1294 1295 src += 16; 1296 dst += 16; 1297 } 1298} 1299 1300static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, 1301 uint8_t *dst, int32_t dst_stride, 1302 const int8_t *filter, int32_t height) 1303{ 1304 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 1305 16); 1306 1307 common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter, 1308 height); 1309} 1310 1311static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, 1312 uint8_t *dst, int32_t dst_stride, 1313 const int8_t *filter, int32_t height) 1314{ 1315 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 1316 32); 1317} 1318 1319static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, 1320 uint8_t *dst, int32_t dst_stride, 1321 const int8_t *filter, int32_t height) 1322{ 1323 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 1324 48); 1325} 1326 1327static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, 1328 uint8_t *dst, int32_t dst_stride, 1329 const int8_t *filter, int32_t height) 1330{ 1331 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 1332 64); 1333} 1334 1335static void hevc_hv_uni_8t_4w_msa(uint8_t *src, 1336 int32_t src_stride, 1337 uint8_t *dst, 1338 int32_t dst_stride, 1339 const int8_t *filter_x, 1340 const int8_t *filter_y, 1341 int32_t height) 1342{ 1343 uint32_t loop_cnt; 1344 v16u8 out0, out1; 1345 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1346 v16i8 src9, src10, src11, src12, src13, src14; 1347 v8i16 filt0, filt1, filt2, filt3; 1348 v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1349 v16i8 mask1, mask2, mask3; 1350 v8i16 filter_vec; 1351 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1352 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1353 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410; 1354 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r; 1355 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r; 1356 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 1357 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1358 1359 src -= ((3 * src_stride) + 3); 1360 filter_vec = LD_SH(filter_x); 1361 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1362 1363 filter_vec = LD_SH(filter_y); 1364 UNPCK_R_SB_SH(filter_vec, filter_vec); 1365 1366 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1367 1368 mask1 = mask0 + 2; 1369 mask2 = mask0 + 4; 1370 mask3 = mask0 + 6; 1371 1372 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1373 src += (7 * src_stride); 1374 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1375 1376 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1377 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1378 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, 1379 vec8, vec9, vec10, vec11); 1380 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, 1381 vec12, vec13, vec14, vec15); 1382 1383 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1384 filt3); 1385 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1386 filt3); 1387 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1388 filt3); 1389 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 1390 filt3); 1391 1392 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 1393 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 1394 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 1395 1396 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1397 1398 for (loop_cnt = height >> 3; loop_cnt--;) { 1399 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13, 1400 src14); 1401 src += (8 * src_stride); 1402 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14); 1403 1404 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3, 1405 vec0, vec1, vec2, vec3); 1406 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3, 1407 vec4, vec5, vec6, vec7); 1408 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3, 1409 vec8, vec9, vec10, vec11); 1410 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3, 1411 vec12, vec13, vec14, vec15); 1412 1413 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1414 filt3); 1415 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1416 filt3); 1417 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, 1418 filt2, filt3); 1419 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1420 filt2, filt3); 1421 1422 dst76_r = __msa_ilvr_h(dst117, dst66); 1423 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r); 1424 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r); 1425 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r); 1426 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1); 1427 dst1110_r = __msa_ilvr_h(dst117, dst1410); 1428 1429 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1430 filt_h1, filt_h2, filt_h3); 1431 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 1432 filt_h1, filt_h2, filt_h3); 1433 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0, 1434 filt_h1, filt_h2, filt_h3); 1435 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0, 1436 filt_h1, filt_h2, filt_h3); 1437 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0, 1438 filt_h1, filt_h2, filt_h3); 1439 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0, 1440 filt_h1, filt_h2, filt_h3); 1441 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0, 1442 filt_h1, filt_h2, filt_h3); 1443 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r, 1444 filt_h0, filt_h1, filt_h2, filt_h3); 1445 1446 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1447 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 1448 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1449 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6); 1450 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7); 1451 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7); 1452 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 1453 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r); 1454 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r); 1455 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r); 1456 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1457 dst += (8 * dst_stride); 1458 1459 dst10_r = dst98_r; 1460 dst32_r = dst1110_r; 1461 dst54_r = dst1312_r; 1462 dst21_r = dst109_r; 1463 dst43_r = dst1211_r; 1464 dst65_r = dst1413_r; 1465 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1); 1466 } 1467} 1468 1469static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, 1470 int32_t src_stride, 1471 uint8_t *dst, 1472 int32_t dst_stride, 1473 const int8_t *filter_x, 1474 const int8_t *filter_y, 1475 int32_t height, int32_t width) 1476{ 1477 uint32_t loop_cnt, cnt; 1478 uint8_t *src_tmp; 1479 uint8_t *dst_tmp; 1480 v16u8 out; 1481 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 1482 v8i16 filt0, filt1, filt2, filt3; 1483 v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1484 v16i8 mask1, mask2, mask3; 1485 v8i16 filter_vec; 1486 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1487 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1488 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 1489 v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 1490 v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 1491 v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 1492 v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 1493 v8i16 dst21_l, dst43_l, dst65_l, dst87_l; 1494 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 1495 1496 src -= ((3 * src_stride) + 3); 1497 1498 filter_vec = LD_SH(filter_x); 1499 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1500 1501 filter_vec = LD_SH(filter_y); 1502 UNPCK_R_SB_SH(filter_vec, filter_vec); 1503 1504 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1505 1506 mask1 = mask0 + 2; 1507 mask2 = mask0 + 4; 1508 mask3 = mask0 + 6; 1509 1510 for (cnt = width >> 3; cnt--;) { 1511 src_tmp = src; 1512 dst_tmp = dst; 1513 1514 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1515 src_tmp += (7 * src_stride); 1516 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1517 1518 /* row 0 row 1 row 2 row 3 */ 1519 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1520 vec0, vec1, vec2, vec3); 1521 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1522 vec4, vec5, vec6, vec7); 1523 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1524 vec8, vec9, vec10, vec11); 1525 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1526 vec12, vec13, vec14, vec15); 1527 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1528 filt3); 1529 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1530 filt3); 1531 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1532 filt3); 1533 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1534 filt2, filt3); 1535 1536 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1537 vec0, vec1, vec2, vec3); 1538 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 1539 vec4, vec5, vec6, vec7); 1540 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 1541 vec8, vec9, vec10, vec11); 1542 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1543 filt3); 1544 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1545 filt3); 1546 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1547 filt3); 1548 1549 for (loop_cnt = height >> 1; loop_cnt--;) { 1550 LD_SB2(src_tmp, src_stride, src7, src8); 1551 XORI_B2_128_SB(src7, src8); 1552 src_tmp += 2 * src_stride; 1553 1554 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, 1555 dst10_r, dst32_r, dst54_r, dst21_r); 1556 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, 1557 dst10_l, dst32_l, dst54_l, dst21_l); 1558 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); 1559 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); 1560 1561 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 1562 vec0, vec1, vec2, vec3); 1563 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1564 filt2, filt3); 1565 1566 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1567 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1568 filt_h0, filt_h1, filt_h2, filt_h3); 1569 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 1570 filt_h0, filt_h1, filt_h2, filt_h3); 1571 dst0_r >>= 6; 1572 dst0_l >>= 6; 1573 1574 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, 1575 vec0, vec1, vec2, vec3); 1576 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, 1577 filt2, filt3); 1578 1579 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 1580 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, 1581 filt_h0, filt_h1, filt_h2, filt_h3); 1582 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, 1583 filt_h0, filt_h1, filt_h2, filt_h3); 1584 dst1_r >>= 6; 1585 dst1_l >>= 6; 1586 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6); 1587 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7); 1588 1589 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1); 1590 out = PCKEV_XORI128_UB(dst0, dst1); 1591 ST_D2(out, 0, 1, dst_tmp, dst_stride); 1592 dst_tmp += (2 * dst_stride); 1593 1594 dst0 = dst2; 1595 dst1 = dst3; 1596 dst2 = dst4; 1597 dst3 = dst5; 1598 dst4 = dst6; 1599 dst5 = dst7; 1600 dst6 = dst8; 1601 } 1602 1603 src += 8; 1604 dst += 8; 1605 } 1606} 1607 1608static void hevc_hv_uni_8t_8w_msa(uint8_t *src, 1609 int32_t src_stride, 1610 uint8_t *dst, 1611 int32_t dst_stride, 1612 const int8_t *filter_x, 1613 const int8_t *filter_y, 1614 int32_t height) 1615{ 1616 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1617 filter_x, filter_y, height, 8); 1618} 1619 1620static void hevc_hv_uni_8t_12w_msa(uint8_t *src, 1621 int32_t src_stride, 1622 uint8_t *dst, 1623 int32_t dst_stride, 1624 const int8_t *filter_x, 1625 const int8_t *filter_y, 1626 int32_t height) 1627{ 1628 uint32_t loop_cnt; 1629 uint8_t *src_tmp, *dst_tmp; 1630 v16u8 out0, out1; 1631 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1632 v16i8 src11, src12, src13, src14; 1633 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1634 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1635 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1636 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 1637 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410; 1638 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 1639 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r; 1640 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l; 1641 v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r; 1642 v8i16 dst1413_r, dst87_l, filter_vec; 1643 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 1644 v4i32 dst0_l, dst1_l; 1645 1646 src -= ((3 * src_stride) + 3); 1647 1648 filter_vec = LD_SH(filter_x); 1649 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1650 1651 filter_vec = LD_SH(filter_y); 1652 UNPCK_R_SB_SH(filter_vec, filter_vec); 1653 1654 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1655 1656 mask0 = LD_SB(ff_hevc_mask_arr); 1657 mask1 = mask0 + 2; 1658 mask2 = mask0 + 4; 1659 mask3 = mask0 + 6; 1660 1661 src_tmp = src; 1662 dst_tmp = dst; 1663 1664 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1665 src_tmp += (7 * src_stride); 1666 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1667 1668 /* row 0 row 1 row 2 row 3 */ 1669 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1670 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1671 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1672 vec11); 1673 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14, 1674 vec15); 1675 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1676 filt3); 1677 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1678 filt3); 1679 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1680 filt3); 1681 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1682 filt2, filt3); 1683 1684 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1685 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1686 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1687 vec11); 1688 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1689 filt3); 1690 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1691 filt3); 1692 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1693 filt3); 1694 1695 for (loop_cnt = 8; loop_cnt--;) { 1696 LD_SB2(src_tmp, src_stride, src7, src8); 1697 XORI_B2_128_SB(src7, src8); 1698 src_tmp += 2 * src_stride; 1699 1700 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r, 1701 dst32_r, dst54_r, dst21_r); 1702 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l, 1703 dst32_l, dst54_l, dst21_l); 1704 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r); 1705 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l); 1706 1707 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1708 vec3); 1709 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1710 filt3); 1711 1712 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1713 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1714 filt_h0, filt_h1, filt_h2, filt_h3); 1715 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 1716 filt_h0, filt_h1, filt_h2, filt_h3); 1717 dst0_r >>= 6; 1718 dst0_l >>= 6; 1719 1720 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1721 vec3); 1722 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1723 filt3); 1724 1725 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 1726 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, 1727 filt_h0, filt_h1, filt_h2, filt_h3); 1728 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, 1729 filt_h0, filt_h1, filt_h2, filt_h3); 1730 dst1_r >>= 6; 1731 dst1_l >>= 6; 1732 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6); 1733 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7); 1734 1735 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1); 1736 out0 = PCKEV_XORI128_UB(dst0, dst1); 1737 ST_D2(out0, 0, 1, dst_tmp, dst_stride); 1738 dst_tmp += (2 * dst_stride); 1739 1740 dst0 = dst2; 1741 dst1 = dst3; 1742 dst2 = dst4; 1743 dst3 = dst5; 1744 dst4 = dst6; 1745 dst5 = dst7; 1746 dst6 = dst8; 1747 } 1748 1749 src += 8; 1750 dst += 8; 1751 1752 mask4 = LD_SB(ff_hevc_mask_arr + 16); 1753 mask5 = mask4 + 2; 1754 mask6 = mask4 + 4; 1755 mask7 = mask4 + 6; 1756 1757 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1758 src += (7 * src_stride); 1759 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1760 1761 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 1762 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7); 1763 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10, 1764 vec11); 1765 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14, 1766 vec15); 1767 1768 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1769 filt3); 1770 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1771 filt3); 1772 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, 1773 filt3); 1774 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, 1775 filt3); 1776 1777 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 1778 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 1779 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 1780 1781 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1782 1783 for (loop_cnt = 2; loop_cnt--;) { 1784 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13, 1785 src14); 1786 src += (8 * src_stride); 1787 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14); 1788 1789 VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 1790 vec3); 1791 VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6, 1792 vec7); 1793 VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10, 1794 vec11); 1795 VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13, 1796 vec14, vec15); 1797 1798 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, 1799 filt3); 1800 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, 1801 filt3); 1802 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, 1803 filt2, filt3); 1804 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, 1805 filt2, filt3); 1806 1807 dst76_r = __msa_ilvr_h(dst117, dst66); 1808 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r); 1809 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r); 1810 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r); 1811 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1); 1812 dst1110_r = __msa_ilvr_h(dst117, dst1410); 1813 1814 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1815 filt_h1, filt_h2, filt_h3); 1816 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 1817 filt_h1, filt_h2, filt_h3); 1818 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0, 1819 filt_h1, filt_h2, filt_h3); 1820 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0, 1821 filt_h1, filt_h2, filt_h3); 1822 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0, 1823 filt_h1, filt_h2, filt_h3); 1824 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0, 1825 filt_h1, filt_h2, filt_h3); 1826 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0, 1827 filt_h1, filt_h2, filt_h3); 1828 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r, 1829 filt_h0, filt_h1, filt_h2, filt_h3); 1830 1831 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1832 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 1833 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1834 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6); 1835 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7); 1836 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7); 1837 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); 1838 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r); 1839 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r); 1840 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r); 1841 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 1842 dst += (8 * dst_stride); 1843 1844 dst10_r = dst98_r; 1845 dst32_r = dst1110_r; 1846 dst54_r = dst1312_r; 1847 dst21_r = dst109_r; 1848 dst43_r = dst1211_r; 1849 dst65_r = dst1413_r; 1850 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1); 1851 } 1852} 1853 1854static void hevc_hv_uni_8t_16w_msa(uint8_t *src, 1855 int32_t src_stride, 1856 uint8_t *dst, 1857 int32_t dst_stride, 1858 const int8_t *filter_x, 1859 const int8_t *filter_y, 1860 int32_t height) 1861{ 1862 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1863 filter_x, filter_y, height, 16); 1864} 1865 1866static void hevc_hv_uni_8t_24w_msa(uint8_t *src, 1867 int32_t src_stride, 1868 uint8_t *dst, 1869 int32_t dst_stride, 1870 const int8_t *filter_x, 1871 const int8_t *filter_y, 1872 int32_t height) 1873{ 1874 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1875 filter_x, filter_y, height, 24); 1876} 1877 1878static void hevc_hv_uni_8t_32w_msa(uint8_t *src, 1879 int32_t src_stride, 1880 uint8_t *dst, 1881 int32_t dst_stride, 1882 const int8_t *filter_x, 1883 const int8_t *filter_y, 1884 int32_t height) 1885{ 1886 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1887 filter_x, filter_y, height, 32); 1888} 1889 1890static void hevc_hv_uni_8t_48w_msa(uint8_t *src, 1891 int32_t src_stride, 1892 uint8_t *dst, 1893 int32_t dst_stride, 1894 const int8_t *filter_x, 1895 const int8_t *filter_y, 1896 int32_t height) 1897{ 1898 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1899 filter_x, filter_y, height, 48); 1900} 1901 1902static void hevc_hv_uni_8t_64w_msa(uint8_t *src, 1903 int32_t src_stride, 1904 uint8_t *dst, 1905 int32_t dst_stride, 1906 const int8_t *filter_x, 1907 const int8_t *filter_y, 1908 int32_t height) 1909{ 1910 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride, 1911 filter_x, filter_y, height, 64); 1912} 1913 1914static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, 1915 uint8_t *dst, int32_t dst_stride, 1916 const int8_t *filter) 1917{ 1918 v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1; 1919 v16u8 out; 1920 v8i16 filt, res0; 1921 1922 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 1923 src -= 1; 1924 1925 /* rearranging filter */ 1926 filt = LD_SH(filter); 1927 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 1928 1929 mask1 = mask0 + 2; 1930 1931 LD_SB2(src, src_stride, src0, src1); 1932 XORI_B2_128_SB(src0, src1); 1933 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 1934 res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 1935 res0 = __msa_srari_h(res0, 6); 1936 res0 = __msa_sat_s_h(res0, 7); 1937 out = PCKEV_XORI128_UB(res0, res0); 1938 ST_W2(out, 0, 1, dst, dst_stride); 1939} 1940 1941static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, 1942 uint8_t *dst, int32_t dst_stride, 1943 const int8_t *filter) 1944{ 1945 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 1946 v8i16 filt, out0, out1; 1947 v16u8 out; 1948 1949 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 1950 src -= 1; 1951 1952 /* rearranging filter */ 1953 filt = LD_SH(filter); 1954 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 1955 1956 mask1 = mask0 + 2; 1957 1958 LD_SB4(src, src_stride, src0, src1, src2, src3); 1959 XORI_B4_128_SB(src0, src1, src2, src3); 1960 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 1961 filt0, filt1, out0, out1); 1962 SRARI_H2_SH(out0, out1, 6); 1963 SAT_SH2_SH(out0, out1, 7); 1964 out = PCKEV_XORI128_UB(out0, out1); 1965 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1966} 1967 1968static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, 1969 uint8_t *dst, int32_t dst_stride, 1970 const int8_t *filter) 1971{ 1972 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 1973 v16u8 out; 1974 v8i16 filt, out0, out1, out2, out3; 1975 1976 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 1977 src -= 1; 1978 1979 /* rearranging filter */ 1980 filt = LD_SH(filter); 1981 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 1982 1983 mask1 = mask0 + 2; 1984 1985 LD_SB4(src, src_stride, src0, src1, src2, src3); 1986 src += (4 * src_stride); 1987 1988 XORI_B4_128_SB(src0, src1, src2, src3); 1989 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 1990 filt0, filt1, out0, out1); 1991 LD_SB4(src, src_stride, src0, src1, src2, src3); 1992 XORI_B4_128_SB(src0, src1, src2, src3); 1993 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 1994 filt0, filt1, out2, out3); 1995 SRARI_H4_SH(out0, out1, out2, out3, 6); 1996 SAT_SH4_SH(out0, out1, out2, out3, 7); 1997 out = PCKEV_XORI128_UB(out0, out1); 1998 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 1999 out = PCKEV_XORI128_UB(out2, out3); 2000 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 2001} 2002 2003static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, 2004 uint8_t *dst, int32_t dst_stride, 2005 const int8_t *filter) 2006{ 2007 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2008 v16i8 filt0, filt1, mask0, mask1; 2009 v16u8 out; 2010 v8i16 filt, out0, out1, out2, out3; 2011 2012 mask0 = LD_SB(&ff_hevc_mask_arr[16]); 2013 src -= 1; 2014 2015 /* rearranging filter */ 2016 filt = LD_SH(filter); 2017 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2018 2019 mask1 = mask0 + 2; 2020 2021 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2022 src += (8 * src_stride); 2023 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2024 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 2025 filt0, filt1, out0, out1); 2026 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, 2027 filt0, filt1, out2, out3); 2028 SRARI_H4_SH(out0, out1, out2, out3, 6); 2029 SAT_SH4_SH(out0, out1, out2, out3, 7); 2030 out = PCKEV_XORI128_UB(out0, out1); 2031 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2032 out = PCKEV_XORI128_UB(out2, out3); 2033 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 2034 dst += (8 * dst_stride); 2035 2036 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 2037 src += (8 * src_stride); 2038 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2039 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, 2040 filt0, filt1, out0, out1); 2041 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, 2042 filt0, filt1, out2, out3); 2043 SRARI_H4_SH(out0, out1, out2, out3, 6); 2044 SAT_SH4_SH(out0, out1, out2, out3, 7); 2045 out = PCKEV_XORI128_UB(out0, out1); 2046 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2047 out = PCKEV_XORI128_UB(out2, out3); 2048 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 2049} 2050 2051static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, 2052 uint8_t *dst, int32_t dst_stride, 2053 const int8_t *filter, int32_t height) 2054{ 2055 if (2 == height) { 2056 common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); 2057 } else if (4 == height) { 2058 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); 2059 } else if (8 == height) { 2060 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter); 2061 } else if (16 == height) { 2062 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter); 2063 } 2064} 2065 2066static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, 2067 uint8_t *dst, int32_t dst_stride, 2068 const int8_t *filter, int32_t height) 2069{ 2070 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 2071 v16u8 out4, out5; 2072 v8i16 filt, out0, out1, out2, out3; 2073 2074 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2075 src -= 1; 2076 2077 /* rearranging filter */ 2078 filt = LD_SH(filter); 2079 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2080 2081 mask1 = mask0 + 2; 2082 2083 LD_SB4(src, src_stride, src0, src1, src2, src3); 2084 src += (4 * src_stride); 2085 2086 XORI_B4_128_SB(src0, src1, src2, src3); 2087 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 2088 filt1, out0, out1, out2, out3); 2089 SRARI_H4_SH(out0, out1, out2, out3, 6); 2090 SAT_SH4_SH(out0, out1, out2, out3, 7); 2091 out4 = PCKEV_XORI128_UB(out0, out1); 2092 out5 = PCKEV_XORI128_UB(out2, out3); 2093 ST_W2(out4, 0, 2, dst, dst_stride); 2094 ST_H2(out4, 2, 6, dst + 4, dst_stride); 2095 ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride); 2096 ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2097 dst += (4 * dst_stride); 2098 2099 LD_SB4(src, src_stride, src0, src1, src2, src3); 2100 src += (4 * src_stride); 2101 2102 XORI_B4_128_SB(src0, src1, src2, src3); 2103 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 2104 filt1, out0, out1, out2, out3); 2105 SRARI_H4_SH(out0, out1, out2, out3, 6); 2106 SAT_SH4_SH(out0, out1, out2, out3, 7); 2107 out4 = PCKEV_XORI128_UB(out0, out1); 2108 out5 = PCKEV_XORI128_UB(out2, out3); 2109 ST_W2(out4, 0, 2, dst, dst_stride); 2110 ST_H2(out4, 2, 6, dst + 4, dst_stride); 2111 ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride); 2112 ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2113} 2114 2115static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, 2116 uint8_t *dst, int32_t dst_stride, 2117 const int8_t *filter, int32_t height) 2118{ 2119 uint32_t loop_cnt; 2120 v16i8 src0, src1, filt0, filt1, mask0, mask1; 2121 v16u8 out; 2122 v8i16 filt, vec0, vec1, vec2, vec3; 2123 2124 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2125 src -= 1; 2126 2127 filt = LD_SH(filter); 2128 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2129 2130 mask1 = mask0 + 2; 2131 2132 for (loop_cnt = (height >> 1); loop_cnt--;) { 2133 LD_SB2(src, src_stride, src0, src1); 2134 src += (2 * src_stride); 2135 2136 XORI_B2_128_SB(src0, src1); 2137 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 2138 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1); 2139 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3); 2140 DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1); 2141 SRARI_H2_SH(vec0, vec1, 6); 2142 SAT_SH2_SH(vec0, vec1, 7); 2143 out = PCKEV_XORI128_UB(vec0, vec1); 2144 ST_D2(out, 0, 1, dst, dst_stride); 2145 dst += (2 * dst_stride); 2146 } 2147} 2148 2149static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, 2150 uint8_t *dst, int32_t dst_stride, 2151 const int8_t *filter, int32_t height) 2152{ 2153 uint32_t loop_cnt; 2154 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; 2155 v16u8 tmp0, tmp1; 2156 v8i16 filt, out0, out1, out2, out3; 2157 2158 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2159 src -= 1; 2160 2161 /* rearranging filter */ 2162 filt = LD_SH(filter); 2163 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2164 2165 mask1 = mask0 + 2; 2166 2167 for (loop_cnt = (height >> 2); loop_cnt--;) { 2168 LD_SB4(src, src_stride, src0, src1, src2, src3); 2169 src += (4 * src_stride); 2170 2171 XORI_B4_128_SB(src0, src1, src2, src3); 2172 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, 2173 filt1, out0, out1, out2, out3); 2174 SRARI_H4_SH(out0, out1, out2, out3, 6); 2175 SAT_SH4_SH(out0, out1, out2, out3, 7); 2176 tmp0 = PCKEV_XORI128_UB(out0, out1); 2177 tmp1 = PCKEV_XORI128_UB(out2, out3); 2178 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 2179 dst += (4 * dst_stride); 2180 } 2181} 2182 2183static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, 2184 uint8_t *dst, int32_t dst_stride, 2185 const int8_t *filter, int32_t height) 2186{ 2187 if ((2 == height) || (6 == height)) { 2188 common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter, 2189 height); 2190 } else { 2191 common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter, 2192 height); 2193 } 2194} 2195 2196static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, 2197 uint8_t *dst, int32_t dst_stride, 2198 const int8_t *filter, int32_t height) 2199{ 2200 uint32_t loop_cnt; 2201 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3; 2202 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 2203 v16i8 vec10, vec11; 2204 v16u8 tmp0, tmp1; 2205 v8i16 filt, out0, out1, out2, out3, out4, out5; 2206 2207 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2208 mask2 = LD_SB(&ff_hevc_mask_arr[32]); 2209 2210 src -= 1; 2211 2212 /* rearranging filter */ 2213 filt = LD_SH(filter); 2214 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2215 2216 mask1 = mask0 + 2; 2217 mask3 = mask2 + 2; 2218 2219 for (loop_cnt = 4; loop_cnt--;) { 2220 LD_SB4(src, src_stride, src0, src1, src2, src3); 2221 src += (4 * src_stride); 2222 2223 XORI_B4_128_SB(src0, src1, src2, src3); 2224 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1); 2225 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1); 2226 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3); 2227 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1); 2228 SRARI_H2_SH(out0, out1, 6); 2229 SAT_SH2_SH(out0, out1, 7); 2230 tmp0 = PCKEV_XORI128_UB(out0, out1); 2231 ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride); 2232 2233 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5); 2234 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7); 2235 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 2236 out2, out3, out4, out5); 2237 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9); 2238 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11); 2239 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1, 2240 out2, out3, out4, out5); 2241 SRARI_H4_SH(out2, out3, out4, out5, 6); 2242 SAT_SH4_SH(out2, out3, out4, out5, 7); 2243 tmp0 = PCKEV_XORI128_UB(out2, out3); 2244 tmp1 = PCKEV_XORI128_UB(out4, out5); 2245 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 2246 dst += (4 * dst_stride); 2247 } 2248} 2249 2250static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, 2251 uint8_t *dst, int32_t dst_stride, 2252 const int8_t *filter, int32_t height) 2253{ 2254 uint32_t loop_cnt; 2255 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2256 v16i8 filt0, filt1, mask0, mask1; 2257 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; 2258 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; 2259 v16u8 out; 2260 2261 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2262 src -= 1; 2263 2264 /* rearranging filter */ 2265 filt = LD_SH(filter); 2266 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2267 2268 mask1 = mask0 + 2; 2269 2270 for (loop_cnt = (height >> 2); loop_cnt--;) { 2271 LD_SB4(src, src_stride, src0, src2, src4, src6); 2272 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 2273 src += (4 * src_stride); 2274 2275 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2276 2277 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); 2278 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); 2279 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, 2280 out0, out1, out2, out3); 2281 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); 2282 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); 2283 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, 2284 out0, out1, out2, out3); 2285 SRARI_H4_SH(out0, out1, out2, out3, 6); 2286 SAT_SH4_SH(out0, out1, out2, out3, 7); 2287 out = PCKEV_XORI128_UB(out0, out1); 2288 ST_UB(out, dst); 2289 dst += dst_stride; 2290 out = PCKEV_XORI128_UB(out2, out3); 2291 ST_UB(out, dst); 2292 dst += dst_stride; 2293 2294 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m); 2295 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m); 2296 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, 2297 out4, out5, out6, out7); 2298 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m); 2299 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m); 2300 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, 2301 out4, out5, out6, out7); 2302 SRARI_H4_SH(out4, out5, out6, out7, 6); 2303 SAT_SH4_SH(out4, out5, out6, out7, 7); 2304 out = PCKEV_XORI128_UB(out4, out5); 2305 ST_UB(out, dst); 2306 dst += dst_stride; 2307 out = PCKEV_XORI128_UB(out6, out7); 2308 ST_UB(out, dst); 2309 dst += dst_stride; 2310 } 2311} 2312 2313static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, 2314 uint8_t *dst, int32_t dst_stride, 2315 const int8_t *filter, int32_t height) 2316{ 2317 uint8_t *dst1 = dst + 16; 2318 uint32_t loop_cnt; 2319 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2320 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2321 v16i8 filt0, filt1, mask0, mask1, mask00, mask11; 2322 v8i16 filt, out0, out1, out2, out3; 2323 v16u8 tmp0, tmp1; 2324 2325 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2326 src -= 1; 2327 2328 /* rearranging filter */ 2329 filt = LD_SH(filter); 2330 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2331 2332 mask1 = mask0 + 2; 2333 mask00 = mask0 + 8; 2334 mask11 = mask0 + 10; 2335 2336 for (loop_cnt = 8; loop_cnt--;) { 2337 LD_SB4(src, src_stride, src0, src2, src4, src6); 2338 LD_SB4(src + 16, src_stride, src1, src3, src5, src7); 2339 src += (4 * src_stride); 2340 2341 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2342 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1); 2343 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3); 2344 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5); 2345 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7); 2346 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2347 out0, out1, out2, out3); 2348 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, 2349 out0, out1, out2, out3); 2350 SRARI_H4_SH(out0, out1, out2, out3, 6); 2351 SAT_SH4_SH(out0, out1, out2, out3, 7); 2352 tmp0 = PCKEV_XORI128_UB(out0, out1); 2353 ST_UB(tmp0, dst); 2354 dst += dst_stride; 2355 tmp0 = PCKEV_XORI128_UB(out2, out3); 2356 ST_UB(tmp0, dst); 2357 dst += dst_stride; 2358 2359 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1); 2360 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3); 2361 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5); 2362 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7); 2363 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2364 out0, out1, out2, out3); 2365 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, 2366 out0, out1, out2, out3); 2367 SRARI_H4_SH(out0, out1, out2, out3, 6); 2368 SAT_SH4_SH(out0, out1, out2, out3, 7); 2369 tmp0 = PCKEV_XORI128_UB(out0, out1); 2370 ST_UB(tmp0, dst); 2371 dst += dst_stride; 2372 tmp0 = PCKEV_XORI128_UB(out2, out3); 2373 ST_UB(tmp0, dst); 2374 dst += dst_stride; 2375 2376 /* 8 width */ 2377 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1); 2378 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3); 2379 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5); 2380 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7); 2381 2382 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 2383 out0, out1, out2, out3); 2384 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, 2385 out0, out1, out2, out3); 2386 2387 SRARI_H4_SH(out0, out1, out2, out3, 6); 2388 SAT_SH4_SH(out0, out1, out2, out3, 7); 2389 tmp0 = PCKEV_XORI128_UB(out0, out1); 2390 tmp1 = PCKEV_XORI128_UB(out2, out3); 2391 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride); 2392 dst1 += (4 * dst_stride); 2393 } 2394} 2395 2396static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, 2397 uint8_t *dst, int32_t dst_stride, 2398 const int8_t *filter, int32_t height) 2399{ 2400 uint32_t loop_cnt; 2401 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2402 v16i8 filt0, filt1, mask0, mask1; 2403 v16u8 out; 2404 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; 2405 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; 2406 2407 mask0 = LD_SB(&ff_hevc_mask_arr[0]); 2408 src -= 1; 2409 2410 /* rearranging filter */ 2411 filt = LD_SH(filter); 2412 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2413 2414 mask1 = mask0 + 2; 2415 2416 for (loop_cnt = (height >> 1); loop_cnt--;) { 2417 src0 = LD_SB(src); 2418 src1 = LD_SB(src + 8); 2419 src2 = LD_SB(src + 16); 2420 src3 = LD_SB(src + 24); 2421 src += src_stride; 2422 src4 = LD_SB(src); 2423 src5 = LD_SB(src + 8); 2424 src6 = LD_SB(src + 16); 2425 src7 = LD_SB(src + 24); 2426 src += src_stride; 2427 2428 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2429 2430 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); 2431 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); 2432 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, 2433 out0, out1, out2, out3); 2434 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); 2435 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); 2436 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, 2437 out0, out1, out2, out3); 2438 2439 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m); 2440 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m); 2441 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, 2442 out4, out5, out6, out7); 2443 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m); 2444 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m); 2445 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, 2446 out4, out5, out6, out7); 2447 SRARI_H4_SH(out0, out1, out2, out3, 6); 2448 SRARI_H4_SH(out4, out5, out6, out7, 6); 2449 SAT_SH4_SH(out0, out1, out2, out3, 7); 2450 SAT_SH4_SH(out4, out5, out6, out7, 7); 2451 out = PCKEV_XORI128_UB(out0, out1); 2452 ST_UB(out, dst); 2453 out = PCKEV_XORI128_UB(out2, out3); 2454 ST_UB(out, dst + 16); 2455 dst += dst_stride; 2456 out = PCKEV_XORI128_UB(out4, out5); 2457 ST_UB(out, dst); 2458 out = PCKEV_XORI128_UB(out6, out7); 2459 ST_UB(out, dst + 16); 2460 dst += dst_stride; 2461 } 2462} 2463 2464static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, 2465 uint8_t *dst, int32_t dst_stride, 2466 const int8_t *filter) 2467{ 2468 v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r; 2469 v16i8 src2110, src4332, filt0, filt1; 2470 v16u8 out; 2471 v8i16 filt, out10; 2472 2473 src -= src_stride; 2474 2475 filt = LD_SH(filter); 2476 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2477 2478 LD_SB3(src, src_stride, src0, src1, src2); 2479 src += (3 * src_stride); 2480 2481 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2482 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2483 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2484 LD_SB2(src, src_stride, src3, src4); 2485 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2486 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); 2487 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); 2488 out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 2489 out10 = __msa_srari_h(out10, 6); 2490 out10 = __msa_sat_s_h(out10, 7); 2491 out = PCKEV_XORI128_UB(out10, out10); 2492 ST_W2(out, 0, 1, dst, dst_stride); 2493} 2494 2495static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, 2496 uint8_t *dst, int32_t dst_stride, 2497 const int8_t *filter, int32_t height) 2498{ 2499 uint32_t loop_cnt; 2500 v16i8 src0, src1, src2, src3, src4, src5; 2501 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 2502 v16i8 src2110, src4332, filt0, filt1; 2503 v8i16 filt, out10, out32; 2504 v16u8 out; 2505 2506 src -= src_stride; 2507 2508 filt = LD_SH(filter); 2509 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2510 2511 LD_SB3(src, src_stride, src0, src1, src2); 2512 src += (3 * src_stride); 2513 2514 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2515 2516 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2517 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2518 2519 for (loop_cnt = (height >> 2); loop_cnt--;) { 2520 LD_SB3(src, src_stride, src3, src4, src5); 2521 src += (3 * src_stride); 2522 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2523 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); 2524 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); 2525 out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 2526 2527 src2 = LD_SB(src); 2528 src += (src_stride); 2529 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r); 2530 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r); 2531 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2532 out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1); 2533 SRARI_H2_SH(out10, out32, 6); 2534 SAT_SH2_SH(out10, out32, 7); 2535 out = PCKEV_XORI128_UB(out10, out32); 2536 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 2537 dst += (4 * dst_stride); 2538 } 2539} 2540 2541static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, 2542 uint8_t *dst, int32_t dst_stride, 2543 const int8_t *filter, int32_t height) 2544{ 2545 if (2 == height) { 2546 common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); 2547 } else { 2548 common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter, 2549 height); 2550 } 2551} 2552 2553static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, 2554 uint8_t *dst, int32_t dst_stride, 2555 const int8_t *filter, int32_t height) 2556{ 2557 v16u8 out0, out1; 2558 v16i8 src0, src1, src2, src3, src4, src5, src6; 2559 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 2560 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec; 2561 2562 src -= src_stride; 2563 2564 filter_vec = LD_SH(filter); 2565 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2566 2567 LD_SB3(src, src_stride, src0, src1, src2); 2568 src += (3 * src_stride); 2569 XORI_B3_128_SB(src0, src1, src2); 2570 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2571 2572 LD_SB2(src, src_stride, src3, src4); 2573 src += (2 * src_stride); 2574 XORI_B2_128_SB(src3, src4); 2575 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2576 2577 dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 2578 dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 2579 2580 LD_SB2(src, src_stride, src5, src6); 2581 src += (2 * src_stride); 2582 XORI_B2_128_SB(src5, src6); 2583 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2584 2585 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 2586 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 2587 2588 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6); 2589 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7); 2590 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r); 2591 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r); 2592 ST_W2(out0, 0, 2, dst, dst_stride); 2593 ST_H2(out0, 2, 6, dst + 4, dst_stride); 2594 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 2595 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2596 dst += (4 * dst_stride); 2597 2598 LD_SB2(src, src_stride, src3, src4); 2599 src += (2 * src_stride); 2600 XORI_B2_128_SB(src3, src4); 2601 ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r); 2602 2603 dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1); 2604 dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1); 2605 2606 LD_SB2(src, src_stride, src5, src6); 2607 src += (2 * src_stride); 2608 XORI_B2_128_SB(src5, src6); 2609 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2610 2611 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 2612 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 2613 2614 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6); 2615 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7); 2616 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r); 2617 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r); 2618 ST_W2(out0, 0, 2, dst, dst_stride); 2619 ST_H2(out0, 2, 6, dst + 4, dst_stride); 2620 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); 2621 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); 2622} 2623 2624static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, 2625 uint8_t *dst, int32_t dst_stride, 2626 const int8_t *filter) 2627{ 2628 v16i8 src0, src1, src2, src3, src4; 2629 v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1; 2630 v16u8 out; 2631 2632 src -= src_stride; 2633 2634 /* rearranging filter_y */ 2635 filt = LD_SH(filter); 2636 SPLATI_H2_SH(filt, 0, 1, filt0, filt1); 2637 2638 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 2639 XORI_B5_128_SB(src0, src1, src2, src3, src4); 2640 ILVR_B2_SH(src1, src0, src3, src2, src01, src23); 2641 tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1); 2642 ILVR_B2_SH(src2, src1, src4, src3, src12, src34); 2643 tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1); 2644 SRARI_H2_SH(tmp0, tmp1, 6); 2645 SAT_SH2_SH(tmp0, tmp1, 7); 2646 out = PCKEV_XORI128_UB(tmp0, tmp1); 2647 ST_D2(out, 0, 1, dst, dst_stride); 2648} 2649 2650static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, 2651 uint8_t *dst, int32_t dst_stride, 2652 const int8_t *filter) 2653{ 2654 uint32_t loop_cnt; 2655 uint64_t out0, out1, out2; 2656 v16i8 src0, src1, src2, src3, src4, src5; 2657 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2; 2658 v8i16 filt, filt0, filt1; 2659 2660 src -= src_stride; 2661 2662 /* rearranging filter_y */ 2663 filt = LD_SH(filter); 2664 SPLATI_H2_SH(filt, 0, 1, filt0, filt1); 2665 2666 LD_SB3(src, src_stride, src0, src1, src2); 2667 src += (3 * src_stride); 2668 2669 XORI_B3_128_SB(src0, src1, src2); 2670 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2); 2671 2672 for (loop_cnt = 2; loop_cnt--;) { 2673 LD_SB3(src, src_stride, src3, src4, src5); 2674 src += (3 * src_stride); 2675 2676 XORI_B3_128_SB(src3, src4, src5); 2677 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4); 2678 tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 2679 tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 2680 tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1); 2681 SRARI_H2_SH(tmp0, tmp1, 6); 2682 tmp2 = __msa_srari_h(tmp2, 6); 2683 SAT_SH3_SH(tmp0, tmp1, tmp2, 7); 2684 PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2); 2685 XORI_B2_128_SH(tmp0, tmp2); 2686 2687 out0 = __msa_copy_u_d((v2i64) tmp0, 0); 2688 out1 = __msa_copy_u_d((v2i64) tmp0, 1); 2689 out2 = __msa_copy_u_d((v2i64) tmp2, 0); 2690 SD(out0, dst); 2691 dst += dst_stride; 2692 SD(out1, dst); 2693 dst += dst_stride; 2694 SD(out2, dst); 2695 dst += dst_stride; 2696 2697 src2 = src5; 2698 vec0 = vec3; 2699 vec2 = vec4; 2700 } 2701} 2702 2703static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, 2704 uint8_t *dst, int32_t dst_stride, 2705 const int8_t *filter, int32_t height) 2706{ 2707 uint32_t loop_cnt; 2708 v16i8 src0, src1, src2, src7, src8, src9, src10; 2709 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; 2710 v16u8 tmp0, tmp1; 2711 v8i16 filt, out0_r, out1_r, out2_r, out3_r; 2712 2713 src -= src_stride; 2714 2715 filt = LD_SH(filter); 2716 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2717 2718 LD_SB3(src, src_stride, src0, src1, src2); 2719 src += (3 * src_stride); 2720 2721 XORI_B3_128_SB(src0, src1, src2); 2722 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2723 2724 for (loop_cnt = (height >> 2); loop_cnt--;) { 2725 LD_SB4(src, src_stride, src7, src8, src9, src10); 2726 src += (4 * src_stride); 2727 2728 XORI_B4_128_SB(src7, src8, src9, src10); 2729 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, 2730 src72_r, src87_r, src98_r, src109_r); 2731 out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1); 2732 out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1); 2733 out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1); 2734 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 2735 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 2736 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2737 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 2738 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 2739 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 2740 dst += (4 * dst_stride); 2741 2742 src10_r = src98_r; 2743 src21_r = src109_r; 2744 src2 = src10; 2745 } 2746} 2747 2748static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, 2749 uint8_t *dst, int32_t dst_stride, 2750 const int8_t *filter, int32_t height) 2751{ 2752 if (2 == height) { 2753 common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter); 2754 } else if (6 == height) { 2755 common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter); 2756 } else { 2757 common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride, 2758 filter, height); 2759 } 2760} 2761 2762static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, 2763 uint8_t *dst, int32_t dst_stride, 2764 const int8_t *filter, int32_t height) 2765{ 2766 uint32_t loop_cnt; 2767 v16i8 src0, src1, src2, src3, src4, src5, src6; 2768 v16u8 out0, out1; 2769 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 2770 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 2771 v16i8 src2110, src4332, src6554; 2772 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1; 2773 v8i16 filter_vec; 2774 2775 src -= (1 * src_stride); 2776 2777 filter_vec = LD_SH(filter); 2778 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2779 2780 LD_SB3(src, src_stride, src0, src1, src2); 2781 src += (3 * src_stride); 2782 2783 XORI_B3_128_SB(src0, src1, src2); 2784 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2785 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 2786 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); 2787 2788 for (loop_cnt = 4; loop_cnt--;) { 2789 LD_SB4(src, src_stride, src3, src4, src5, src6); 2790 src += (4 * src_stride); 2791 2792 XORI_B4_128_SB(src3, src4, src5, src6); 2793 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2794 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 2795 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); 2796 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2797 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l); 2798 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); 2799 2800 dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 2801 dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 2802 dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1); 2803 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 2804 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 2805 dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1); 2806 2807 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6); 2808 SRARI_H2_SH(dst0_l, dst1_l, 6); 2809 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7); 2810 SAT_SH2_SH(dst0_l, dst1_l, 7); 2811 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r); 2812 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r); 2813 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 2814 out0 = PCKEV_XORI128_UB(dst0_l, dst1_l); 2815 ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride); 2816 dst += (4 * dst_stride); 2817 2818 src2 = src6; 2819 src10_r = src54_r; 2820 src21_r = src65_r; 2821 src2110 = src6554; 2822 } 2823} 2824 2825static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, 2826 uint8_t *dst, int32_t dst_stride, 2827 const int8_t *filter, int32_t height) 2828{ 2829 uint32_t loop_cnt; 2830 v16i8 src0, src1, src2, src3, src4, src5, src6; 2831 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; 2832 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; 2833 v16u8 tmp0, tmp1, tmp2, tmp3; 2834 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 2835 2836 src -= src_stride; 2837 2838 filt = LD_SH(filter); 2839 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2840 2841 LD_SB3(src, src_stride, src0, src1, src2); 2842 src += (3 * src_stride); 2843 2844 XORI_B3_128_SB(src0, src1, src2); 2845 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2846 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 2847 2848 for (loop_cnt = (height >> 2); loop_cnt--;) { 2849 LD_SB4(src, src_stride, src3, src4, src5, src6); 2850 src += (4 * src_stride); 2851 2852 XORI_B4_128_SB(src3, src4, src5, src6); 2853 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 2854 src32_r, src43_r, src54_r, src65_r); 2855 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 2856 src32_l, src43_l, src54_l, src65_l); 2857 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 2858 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 2859 out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1); 2860 out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1); 2861 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 2862 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 2863 out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1); 2864 out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1); 2865 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 2866 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); 2867 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2868 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 2869 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 2870 out3_r, tmp0, tmp1, tmp2, tmp3); 2871 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 2872 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 2873 dst += (4 * dst_stride); 2874 2875 src10_r = src54_r; 2876 src21_r = src65_r; 2877 src10_l = src54_l; 2878 src21_l = src65_l; 2879 src2 = src6; 2880 } 2881} 2882 2883static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, 2884 uint8_t *dst, int32_t dst_stride, 2885 const int8_t *filter, int32_t height) 2886{ 2887 uint32_t loop_cnt; 2888 uint64_t out0, out1; 2889 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2890 v16i8 src11, filt0, filt1; 2891 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; 2892 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l; 2893 v16u8 out; 2894 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l; 2895 2896 src -= src_stride; 2897 2898 filt = LD_SH(filter); 2899 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 2900 2901 /* 16 width */ 2902 LD_SB3(src, src_stride, src0, src1, src2); 2903 XORI_B3_128_SB(src0, src1, src2); 2904 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2905 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 2906 2907 /* 8 width */ 2908 LD_SB3(src + 16, src_stride, src6, src7, src8); 2909 src += (3 * src_stride); 2910 XORI_B3_128_SB(src6, src7, src8); 2911 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 2912 2913 for (loop_cnt = 8; loop_cnt--;) { 2914 /* 16 width */ 2915 LD_SB2(src, src_stride, src3, src4); 2916 XORI_B2_128_SB(src3, src4); 2917 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2918 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 2919 2920 /* 8 width */ 2921 LD_SB2(src + 16, src_stride, src9, src10); 2922 src += (2 * src_stride); 2923 XORI_B2_128_SB(src9, src10); 2924 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 2925 2926 /* 16 width */ 2927 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 2928 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 2929 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 2930 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 2931 2932 /* 8 width */ 2933 out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 2934 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 2935 2936 /* 16 + 8 width */ 2937 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 2938 SRARI_H2_SH(out0_l, out1_l, 6); 2939 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2940 SAT_SH2_SH(out0_l, out1_l, 7); 2941 out = PCKEV_XORI128_UB(out0_r, out0_l); 2942 ST_UB(out, dst); 2943 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r); 2944 XORI_B2_128_SH(out2_r, out3_r); 2945 out0 = __msa_copy_u_d((v2i64) out2_r, 0); 2946 out1 = __msa_copy_u_d((v2i64) out3_r, 0); 2947 SD(out0, dst + 16); 2948 dst += dst_stride; 2949 out = PCKEV_XORI128_UB(out1_r, out1_l); 2950 ST_UB(out, dst); 2951 SD(out1, dst + 16); 2952 dst += dst_stride; 2953 2954 /* 16 width */ 2955 LD_SB2(src, src_stride, src5, src2); 2956 XORI_B2_128_SB(src5, src2); 2957 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 2958 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 2959 2960 /* 8 width */ 2961 LD_SB2(src + 16, src_stride, src11, src8); 2962 src += (2 * src_stride); 2963 XORI_B2_128_SB(src11, src8); 2964 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); 2965 2966 /* 16 width */ 2967 out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1); 2968 out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1); 2969 out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1); 2970 out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1); 2971 2972 /* 8 width */ 2973 out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1); 2974 out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1); 2975 2976 /* 16 + 8 width */ 2977 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 2978 SRARI_H2_SH(out0_l, out1_l, 6); 2979 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 2980 SAT_SH2_SH(out0_l, out1_l, 7); 2981 out = PCKEV_XORI128_UB(out0_r, out0_l); 2982 ST_UB(out, dst); 2983 out = PCKEV_XORI128_UB(out2_r, out2_r); 2984 ST_D1(out, 0, dst + 16); 2985 dst += dst_stride; 2986 out = PCKEV_XORI128_UB(out1_r, out1_l); 2987 ST_UB(out, dst); 2988 out = PCKEV_XORI128_UB(out3_r, out3_r); 2989 ST_D1(out, 0, dst + 16); 2990 dst += dst_stride; 2991 } 2992} 2993 2994static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, 2995 uint8_t *dst, int32_t dst_stride, 2996 const int8_t *filter, int32_t height) 2997{ 2998 uint32_t loop_cnt; 2999 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10; 3000 v16i8 src10_r, src32_r, src76_r, src98_r; 3001 v16i8 src21_r, src43_r, src87_r, src109_r; 3002 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 3003 v16i8 src10_l, src32_l, src76_l, src98_l; 3004 v16i8 src21_l, src43_l, src87_l, src109_l; 3005 v8i16 filt; 3006 v16i8 filt0, filt1; 3007 v16u8 out; 3008 3009 src -= src_stride; 3010 3011 filt = LD_SH(filter); 3012 SPLATI_H2_SB(filt, 0, 1, filt0, filt1); 3013 3014 /* 16 width */ 3015 LD_SB3(src, src_stride, src0, src1, src2); 3016 XORI_B3_128_SB(src0, src1, src2); 3017 3018 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3019 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3020 3021 /* next 16 width */ 3022 LD_SB3(src + 16, src_stride, src6, src7, src8); 3023 src += (3 * src_stride); 3024 3025 XORI_B3_128_SB(src6, src7, src8); 3026 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3027 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 3028 3029 for (loop_cnt = (height >> 1); loop_cnt--;) { 3030 /* 16 width */ 3031 LD_SB2(src, src_stride, src3, src4); 3032 XORI_B2_128_SB(src3, src4); 3033 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3034 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3035 3036 /* 16 width */ 3037 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1); 3038 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1); 3039 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1); 3040 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1); 3041 3042 /* 16 width */ 3043 SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6); 3044 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7); 3045 out = PCKEV_XORI128_UB(out0_r, out0_l); 3046 ST_UB(out, dst); 3047 out = PCKEV_XORI128_UB(out1_r, out1_l); 3048 ST_UB(out, dst + dst_stride); 3049 3050 src10_r = src32_r; 3051 src21_r = src43_r; 3052 src10_l = src32_l; 3053 src21_l = src43_l; 3054 src2 = src4; 3055 3056 /* next 16 width */ 3057 LD_SB2(src + 16, src_stride, src9, src10); 3058 src += (2 * src_stride); 3059 XORI_B2_128_SB(src9, src10); 3060 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3061 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); 3062 3063 /* next 16 width */ 3064 out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1); 3065 out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1); 3066 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1); 3067 out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1); 3068 3069 /* next 16 width */ 3070 SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6); 3071 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7); 3072 out = PCKEV_XORI128_UB(out2_r, out2_l); 3073 ST_UB(out, dst + 16); 3074 out = PCKEV_XORI128_UB(out3_r, out3_l); 3075 ST_UB(out, dst + 16 + dst_stride); 3076 3077 dst += 2 * dst_stride; 3078 3079 src76_r = src98_r; 3080 src87_r = src109_r; 3081 src76_l = src98_l; 3082 src87_l = src109_l; 3083 src8 = src10; 3084 } 3085} 3086 3087static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, 3088 int32_t src_stride, 3089 uint8_t *dst, 3090 int32_t dst_stride, 3091 const int8_t *filter_x, 3092 const int8_t *filter_y) 3093{ 3094 v16u8 out; 3095 v16i8 src0, src1, src2, src3, src4; 3096 v8i16 filt0, filt1; 3097 v8i16 filt_h0, filt_h1; 3098 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3099 v16i8 mask1; 3100 v8i16 filter_vec, tmp; 3101 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 3102 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43; 3103 v4i32 dst0, dst1; 3104 3105 src -= (src_stride + 1); 3106 3107 filter_vec = LD_SH(filter_x); 3108 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3109 3110 filter_vec = LD_SH(filter_y); 3111 UNPCK_R_SB_SH(filter_vec, filter_vec); 3112 3113 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3114 3115 mask1 = mask0 + 2; 3116 3117 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3118 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3119 3120 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1); 3121 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3); 3122 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5); 3123 3124 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3125 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3126 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3127 3128 ILVRL_H2_SH(dst31, dst20, dst10, dst32); 3129 ILVRL_H2_SH(dst42, dst31, dst21, dst43); 3130 3131 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3132 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3133 dst0 >>= 6; 3134 dst1 >>= 6; 3135 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 3136 tmp = __msa_srari_h(tmp, 6); 3137 tmp = __msa_sat_s_h(tmp, 7); 3138 out = PCKEV_XORI128_UB(tmp, tmp); 3139 ST_W2(out, 0, 1, dst, dst_stride); 3140} 3141 3142static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, 3143 int32_t src_stride, 3144 uint8_t *dst, 3145 int32_t dst_stride, 3146 const int8_t *filter_x, 3147 const int8_t *filter_y) 3148{ 3149 v16u8 out; 3150 v16i8 src0, src1, src2, src3, src4, src5, src6; 3151 v8i16 filt0, filt1; 3152 v8i16 filt_h0, filt_h1; 3153 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3154 v16i8 mask1; 3155 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3156 v8i16 filter_vec, tmp0, tmp1; 3157 v8i16 dst30, dst41, dst52, dst63; 3158 v8i16 dst10, dst32, dst54, dst21, dst43, dst65; 3159 v4i32 dst0, dst1, dst2, dst3; 3160 3161 src -= (src_stride + 1); 3162 3163 filter_vec = LD_SH(filter_x); 3164 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3165 3166 filter_vec = LD_SH(filter_y); 3167 UNPCK_R_SB_SH(filter_vec, filter_vec); 3168 3169 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3170 3171 mask1 = mask0 + 2; 3172 3173 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3174 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3175 3176 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1); 3177 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3); 3178 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5); 3179 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7); 3180 3181 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3182 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3183 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3184 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3185 3186 ILVRL_H2_SH(dst41, dst30, dst10, dst43); 3187 ILVRL_H2_SH(dst52, dst41, dst21, dst54); 3188 ILVRL_H2_SH(dst63, dst52, dst32, dst65); 3189 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3190 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3191 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1); 3192 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1); 3193 SRA_4V(dst0, dst1, dst2, dst3, 6); 3194 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); 3195 SRARI_H2_SH(tmp0, tmp1, 6); 3196 SAT_SH2_SH(tmp0, tmp1, 7); 3197 out = PCKEV_XORI128_UB(tmp0, tmp1); 3198 ST_W4(out, 0, 1, 2, 3, dst, dst_stride); 3199} 3200 3201static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, 3202 int32_t src_stride, 3203 uint8_t *dst, 3204 int32_t dst_stride, 3205 const int8_t *filter_x, 3206 const int8_t *filter_y, 3207 int32_t height) 3208{ 3209 uint32_t loop_cnt; 3210 v16u8 out0, out1; 3211 v16i8 src0, src1, src2, src3, src4, src5; 3212 v16i8 src6, src7, src8, src9, src10; 3213 v8i16 filt0, filt1; 3214 v8i16 filt_h0, filt_h1; 3215 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3216 v16i8 mask1; 3217 v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3; 3218 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3219 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 3220 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 3221 v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 3222 v8i16 dst21_r, dst43_r, dst65_r, dst87_r; 3223 v8i16 dst98_r, dst109_r; 3224 3225 src -= (src_stride + 1); 3226 3227 filter_vec = LD_SH(filter_x); 3228 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3229 3230 filter_vec = LD_SH(filter_y); 3231 UNPCK_R_SB_SH(filter_vec, filter_vec); 3232 3233 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3234 3235 mask1 = mask0 + 2; 3236 3237 LD_SB3(src, src_stride, src0, src1, src2); 3238 src += (3 * src_stride); 3239 3240 XORI_B3_128_SB(src0, src1, src2); 3241 3242 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 3243 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3); 3244 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3245 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3246 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 3247 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 3248 3249 for (loop_cnt = height >> 3; loop_cnt--;) { 3250 LD_SB8(src, src_stride, 3251 src3, src4, src5, src6, src7, src8, src9, src10); 3252 src += (8 * src_stride); 3253 3254 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3255 3256 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1); 3257 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3); 3258 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5); 3259 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7); 3260 3261 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3262 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3263 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3264 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3265 3266 dst32_r = __msa_ilvr_h(dst73, dst22); 3267 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 3268 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 3269 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 3270 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 3271 dst76_r = __msa_ilvr_h(dst22, dst106); 3272 3273 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3274 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3275 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3276 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3277 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3278 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3279 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 3280 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 3281 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 3282 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 3283 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r, 3284 dst5_r, dst4_r, dst7_r, dst6_r, 3285 tmp0, tmp1, tmp2, tmp3); 3286 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); 3287 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 3288 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 3289 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 3290 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3291 dst += (8 * dst_stride); 3292 3293 dst10_r = dst98_r; 3294 dst21_r = dst109_r; 3295 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 3296 } 3297} 3298 3299static void hevc_hv_uni_4t_4w_msa(uint8_t *src, 3300 int32_t src_stride, 3301 uint8_t *dst, 3302 int32_t dst_stride, 3303 const int8_t *filter_x, 3304 const int8_t *filter_y, 3305 int32_t height) 3306{ 3307 if (2 == height) { 3308 hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride, 3309 filter_x, filter_y); 3310 } else if (4 == height) { 3311 hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride, 3312 filter_x, filter_y); 3313 } else if (0 == (height % 8)) { 3314 hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride, 3315 filter_x, filter_y, height); 3316 } 3317} 3318 3319static void hevc_hv_uni_4t_6w_msa(uint8_t *src, 3320 int32_t src_stride, 3321 uint8_t *dst, 3322 int32_t dst_stride, 3323 const int8_t *filter_x, 3324 const int8_t *filter_y, 3325 int32_t height) 3326{ 3327 v16u8 out0, out1, out2; 3328 v16i8 src0, src1, src2, src3, src4, src5, src6; 3329 v16i8 src7, src8, src9, src10; 3330 v8i16 filt0, filt1; 3331 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3332 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3333 v16i8 mask1; 3334 v8i16 filt_h0, filt_h1, filter_vec; 3335 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9; 3336 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 3337 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3338 v4i32 dst4_r, dst5_r, dst6_r, dst7_r; 3339 v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 3340 v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 3341 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r; 3342 v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l; 3343 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l; 3344 3345 src -= (src_stride + 1); 3346 3347 filter_vec = LD_SH(filter_x); 3348 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3349 3350 filter_vec = LD_SH(filter_y); 3351 UNPCK_R_SB_SH(filter_vec, filter_vec); 3352 3353 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3354 3355 mask1 = mask0 + 2; 3356 3357 LD_SB3(src, src_stride, src0, src1, src2); 3358 src += (3 * src_stride); 3359 3360 XORI_B3_128_SB(src0, src1, src2); 3361 3362 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3363 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3364 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3365 3366 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3367 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3368 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3369 3370 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 3371 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 3372 3373 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 3374 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3375 3376 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3377 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3378 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3379 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3380 3381 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3382 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3383 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3384 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3385 3386 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 3387 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3); 3388 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5); 3389 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7); 3390 3391 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3392 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3393 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3394 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3395 3396 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 3397 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 3398 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 3399 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 3400 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 3401 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 3402 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l); 3403 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l); 3404 3405 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l); 3406 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l); 3407 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l); 3408 3409 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3410 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3411 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3412 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3413 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3414 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3415 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 3416 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 3417 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1); 3418 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1); 3419 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1); 3420 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1); 3421 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 3422 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 3423 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 3424 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); 3425 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3); 3426 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5); 3427 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); 3428 SRARI_H2_SH(tmp4, tmp5, 6); 3429 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7); 3430 SAT_SH2_SH(tmp4, tmp5,7); 3431 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 3432 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 3433 out2 = PCKEV_XORI128_UB(tmp4, tmp5); 3434 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3435 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); 3436} 3437 3438static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, 3439 int32_t src_stride, 3440 uint8_t *dst, 3441 int32_t dst_stride, 3442 const int8_t *filter_x, 3443 const int8_t *filter_y) 3444{ 3445 v16u8 out; 3446 v16i8 src0, src1, src2, src3, src4; 3447 v8i16 filt0, filt1; 3448 v8i16 filt_h0, filt_h1, filter_vec; 3449 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3450 v16i8 mask1; 3451 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 3452 v8i16 dst0, dst1, dst2, dst3, dst4; 3453 v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 3454 v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 3455 v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 3456 v8i16 out0_r, out1_r; 3457 3458 src -= (src_stride + 1); 3459 3460 filter_vec = LD_SH(filter_x); 3461 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3462 3463 filter_vec = LD_SH(filter_y); 3464 UNPCK_R_SB_SH(filter_vec, filter_vec); 3465 3466 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3467 3468 mask1 = mask0 + 2; 3469 3470 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3471 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3472 3473 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3474 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3475 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3476 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 3477 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 3478 3479 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3480 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3481 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3482 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3483 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 3484 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3485 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3486 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3487 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3488 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3489 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3490 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3491 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3492 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3493 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r); 3494 SRARI_H2_SH(out0_r, out1_r, 6); 3495 SAT_SH2_SH(out0_r, out1_r, 7); 3496 out = PCKEV_XORI128_UB(out0_r, out1_r); 3497 ST_D2(out, 0, 1, dst, dst_stride); 3498} 3499 3500static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src, 3501 int32_t src_stride, 3502 uint8_t *dst, 3503 int32_t dst_stride, 3504 const int8_t *filter_x, 3505 const int8_t *filter_y, 3506 int32_t width8mult) 3507{ 3508 uint32_t cnt; 3509 v16u8 out0, out1; 3510 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 3511 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3512 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec; 3513 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3; 3514 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3515 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 3516 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 3517 3518 src -= (src_stride + 1); 3519 3520 filter_vec = LD_SH(filter_x); 3521 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3522 3523 filter_vec = LD_SH(filter_y); 3524 UNPCK_R_SB_SH(filter_vec, filter_vec); 3525 3526 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3527 3528 mask0 = LD_SB(ff_hevc_mask_arr); 3529 mask1 = mask0 + 2; 3530 3531 for (cnt = width8mult; cnt--;) { 3532 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3533 src += 8; 3534 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3535 3536 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3537 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3538 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3539 3540 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3541 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3542 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3543 3544 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3545 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3546 3547 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3548 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3549 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3550 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3551 3552 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3553 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3554 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3555 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3556 3557 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3558 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3559 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3560 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3561 3562 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3563 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3564 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3565 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3566 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3567 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3568 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3569 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3570 3571 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3572 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3573 3574 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 3575 dst3_r, tmp0, tmp1, tmp2, tmp3); 3576 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); 3577 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 3578 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 3579 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 3580 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 3581 dst += 8; 3582 } 3583} 3584 3585static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, 3586 int32_t src_stride, 3587 uint8_t *dst, 3588 int32_t dst_stride, 3589 const int8_t *filter_x, 3590 const int8_t *filter_y) 3591{ 3592 v16u8 out0, out1, out2; 3593 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3594 v8i16 filt0, filt1; 3595 v8i16 filt_h0, filt_h1, filter_vec; 3596 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3597 v16i8 mask1; 3598 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 3599 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 3600 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 3601 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3602 v4i32 dst4_r, dst4_l, dst5_r, dst5_l; 3603 v8i16 dst10_r, dst32_r, dst10_l, dst32_l; 3604 v8i16 dst21_r, dst43_r, dst21_l, dst43_l; 3605 v8i16 dst54_r, dst54_l, dst65_r, dst65_l; 3606 v8i16 dst76_r, dst76_l, dst87_r, dst87_l; 3607 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r; 3608 3609 src -= (src_stride + 1); 3610 3611 filter_vec = LD_SH(filter_x); 3612 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3613 3614 filter_vec = LD_SH(filter_y); 3615 UNPCK_R_SB_SH(filter_vec, filter_vec); 3616 3617 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3618 3619 mask1 = mask0 + 2; 3620 3621 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3622 src += (5 * src_stride); 3623 LD_SB4(src, src_stride, src5, src6, src7, src8); 3624 3625 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3626 XORI_B4_128_SB(src5, src6, src7, src8); 3627 3628 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3629 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3630 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3631 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 3632 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 3633 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 3634 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13); 3635 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15); 3636 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17); 3637 3638 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3639 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3640 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3641 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3642 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1); 3643 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1); 3644 dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1); 3645 dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1); 3646 dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1); 3647 3648 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3649 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3650 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3651 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3652 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3653 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3654 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 3655 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 3656 3657 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3658 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3659 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3660 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3661 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3662 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3663 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3664 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3665 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3666 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); 3667 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3668 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); 3669 3670 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3671 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3672 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6); 3673 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, 3674 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r); 3675 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r); 3676 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 3677 SRARI_H2_SH(out4_r, out5_r, 6); 3678 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3679 SAT_SH2_SH(out4_r, out5_r, 7); 3680 out0 = PCKEV_XORI128_UB(out0_r, out1_r); 3681 out1 = PCKEV_XORI128_UB(out2_r, out3_r); 3682 out2 = PCKEV_XORI128_UB(out4_r, out5_r); 3683 3684 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); 3685 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); 3686} 3687 3688static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src, 3689 int32_t src_stride, 3690 uint8_t *dst, 3691 int32_t dst_stride, 3692 const int8_t *filter_x, 3693 const int8_t *filter_y, 3694 int32_t height, 3695 int32_t width8mult) 3696{ 3697 uint32_t loop_cnt, cnt; 3698 uint8_t *src_tmp; 3699 uint8_t *dst_tmp; 3700 v16u8 out0, out1; 3701 v16i8 src0, src1, src2, src3, src4, src5, src6; 3702 v8i16 filt0, filt1; 3703 v8i16 filt_h0, filt_h1, filter_vec; 3704 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3705 v16i8 mask1; 3706 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3707 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 3708 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3709 v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 3710 v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 3711 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6; 3712 v8i16 out0_r, out1_r, out2_r, out3_r; 3713 3714 src -= (src_stride + 1); 3715 3716 filter_vec = LD_SH(filter_x); 3717 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3718 3719 filter_vec = LD_SH(filter_y); 3720 UNPCK_R_SB_SH(filter_vec, filter_vec); 3721 3722 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3723 3724 mask1 = mask0 + 2; 3725 3726 for (cnt = width8mult; cnt--;) { 3727 src_tmp = src; 3728 dst_tmp = dst; 3729 3730 LD_SB3(src_tmp, src_stride, src0, src1, src2); 3731 src_tmp += (3 * src_stride); 3732 3733 XORI_B3_128_SB(src0, src1, src2); 3734 3735 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3736 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3737 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3738 3739 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3740 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3741 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3742 3743 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3744 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3745 3746 for (loop_cnt = (height >> 2); loop_cnt--;) { 3747 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 3748 src_tmp += (4 * src_stride); 3749 3750 XORI_B4_128_SB(src3, src4, src5, src6); 3751 3752 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3753 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3754 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3755 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3756 3757 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3758 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3759 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3760 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3761 3762 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3763 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3764 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3765 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3766 3767 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3768 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3769 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3770 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3771 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3772 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3773 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3774 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3775 3776 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3777 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3778 3779 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, 3780 dst2_l, dst2_r, dst3_l, dst3_r, 3781 out0_r, out1_r, out2_r, out3_r); 3782 3783 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); 3784 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 3785 out0 = PCKEV_XORI128_UB(out0_r, out1_r); 3786 out1 = PCKEV_XORI128_UB(out2_r, out3_r); 3787 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 3788 dst_tmp += (4 * dst_stride); 3789 3790 dst10_r = dst54_r; 3791 dst10_l = dst54_l; 3792 dst21_r = dst65_r; 3793 dst21_l = dst65_l; 3794 dst2 = dst6; 3795 } 3796 3797 src += 8; 3798 dst += 8; 3799 } 3800} 3801 3802static void hevc_hv_uni_4t_8w_msa(uint8_t *src, 3803 int32_t src_stride, 3804 uint8_t *dst, 3805 int32_t dst_stride, 3806 const int8_t *filter_x, 3807 const int8_t *filter_y, 3808 int32_t height) 3809{ 3810 if (2 == height) { 3811 hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride, 3812 filter_x, filter_y); 3813 } else if (4 == height) { 3814 hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, 3815 filter_x, filter_y, 1); 3816 } else if (6 == height) { 3817 hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride, 3818 filter_x, filter_y); 3819 } else if (0 == (height % 4)) { 3820 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 3821 filter_x, filter_y, height, 1); 3822 } 3823} 3824 3825static void hevc_hv_uni_4t_12w_msa(uint8_t *src, 3826 int32_t src_stride, 3827 uint8_t *dst, 3828 int32_t dst_stride, 3829 const int8_t *filter_x, 3830 const int8_t *filter_y, 3831 int32_t height) 3832{ 3833 uint32_t loop_cnt; 3834 uint8_t *src_tmp, *dst_tmp; 3835 v16u8 out0, out1; 3836 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3837 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3838 v16i8 mask0, mask1, mask2, mask3; 3839 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3; 3840 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6; 3841 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 3842 v8i16 dst76_r, dst98_r, dst87_r, dst109_r; 3843 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 3844 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 3845 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3846 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3847 3848 src -= (src_stride + 1); 3849 3850 filter_vec = LD_SH(filter_x); 3851 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3852 3853 filter_vec = LD_SH(filter_y); 3854 UNPCK_R_SB_SH(filter_vec, filter_vec); 3855 3856 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3857 3858 mask0 = LD_SB(ff_hevc_mask_arr); 3859 mask1 = mask0 + 2; 3860 3861 src_tmp = src; 3862 dst_tmp = dst; 3863 3864 LD_SB3(src_tmp, src_stride, src0, src1, src2); 3865 src_tmp += (3 * src_stride); 3866 3867 XORI_B3_128_SB(src0, src1, src2); 3868 3869 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3870 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3871 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3872 3873 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3874 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3875 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3876 3877 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 3878 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 3879 3880 for (loop_cnt = 4; loop_cnt--;) { 3881 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 3882 src_tmp += (4 * src_stride); 3883 XORI_B4_128_SB(src3, src4, src5, src6); 3884 3885 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3886 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3887 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3888 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3889 3890 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3891 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3892 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3893 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3894 3895 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 3896 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 3897 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 3898 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 3899 3900 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3901 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3902 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3903 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3904 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3905 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3906 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3907 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3908 3909 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3910 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3911 3912 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 3913 dst3_r, tmp0, tmp1, tmp2, tmp3); 3914 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); 3915 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 3916 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 3917 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 3918 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); 3919 dst_tmp += (4 * dst_stride); 3920 3921 dst10_r = dst54_r; 3922 dst10_l = dst54_l; 3923 dst21_r = dst65_r; 3924 dst21_l = dst65_l; 3925 dsth2 = dsth6; 3926 } 3927 3928 src += 8; 3929 dst += 8; 3930 3931 mask2 = LD_SB(ff_hevc_mask_arr + 16); 3932 mask3 = mask2 + 2; 3933 3934 LD_SB3(src, src_stride, src0, src1, src2); 3935 src += (3 * src_stride); 3936 XORI_B3_128_SB(src0, src1, src2); 3937 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 3938 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3); 3939 3940 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3941 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3942 3943 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 3944 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 3945 3946 for (loop_cnt = 2; loop_cnt--;) { 3947 LD_SB8(src, src_stride, 3948 src3, src4, src5, src6, src7, src8, src9, src10); 3949 src += (8 * src_stride); 3950 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3951 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1); 3952 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3); 3953 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5); 3954 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7); 3955 3956 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1); 3957 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1); 3958 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1); 3959 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1); 3960 3961 dst32_r = __msa_ilvr_h(dst73, dst22); 3962 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 3963 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 3964 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 3965 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 3966 dst76_r = __msa_ilvr_h(dst22, dst106); 3967 3968 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3969 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3970 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3971 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3972 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3973 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3974 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 3975 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 3976 SRA_4V(dst0, dst1, dst2, dst3, 6); 3977 SRA_4V(dst4, dst5, dst6, dst7, 6); 3978 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 3979 tmp0, tmp1, tmp2, tmp3); 3980 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); 3981 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 3982 out0 = PCKEV_XORI128_UB(tmp0, tmp1); 3983 out1 = PCKEV_XORI128_UB(tmp2, tmp3); 3984 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); 3985 dst += (8 * dst_stride); 3986 3987 dst10_r = dst98_r; 3988 dst21_r = dst109_r; 3989 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 3990 } 3991} 3992 3993static void hevc_hv_uni_4t_16w_msa(uint8_t *src, 3994 int32_t src_stride, 3995 uint8_t *dst, 3996 int32_t dst_stride, 3997 const int8_t *filter_x, 3998 const int8_t *filter_y, 3999 int32_t height) 4000{ 4001 if (4 == height) { 4002 hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x, 4003 filter_y, 2); 4004 } else { 4005 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4006 filter_x, filter_y, height, 2); 4007 } 4008} 4009 4010static void hevc_hv_uni_4t_24w_msa(uint8_t *src, 4011 int32_t src_stride, 4012 uint8_t *dst, 4013 int32_t dst_stride, 4014 const int8_t *filter_x, 4015 const int8_t *filter_y, 4016 int32_t height) 4017{ 4018 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4019 filter_x, filter_y, height, 3); 4020} 4021 4022static void hevc_hv_uni_4t_32w_msa(uint8_t *src, 4023 int32_t src_stride, 4024 uint8_t *dst, 4025 int32_t dst_stride, 4026 const int8_t *filter_x, 4027 const int8_t *filter_y, 4028 int32_t height) 4029{ 4030 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4031 filter_x, filter_y, height, 4); 4032} 4033 4034#define UNI_MC_COPY(WIDTH) \ 4035void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 4036 ptrdiff_t dst_stride, \ 4037 uint8_t *src, \ 4038 ptrdiff_t src_stride, \ 4039 int height, \ 4040 intptr_t mx, \ 4041 intptr_t my, \ 4042 int width) \ 4043{ \ 4044 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \ 4045} 4046 4047UNI_MC_COPY(8); 4048UNI_MC_COPY(12); 4049UNI_MC_COPY(16); 4050UNI_MC_COPY(24); 4051UNI_MC_COPY(32); 4052UNI_MC_COPY(48); 4053UNI_MC_COPY(64); 4054 4055#undef UNI_MC_COPY 4056 4057#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 4058void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 4059 ptrdiff_t dst_stride, \ 4060 uint8_t *src, \ 4061 ptrdiff_t src_stride, \ 4062 int height, \ 4063 intptr_t mx, \ 4064 intptr_t my, \ 4065 int width) \ 4066{ \ 4067 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 4068 \ 4069 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 4070 filter, height); \ 4071} 4072 4073UNI_MC(qpel, h, 4, 8, hz, mx); 4074UNI_MC(qpel, h, 8, 8, hz, mx); 4075UNI_MC(qpel, h, 12, 8, hz, mx); 4076UNI_MC(qpel, h, 16, 8, hz, mx); 4077UNI_MC(qpel, h, 24, 8, hz, mx); 4078UNI_MC(qpel, h, 32, 8, hz, mx); 4079UNI_MC(qpel, h, 48, 8, hz, mx); 4080UNI_MC(qpel, h, 64, 8, hz, mx); 4081 4082UNI_MC(qpel, v, 4, 8, vt, my); 4083UNI_MC(qpel, v, 8, 8, vt, my); 4084UNI_MC(qpel, v, 12, 8, vt, my); 4085UNI_MC(qpel, v, 16, 8, vt, my); 4086UNI_MC(qpel, v, 24, 8, vt, my); 4087UNI_MC(qpel, v, 32, 8, vt, my); 4088UNI_MC(qpel, v, 48, 8, vt, my); 4089UNI_MC(qpel, v, 64, 8, vt, my); 4090 4091UNI_MC(epel, h, 4, 4, hz, mx); 4092UNI_MC(epel, h, 6, 4, hz, mx); 4093UNI_MC(epel, h, 8, 4, hz, mx); 4094UNI_MC(epel, h, 12, 4, hz, mx); 4095UNI_MC(epel, h, 16, 4, hz, mx); 4096UNI_MC(epel, h, 24, 4, hz, mx); 4097UNI_MC(epel, h, 32, 4, hz, mx); 4098 4099UNI_MC(epel, v, 4, 4, vt, my); 4100UNI_MC(epel, v, 6, 4, vt, my); 4101UNI_MC(epel, v, 8, 4, vt, my); 4102UNI_MC(epel, v, 12, 4, vt, my); 4103UNI_MC(epel, v, 16, 4, vt, my); 4104UNI_MC(epel, v, 24, 4, vt, my); 4105UNI_MC(epel, v, 32, 4, vt, my); 4106 4107#undef UNI_MC 4108 4109#define UNI_MC_HV(PEL, WIDTH, TAP) \ 4110void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 4111 ptrdiff_t dst_stride, \ 4112 uint8_t *src, \ 4113 ptrdiff_t src_stride, \ 4114 int height, \ 4115 intptr_t mx, \ 4116 intptr_t my, \ 4117 int width) \ 4118{ \ 4119 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 4120 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 4121 \ 4122 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 4123 filter_x, filter_y, height); \ 4124} 4125 4126UNI_MC_HV(qpel, 4, 8); 4127UNI_MC_HV(qpel, 8, 8); 4128UNI_MC_HV(qpel, 12, 8); 4129UNI_MC_HV(qpel, 16, 8); 4130UNI_MC_HV(qpel, 24, 8); 4131UNI_MC_HV(qpel, 32, 8); 4132UNI_MC_HV(qpel, 48, 8); 4133UNI_MC_HV(qpel, 64, 8); 4134 4135UNI_MC_HV(epel, 4, 4); 4136UNI_MC_HV(epel, 6, 4); 4137UNI_MC_HV(epel, 8, 4); 4138UNI_MC_HV(epel, 12, 4); 4139UNI_MC_HV(epel, 16, 4); 4140UNI_MC_HV(epel, 24, 4); 4141UNI_MC_HV(epel, 32, 4); 4142 4143#undef UNI_MC_HV 4144