1/* 2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "libavcodec/mips/hevcdsp_mips.h" 23#include "libavcodec/mips/hevc_macros_msa.h" 24 25static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { 26 /* 8 width cases */ 27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 28 /* 4 width cases */ 29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 30}; 31 32static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride, 33 int16_t *dst, int32_t dst_stride, 34 int32_t height) 35{ 36 v16i8 zero = { 0 }; 37 38 if (2 == height) { 39 v16i8 src0, src1; 40 v8i16 in0; 41 42 LD_SB2(src, src_stride, src0, src1); 43 44 src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0); 45 in0 = (v8i16) __msa_ilvr_b(zero, src0); 46 in0 <<= 6; 47 ST_D2(in0, 0, 1, dst, dst_stride); 48 } else if (4 == height) { 49 v16i8 src0, src1, src2, src3; 50 v8i16 in0, in1; 51 52 LD_SB4(src, src_stride, src0, src1, src2, src3); 53 54 ILVR_W2_SB(src1, src0, src3, src2, src0, src1); 55 ILVR_B2_SH(zero, src0, zero, src1, in0, in1); 56 in0 <<= 6; 57 in1 <<= 6; 58 ST_D4(in0, in1, 0, 1, 0, 1, dst, dst_stride); 59 } else if (0 == height % 8) { 60 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 61 v8i16 in0, in1, in2, in3; 62 uint32_t loop_cnt; 63 64 for (loop_cnt = (height >> 3); loop_cnt--;) { 65 LD_SB8(src, src_stride, 66 src0, src1, src2, src3, src4, src5, src6, src7); 67 src += (8 * src_stride); 68 69 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 70 src0, src1, src2, src3); 71 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 72 in0, in1, in2, in3); 73 SLLI_4V(in0, in1, in2, in3, 6); 74 ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 75 dst += (8 * dst_stride); 76 } 77 } 78} 79 80static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride, 81 int16_t *dst, int32_t dst_stride, 82 int32_t height) 83{ 84 uint32_t loop_cnt; 85 v16i8 zero = { 0 }; 86 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 87 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 88 89 for (loop_cnt = (height >> 3); loop_cnt--;) { 90 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 91 src += (8 * src_stride); 92 93 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 94 in0, in1, in2, in3); 95 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 96 in4, in5, in6, in7); 97 SLLI_4V(in0, in1, in2, in3, 6); 98 SLLI_4V(in4, in5, in6, in7, 6); 99 ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride); 100 dst += (8 * dst_stride); 101 } 102} 103 104static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride, 105 int16_t *dst, int32_t dst_stride, 106 int32_t height) 107{ 108 v16i8 zero = { 0 }; 109 110 if (2 == height) { 111 v16i8 src0, src1; 112 v8i16 in0, in1; 113 114 LD_SB2(src, src_stride, src0, src1); 115 116 ILVR_B2_SH(zero, src0, zero, src1, in0, in1); 117 in0 <<= 6; 118 in1 <<= 6; 119 ST_SH2(in0, in1, dst, dst_stride); 120 } else if (4 == height) { 121 v16i8 src0, src1, src2, src3; 122 v8i16 in0, in1, in2, in3; 123 124 LD_SB4(src, src_stride, src0, src1, src2, src3); 125 126 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 127 in0, in1, in2, in3); 128 SLLI_4V(in0, in1, in2, in3, 6); 129 ST_SH4(in0, in1, in2, in3, dst, dst_stride); 130 } else if (6 == height) { 131 v16i8 src0, src1, src2, src3, src4, src5; 132 v8i16 in0, in1, in2, in3, in4, in5; 133 134 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5); 135 136 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 137 in0, in1, in2, in3); 138 ILVR_B2_SH(zero, src4, zero, src5, in4, in5); 139 SLLI_4V(in0, in1, in2, in3, 6); 140 in4 <<= 6; 141 in5 <<= 6; 142 ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride); 143 } else if (0 == height % 8) { 144 uint32_t loop_cnt; 145 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 146 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 147 148 for (loop_cnt = (height >> 3); loop_cnt--;) { 149 LD_SB8(src, src_stride, 150 src0, src1, src2, src3, src4, src5, src6, src7); 151 src += (8 * src_stride); 152 153 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 154 in0, in1, in2, in3); 155 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 156 in4, in5, in6, in7); 157 SLLI_4V(in0, in1, in2, in3, 6); 158 SLLI_4V(in4, in5, in6, in7, 6); 159 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride); 160 dst += (8 * dst_stride); 161 } 162 } 163} 164 165static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride, 166 int16_t *dst, int32_t dst_stride, 167 int32_t height) 168{ 169 uint32_t loop_cnt; 170 v16i8 zero = { 0 }; 171 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 172 v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r; 173 174 for (loop_cnt = (height >> 3); loop_cnt--;) { 175 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 176 src += (8 * src_stride); 177 178 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 179 in0_r, in1_r, in2_r, in3_r); 180 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 181 ILVL_W2_SB(src1, src0, src3, src2, src0, src1); 182 ILVR_B2_SH(zero, src0, zero, src1, in0, in1); 183 in0 <<= 6; 184 in1 <<= 6; 185 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 186 ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride); 187 dst += (4 * dst_stride); 188 189 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 190 in0_r, in1_r, in2_r, in3_r); 191 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 192 ILVL_W2_SB(src5, src4, src7, src6, src0, src1); 193 ILVR_B2_SH(zero, src0, zero, src1, in0, in1); 194 in0 <<= 6; 195 in1 <<= 6; 196 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 197 ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride); 198 dst += (4 * dst_stride); 199 } 200} 201 202static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride, 203 int16_t *dst, int32_t dst_stride, 204 int32_t height) 205{ 206 v16i8 zero = { 0 }; 207 208 if (4 == height) { 209 v16i8 src0, src1, src2, src3; 210 v8i16 in0_r, in1_r, in2_r, in3_r; 211 v8i16 in0_l, in1_l, in2_l, in3_l; 212 213 LD_SB4(src, src_stride, src0, src1, src2, src3); 214 215 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 216 in0_r, in1_r, in2_r, in3_r); 217 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 218 in0_l, in1_l, in2_l, in3_l); 219 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 220 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 221 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 222 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 223 } else if (12 == height) { 224 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 225 v16i8 src8, src9, src10, src11; 226 v8i16 in0_r, in1_r, in2_r, in3_r; 227 v8i16 in0_l, in1_l, in2_l, in3_l; 228 229 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 230 src += (8 * src_stride); 231 LD_SB4(src, src_stride, src8, src9, src10, src11); 232 233 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 234 in0_r, in1_r, in2_r, in3_r); 235 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 236 in0_l, in1_l, in2_l, in3_l); 237 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 238 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 239 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 240 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 241 dst += (4 * dst_stride); 242 243 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 244 in0_r, in1_r, in2_r, in3_r); 245 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 246 in0_l, in1_l, in2_l, in3_l); 247 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 248 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 249 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 250 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 251 dst += (4 * dst_stride); 252 253 ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11, 254 in0_r, in1_r, in2_r, in3_r); 255 ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11, 256 in0_l, in1_l, in2_l, in3_l); 257 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 258 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 259 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 260 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 261 } else if (0 == (height % 8)) { 262 uint32_t loop_cnt; 263 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 264 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 265 266 for (loop_cnt = (height >> 3); loop_cnt--;) { 267 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, 268 src7); 269 src += (8 * src_stride); 270 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, 271 in1_r, in2_r, in3_r); 272 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, 273 in1_l, in2_l, in3_l); 274 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 275 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 276 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 277 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 278 dst += (4 * dst_stride); 279 280 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, 281 in1_r, in2_r, in3_r); 282 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, 283 in1_l, in2_l, in3_l); 284 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 285 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 286 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 287 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 288 dst += (4 * dst_stride); 289 } 290 } 291} 292 293static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride, 294 int16_t *dst, int32_t dst_stride, 295 int32_t height) 296{ 297 uint32_t loop_cnt; 298 v16i8 zero = { 0 }; 299 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 300 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 301 302 for (loop_cnt = (height >> 2); loop_cnt--;) { 303 LD_SB4(src, src_stride, src0, src1, src2, src3); 304 LD_SB4((src + 16), src_stride, src4, src5, src6, src7); 305 src += (4 * src_stride); 306 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r, 307 in2_r, in3_r); 308 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l, 309 in2_l, in3_l); 310 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 311 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 312 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride); 313 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride); 314 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r, 315 in2_r, in3_r); 316 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 317 ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride); 318 dst += (4 * dst_stride); 319 } 320} 321 322static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride, 323 int16_t *dst, int32_t dst_stride, 324 int32_t height) 325{ 326 uint32_t loop_cnt; 327 v16i8 zero = { 0 }; 328 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 329 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 330 331 for (loop_cnt = (height >> 2); loop_cnt--;) { 332 LD_SB4(src, src_stride, src0, src2, src4, src6); 333 LD_SB4((src + 16), src_stride, src1, src3, src5, src7); 334 src += (4 * src_stride); 335 336 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r, 337 in2_r, in3_r); 338 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l, 339 in2_l, in3_l); 340 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 341 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 342 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8); 343 dst += dst_stride; 344 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8); 345 dst += dst_stride; 346 347 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r, 348 in2_r, in3_r); 349 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, in1_l, 350 in2_l, in3_l); 351 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 352 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 353 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8); 354 dst += dst_stride; 355 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8); 356 dst += dst_stride; 357 } 358} 359 360static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride, 361 int16_t *dst, int32_t dst_stride, 362 int32_t height) 363{ 364 uint32_t loop_cnt; 365 v16i8 zero = { 0 }; 366 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 367 v16i8 src8, src9, src10, src11; 368 v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r; 369 v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l; 370 371 for (loop_cnt = (height >> 2); loop_cnt--;) { 372 LD_SB3(src, 16, src0, src1, src2); 373 src += src_stride; 374 LD_SB3(src, 16, src3, src4, src5); 375 src += src_stride; 376 LD_SB3(src, 16, src6, src7, src8); 377 src += src_stride; 378 LD_SB3(src, 16, src9, src10, src11); 379 src += src_stride; 380 381 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 382 in0_r, in1_r, in2_r, in3_r); 383 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 384 in0_l, in1_l, in2_l, in3_l); 385 ILVR_B2_SH(zero, src4, zero, src5, in4_r, in5_r); 386 ILVL_B2_SH(zero, src4, zero, src5, in4_l, in5_l); 387 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 388 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 389 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6); 390 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8); 391 dst += dst_stride; 392 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8); 393 dst += dst_stride; 394 395 ILVR_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9, 396 in0_r, in1_r, in2_r, in3_r); 397 ILVL_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9, 398 in0_l, in1_l, in2_l, in3_l); 399 ILVR_B2_SH(zero, src10, zero, src11, in4_r, in5_r); 400 ILVL_B2_SH(zero, src10, zero, src11, in4_l, in5_l); 401 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 402 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 403 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6); 404 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8); 405 dst += dst_stride; 406 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8); 407 dst += dst_stride; 408 } 409} 410 411static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride, 412 int16_t *dst, int32_t dst_stride, 413 int32_t height) 414{ 415 uint32_t loop_cnt; 416 v16i8 zero = { 0 }; 417 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 418 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l; 419 420 for (loop_cnt = (height >> 1); loop_cnt--;) { 421 LD_SB4(src, 16, src0, src1, src2, src3); 422 src += src_stride; 423 LD_SB4(src, 16, src4, src5, src6, src7); 424 src += src_stride; 425 426 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 427 in0_r, in1_r, in2_r, in3_r); 428 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, 429 in0_l, in1_l, in2_l, in3_l); 430 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 431 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 432 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8); 433 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8); 434 dst += dst_stride; 435 436 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 437 in0_r, in1_r, in2_r, in3_r); 438 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, 439 in0_l, in1_l, in2_l, in3_l); 440 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6); 441 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6); 442 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8); 443 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8); 444 dst += dst_stride; 445 } 446} 447 448static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, 449 int16_t *dst, int32_t dst_stride, 450 const int8_t *filter, int32_t height) 451{ 452 uint32_t loop_cnt; 453 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 454 v8i16 filt0, filt1, filt2, filt3; 455 v16i8 mask1, mask2, mask3; 456 v16i8 vec0, vec1, vec2, vec3; 457 v8i16 dst0, dst1, dst2, dst3; 458 v8i16 filter_vec, const_vec; 459 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 460 461 src -= 3; 462 const_vec = __msa_ldi_h(128); 463 const_vec <<= 6; 464 465 filter_vec = LD_SH(filter); 466 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 467 468 mask1 = mask0 + 2; 469 mask2 = mask0 + 4; 470 mask3 = mask0 + 6; 471 472 for (loop_cnt = (height >> 3); loop_cnt--;) { 473 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 474 src += (8 * src_stride); 475 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 476 477 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, 478 vec0, vec1, vec2, vec3); 479 dst0 = const_vec; 480 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 481 dst0, dst0, dst0, dst0); 482 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3, 483 vec0, vec1, vec2, vec3); 484 dst1 = const_vec; 485 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 486 dst1, dst1, dst1, dst1); 487 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3, 488 vec0, vec1, vec2, vec3); 489 dst2 = const_vec; 490 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 491 dst2, dst2, dst2, dst2); 492 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3, 493 vec0, vec1, vec2, vec3); 494 dst3 = const_vec; 495 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 496 dst3, dst3, dst3, dst3); 497 498 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 499 dst += (8 * dst_stride); 500 } 501} 502 503static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, 504 int16_t *dst, int32_t dst_stride, 505 const int8_t *filter, int32_t height) 506{ 507 uint32_t loop_cnt; 508 v16i8 src0, src1, src2, src3; 509 v8i16 filt0, filt1, filt2, filt3; 510 v16i8 mask1, mask2, mask3; 511 v16i8 vec0, vec1, vec2, vec3; 512 v8i16 dst0, dst1, dst2, dst3; 513 v8i16 filter_vec, const_vec; 514 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 515 516 src -= 3; 517 const_vec = __msa_ldi_h(128); 518 const_vec <<= 6; 519 520 filter_vec = LD_SH(filter); 521 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 522 523 mask1 = mask0 + 2; 524 mask2 = mask0 + 4; 525 mask3 = mask0 + 6; 526 527 for (loop_cnt = (height >> 2); loop_cnt--;) { 528 LD_SB4(src, src_stride, src0, src1, src2, src3); 529 src += (4 * src_stride); 530 XORI_B4_128_SB(src0, src1, src2, src3); 531 532 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 533 vec0, vec1, vec2, vec3); 534 dst0 = const_vec; 535 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 536 dst0, dst0, dst0, dst0); 537 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 538 vec0, vec1, vec2, vec3); 539 dst1 = const_vec; 540 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 541 dst1, dst1, dst1, dst1); 542 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 543 vec0, vec1, vec2, vec3); 544 dst2 = const_vec; 545 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 546 dst2, dst2, dst2, dst2); 547 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 548 vec0, vec1, vec2, vec3); 549 dst3 = const_vec; 550 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 551 dst3, dst3, dst3, dst3); 552 553 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); 554 dst += (4 * dst_stride); 555 } 556} 557 558static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, 559 int16_t *dst, int32_t dst_stride, 560 const int8_t *filter, int32_t height) 561{ 562 uint32_t loop_cnt; 563 int64_t res0, res1, res2, res3; 564 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 565 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 566 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 567 v8i16 filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5; 568 v8i16 filter_vec, const_vec; 569 570 src -= 3; 571 const_vec = __msa_ldi_h(128); 572 const_vec <<= 6; 573 574 filter_vec = LD_SH(filter); 575 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 576 577 mask0 = LD_SB(ff_hevc_mask_arr); 578 mask1 = mask0 + 2; 579 mask2 = mask0 + 4; 580 mask3 = mask0 + 6; 581 mask4 = LD_SB(ff_hevc_mask_arr + 16); 582 mask5 = mask4 + 2; 583 mask6 = mask4 + 4; 584 mask7 = mask4 + 6; 585 586 for (loop_cnt = 4; loop_cnt--;) { 587 LD_SB4(src, src_stride, src0, src1, src2, src3); 588 LD_SB4(src + 8, src_stride, src4, src5, src6, src7); 589 src += (4 * src_stride); 590 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 591 592 dst0 = const_vec; 593 dst1 = const_vec; 594 dst2 = const_vec; 595 dst3 = const_vec; 596 dst4 = const_vec; 597 dst5 = const_vec; 598 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 599 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 600 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec4, vec5); 601 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 602 dst1, dst2, dst3); 603 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5); 604 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 605 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 606 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5); 607 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 608 dst1, dst2, dst3); 609 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5); 610 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 611 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 612 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec4, vec5); 613 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 614 dst1, dst2, dst3); 615 DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5); 616 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1); 617 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3); 618 VSHF_B2_SB(src4, src5, src6, src7, mask7, mask7, vec4, vec5); 619 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 620 dst1, dst2, dst3); 621 DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5); 622 623 res0 = __msa_copy_s_d((v2i64) dst4, 0); 624 res1 = __msa_copy_s_d((v2i64) dst4, 1); 625 res2 = __msa_copy_s_d((v2i64) dst5, 0); 626 res3 = __msa_copy_s_d((v2i64) dst5, 1); 627 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); 628 SD4(res0, res1, res2, res3, (dst + 8), dst_stride); 629 dst += (4 * dst_stride); 630 } 631} 632 633static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, 634 int16_t *dst, int32_t dst_stride, 635 const int8_t *filter, int32_t height) 636{ 637 uint32_t loop_cnt; 638 v16i8 src0, src1, src2, src3; 639 v8i16 filt0, filt1, filt2, filt3; 640 v16i8 mask1, mask2, mask3; 641 v16i8 vec0, vec1, vec2, vec3; 642 v8i16 dst0, dst1, dst2, dst3; 643 v8i16 filter_vec, const_vec; 644 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 645 646 src -= 3; 647 const_vec = __msa_ldi_h(128); 648 const_vec <<= 6; 649 650 filter_vec = LD_SH(filter); 651 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 652 653 mask1 = mask0 + 2; 654 mask2 = mask0 + 4; 655 mask3 = mask0 + 6; 656 657 for (loop_cnt = (height >> 1); loop_cnt--;) { 658 LD_SB2(src, src_stride, src0, src2); 659 LD_SB2(src + 8, src_stride, src1, src3); 660 src += (2 * src_stride); 661 XORI_B4_128_SB(src0, src1, src2, src3); 662 663 dst0 = const_vec; 664 dst1 = const_vec; 665 dst2 = const_vec; 666 dst3 = const_vec; 667 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1); 668 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3); 669 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 670 dst1, dst2, dst3); 671 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1); 672 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3); 673 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 674 dst1, dst2, dst3); 675 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1); 676 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3); 677 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 678 dst1, dst2, dst3); 679 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1); 680 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3); 681 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 682 dst1, dst2, dst3); 683 684 ST_SH2(dst0, dst2, dst, dst_stride); 685 ST_SH2(dst1, dst3, dst + 8, dst_stride); 686 dst += (2 * dst_stride); 687 } 688} 689 690static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, 691 int16_t *dst, int32_t dst_stride, 692 const int8_t *filter, int32_t height) 693{ 694 uint32_t loop_cnt; 695 v16i8 src0, src1, src2, src3; 696 v8i16 filt0, filt1, filt2, filt3; 697 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 698 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 699 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 700 v8i16 filter_vec, const_vec; 701 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 702 703 src -= 3; 704 filter_vec = LD_SH(filter); 705 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 706 707 mask1 = mask0 + 2; 708 mask2 = mask0 + 4; 709 mask3 = mask0 + 6; 710 mask4 = mask0 + 8; 711 mask5 = mask0 + 10; 712 mask6 = mask0 + 12; 713 mask7 = mask0 + 14; 714 715 const_vec = __msa_ldi_h(128); 716 const_vec <<= 6; 717 718 for (loop_cnt = (height >> 1); loop_cnt--;) { 719 LD_SB2(src, 16, src0, src1); 720 src += src_stride; 721 LD_SB2(src, 16, src2, src3); 722 src += src_stride; 723 XORI_B4_128_SB(src0, src1, src2, src3); 724 725 dst0 = const_vec; 726 dst1 = const_vec; 727 dst2 = const_vec; 728 dst3 = const_vec; 729 dst4 = const_vec; 730 dst5 = const_vec; 731 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 732 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 733 VSHF_B2_SB(src2, src3, src3, src3, mask4, mask0, vec4, vec5); 734 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 735 dst1, dst2, dst3); 736 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5); 737 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 738 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 739 VSHF_B2_SB(src2, src3, src3, src3, mask5, mask1, vec4, vec5); 740 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 741 dst1, dst2, dst3); 742 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5); 743 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 744 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3); 745 VSHF_B2_SB(src2, src3, src3, src3, mask6, mask2, vec4, vec5); 746 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 747 dst1, dst2, dst3); 748 DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5); 749 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 750 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3); 751 VSHF_B2_SB(src2, src3, src3, src3, mask7, mask3, vec4, vec5); 752 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 753 dst1, dst2, dst3); 754 DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5); 755 756 ST_SH2(dst0, dst1, dst, 8); 757 ST_SH(dst2, dst + 16); 758 dst += dst_stride; 759 ST_SH2(dst3, dst4, dst, 8); 760 ST_SH(dst5, dst + 16); 761 dst += dst_stride; 762 } 763} 764 765static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, 766 int16_t *dst, int32_t dst_stride, 767 const int8_t *filter, int32_t height) 768{ 769 uint32_t loop_cnt; 770 v16i8 src0, src1, src2; 771 v8i16 filt0, filt1, filt2, filt3; 772 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 773 v16i8 vec0, vec1, vec2, vec3; 774 v8i16 dst0, dst1, dst2, dst3; 775 v8i16 filter_vec, const_vec; 776 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 777 778 src -= 3; 779 filter_vec = LD_SH(filter); 780 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 781 782 mask1 = mask0 + 2; 783 mask2 = mask0 + 4; 784 mask3 = mask0 + 6; 785 mask4 = mask0 + 8; 786 mask5 = mask0 + 10; 787 mask6 = mask0 + 12; 788 mask7 = mask0 + 14; 789 790 const_vec = __msa_ldi_h(128); 791 const_vec <<= 6; 792 793 for (loop_cnt = height; loop_cnt--;) { 794 LD_SB2(src, 16, src0, src1); 795 src2 = LD_SB(src + 24); 796 src += src_stride; 797 XORI_B3_128_SB(src0, src1, src2); 798 799 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 800 vec0, vec1, vec2, vec3); 801 dst0 = const_vec; 802 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 803 dst0, dst0, dst0, dst0); 804 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 805 vec0, vec1, vec2, vec3); 806 dst1 = const_vec; 807 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 808 dst1, dst1, dst1, dst1); 809 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 810 vec0, vec1, vec2, vec3); 811 dst2 = const_vec; 812 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 813 dst2, dst2, dst2, dst2); 814 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 815 vec0, vec1, vec2, vec3); 816 dst3 = const_vec; 817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 818 dst3, dst3, dst3, dst3); 819 820 ST_SH4(dst0, dst1, dst2, dst3, dst, 8); 821 dst += dst_stride; 822 } 823} 824 825static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, 826 int16_t *dst, int32_t dst_stride, 827 const int8_t *filter, int32_t height) 828{ 829 uint32_t loop_cnt; 830 v16i8 src0, src1, src2, src3; 831 v8i16 filt0, filt1, filt2, filt3; 832 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 833 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 834 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 835 v8i16 filter_vec, const_vec; 836 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 837 838 src -= 3; 839 filter_vec = LD_SH(filter); 840 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 841 842 mask1 = mask0 + 2; 843 mask2 = mask0 + 4; 844 mask3 = mask0 + 6; 845 mask4 = mask0 + 8; 846 mask5 = mask0 + 10; 847 mask6 = mask0 + 12; 848 mask7 = mask0 + 14; 849 850 const_vec = __msa_ldi_h(128); 851 const_vec <<= 6; 852 853 for (loop_cnt = height; loop_cnt--;) { 854 LD_SB3(src, 16, src0, src1, src2); 855 src3 = LD_SB(src + 40); 856 src += src_stride; 857 XORI_B4_128_SB(src0, src1, src2, src3); 858 859 dst0 = const_vec; 860 dst1 = const_vec; 861 dst2 = const_vec; 862 dst3 = const_vec; 863 dst4 = const_vec; 864 dst5 = const_vec; 865 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1); 866 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3); 867 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 868 dst1, dst2, dst3); 869 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1); 870 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3); 871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 872 dst1, dst2, dst3); 873 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1); 874 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3); 875 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0, 876 dst1, dst2, dst3); 877 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1); 878 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3); 879 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0, 880 dst1, dst2, dst3); 881 ST_SH4(dst0, dst1, dst2, dst3, dst, 8); 882 883 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec4, vec5); 884 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5); 885 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec4, vec5); 886 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5); 887 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec4, vec5); 888 DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5); 889 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec4, vec5); 890 DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5); 891 ST_SH2(dst4, dst5, (dst + 32), 8); 892 dst += dst_stride; 893 } 894} 895 896static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, 897 int16_t *dst, int32_t dst_stride, 898 const int8_t *filter, int32_t height) 899{ 900 uint32_t loop_cnt; 901 v16i8 src0, src1, src2, src3, src4; 902 v8i16 filt0, filt1, filt2, filt3; 903 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7; 904 v16i8 vec0, vec1, vec2, vec3; 905 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 906 v8i16 filter_vec, const_vec; 907 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 908 909 src -= 3; 910 911 filter_vec = LD_SH(filter); 912 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 913 914 mask1 = mask0 + 2; 915 mask2 = mask0 + 4; 916 mask3 = mask0 + 6; 917 mask4 = mask0 + 8; 918 mask5 = mask0 + 10; 919 mask6 = mask0 + 12; 920 mask7 = mask0 + 14; 921 922 const_vec = __msa_ldi_h(128); 923 const_vec <<= 6; 924 925 for (loop_cnt = height; loop_cnt--;) { 926 LD_SB4(src, 16, src0, src1, src2, src3); 927 src4 = LD_SB(src + 56); 928 src += src_stride; 929 XORI_B5_128_SB(src0, src1, src2, src3, src4); 930 931 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 932 vec0, vec1, vec2, vec3); 933 dst0 = const_vec; 934 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 935 dst0, dst0, dst0, dst0); 936 ST_SH(dst0, dst); 937 938 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, 939 vec0, vec1, vec2, vec3); 940 dst1 = const_vec; 941 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 942 dst1, dst1, dst1, dst1); 943 ST_SH(dst1, dst + 8); 944 945 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 946 vec0, vec1, vec2, vec3); 947 dst2 = const_vec; 948 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 949 dst2, dst2, dst2, dst2); 950 ST_SH(dst2, dst + 16); 951 952 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7, 953 vec0, vec1, vec2, vec3); 954 dst3 = const_vec; 955 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 956 dst3, dst3, dst3, dst3); 957 ST_SH(dst3, dst + 24); 958 959 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 960 vec0, vec1, vec2, vec3); 961 dst4 = const_vec; 962 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 963 dst4, dst4, dst4, dst4); 964 ST_SH(dst4, dst + 32); 965 966 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, 967 vec0, vec1, vec2, vec3); 968 dst5 = const_vec; 969 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 970 dst5, dst5, dst5, dst5); 971 ST_SH(dst5, dst + 40); 972 973 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 974 vec0, vec1, vec2, vec3); 975 dst6 = const_vec; 976 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 977 dst6, dst6, dst6, dst6); 978 ST_SH(dst6, dst + 48); 979 980 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 981 vec0, vec1, vec2, vec3); 982 dst7 = const_vec; 983 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 984 dst7, dst7, dst7, dst7); 985 ST_SH(dst7, dst + 56); 986 dst += dst_stride; 987 } 988} 989 990static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, 991 int16_t *dst, int32_t dst_stride, 992 const int8_t *filter, int32_t height) 993{ 994 int32_t loop_cnt; 995 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 996 v16i8 src9, src10, src11, src12, src13, src14; 997 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 998 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 999 v16i8 src1110_r, src1211_r, src1312_r, src1413_r; 1000 v16i8 src2110, src4332, src6554, src8776, src10998; 1001 v16i8 src12111110, src14131312; 1002 v8i16 dst10, dst32, dst54, dst76; 1003 v8i16 filt0, filt1, filt2, filt3; 1004 v8i16 filter_vec, const_vec; 1005 1006 src -= (3 * src_stride); 1007 1008 const_vec = __msa_ldi_h(128); 1009 const_vec <<= 6; 1010 1011 filter_vec = LD_SH(filter); 1012 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1013 1014 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1015 src += (7 * src_stride); 1016 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1017 src10_r, src32_r, src54_r, src21_r); 1018 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1019 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 1020 src2110, src4332, src6554); 1021 XORI_B3_128_SB(src2110, src4332, src6554); 1022 1023 for (loop_cnt = (height >> 3); loop_cnt--;) { 1024 LD_SB8(src, src_stride, 1025 src7, src8, src9, src10, src11, src12, src13, src14); 1026 src += (8 * src_stride); 1027 1028 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1029 src76_r, src87_r, src98_r, src109_r); 1030 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, 1031 src1110_r, src1211_r, src1312_r, src1413_r); 1032 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, 1033 src1211_r, src1110_r, src1413_r, src1312_r, 1034 src8776, src10998, src12111110, src14131312); 1035 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312); 1036 1037 dst10 = const_vec; 1038 DPADD_SB4_SH(src2110, src4332, src6554, src8776, 1039 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10); 1040 dst32 = const_vec; 1041 DPADD_SB4_SH(src4332, src6554, src8776, src10998, 1042 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32); 1043 dst54 = const_vec; 1044 DPADD_SB4_SH(src6554, src8776, src10998, src12111110, 1045 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54); 1046 dst76 = const_vec; 1047 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, 1048 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76); 1049 1050 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 1051 dst += (8 * dst_stride); 1052 1053 src2110 = src10998; 1054 src4332 = src12111110; 1055 src6554 = src14131312; 1056 src6 = src14; 1057 } 1058} 1059 1060static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, 1061 int16_t *dst, int32_t dst_stride, 1062 const int8_t *filter, int32_t height) 1063{ 1064 int32_t loop_cnt; 1065 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1066 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1067 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1068 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 1069 v8i16 filter_vec, const_vec; 1070 v8i16 filt0, filt1, filt2, filt3; 1071 1072 src -= (3 * src_stride); 1073 const_vec = __msa_ldi_h(128); 1074 const_vec <<= 6; 1075 1076 filter_vec = LD_SH(filter); 1077 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1078 1079 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1080 src += (7 * src_stride); 1081 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1082 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1083 src10_r, src32_r, src54_r, src21_r); 1084 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1085 1086 for (loop_cnt = (height >> 2); loop_cnt--;) { 1087 LD_SB4(src, src_stride, src7, src8, src9, src10); 1088 src += (4 * src_stride); 1089 XORI_B4_128_SB(src7, src8, src9, src10); 1090 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1091 src76_r, src87_r, src98_r, src109_r); 1092 1093 dst0_r = const_vec; 1094 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1095 filt0, filt1, filt2, filt3, 1096 dst0_r, dst0_r, dst0_r, dst0_r); 1097 dst1_r = const_vec; 1098 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1099 filt0, filt1, filt2, filt3, 1100 dst1_r, dst1_r, dst1_r, dst1_r); 1101 dst2_r = const_vec; 1102 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, 1103 filt0, filt1, filt2, filt3, 1104 dst2_r, dst2_r, dst2_r, dst2_r); 1105 dst3_r = const_vec; 1106 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, 1107 filt0, filt1, filt2, filt3, 1108 dst3_r, dst3_r, dst3_r, dst3_r); 1109 1110 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); 1111 dst += (4 * dst_stride); 1112 1113 src10_r = src54_r; 1114 src32_r = src76_r; 1115 src54_r = src98_r; 1116 src21_r = src65_r; 1117 src43_r = src87_r; 1118 src65_r = src109_r; 1119 src6 = src10; 1120 } 1121} 1122 1123static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, 1124 int16_t *dst, int32_t dst_stride, 1125 const int8_t *filter, int32_t height) 1126{ 1127 int32_t loop_cnt; 1128 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1129 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1130 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1131 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 1132 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; 1133 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; 1134 v16i8 src2110, src4332, src6554, src8776, src10998; 1135 v8i16 dst0_l, dst1_l; 1136 v8i16 filter_vec, const_vec; 1137 v8i16 filt0, filt1, filt2, filt3; 1138 1139 src -= (3 * src_stride); 1140 const_vec = __msa_ldi_h(128); 1141 const_vec <<= 6; 1142 1143 filter_vec = LD_SH(filter); 1144 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1145 1146 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1147 src += (7 * src_stride); 1148 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1149 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1150 src10_r, src32_r, src54_r, src21_r); 1151 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1152 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1153 src10_l, src32_l, src54_l, src21_l); 1154 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1155 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l, 1156 src2110, src4332, src6554); 1157 1158 for (loop_cnt = (height >> 2); loop_cnt--;) { 1159 LD_SB4(src, src_stride, src7, src8, src9, src10); 1160 src += (4 * src_stride); 1161 XORI_B4_128_SB(src7, src8, src9, src10); 1162 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1163 src76_r, src87_r, src98_r, src109_r); 1164 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1165 src76_l, src87_l, src98_l, src109_l); 1166 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998); 1167 1168 dst0_r = const_vec; 1169 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1170 filt0, filt1, filt2, filt3, 1171 dst0_r, dst0_r, dst0_r, dst0_r); 1172 dst1_r = const_vec; 1173 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1174 filt0, filt1, filt2, filt3, 1175 dst1_r, dst1_r, dst1_r, dst1_r); 1176 dst2_r = const_vec; 1177 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, 1178 filt0, filt1, filt2, filt3, 1179 dst2_r, dst2_r, dst2_r, dst2_r); 1180 dst3_r = const_vec; 1181 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, 1182 filt0, filt1, filt2, filt3, 1183 dst3_r, dst3_r, dst3_r, dst3_r); 1184 dst0_l = const_vec; 1185 DPADD_SB4_SH(src2110, src4332, src6554, src8776, 1186 filt0, filt1, filt2, filt3, 1187 dst0_l, dst0_l, dst0_l, dst0_l); 1188 dst1_l = const_vec; 1189 DPADD_SB4_SH(src4332, src6554, src8776, src10998, 1190 filt0, filt1, filt2, filt3, 1191 dst1_l, dst1_l, dst1_l, dst1_l); 1192 1193 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); 1194 ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride); 1195 dst += (4 * dst_stride); 1196 1197 src10_r = src54_r; 1198 src32_r = src76_r; 1199 src54_r = src98_r; 1200 src21_r = src65_r; 1201 src43_r = src87_r; 1202 src65_r = src109_r; 1203 src2110 = src6554; 1204 src4332 = src8776; 1205 src6554 = src10998; 1206 src6 = src10; 1207 } 1208} 1209 1210static void hevc_vt_8t_16multx4mult_msa(uint8_t *src, 1211 int32_t src_stride, 1212 int16_t *dst, 1213 int32_t dst_stride, 1214 const int8_t *filter, 1215 int32_t height, 1216 int32_t width) 1217{ 1218 uint8_t *src_tmp; 1219 int16_t *dst_tmp; 1220 int32_t loop_cnt, cnt; 1221 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1222 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 1223 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 1224 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 1225 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; 1226 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; 1227 v8i16 dst0_l, dst1_l, dst2_l, dst3_l; 1228 v8i16 filter_vec, const_vec; 1229 v8i16 filt0, filt1, filt2, filt3; 1230 1231 src -= (3 * src_stride); 1232 const_vec = __msa_ldi_h(128); 1233 const_vec <<= 6; 1234 1235 filter_vec = LD_SH(filter); 1236 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1237 1238 for (cnt = width >> 4; cnt--;) { 1239 src_tmp = src; 1240 dst_tmp = dst; 1241 1242 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1243 src_tmp += (7 * src_stride); 1244 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1245 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1246 src10_r, src32_r, src54_r, src21_r); 1247 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 1248 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, 1249 src10_l, src32_l, src54_l, src21_l); 1250 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 1251 1252 for (loop_cnt = (height >> 2); loop_cnt--;) { 1253 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); 1254 src_tmp += (4 * src_stride); 1255 XORI_B4_128_SB(src7, src8, src9, src10); 1256 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1257 src76_r, src87_r, src98_r, src109_r); 1258 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 1259 src76_l, src87_l, src98_l, src109_l); 1260 1261 dst0_r = const_vec; 1262 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r, 1263 filt0, filt1, filt2, filt3, 1264 dst0_r, dst0_r, dst0_r, dst0_r); 1265 dst1_r = const_vec; 1266 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r, 1267 filt0, filt1, filt2, filt3, 1268 dst1_r, dst1_r, dst1_r, dst1_r); 1269 dst2_r = const_vec; 1270 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r, 1271 filt0, filt1, filt2, filt3, 1272 dst2_r, dst2_r, dst2_r, dst2_r); 1273 dst3_r = const_vec; 1274 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r, 1275 filt0, filt1, filt2, filt3, 1276 dst3_r, dst3_r, dst3_r, dst3_r); 1277 dst0_l = const_vec; 1278 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l, 1279 filt0, filt1, filt2, filt3, 1280 dst0_l, dst0_l, dst0_l, dst0_l); 1281 dst1_l = const_vec; 1282 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l, 1283 filt0, filt1, filt2, filt3, 1284 dst1_l, dst1_l, dst1_l, dst1_l); 1285 dst2_l = const_vec; 1286 DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l, 1287 filt0, filt1, filt2, filt3, 1288 dst2_l, dst2_l, dst2_l, dst2_l); 1289 dst3_l = const_vec; 1290 DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l, 1291 filt0, filt1, filt2, filt3, 1292 dst3_l, dst3_l, dst3_l, dst3_l); 1293 1294 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride); 1295 ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride); 1296 dst_tmp += (4 * dst_stride); 1297 1298 src10_r = src54_r; 1299 src32_r = src76_r; 1300 src54_r = src98_r; 1301 src21_r = src65_r; 1302 src43_r = src87_r; 1303 src65_r = src109_r; 1304 src10_l = src54_l; 1305 src32_l = src76_l; 1306 src54_l = src98_l; 1307 src21_l = src65_l; 1308 src43_l = src87_l; 1309 src65_l = src109_l; 1310 src6 = src10; 1311 } 1312 1313 src += 16; 1314 dst += 16; 1315 } 1316} 1317 1318static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, 1319 int16_t *dst, int32_t dst_stride, 1320 const int8_t *filter, int32_t height) 1321{ 1322 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1323 filter, height, 16); 1324} 1325 1326static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, 1327 int16_t *dst, int32_t dst_stride, 1328 const int8_t *filter, int32_t height) 1329{ 1330 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1331 filter, height, 16); 1332 hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, 1333 filter, height); 1334} 1335 1336static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, 1337 int16_t *dst, int32_t dst_stride, 1338 const int8_t *filter, int32_t height) 1339{ 1340 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1341 filter, height, 32); 1342} 1343 1344static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, 1345 int16_t *dst, int32_t dst_stride, 1346 const int8_t *filter, int32_t height) 1347{ 1348 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1349 filter, height, 48); 1350} 1351 1352static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, 1353 int16_t *dst, int32_t dst_stride, 1354 const int8_t *filter, int32_t height) 1355{ 1356 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride, 1357 filter, height, 64); 1358} 1359 1360static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride, 1361 int16_t *dst, int32_t dst_stride, 1362 const int8_t *filter_x, const int8_t *filter_y, 1363 int32_t height) 1364{ 1365 uint32_t loop_cnt; 1366 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1367 v8i16 filt0, filt1, filt2, filt3; 1368 v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1369 v16i8 mask1, mask2, mask3; 1370 v8i16 filter_vec, const_vec; 1371 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1372 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1373 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1374 v4i32 dst0_r, dst1_r, dst2_r, dst3_r; 1375 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r; 1376 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r; 1377 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1378 1379 src -= ((3 * src_stride) + 3); 1380 filter_vec = LD_SH(filter_x); 1381 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1382 1383 filter_vec = LD_SH(filter_y); 1384 UNPCK_R_SB_SH(filter_vec, filter_vec); 1385 1386 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1387 1388 mask1 = mask0 + 2; 1389 mask2 = mask0 + 4; 1390 mask3 = mask0 + 6; 1391 1392 const_vec = __msa_ldi_h(128); 1393 const_vec <<= 6; 1394 1395 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1396 src += (7 * src_stride); 1397 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1398 1399 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1400 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1401 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3, 1402 vec8, vec9, vec10, vec11); 1403 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3, 1404 vec12, vec13, vec14, vec15); 1405 dst30 = const_vec; 1406 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 1407 dst30, dst30, dst30, dst30); 1408 dst41 = const_vec; 1409 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, 1410 dst41, dst41, dst41, dst41); 1411 dst52 = const_vec; 1412 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, 1413 dst52, dst52, dst52, dst52); 1414 dst63 = const_vec; 1415 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, 1416 dst63, dst63, dst63, dst63); 1417 1418 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 1419 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 1420 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 1421 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1422 1423 for (loop_cnt = height >> 2; loop_cnt--;) { 1424 LD_SB4(src, src_stride, src7, src8, src9, src10); 1425 src += (4 * src_stride); 1426 XORI_B4_128_SB(src7, src8, src9, src10); 1427 1428 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3, 1429 vec0, vec1, vec2, vec3); 1430 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3, 1431 vec4, vec5, vec6, vec7); 1432 dst97 = const_vec; 1433 dst108 = const_vec; 1434 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 1435 dst97, dst97, dst97, dst97); 1436 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, 1437 dst108, dst108, dst108, dst108); 1438 1439 dst76_r = __msa_ilvr_h(dst97, dst66); 1440 ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r); 1441 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 1442 dst98_r = __msa_ilvr_h(dst66, dst108); 1443 1444 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1445 filt_h0, filt_h1, filt_h2, filt_h3); 1446 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, 1447 filt_h0, filt_h1, filt_h2, filt_h3); 1448 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, 1449 filt_h0, filt_h1, filt_h2, filt_h3); 1450 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, 1451 filt_h0, filt_h1, filt_h2, filt_h3); 1452 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1453 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r); 1454 ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride); 1455 dst += (4 * dst_stride); 1456 1457 dst10_r = dst54_r; 1458 dst32_r = dst76_r; 1459 dst54_r = dst98_r; 1460 dst21_r = dst65_r; 1461 dst43_r = dst87_r; 1462 dst65_r = dst109_r; 1463 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 1464 } 1465} 1466 1467static void hevc_hv_8t_8multx1mult_msa(uint8_t *src, 1468 int32_t src_stride, 1469 int16_t *dst, 1470 int32_t dst_stride, 1471 const int8_t *filter_x, 1472 const int8_t *filter_y, 1473 int32_t height, int32_t width) 1474{ 1475 uint32_t loop_cnt, cnt; 1476 uint8_t *src_tmp; 1477 int16_t *dst_tmp; 1478 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1479 v8i16 filt0, filt1, filt2, filt3; 1480 v8i16 filt_h0, filt_h1, filt_h2, filt_h3; 1481 v16i8 mask1, mask2, mask3; 1482 v8i16 filter_vec, const_vec; 1483 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1484 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1485 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1486 v4i32 dst0_r, dst0_l; 1487 v8i16 dst10_r, dst32_r, dst54_r, dst76_r; 1488 v8i16 dst10_l, dst32_l, dst54_l, dst76_l; 1489 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1490 1491 src -= ((3 * src_stride) + 3); 1492 filter_vec = LD_SH(filter_x); 1493 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1494 1495 filter_vec = LD_SH(filter_y); 1496 UNPCK_R_SB_SH(filter_vec, filter_vec); 1497 1498 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1499 1500 mask1 = mask0 + 2; 1501 mask2 = mask0 + 4; 1502 mask3 = mask0 + 6; 1503 1504 const_vec = __msa_ldi_h(128); 1505 const_vec <<= 6; 1506 1507 for (cnt = width >> 3; cnt--;) { 1508 src_tmp = src; 1509 dst_tmp = dst; 1510 1511 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1512 src_tmp += (7 * src_stride); 1513 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1514 1515 /* row 0 row 1 row 2 row 3 */ 1516 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, 1517 vec0, vec1, vec2, vec3); 1518 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, 1519 vec4, vec5, vec6, vec7); 1520 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, 1521 vec8, vec9, vec10, vec11); 1522 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, 1523 vec12, vec13, vec14, vec15); 1524 dst0 = const_vec; 1525 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 1526 dst0, dst0, dst0, dst0); 1527 dst1 = const_vec; 1528 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, 1529 dst1, dst1, dst1, dst1); 1530 dst2 = const_vec; 1531 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, 1532 dst2, dst2, dst2, dst2); 1533 dst3 = const_vec; 1534 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, 1535 dst3, dst3, dst3, dst3); 1536 1537 /* row 4 row 5 row 6 */ 1538 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, 1539 vec0, vec1, vec2, vec3); 1540 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, 1541 vec4, vec5, vec6, vec7); 1542 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, 1543 vec8, vec9, vec10, vec11); 1544 dst4 = const_vec; 1545 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 1546 dst4, dst4, dst4, dst4); 1547 dst5 = const_vec; 1548 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, 1549 dst5, dst5, dst5, dst5); 1550 dst6 = const_vec; 1551 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, 1552 dst6, dst6, dst6, dst6); 1553 1554 for (loop_cnt = height; loop_cnt--;) { 1555 src7 = LD_SB(src_tmp); 1556 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128); 1557 src_tmp += src_stride; 1558 1559 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, 1560 vec0, vec1, vec2, vec3); 1561 dst7 = const_vec; 1562 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, 1563 dst7, dst7, dst7, dst7); 1564 1565 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 1566 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 1567 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 1568 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1569 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, 1570 filt_h0, filt_h1, filt_h2, filt_h3); 1571 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, 1572 filt_h0, filt_h1, filt_h2, filt_h3); 1573 dst0_r >>= 6; 1574 dst0_l >>= 6; 1575 1576 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 1577 ST_SW(dst0_r, dst_tmp); 1578 dst_tmp += dst_stride; 1579 1580 dst0 = dst1; 1581 dst1 = dst2; 1582 dst2 = dst3; 1583 dst3 = dst4; 1584 dst4 = dst5; 1585 dst5 = dst6; 1586 dst6 = dst7; 1587 } 1588 1589 src += 8; 1590 dst += 8; 1591 } 1592} 1593 1594static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride, 1595 int16_t *dst, int32_t dst_stride, 1596 const int8_t *filter_x, const int8_t *filter_y, 1597 int32_t height) 1598{ 1599 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1600 filter_x, filter_y, height, 8); 1601} 1602 1603static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride, 1604 int16_t *dst, int32_t dst_stride, 1605 const int8_t *filter_x, const int8_t *filter_y, 1606 int32_t height) 1607{ 1608 uint32_t loop_cnt; 1609 uint8_t *src_tmp; 1610 int16_t *dst_tmp; 1611 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1612 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7; 1613 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1614 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 1615 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3; 1616 v8i16 filter_vec, const_vec; 1617 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 1618 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108; 1619 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r; 1620 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l; 1621 v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r; 1622 1623 src -= ((3 * src_stride) + 3); 1624 filter_vec = LD_SH(filter_x); 1625 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1626 1627 filter_vec = LD_SH(filter_y); 1628 UNPCK_R_SB_SH(filter_vec, filter_vec); 1629 1630 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3); 1631 1632 mask0 = LD_SB(ff_hevc_mask_arr); 1633 mask1 = mask0 + 2; 1634 mask2 = mask0 + 4; 1635 mask3 = mask0 + 6; 1636 1637 const_vec = __msa_ldi_h(128); 1638 const_vec <<= 6; 1639 1640 src_tmp = src; 1641 dst_tmp = dst; 1642 1643 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 1644 src_tmp += (7 * src_stride); 1645 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1646 1647 /* row 0 row 1 row 2 row 3 */ 1648 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1649 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1650 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1651 vec11); 1652 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14, 1653 vec15); 1654 dst0 = const_vec; 1655 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst0, dst0, 1656 dst0, dst0); 1657 dst1 = const_vec; 1658 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst1, dst1, 1659 dst1, dst1); 1660 dst2 = const_vec; 1661 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst2, 1662 dst2, dst2, dst2); 1663 dst3 = const_vec; 1664 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst3, 1665 dst3, dst3, dst3); 1666 1667 /* row 4 row 5 row 6 */ 1668 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3); 1669 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7); 1670 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10, 1671 vec11); 1672 dst4 = const_vec; 1673 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst4, dst4, 1674 dst4, dst4); 1675 dst5 = const_vec; 1676 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst5, dst5, 1677 dst5, dst5); 1678 dst6 = const_vec; 1679 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst6, 1680 dst6, dst6, dst6); 1681 1682 for (loop_cnt = height; loop_cnt--;) { 1683 src7 = LD_SB(src_tmp); 1684 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128); 1685 src_tmp += src_stride; 1686 1687 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2, 1688 vec3); 1689 dst7 = const_vec; 1690 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst7, 1691 dst7, dst7, dst7); 1692 1693 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 1694 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 1695 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 1696 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 1697 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1698 filt_h1, filt_h2, filt_h3); 1699 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0, 1700 filt_h1, filt_h2, filt_h3); 1701 dst0_r >>= 6; 1702 dst0_l >>= 6; 1703 1704 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); 1705 ST_SW(dst0_r, dst_tmp); 1706 dst_tmp += dst_stride; 1707 1708 dst0 = dst1; 1709 dst1 = dst2; 1710 dst2 = dst3; 1711 dst3 = dst4; 1712 dst4 = dst5; 1713 dst5 = dst6; 1714 dst6 = dst7; 1715 } 1716 1717 src += 8; 1718 dst += 8; 1719 1720 mask4 = LD_SB(ff_hevc_mask_arr + 16); 1721 mask5 = mask4 + 2; 1722 mask6 = mask4 + 4; 1723 mask7 = mask4 + 6; 1724 1725 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 1726 src += (7 * src_stride); 1727 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 1728 1729 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3); 1730 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7); 1731 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10, 1732 vec11); 1733 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14, 1734 vec15); 1735 dst30 = const_vec; 1736 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst30, 1737 dst30, dst30, dst30); 1738 dst41 = const_vec; 1739 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst41, 1740 dst41, dst41, dst41); 1741 dst52 = const_vec; 1742 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst52, 1743 dst52, dst52, dst52); 1744 dst63 = const_vec; 1745 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst63, 1746 dst63, dst63, dst63); 1747 1748 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r); 1749 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r); 1750 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r); 1751 1752 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1); 1753 1754 for (loop_cnt = height >> 2; loop_cnt--;) { 1755 LD_SB4(src, src_stride, src7, src8, src9, src10); 1756 src += (4 * src_stride); 1757 XORI_B4_128_SB(src7, src8, src9, src10); 1758 1759 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2, 1760 vec3); 1761 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6, 1762 vec7); 1763 dst97 = const_vec; 1764 dst108 = const_vec; 1765 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst97, 1766 dst97, dst97, dst97); 1767 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst108, 1768 dst108, dst108, dst108); 1769 1770 dst76_r = __msa_ilvr_h(dst97, dst66); 1771 ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r); 1772 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1); 1773 dst98_r = __msa_ilvr_h(dst66, dst108); 1774 1775 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0, 1776 filt_h1, filt_h2, filt_h3); 1777 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0, 1778 filt_h1, filt_h2, filt_h3); 1779 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0, 1780 filt_h1, filt_h2, filt_h3); 1781 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0, 1782 filt_h1, filt_h2, filt_h3); 1783 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 1784 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r); 1785 ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride); 1786 dst += (4 * dst_stride); 1787 1788 dst10_r = dst54_r; 1789 dst32_r = dst76_r; 1790 dst54_r = dst98_r; 1791 dst21_r = dst65_r; 1792 dst43_r = dst87_r; 1793 dst65_r = dst109_r; 1794 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1); 1795 } 1796} 1797 1798static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride, 1799 int16_t *dst, int32_t dst_stride, 1800 const int8_t *filter_x, const int8_t *filter_y, 1801 int32_t height) 1802{ 1803 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1804 filter_x, filter_y, height, 16); 1805} 1806 1807static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride, 1808 int16_t *dst, int32_t dst_stride, 1809 const int8_t *filter_x, const int8_t *filter_y, 1810 int32_t height) 1811{ 1812 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1813 filter_x, filter_y, height, 24); 1814} 1815 1816static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride, 1817 int16_t *dst, int32_t dst_stride, 1818 const int8_t *filter_x, const int8_t *filter_y, 1819 int32_t height) 1820{ 1821 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1822 filter_x, filter_y, height, 32); 1823} 1824 1825static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride, 1826 int16_t *dst, int32_t dst_stride, 1827 const int8_t *filter_x, const int8_t *filter_y, 1828 int32_t height) 1829{ 1830 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1831 filter_x, filter_y, height, 48); 1832} 1833 1834static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride, 1835 int16_t *dst, int32_t dst_stride, 1836 const int8_t *filter_x, const int8_t *filter_y, 1837 int32_t height) 1838{ 1839 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride, 1840 filter_x, filter_y, height, 64); 1841} 1842 1843static void hevc_hz_4t_4x2_msa(uint8_t *src, 1844 int32_t src_stride, 1845 int16_t *dst, 1846 int32_t dst_stride, 1847 const int8_t *filter) 1848{ 1849 v8i16 filt0, filt1; 1850 v16i8 src0, src1; 1851 v16i8 mask1, vec0, vec1; 1852 v8i16 dst0; 1853 v8i16 filter_vec, const_vec; 1854 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1855 1856 src -= 1; 1857 1858 filter_vec = LD_SH(filter); 1859 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 1860 1861 mask1 = mask0 + 2; 1862 1863 const_vec = __msa_ldi_h(128); 1864 const_vec <<= 6; 1865 1866 LD_SB2(src, src_stride, src0, src1); 1867 XORI_B2_128_SB(src0, src1); 1868 1869 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 1870 dst0 = const_vec; 1871 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 1872 1873 ST_D2(dst0, 0, 1, dst, dst_stride); 1874} 1875 1876static void hevc_hz_4t_4x4_msa(uint8_t *src, 1877 int32_t src_stride, 1878 int16_t *dst, 1879 int32_t dst_stride, 1880 const int8_t *filter) 1881{ 1882 v8i16 filt0, filt1; 1883 v16i8 src0, src1, src2, src3; 1884 v16i8 mask1, vec0, vec1; 1885 v8i16 dst0, dst1; 1886 v8i16 filter_vec, const_vec; 1887 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1888 1889 src -= 1; 1890 1891 filter_vec = LD_SH(filter); 1892 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 1893 1894 mask1 = mask0 + 2; 1895 1896 const_vec = __msa_ldi_h(128); 1897 const_vec <<= 6; 1898 1899 LD_SB4(src, src_stride, src0, src1, src2, src3); 1900 XORI_B4_128_SB(src0, src1, src2, src3); 1901 1902 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 1903 dst0 = const_vec; 1904 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 1905 1906 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); 1907 dst1 = const_vec; 1908 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 1909 1910 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 1911} 1912 1913static void hevc_hz_4t_4x8multiple_msa(uint8_t *src, 1914 int32_t src_stride, 1915 int16_t *dst, 1916 int32_t dst_stride, 1917 const int8_t *filter, 1918 int32_t height) 1919{ 1920 uint32_t loop_cnt; 1921 v8i16 filt0, filt1; 1922 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1923 v16i8 mask1, vec0, vec1; 1924 v8i16 dst0, dst1, dst2, dst3; 1925 v8i16 filter_vec, const_vec; 1926 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 1927 1928 src -= 1; 1929 1930 filter_vec = LD_SH(filter); 1931 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 1932 1933 mask1 = mask0 + 2; 1934 1935 const_vec = __msa_ldi_h(128); 1936 const_vec <<= 6; 1937 1938 for (loop_cnt = (height >> 3); loop_cnt--;) { 1939 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 1940 src += (8 * src_stride); 1941 1942 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 1943 1944 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 1945 dst0 = const_vec; 1946 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 1947 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1); 1948 dst1 = const_vec; 1949 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 1950 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1); 1951 dst2 = const_vec; 1952 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 1953 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1); 1954 dst3 = const_vec; 1955 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 1956 1957 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 1958 dst += (8 * dst_stride); 1959 } 1960} 1961 1962static void hevc_hz_4t_4w_msa(uint8_t *src, 1963 int32_t src_stride, 1964 int16_t *dst, 1965 int32_t dst_stride, 1966 const int8_t *filter, 1967 int32_t height) 1968{ 1969 if (2 == height) { 1970 hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); 1971 } else if (4 == height) { 1972 hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); 1973 } else if (0 == height % 8) { 1974 hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride, 1975 filter, height); 1976 } 1977} 1978 1979static void hevc_hz_4t_6w_msa(uint8_t *src, 1980 int32_t src_stride, 1981 int16_t *dst, 1982 int32_t dst_stride, 1983 const int8_t *filter, 1984 int32_t height) 1985{ 1986 uint32_t loop_cnt; 1987 uint64_t dst_val0, dst_val1, dst_val2, dst_val3; 1988 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3; 1989 v8i16 filt0, filt1, dst0, dst1, dst2, dst3; 1990 v16i8 src0, src1, src2, src3; 1991 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 1992 v16i8 mask1; 1993 v16i8 vec0, vec1; 1994 v8i16 filter_vec, const_vec; 1995 1996 src -= 1; 1997 1998 filter_vec = LD_SH(filter); 1999 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2000 2001 mask1 = mask0 + 2; 2002 2003 const_vec = __msa_ldi_h(128); 2004 const_vec <<= 6; 2005 2006 for (loop_cnt = 2; loop_cnt--;) { 2007 LD_SB4(src, src_stride, src0, src1, src2, src3); 2008 src += (4 * src_stride); 2009 2010 XORI_B4_128_SB(src0, src1, src2, src3); 2011 2012 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2013 dst0 = const_vec; 2014 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2015 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2016 dst1 = const_vec; 2017 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2018 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2019 dst2 = const_vec; 2020 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2021 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2022 dst3 = const_vec; 2023 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2024 2025 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); 2026 dst_val1 = __msa_copy_u_d((v2i64) dst1, 0); 2027 dst_val2 = __msa_copy_u_d((v2i64) dst2, 0); 2028 dst_val3 = __msa_copy_u_d((v2i64) dst3, 0); 2029 2030 dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2); 2031 dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2); 2032 dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2); 2033 dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2); 2034 2035 SD(dst_val0, dst); 2036 SW(dst_val_int0, dst + 4); 2037 dst += dst_stride; 2038 SD(dst_val1, dst); 2039 SW(dst_val_int1, dst + 4); 2040 dst += dst_stride; 2041 SD(dst_val2, dst); 2042 SW(dst_val_int2, dst + 4); 2043 dst += dst_stride; 2044 SD(dst_val3, dst); 2045 SW(dst_val_int3, dst + 4); 2046 dst += dst_stride; 2047 } 2048} 2049 2050static void hevc_hz_4t_8x2multiple_msa(uint8_t *src, 2051 int32_t src_stride, 2052 int16_t *dst, 2053 int32_t dst_stride, 2054 const int8_t *filter, 2055 int32_t height) 2056{ 2057 uint32_t loop_cnt; 2058 v8i16 filt0, filt1, dst0, dst1; 2059 v16i8 src0, src1; 2060 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2061 v16i8 mask1; 2062 v16i8 vec0, vec1; 2063 v8i16 filter_vec, const_vec; 2064 2065 src -= 1; 2066 2067 filter_vec = LD_SH(filter); 2068 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2069 2070 mask1 = mask0 + 2; 2071 2072 const_vec = __msa_ldi_h(128); 2073 const_vec <<= 6; 2074 2075 for (loop_cnt = (height >> 1); loop_cnt--;) { 2076 LD_SB2(src, src_stride, src0, src1); 2077 src += (2 * src_stride); 2078 2079 XORI_B2_128_SB(src0, src1); 2080 2081 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2082 dst0 = const_vec; 2083 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2084 2085 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2086 dst1 = const_vec; 2087 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2088 2089 ST_SH2(dst0, dst1, dst, dst_stride); 2090 dst += (2 * dst_stride); 2091 } 2092} 2093 2094static void hevc_hz_4t_8x4multiple_msa(uint8_t *src, 2095 int32_t src_stride, 2096 int16_t *dst, 2097 int32_t dst_stride, 2098 const int8_t *filter, 2099 int32_t height) 2100{ 2101 uint32_t loop_cnt; 2102 v8i16 filt0, filt1; 2103 v16i8 src0, src1, src2, src3; 2104 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2105 v16i8 mask1; 2106 v16i8 vec0, vec1; 2107 v8i16 dst0, dst1, dst2, dst3; 2108 v8i16 filter_vec, const_vec; 2109 2110 src -= 1; 2111 2112 filter_vec = LD_SH(filter); 2113 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2114 2115 mask1 = mask0 + 2; 2116 2117 const_vec = __msa_ldi_h(128); 2118 const_vec <<= 6; 2119 2120 for (loop_cnt = (height >> 2); loop_cnt--;) { 2121 LD_SB4(src, src_stride, src0, src1, src2, src3); 2122 src += (4 * src_stride); 2123 2124 XORI_B4_128_SB(src0, src1, src2, src3); 2125 2126 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2127 dst0 = const_vec; 2128 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2129 2130 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2131 dst1 = const_vec; 2132 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2133 2134 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2135 dst2 = const_vec; 2136 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2137 2138 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2139 dst3 = const_vec; 2140 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2141 2142 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); 2143 dst += (4 * dst_stride); 2144 } 2145} 2146 2147static void hevc_hz_4t_8w_msa(uint8_t *src, 2148 int32_t src_stride, 2149 int16_t *dst, 2150 int32_t dst_stride, 2151 const int8_t *filter, 2152 int32_t height) 2153{ 2154 if (2 == height || 6 == height) { 2155 hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride, 2156 filter, height); 2157 } else { 2158 hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride, 2159 filter, height); 2160 } 2161} 2162 2163static void hevc_hz_4t_12w_msa(uint8_t *src, 2164 int32_t src_stride, 2165 int16_t *dst, 2166 int32_t dst_stride, 2167 const int8_t *filter, 2168 int32_t height) 2169{ 2170 uint32_t loop_cnt; 2171 v8i16 filt0, filt1; 2172 v16i8 src0, src1, src2, src3; 2173 v16i8 mask1; 2174 v16i8 vec0, vec1; 2175 v8i16 dst0, dst1, dst2, dst3, dst4, dst5; 2176 v8i16 filter_vec, const_vec; 2177 v16i8 mask3; 2178 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2179 v16i8 mask2 = { 2180 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 2181 }; 2182 2183 src -= 1; 2184 2185 filter_vec = LD_SH(filter); 2186 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2187 2188 mask1 = mask0 + 2; 2189 mask3 = mask2 + 2; 2190 2191 const_vec = __msa_ldi_h(128); 2192 const_vec <<= 6; 2193 2194 for (loop_cnt = (height >> 2); loop_cnt--;) { 2195 LD_SB4(src, src_stride, src0, src1, src2, src3); 2196 src += (4 * src_stride); 2197 XORI_B4_128_SB(src0, src1, src2, src3); 2198 2199 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2200 dst0 = const_vec; 2201 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2202 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2203 dst1 = const_vec; 2204 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2205 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2206 dst2 = const_vec; 2207 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2208 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2209 dst3 = const_vec; 2210 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2211 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 2212 dst4 = const_vec; 2213 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); 2214 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1); 2215 dst5 = const_vec; 2216 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); 2217 2218 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride); 2219 ST_D4(dst4, dst5, 0, 1, 0, 1, dst + 8, dst_stride); 2220 dst += (4 * dst_stride); 2221 } 2222} 2223 2224static void hevc_hz_4t_16w_msa(uint8_t *src, 2225 int32_t src_stride, 2226 int16_t *dst, 2227 int32_t dst_stride, 2228 const int8_t *filter, 2229 int32_t height) 2230{ 2231 uint32_t loop_cnt; 2232 v16i8 src0, src1, src2, src3; 2233 v16i8 src4, src5, src6, src7; 2234 v8i16 filt0, filt1; 2235 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2236 v16i8 mask1; 2237 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2238 v16i8 vec0, vec1; 2239 v8i16 filter_vec, const_vec; 2240 2241 src -= 1; 2242 2243 filter_vec = LD_SH(filter); 2244 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2245 2246 mask1 = mask0 + 2; 2247 2248 const_vec = __msa_ldi_h(128); 2249 const_vec <<= 6; 2250 2251 for (loop_cnt = (height >> 2); loop_cnt--;) { 2252 LD_SB4(src, src_stride, src0, src2, src4, src6); 2253 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 2254 src += (4 * src_stride); 2255 2256 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2257 2258 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2259 dst0 = const_vec; 2260 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2261 2262 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2263 dst1 = const_vec; 2264 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2265 2266 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2267 dst2 = const_vec; 2268 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2269 2270 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2271 dst3 = const_vec; 2272 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2273 2274 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 2275 dst4 = const_vec; 2276 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4); 2277 2278 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); 2279 dst5 = const_vec; 2280 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5); 2281 2282 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); 2283 dst6 = const_vec; 2284 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6); 2285 2286 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 2287 dst7 = const_vec; 2288 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7); 2289 2290 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride); 2291 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride); 2292 dst += (4 * dst_stride); 2293 } 2294} 2295 2296static void hevc_hz_4t_24w_msa(uint8_t *src, 2297 int32_t src_stride, 2298 int16_t *dst, 2299 int32_t dst_stride, 2300 const int8_t *filter, 2301 int32_t height) 2302{ 2303 uint32_t loop_cnt; 2304 int16_t *dst_tmp = dst + 16; 2305 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 2306 v8i16 filt0, filt1; 2307 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2308 v16i8 mask1, mask00, mask11; 2309 v16i8 vec0, vec1; 2310 v8i16 dst0, dst1, dst2, dst3; 2311 v8i16 filter_vec, const_vec; 2312 2313 src -= 1; 2314 2315 filter_vec = LD_SH(filter); 2316 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2317 2318 mask1 = mask0 + 2; 2319 mask00 = mask0 + 8; 2320 mask11 = mask0 + 10; 2321 2322 const_vec = __msa_ldi_h(128); 2323 const_vec <<= 6; 2324 2325 for (loop_cnt = (height >> 2); loop_cnt--;) { 2326 /* 16 width */ 2327 LD_SB4(src, src_stride, src0, src2, src4, src6); 2328 LD_SB4(src + 16, src_stride, src1, src3, src5, src7); 2329 src += (4 * src_stride); 2330 2331 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7); 2332 2333 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 2334 dst0 = const_vec; 2335 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2336 2337 VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1); 2338 dst1 = const_vec; 2339 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2340 2341 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1); 2342 dst2 = const_vec; 2343 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2344 2345 VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1); 2346 dst3 = const_vec; 2347 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2348 2349 ST_SH2(dst0, dst1, dst, 8); 2350 dst += dst_stride; 2351 ST_SH2(dst2, dst3, dst, 8); 2352 dst += dst_stride; 2353 2354 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1); 2355 dst0 = const_vec; 2356 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2357 2358 VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1); 2359 dst1 = const_vec; 2360 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2361 2362 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1); 2363 dst2 = const_vec; 2364 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2365 2366 VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1); 2367 dst3 = const_vec; 2368 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2369 2370 ST_SH2(dst0, dst1, dst, 8); 2371 dst += dst_stride; 2372 ST_SH2(dst2, dst3, dst, 8); 2373 dst += dst_stride; 2374 2375 /* 8 width */ 2376 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1); 2377 dst0 = const_vec; 2378 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 2379 2380 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 2381 dst1 = const_vec; 2382 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1); 2383 2384 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1); 2385 dst2 = const_vec; 2386 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2); 2387 2388 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 2389 dst3 = const_vec; 2390 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 2391 2392 ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); 2393 dst_tmp += (4 * dst_stride); 2394 } 2395} 2396 2397static void hevc_hz_4t_32w_msa(uint8_t *src, 2398 int32_t src_stride, 2399 int16_t *dst, 2400 int32_t dst_stride, 2401 const int8_t *filter, 2402 int32_t height) 2403{ 2404 uint32_t loop_cnt; 2405 v16i8 src0, src1, src2; 2406 v8i16 filt0, filt1; 2407 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 2408 v16i8 mask1, mask2, mask3; 2409 v8i16 dst0, dst1, dst2, dst3; 2410 v16i8 vec0, vec1, vec2, vec3; 2411 v8i16 filter_vec, const_vec; 2412 2413 src -= 1; 2414 2415 filter_vec = LD_SH(filter); 2416 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2417 2418 const_vec = __msa_ldi_h(128); 2419 const_vec <<= 6; 2420 2421 mask1 = mask0 + 2; 2422 mask2 = mask0 + 8; 2423 mask3 = mask0 + 10; 2424 2425 for (loop_cnt = height; loop_cnt--;) { 2426 LD_SB2(src, 16, src0, src1); 2427 src2 = LD_SB(src + 24); 2428 src += src_stride; 2429 2430 XORI_B3_128_SB(src0, src1, src2); 2431 2432 dst0 = const_vec; 2433 dst1 = const_vec; 2434 dst2 = const_vec; 2435 dst3 = const_vec; 2436 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1); 2437 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3); 2438 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, 2439 dst1, dst2, dst3); 2440 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1); 2441 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3); 2442 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, 2443 dst1, dst2, dst3); 2444 ST_SH4(dst0, dst1, dst2, dst3, dst, 8); 2445 dst += dst_stride; 2446 } 2447} 2448 2449static void hevc_vt_4t_4x2_msa(uint8_t *src, 2450 int32_t src_stride, 2451 int16_t *dst, 2452 int32_t dst_stride, 2453 const int8_t *filter) 2454{ 2455 v16i8 src0, src1, src2, src3, src4; 2456 v16i8 src10_r, src32_r, src21_r, src43_r; 2457 v16i8 src2110, src4332; 2458 v8i16 dst10; 2459 v8i16 filt0, filt1; 2460 v8i16 filter_vec, const_vec; 2461 2462 src -= src_stride; 2463 2464 const_vec = __msa_ldi_h(128); 2465 const_vec <<= 6; 2466 2467 filter_vec = LD_SH(filter); 2468 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2469 2470 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 2471 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, 2472 src10_r, src21_r, src32_r, src43_r); 2473 2474 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 2475 XORI_B2_128_SB(src2110, src4332); 2476 dst10 = const_vec; 2477 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2478 2479 ST_D2(dst10, 0, 1, dst, dst_stride); 2480} 2481 2482static void hevc_vt_4t_4x4_msa(uint8_t *src, 2483 int32_t src_stride, 2484 int16_t *dst, 2485 int32_t dst_stride, 2486 const int8_t *filter, 2487 int32_t height) 2488{ 2489 v16i8 src0, src1, src2, src3, src4, src5, src6; 2490 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; 2491 v16i8 src2110, src4332, src6554; 2492 v8i16 dst10, dst32; 2493 v8i16 filt0, filt1; 2494 v8i16 filter_vec, const_vec; 2495 2496 src -= src_stride; 2497 2498 const_vec = __msa_ldi_h(128); 2499 const_vec <<= 6; 2500 2501 filter_vec = LD_SH(filter); 2502 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2503 2504 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 2505 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, 2506 src10_r, src21_r, src32_r, src43_r); 2507 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2508 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 2509 src2110, src4332, src6554); 2510 XORI_B3_128_SB(src2110, src4332, src6554); 2511 dst10 = const_vec; 2512 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2513 dst32 = const_vec; 2514 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 2515 2516 ST_D4(dst10, dst32, 0, 1, 0, 1, dst, dst_stride); 2517} 2518 2519static void hevc_vt_4t_4x8_msa(uint8_t *src, 2520 int32_t src_stride, 2521 int16_t *dst, 2522 int32_t dst_stride, 2523 const int8_t *filter, 2524 int32_t height) 2525{ 2526 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2527 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 2528 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 2529 v16i8 src2110, src4332, src6554, src8776, src10998; 2530 v8i16 dst10, dst32, dst54, dst76; 2531 v8i16 filt0, filt1; 2532 v8i16 filter_vec, const_vec; 2533 2534 src -= src_stride; 2535 const_vec = __msa_ldi_h(128); 2536 const_vec <<= 6; 2537 2538 filter_vec = LD_SH(filter); 2539 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2540 2541 LD_SB3(src, src_stride, src0, src1, src2); 2542 src += (3 * src_stride); 2543 2544 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2545 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2546 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2547 2548 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 2549 src += (8 * src_stride); 2550 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, 2551 src32_r, src43_r, src54_r, src65_r); 2552 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, 2553 src76_r, src87_r, src98_r, src109_r); 2554 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r, 2555 src98_r, src4332, src6554, src8776, src10998); 2556 XORI_B4_128_SB(src4332, src6554, src8776, src10998); 2557 dst10 = const_vec; 2558 dst32 = const_vec; 2559 dst54 = const_vec; 2560 dst76 = const_vec; 2561 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2562 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 2563 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); 2564 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76); 2565 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 2566} 2567 2568static void hevc_vt_4t_4x16_msa(uint8_t *src, int32_t src_stride, 2569 int16_t *dst, int32_t dst_stride, 2570 const int8_t *filter, int32_t height) 2571{ 2572 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 2573 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 2574 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; 2575 v16i8 src10998; 2576 v8i16 dst10, dst32, dst54, dst76, filt0, filt1, filter_vec, const_vec; 2577 2578 src -= src_stride; 2579 const_vec = __msa_ldi_h(128); 2580 const_vec <<= 6; 2581 2582 filter_vec = LD_SH(filter); 2583 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2584 2585 LD_SB3(src, src_stride, src0, src1, src2); 2586 src += (3 * src_stride); 2587 2588 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2589 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r); 2590 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); 2591 2592 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 2593 src += (8 * src_stride); 2594 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r, 2595 src54_r, src65_r); 2596 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 2597 src87_r, src98_r, src109_r); 2598 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r, 2599 src98_r, src4332, src6554, src8776, src10998); 2600 XORI_B4_128_SB(src4332, src6554, src8776, src10998); 2601 2602 dst10 = const_vec; 2603 dst32 = const_vec; 2604 dst54 = const_vec; 2605 dst76 = const_vec; 2606 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2607 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 2608 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); 2609 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76); 2610 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 2611 dst += (8 * dst_stride); 2612 2613 src2 = src10; 2614 src2110 = src10998; 2615 2616 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 2617 src += (8 * src_stride); 2618 2619 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r, 2620 src54_r, src65_r); 2621 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 2622 src87_r, src98_r, src109_r); 2623 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r, 2624 src98_r, src4332, src6554, src8776, src10998); 2625 XORI_B4_128_SB(src4332, src6554, src8776, src10998); 2626 2627 dst10 = const_vec; 2628 dst32 = const_vec; 2629 dst54 = const_vec; 2630 dst76 = const_vec; 2631 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); 2632 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32); 2633 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54); 2634 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76); 2635 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 2636} 2637 2638static void hevc_vt_4t_4w_msa(uint8_t *src, 2639 int32_t src_stride, 2640 int16_t *dst, 2641 int32_t dst_stride, 2642 const int8_t *filter, 2643 int32_t height) 2644{ 2645 if (2 == height) { 2646 hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); 2647 } else if (4 == height) { 2648 hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height); 2649 } else if (8 == height) { 2650 hevc_vt_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, height); 2651 } else if (16 == height) { 2652 hevc_vt_4t_4x16_msa(src, src_stride, dst, dst_stride, filter, height); 2653 } 2654} 2655 2656static void hevc_vt_4t_6w_msa(uint8_t *src, 2657 int32_t src_stride, 2658 int16_t *dst, 2659 int32_t dst_stride, 2660 const int8_t *filter, 2661 int32_t height) 2662{ 2663 int32_t loop_cnt; 2664 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3; 2665 uint64_t dst_val0, dst_val1, dst_val2, dst_val3; 2666 v16i8 src0, src1, src2, src3, src4; 2667 v16i8 src10_r, src32_r, src21_r, src43_r; 2668 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 2669 v8i16 filt0, filt1; 2670 v8i16 filter_vec, const_vec; 2671 2672 src -= src_stride; 2673 const_vec = __msa_ldi_h(128); 2674 const_vec <<= 6; 2675 2676 filter_vec = LD_SH(filter); 2677 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2678 2679 LD_SB3(src, src_stride, src0, src1, src2); 2680 src += (3 * src_stride); 2681 XORI_B3_128_SB(src0, src1, src2); 2682 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2683 2684 for (loop_cnt = (height >> 2); loop_cnt--;) { 2685 LD_SB2(src, src_stride, src3, src4); 2686 src += (2 * src_stride); 2687 XORI_B2_128_SB(src3, src4); 2688 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2689 2690 dst0_r = const_vec; 2691 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2692 dst1_r = const_vec; 2693 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2694 2695 LD_SB2(src, src_stride, src1, src2); 2696 src += (2 * src_stride); 2697 XORI_B2_128_SB(src1, src2); 2698 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); 2699 2700 dst2_r = const_vec; 2701 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r); 2702 dst3_r = const_vec; 2703 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r); 2704 2705 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0); 2706 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0); 2707 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0); 2708 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0); 2709 2710 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2); 2711 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2); 2712 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2); 2713 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2); 2714 2715 SD(dst_val0, dst); 2716 SW(dst_val_int0, dst + 4); 2717 dst += dst_stride; 2718 SD(dst_val1, dst); 2719 SW(dst_val_int1, dst + 4); 2720 dst += dst_stride; 2721 SD(dst_val2, dst); 2722 SW(dst_val_int2, dst + 4); 2723 dst += dst_stride; 2724 SD(dst_val3, dst); 2725 SW(dst_val_int3, dst + 4); 2726 dst += dst_stride; 2727 } 2728} 2729 2730static void hevc_vt_4t_8x2_msa(uint8_t *src, 2731 int32_t src_stride, 2732 int16_t *dst, 2733 int32_t dst_stride, 2734 const int8_t *filter) 2735{ 2736 v16i8 src0, src1, src2, src3, src4; 2737 v16i8 src10_r, src32_r, src21_r, src43_r; 2738 v8i16 dst0_r, dst1_r; 2739 v8i16 filt0, filt1; 2740 v8i16 filter_vec, const_vec; 2741 2742 src -= src_stride; 2743 const_vec = __msa_ldi_h(128); 2744 const_vec <<= 6; 2745 2746 filter_vec = LD_SH(filter); 2747 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2748 2749 LD_SB3(src, src_stride, src0, src1, src2); 2750 src += (3 * src_stride); 2751 XORI_B3_128_SB(src0, src1, src2); 2752 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2753 2754 LD_SB2(src, src_stride, src3, src4); 2755 XORI_B2_128_SB(src3, src4); 2756 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2757 dst0_r = const_vec; 2758 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2759 dst1_r = const_vec; 2760 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2761 2762 ST_SH2(dst0_r, dst1_r, dst, dst_stride); 2763} 2764 2765static void hevc_vt_4t_8x6_msa(uint8_t *src, 2766 int32_t src_stride, 2767 int16_t *dst, 2768 int32_t dst_stride, 2769 const int8_t *filter) 2770{ 2771 v16i8 src0, src1, src2, src3, src4; 2772 v16i8 src10_r, src32_r, src21_r, src43_r; 2773 v8i16 dst0_r, dst1_r; 2774 v8i16 filt0, filt1; 2775 v8i16 filter_vec, const_vec; 2776 2777 src -= src_stride; 2778 const_vec = __msa_ldi_h(128); 2779 const_vec <<= 6; 2780 2781 filter_vec = LD_SH(filter); 2782 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2783 2784 LD_SB3(src, src_stride, src0, src1, src2); 2785 src += (3 * src_stride); 2786 XORI_B3_128_SB(src0, src1, src2); 2787 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2788 2789 LD_SB2(src, src_stride, src3, src4); 2790 src += (2 * src_stride); 2791 XORI_B2_128_SB(src3, src4); 2792 2793 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2794 dst0_r = const_vec; 2795 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2796 dst1_r = const_vec; 2797 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2798 2799 ST_SH2(dst0_r, dst1_r, dst, dst_stride); 2800 dst += (2 * dst_stride); 2801 2802 LD_SB2(src, src_stride, src1, src2); 2803 src += (2 * src_stride); 2804 XORI_B2_128_SB(src1, src2); 2805 2806 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r); 2807 dst0_r = const_vec; 2808 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 2809 dst1_r = const_vec; 2810 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 2811 2812 ST_SH2(dst0_r, dst1_r, dst, dst_stride); 2813 dst += (2 * dst_stride); 2814 2815 LD_SB2(src, src_stride, src3, src4); 2816 XORI_B2_128_SB(src3, src4); 2817 2818 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2819 dst0_r = const_vec; 2820 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2821 dst1_r = const_vec; 2822 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2823 2824 ST_SH2(dst0_r, dst1_r, dst, dst_stride); 2825} 2826 2827static void hevc_vt_4t_8x4multiple_msa(uint8_t *src, 2828 int32_t src_stride, 2829 int16_t *dst, 2830 int32_t dst_stride, 2831 const int8_t *filter, 2832 int32_t height) 2833{ 2834 int32_t loop_cnt; 2835 v16i8 src0, src1, src2, src3, src4, src5, src6; 2836 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r; 2837 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 2838 v8i16 filt0, filt1; 2839 v8i16 filter_vec, const_vec; 2840 2841 src -= src_stride; 2842 const_vec = __msa_ldi_h(128); 2843 const_vec <<= 6; 2844 2845 filter_vec = LD_SH(filter); 2846 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2847 2848 LD_SB3(src, src_stride, src0, src1, src2); 2849 src += (3 * src_stride); 2850 XORI_B3_128_SB(src0, src1, src2); 2851 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2852 2853 for (loop_cnt = (height >> 2); loop_cnt--;) { 2854 LD_SB4(src, src_stride, src3, src4, src5, src6); 2855 src += (4 * src_stride); 2856 XORI_B4_128_SB(src3, src4, src5, src6); 2857 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2858 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2859 dst0_r = const_vec; 2860 dst1_r = const_vec; 2861 dst2_r = const_vec; 2862 dst3_r = const_vec; 2863 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2864 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2865 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 2866 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 2867 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); 2868 dst += (4 * dst_stride); 2869 2870 src2 = src6; 2871 src10_r = src54_r; 2872 src21_r = src65_r; 2873 } 2874} 2875 2876static void hevc_vt_4t_8w_msa(uint8_t *src, 2877 int32_t src_stride, 2878 int16_t *dst, 2879 int32_t dst_stride, 2880 const int8_t *filter, 2881 int32_t height) 2882{ 2883 if (2 == height) { 2884 hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter); 2885 } else if (6 == height) { 2886 hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter); 2887 } else { 2888 hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride, 2889 filter, height); 2890 } 2891} 2892 2893static void hevc_vt_4t_12w_msa(uint8_t *src, 2894 int32_t src_stride, 2895 int16_t *dst, 2896 int32_t dst_stride, 2897 const int8_t *filter, 2898 int32_t height) 2899{ 2900 int32_t loop_cnt; 2901 v16i8 src0, src1, src2, src3, src4, src5, src6; 2902 v16i8 src10_r, src32_r, src21_r, src43_r; 2903 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 2904 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l; 2905 v16i8 src2110, src4332; 2906 v16i8 src54_r, src65_r, src6554; 2907 v8i16 dst0_l, dst1_l; 2908 v8i16 filt0, filt1; 2909 v8i16 filter_vec, const_vec; 2910 2911 src -= (1 * src_stride); 2912 const_vec = __msa_ldi_h(128); 2913 const_vec <<= 6; 2914 2915 filter_vec = LD_SH(filter); 2916 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2917 2918 LD_SB3(src, src_stride, src0, src1, src2); 2919 src += (3 * src_stride); 2920 XORI_B3_128_SB(src0, src1, src2); 2921 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2922 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 2923 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l); 2924 2925 for (loop_cnt = 4; loop_cnt--;) { 2926 LD_SB2(src, src_stride, src3, src4); 2927 src += (2 * src_stride); 2928 LD_SB2(src, src_stride, src5, src6); 2929 src += (2 * src_stride); 2930 XORI_B2_128_SB(src3, src4); 2931 XORI_B2_128_SB(src5, src6); 2932 2933 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2934 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 2935 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l); 2936 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r); 2937 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l); 2938 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l); 2939 2940 dst0_r = const_vec; 2941 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 2942 dst1_r = const_vec; 2943 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 2944 dst2_r = const_vec; 2945 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r); 2946 dst3_r = const_vec; 2947 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r); 2948 dst0_l = const_vec; 2949 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l); 2950 dst1_l = const_vec; 2951 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l); 2952 2953 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); 2954 ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride); 2955 dst += (4 * dst_stride); 2956 2957 src2 = src6; 2958 src10_r = src54_r; 2959 src21_r = src65_r; 2960 src2110 = src6554; 2961 } 2962} 2963 2964static void hevc_vt_4t_16w_msa(uint8_t *src, 2965 int32_t src_stride, 2966 int16_t *dst, 2967 int32_t dst_stride, 2968 const int8_t *filter, 2969 int32_t height) 2970{ 2971 int32_t loop_cnt; 2972 v16i8 src0, src1, src2, src3, src4, src5; 2973 v16i8 src10_r, src32_r, src21_r, src43_r; 2974 v16i8 src10_l, src32_l, src21_l, src43_l; 2975 v8i16 dst0_r, dst1_r, dst0_l, dst1_l; 2976 v8i16 filt0, filt1; 2977 v8i16 filter_vec, const_vec; 2978 2979 src -= src_stride; 2980 const_vec = __msa_ldi_h(128); 2981 const_vec <<= 6; 2982 2983 filter_vec = LD_SH(filter); 2984 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 2985 2986 LD_SB3(src, src_stride, src0, src1, src2); 2987 src += (3 * src_stride); 2988 XORI_B3_128_SB(src0, src1, src2); 2989 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 2990 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 2991 2992 for (loop_cnt = (height >> 2); loop_cnt--;) { 2993 LD_SB2(src, src_stride, src3, src4); 2994 src += (2 * src_stride); 2995 XORI_B2_128_SB(src3, src4); 2996 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 2997 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 2998 dst0_r = const_vec; 2999 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3000 dst0_l = const_vec; 3001 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3002 dst1_r = const_vec; 3003 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3004 dst1_l = const_vec; 3005 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3006 ST_SH2(dst0_r, dst0_l, dst, 8); 3007 dst += dst_stride; 3008 ST_SH2(dst1_r, dst1_l, dst, 8); 3009 dst += dst_stride; 3010 3011 LD_SB2(src, src_stride, src5, src2); 3012 src += (2 * src_stride); 3013 XORI_B2_128_SB(src5, src2); 3014 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3015 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 3016 dst0_r = const_vec; 3017 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 3018 dst0_l = const_vec; 3019 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); 3020 dst1_r = const_vec; 3021 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 3022 dst1_l = const_vec; 3023 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); 3024 ST_SH2(dst0_r, dst0_l, dst, 8); 3025 dst += dst_stride; 3026 ST_SH2(dst1_r, dst1_l, dst, 8); 3027 dst += dst_stride; 3028 } 3029} 3030 3031static void hevc_vt_4t_24w_msa(uint8_t *src, 3032 int32_t src_stride, 3033 int16_t *dst, 3034 int32_t dst_stride, 3035 const int8_t *filter, 3036 int32_t height) 3037{ 3038 int32_t loop_cnt; 3039 v16i8 src0, src1, src2, src3, src4, src5; 3040 v16i8 src6, src7, src8, src9, src10, src11; 3041 v16i8 src10_r, src32_r, src76_r, src98_r; 3042 v16i8 src21_r, src43_r, src87_r, src109_r; 3043 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3044 v16i8 src10_l, src32_l, src21_l, src43_l; 3045 v8i16 dst0_l, dst1_l; 3046 v8i16 filt0, filt1; 3047 v8i16 filter_vec, const_vec; 3048 3049 src -= src_stride; 3050 const_vec = __msa_ldi_h(128); 3051 const_vec <<= 6; 3052 3053 filter_vec = LD_SH(filter); 3054 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3055 3056 LD_SB3(src, src_stride, src0, src1, src2); 3057 XORI_B3_128_SB(src0, src1, src2); 3058 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3059 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3060 3061 LD_SB3(src + 16, src_stride, src6, src7, src8); 3062 src += (3 * src_stride); 3063 XORI_B3_128_SB(src6, src7, src8); 3064 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3065 3066 for (loop_cnt = (height >> 2); loop_cnt--;) { 3067 LD_SB2(src, src_stride, src3, src4); 3068 XORI_B2_128_SB(src3, src4); 3069 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3070 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3071 3072 LD_SB2(src + 16, src_stride, src9, src10); 3073 src += (2 * src_stride); 3074 XORI_B2_128_SB(src9, src10); 3075 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3076 3077 dst0_r = const_vec; 3078 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3079 dst0_l = const_vec; 3080 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3081 dst1_r = const_vec; 3082 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3083 dst1_l = const_vec; 3084 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3085 dst2_r = const_vec; 3086 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); 3087 dst3_r = const_vec; 3088 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); 3089 3090 ST_SH2(dst0_r, dst0_l, dst, 8); 3091 ST_SH(dst2_r, dst + 16); 3092 dst += dst_stride; 3093 ST_SH2(dst1_r, dst1_l, dst, 8); 3094 ST_SH(dst3_r, dst + 16); 3095 dst += dst_stride; 3096 3097 LD_SB2(src, src_stride, src5, src2); 3098 XORI_B2_128_SB(src5, src2); 3099 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3100 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 3101 3102 LD_SB2(src + 16, src_stride, src11, src8); 3103 src += (2 * src_stride); 3104 XORI_B2_128_SB(src11, src8); 3105 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); 3106 3107 dst0_r = const_vec; 3108 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 3109 dst0_l = const_vec; 3110 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); 3111 dst1_r = const_vec; 3112 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 3113 dst1_l = const_vec; 3114 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); 3115 dst2_r = const_vec; 3116 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r); 3117 dst3_r = const_vec; 3118 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r); 3119 3120 ST_SH2(dst0_r, dst0_l, dst, 8); 3121 ST_SH(dst2_r, dst + 16); 3122 dst += dst_stride; 3123 ST_SH2(dst1_r, dst1_l, dst, 8); 3124 ST_SH(dst3_r, dst + 16); 3125 dst += dst_stride; 3126 } 3127} 3128 3129static void hevc_vt_4t_32w_msa(uint8_t *src, 3130 int32_t src_stride, 3131 int16_t *dst, 3132 int32_t dst_stride, 3133 const int8_t *filter, 3134 int32_t height) 3135{ 3136 int32_t loop_cnt; 3137 v16i8 src0, src1, src2, src3, src4, src5; 3138 v16i8 src6, src7, src8, src9, src10, src11; 3139 v16i8 src10_r, src32_r, src76_r, src98_r; 3140 v16i8 src21_r, src43_r, src87_r, src109_r; 3141 v8i16 dst0_r, dst1_r, dst2_r, dst3_r; 3142 v16i8 src10_l, src32_l, src76_l, src98_l; 3143 v16i8 src21_l, src43_l, src87_l, src109_l; 3144 v8i16 dst0_l, dst1_l, dst2_l, dst3_l; 3145 v8i16 filt0, filt1; 3146 v8i16 filter_vec, const_vec; 3147 3148 src -= src_stride; 3149 const_vec = __msa_ldi_h(128); 3150 const_vec <<= 6; 3151 3152 filter_vec = LD_SH(filter); 3153 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3154 3155 LD_SB3(src, src_stride, src0, src1, src2); 3156 XORI_B3_128_SB(src0, src1, src2); 3157 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r); 3158 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l); 3159 3160 LD_SB3(src + 16, src_stride, src6, src7, src8); 3161 src += (3 * src_stride); 3162 XORI_B3_128_SB(src6, src7, src8); 3163 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r); 3164 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l); 3165 3166 for (loop_cnt = (height >> 2); loop_cnt--;) { 3167 LD_SB2(src, src_stride, src3, src4); 3168 XORI_B2_128_SB(src3, src4); 3169 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r); 3170 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l); 3171 3172 LD_SB2(src + 16, src_stride, src9, src10); 3173 src += (2 * src_stride); 3174 XORI_B2_128_SB(src9, src10); 3175 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r); 3176 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l); 3177 3178 dst0_r = const_vec; 3179 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r); 3180 dst0_l = const_vec; 3181 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l); 3182 dst1_r = const_vec; 3183 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r); 3184 dst1_l = const_vec; 3185 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l); 3186 dst2_r = const_vec; 3187 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r); 3188 dst2_l = const_vec; 3189 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l); 3190 dst3_r = const_vec; 3191 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r); 3192 dst3_l = const_vec; 3193 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l); 3194 3195 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8); 3196 dst += dst_stride; 3197 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8); 3198 dst += dst_stride; 3199 3200 LD_SB2(src, src_stride, src5, src2); 3201 XORI_B2_128_SB(src5, src2); 3202 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r); 3203 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l); 3204 3205 LD_SB2(src + 16, src_stride, src11, src8); 3206 src += (2 * src_stride); 3207 XORI_B2_128_SB(src11, src8); 3208 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r); 3209 ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l); 3210 3211 dst0_r = const_vec; 3212 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r); 3213 dst0_l = const_vec; 3214 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l); 3215 dst1_r = const_vec; 3216 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r); 3217 dst1_l = const_vec; 3218 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l); 3219 dst2_r = const_vec; 3220 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r); 3221 dst2_l = const_vec; 3222 DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l); 3223 dst3_r = const_vec; 3224 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r); 3225 dst3_l = const_vec; 3226 DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l); 3227 3228 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8); 3229 dst += dst_stride; 3230 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8); 3231 dst += dst_stride; 3232 } 3233} 3234 3235static void hevc_hv_4t_4x2_msa(uint8_t *src, 3236 int32_t src_stride, 3237 int16_t *dst, 3238 int32_t dst_stride, 3239 const int8_t *filter_x, 3240 const int8_t *filter_y) 3241{ 3242 v16i8 src0, src1, src2, src3, src4; 3243 v8i16 filt0, filt1; 3244 v8i16 filt_h0, filt_h1; 3245 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3246 v16i8 mask1; 3247 v8i16 filter_vec, const_vec; 3248 v16i8 vec0, vec1, vec2, vec3, vec4, vec5; 3249 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43; 3250 v4i32 dst0, dst1; 3251 3252 src -= (src_stride + 1); 3253 filter_vec = LD_SH(filter_x); 3254 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3255 3256 filter_vec = LD_SH(filter_y); 3257 UNPCK_R_SB_SH(filter_vec, filter_vec); 3258 3259 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3260 3261 mask1 = mask0 + 2; 3262 3263 const_vec = __msa_ldi_h(128); 3264 const_vec <<= 6; 3265 3266 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3267 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3268 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1); 3269 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3); 3270 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5); 3271 3272 dst20 = const_vec; 3273 dst31 = const_vec; 3274 dst42 = const_vec; 3275 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst20, dst20); 3276 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst31, dst31); 3277 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst42, dst42); 3278 ILVRL_H2_SH(dst31, dst20, dst10, dst32); 3279 ILVRL_H2_SH(dst42, dst31, dst21, dst43); 3280 3281 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3282 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3283 dst0 >>= 6; 3284 dst1 >>= 6; 3285 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0); 3286 ST_D2(dst0, 0, 1, dst, dst_stride); 3287} 3288 3289static void hevc_hv_4t_4x4_msa(uint8_t *src, 3290 int32_t src_stride, 3291 int16_t *dst, 3292 int32_t dst_stride, 3293 const int8_t *filter_x, 3294 const int8_t *filter_y) 3295{ 3296 v16i8 src0, src1, src2, src3, src4, src5, src6; 3297 v8i16 filt0, filt1; 3298 v8i16 filt_h0, filt_h1; 3299 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3300 v16i8 mask1; 3301 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3302 v8i16 filter_vec, const_vec; 3303 v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65; 3304 v4i32 dst0, dst1, dst2, dst3; 3305 3306 src -= (src_stride + 1); 3307 3308 filter_vec = LD_SH(filter_x); 3309 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3310 3311 filter_vec = LD_SH(filter_y); 3312 UNPCK_R_SB_SH(filter_vec, filter_vec); 3313 3314 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3315 3316 mask1 = mask0 + 2; 3317 3318 const_vec = __msa_ldi_h(128); 3319 const_vec <<= 6; 3320 3321 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3322 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3323 3324 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1); 3325 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3); 3326 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5); 3327 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7); 3328 3329 dst30 = const_vec; 3330 dst41 = const_vec; 3331 dst52 = const_vec; 3332 dst63 = const_vec; 3333 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst30, dst30); 3334 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst41, dst41); 3335 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst52, dst52); 3336 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst63, dst63); 3337 3338 ILVRL_H2_SH(dst41, dst30, dst10, dst43); 3339 ILVRL_H2_SH(dst52, dst41, dst21, dst54); 3340 ILVRL_H2_SH(dst63, dst52, dst32, dst65); 3341 3342 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1); 3343 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1); 3344 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1); 3345 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1); 3346 SRA_4V(dst0, dst1, dst2, dst3, 6); 3347 PCKEV_H2_SW(dst1, dst0, dst3, dst2, dst0, dst2); 3348 ST_D4(dst0, dst2, 0, 1, 0, 1, dst, dst_stride); 3349} 3350 3351 3352static void hevc_hv_4t_4multx8mult_msa(uint8_t *src, 3353 int32_t src_stride, 3354 int16_t *dst, 3355 int32_t dst_stride, 3356 const int8_t *filter_x, 3357 const int8_t *filter_y, 3358 int32_t height) 3359{ 3360 uint32_t loop_cnt; 3361 v16i8 src0, src1, src2, src3, src4, src5, src6; 3362 v16i8 src7, src8, src9, src10; 3363 v8i16 filt0, filt1; 3364 v8i16 filt_h0, filt_h1; 3365 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16); 3366 v16i8 mask1; 3367 v8i16 filter_vec, const_vec; 3368 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3369 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106; 3370 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r; 3371 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r; 3372 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 3373 3374 src -= (src_stride + 1); 3375 filter_vec = LD_SH(filter_x); 3376 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3377 3378 filter_vec = LD_SH(filter_y); 3379 UNPCK_R_SB_SH(filter_vec, filter_vec); 3380 3381 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3382 3383 mask1 = mask0 + 2; 3384 3385 const_vec = __msa_ldi_h(128); 3386 const_vec <<= 6; 3387 3388 LD_SB3(src, src_stride, src0, src1, src2); 3389 src += (3 * src_stride); 3390 XORI_B3_128_SB(src0, src1, src2); 3391 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); 3392 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3); 3393 dst10 = const_vec; 3394 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10); 3395 dst21 = const_vec; 3396 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21); 3397 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 3398 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 3399 3400 for (loop_cnt = height >> 3; loop_cnt--;) { 3401 LD_SB8(src, src_stride, 3402 src3, src4, src5, src6, src7, src8, src9, src10); 3403 src += (8 * src_stride); 3404 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3405 3406 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1); 3407 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3); 3408 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5); 3409 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7); 3410 3411 dst73 = const_vec; 3412 dst84 = const_vec; 3413 dst95 = const_vec; 3414 dst106 = const_vec; 3415 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73); 3416 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84); 3417 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95); 3418 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106); 3419 3420 dst32_r = __msa_ilvr_h(dst73, dst22); 3421 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 3422 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 3423 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 3424 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 3425 dst76_r = __msa_ilvr_h(dst22, dst106); 3426 3427 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3428 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3429 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3430 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3431 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3432 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3433 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 3434 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 3435 SRA_4V(dst0, dst1, dst2, dst3, 6); 3436 SRA_4V(dst4, dst5, dst6, dst7, 6); 3437 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, 3438 dst0, dst1, dst2, dst3); 3439 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 3440 dst += (8 * dst_stride); 3441 3442 dst10_r = dst98_r; 3443 dst21_r = dst109_r; 3444 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 3445 } 3446} 3447 3448static void hevc_hv_4t_4w_msa(uint8_t *src, 3449 int32_t src_stride, 3450 int16_t *dst, 3451 int32_t dst_stride, 3452 const int8_t *filter_x, 3453 const int8_t *filter_y, 3454 int32_t height) 3455{ 3456 if (2 == height) { 3457 hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride, 3458 filter_x, filter_y); 3459 } else if (4 == height) { 3460 hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride, 3461 filter_x, filter_y); 3462 } else if (0 == (height % 8)) { 3463 hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride, 3464 filter_x, filter_y, height); 3465 } 3466} 3467 3468static void hevc_hv_4t_6w_msa(uint8_t *src, 3469 int32_t src_stride, 3470 int16_t *dst, 3471 int32_t dst_stride, 3472 const int8_t *filter_x, 3473 const int8_t *filter_y, 3474 int32_t height) 3475{ 3476 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 3477 v8i16 filt0, filt1; 3478 v8i16 filt_h0, filt_h1; 3479 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3480 v16i8 mask1; 3481 v8i16 filter_vec, const_vec; 3482 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3483 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9; 3484 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 3485 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r; 3486 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l; 3487 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l; 3488 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l; 3489 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r; 3490 v4i32 dst0_l, dst1_l, dst2_l, dst3_l; 3491 3492 src -= (src_stride + 1); 3493 filter_vec = LD_SH(filter_x); 3494 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3495 3496 filter_vec = LD_SH(filter_y); 3497 UNPCK_R_SB_SH(filter_vec, filter_vec); 3498 3499 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3500 3501 mask1 = mask0 + 2; 3502 3503 const_vec = __msa_ldi_h(128); 3504 const_vec <<= 6; 3505 3506 LD_SB3(src, src_stride, src0, src1, src2); 3507 src += (3 * src_stride); 3508 XORI_B3_128_SB(src0, src1, src2); 3509 3510 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3511 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3512 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3513 3514 dsth0 = const_vec; 3515 dsth1 = const_vec; 3516 dsth2 = const_vec; 3517 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth0, dsth0); 3518 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth1, dsth1); 3519 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth2, dsth2); 3520 3521 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l); 3522 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l); 3523 3524 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10); 3525 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 3526 3527 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3528 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3529 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3530 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3531 3532 dsth3 = const_vec; 3533 dsth4 = const_vec; 3534 dsth5 = const_vec; 3535 dsth6 = const_vec; 3536 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth3, dsth3); 3537 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth4, dsth4); 3538 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth5, dsth5); 3539 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth6, dsth6); 3540 3541 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1); 3542 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3); 3543 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5); 3544 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7); 3545 3546 dsth7 = const_vec; 3547 dsth8 = const_vec; 3548 dsth9 = const_vec; 3549 dsth10 = const_vec; 3550 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth7, dsth7); 3551 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth8, dsth8); 3552 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth9, dsth9); 3553 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth10, dsth10); 3554 3555 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l); 3556 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l); 3557 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l); 3558 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l); 3559 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l); 3560 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l); 3561 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l); 3562 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l); 3563 3564 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l); 3565 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l); 3566 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l); 3567 3568 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3569 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3570 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3571 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3572 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3573 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3574 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 3575 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 3576 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1); 3577 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1); 3578 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1); 3579 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1); 3580 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6); 3581 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6); 3582 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6); 3583 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1); 3584 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3); 3585 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5); 3586 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 3587 ST_W4(tmp4, 0, 1, 2, 3, dst + 4, dst_stride); 3588 dst += 4 * dst_stride; 3589 ST_D4(tmp2, tmp3, 0, 1, 0, 1, dst, dst_stride); 3590 ST_W4(tmp5, 0, 1, 2, 3, dst + 4, dst_stride); 3591} 3592 3593static void hevc_hv_4t_8x2_msa(uint8_t *src, 3594 int32_t src_stride, 3595 int16_t *dst, 3596 int32_t dst_stride, 3597 const int8_t *filter_x, 3598 const int8_t *filter_y) 3599{ 3600 v16i8 src0, src1, src2, src3, src4; 3601 v8i16 filt0, filt1; 3602 v8i16 filt_h0, filt_h1; 3603 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3604 v16i8 mask1; 3605 v8i16 filter_vec, const_vec; 3606 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 3607 v8i16 dst0, dst1, dst2, dst3, dst4; 3608 v4i32 dst0_r, dst0_l, dst1_r, dst1_l; 3609 v8i16 dst10_r, dst32_r, dst21_r, dst43_r; 3610 v8i16 dst10_l, dst32_l, dst21_l, dst43_l; 3611 3612 src -= (src_stride + 1); 3613 3614 filter_vec = LD_SH(filter_x); 3615 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3616 3617 filter_vec = LD_SH(filter_y); 3618 UNPCK_R_SB_SH(filter_vec, filter_vec); 3619 3620 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3621 3622 mask1 = mask0 + 2; 3623 3624 const_vec = __msa_ldi_h(128); 3625 const_vec <<= 6; 3626 3627 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3628 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3629 3630 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3631 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3632 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3633 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 3634 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 3635 3636 dst0 = const_vec; 3637 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 3638 dst1 = const_vec; 3639 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); 3640 dst2 = const_vec; 3641 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); 3642 dst3 = const_vec; 3643 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3); 3644 dst4 = const_vec; 3645 DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4); 3646 3647 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3648 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3649 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3650 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3651 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3652 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3653 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3654 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3655 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3656 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3657 ST_SW2(dst0_r, dst1_r, dst, dst_stride); 3658} 3659 3660static void hevc_hv_4t_8multx4_msa(uint8_t *src, int32_t src_stride, 3661 int16_t *dst, int32_t dst_stride, 3662 const int8_t *filter_x, 3663 const int8_t *filter_y, int32_t width8mult) 3664{ 3665 int32_t cnt; 3666 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1; 3667 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3668 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec; 3669 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6; 3670 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 3671 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 3672 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3673 3674 src -= (src_stride + 1); 3675 3676 filter_vec = LD_SH(filter_x); 3677 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3678 3679 filter_vec = LD_SH(filter_y); 3680 UNPCK_R_SB_SH(filter_vec, filter_vec); 3681 3682 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3683 3684 mask0 = LD_SB(ff_hevc_mask_arr); 3685 mask1 = mask0 + 2; 3686 3687 const_vec = __msa_ldi_h(128); 3688 const_vec <<= 6; 3689 3690 for (cnt = width8mult; cnt--;) { 3691 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 3692 src += 8; 3693 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 3694 3695 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3696 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3697 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3698 3699 dst0 = const_vec; 3700 dst1 = const_vec; 3701 dst2 = const_vec; 3702 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 3703 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); 3704 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); 3705 3706 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3707 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3708 3709 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3710 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3711 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3712 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3713 dst3 = const_vec; 3714 dst4 = const_vec; 3715 dst5 = const_vec; 3716 dst6 = const_vec; 3717 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 3718 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4); 3719 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5); 3720 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6); 3721 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3722 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3723 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3724 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3725 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3726 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3727 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3728 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3729 3730 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3731 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3732 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3733 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3734 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3735 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3736 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); 3737 PCKEV_H2_SW(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r); 3738 3739 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride); 3740 dst += 8; 3741 } 3742} 3743 3744static void hevc_hv_4t_8x6_msa(uint8_t *src, 3745 int32_t src_stride, 3746 int16_t *dst, 3747 int32_t dst_stride, 3748 const int8_t *filter_x, 3749 const int8_t *filter_y) 3750{ 3751 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 3752 v8i16 filt0, filt1; 3753 v8i16 filt_h0, filt_h1; 3754 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3755 v16i8 mask1; 3756 v8i16 filter_vec, const_vec; 3757 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 3758 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 3759 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; 3760 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3761 v4i32 dst4_r, dst4_l, dst5_r, dst5_l; 3762 v8i16 dst10_r, dst32_r, dst10_l, dst32_l; 3763 v8i16 dst21_r, dst43_r, dst21_l, dst43_l; 3764 v8i16 dst54_r, dst54_l, dst65_r, dst65_l; 3765 v8i16 dst76_r, dst76_l, dst87_r, dst87_l; 3766 3767 src -= (src_stride + 1); 3768 3769 filter_vec = LD_SH(filter_x); 3770 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3771 3772 filter_vec = LD_SH(filter_y); 3773 UNPCK_R_SB_SH(filter_vec, filter_vec); 3774 3775 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3776 3777 mask1 = mask0 + 2; 3778 3779 const_vec = __msa_ldi_h(128); 3780 const_vec <<= 6; 3781 3782 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 3783 src += (5 * src_stride); 3784 LD_SB4(src, src_stride, src5, src6, src7, src8); 3785 3786 XORI_B5_128_SB(src0, src1, src2, src3, src4); 3787 XORI_B4_128_SB(src5, src6, src7, src8); 3788 3789 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3790 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3791 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3792 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7); 3793 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9); 3794 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11); 3795 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13); 3796 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15); 3797 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17); 3798 3799 dst0 = const_vec; 3800 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 3801 dst1 = const_vec; 3802 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); 3803 dst2 = const_vec; 3804 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); 3805 dst3 = const_vec; 3806 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3); 3807 dst4 = const_vec; 3808 DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4); 3809 dst5 = const_vec; 3810 DPADD_SB2_SH(vec10, vec11, filt0, filt1, dst5, dst5); 3811 dst6 = const_vec; 3812 DPADD_SB2_SH(vec12, vec13, filt0, filt1, dst6, dst6); 3813 dst7 = const_vec; 3814 DPADD_SB2_SH(vec14, vec15, filt0, filt1, dst7, dst7); 3815 dst8 = const_vec; 3816 DPADD_SB2_SH(vec16, vec17, filt0, filt1, dst8, dst8); 3817 3818 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3819 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3820 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3821 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3822 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3823 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3824 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l); 3825 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l); 3826 3827 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3828 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3829 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3830 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3831 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3832 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3833 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3834 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3835 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 3836 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1); 3837 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 3838 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1); 3839 3840 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3841 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3842 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6); 3843 3844 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, 3845 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r); 3846 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r); 3847 3848 ST_SW2(dst0_r, dst1_r, dst, dst_stride); 3849 dst += (2 * dst_stride); 3850 ST_SW2(dst2_r, dst3_r, dst, dst_stride); 3851 dst += (2 * dst_stride); 3852 ST_SW2(dst4_r, dst5_r, dst, dst_stride); 3853} 3854 3855static void hevc_hv_4t_8multx4mult_msa(uint8_t *src, 3856 int32_t src_stride, 3857 int16_t *dst, 3858 int32_t dst_stride, 3859 const int8_t *filter_x, 3860 const int8_t *filter_y, 3861 int32_t height, 3862 int32_t width8mult) 3863{ 3864 uint32_t loop_cnt, cnt; 3865 uint8_t *src_tmp; 3866 int16_t *dst_tmp; 3867 v16i8 src0, src1, src2, src3, src4, src5, src6; 3868 v8i16 filt0, filt1; 3869 v8i16 filt_h0, filt_h1; 3870 v16i8 mask0 = LD_SB(ff_hevc_mask_arr); 3871 v16i8 mask1; 3872 v8i16 filter_vec, const_vec; 3873 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3874 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6; 3875 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 3876 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 3877 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 3878 3879 src -= (src_stride + 1); 3880 3881 filter_vec = LD_SH(filter_x); 3882 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 3883 3884 filter_vec = LD_SH(filter_y); 3885 UNPCK_R_SB_SH(filter_vec, filter_vec); 3886 3887 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 3888 3889 mask1 = mask0 + 2; 3890 3891 const_vec = __msa_ldi_h(128); 3892 const_vec <<= 6; 3893 3894 for (cnt = width8mult; cnt--;) { 3895 src_tmp = src; 3896 dst_tmp = dst; 3897 3898 LD_SB3(src_tmp, src_stride, src0, src1, src2); 3899 src_tmp += (3 * src_stride); 3900 3901 XORI_B3_128_SB(src0, src1, src2); 3902 3903 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 3904 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 3905 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 3906 3907 dst0 = const_vec; 3908 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 3909 dst1 = const_vec; 3910 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); 3911 dst2 = const_vec; 3912 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); 3913 3914 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 3915 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 3916 3917 for (loop_cnt = height >> 2; loop_cnt--;) { 3918 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 3919 src_tmp += (4 * src_stride); 3920 XORI_B4_128_SB(src3, src4, src5, src6); 3921 3922 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 3923 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 3924 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 3925 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 3926 3927 dst3 = const_vec; 3928 dst4 = const_vec; 3929 dst5 = const_vec; 3930 dst6 = const_vec; 3931 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 3932 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4); 3933 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5); 3934 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6); 3935 3936 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 3937 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 3938 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 3939 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 3940 3941 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 3942 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 3943 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 3944 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 3945 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 3946 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 3947 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 3948 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 3949 3950 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 3951 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 3952 3953 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, 3954 dst2_l, dst2_r, dst3_l, dst3_r, 3955 dst0_r, dst1_r, dst2_r, dst3_r); 3956 3957 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride); 3958 dst_tmp += (4 * dst_stride); 3959 3960 dst10_r = dst54_r; 3961 dst10_l = dst54_l; 3962 dst21_r = dst65_r; 3963 dst21_l = dst65_l; 3964 dst2 = dst6; 3965 } 3966 3967 src += 8; 3968 dst += 8; 3969 } 3970} 3971 3972static void hevc_hv_4t_8w_msa(uint8_t *src, 3973 int32_t src_stride, 3974 int16_t *dst, 3975 int32_t dst_stride, 3976 const int8_t *filter_x, 3977 const int8_t *filter_y, 3978 int32_t height) 3979{ 3980 3981 if (2 == height) { 3982 hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride, 3983 filter_x, filter_y); 3984 } else if (4 == height) { 3985 hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride, 3986 filter_x, filter_y, 1); 3987 } else if (6 == height) { 3988 hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride, 3989 filter_x, filter_y); 3990 } else if (0 == (height % 4)) { 3991 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 3992 filter_x, filter_y, height, 1); 3993 } 3994} 3995 3996static void hevc_hv_4t_12w_msa(uint8_t *src, 3997 int32_t src_stride, 3998 int16_t *dst, 3999 int32_t dst_stride, 4000 const int8_t *filter_x, 4001 const int8_t *filter_y, 4002 int32_t height) 4003{ 4004 uint32_t loop_cnt; 4005 uint8_t *src_tmp; 4006 int16_t *dst_tmp; 4007 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 4008 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 4009 v16i8 mask0, mask1, mask2, mask3; 4010 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec; 4011 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73; 4012 v8i16 dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r; 4013 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r; 4014 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l; 4015 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l; 4016 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 4017 4018 src -= (src_stride + 1); 4019 4020 filter_vec = LD_SH(filter_x); 4021 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1); 4022 4023 filter_vec = LD_SH(filter_y); 4024 UNPCK_R_SB_SH(filter_vec, filter_vec); 4025 4026 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1); 4027 4028 mask0 = LD_SB(ff_hevc_mask_arr); 4029 mask1 = mask0 + 2; 4030 4031 const_vec = __msa_ldi_h(128); 4032 const_vec <<= 6; 4033 4034 src_tmp = src; 4035 dst_tmp = dst; 4036 4037 LD_SB3(src_tmp, src_stride, src0, src1, src2); 4038 src_tmp += (3 * src_stride); 4039 4040 XORI_B3_128_SB(src0, src1, src2); 4041 4042 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1); 4043 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3); 4044 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5); 4045 4046 dst0 = const_vec; 4047 dst1 = const_vec; 4048 dst2 = const_vec; 4049 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0); 4050 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1); 4051 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2); 4052 4053 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l); 4054 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l); 4055 4056 for (loop_cnt = 4; loop_cnt--;) { 4057 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6); 4058 src_tmp += (4 * src_stride); 4059 XORI_B4_128_SB(src3, src4, src5, src6); 4060 4061 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1); 4062 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3); 4063 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5); 4064 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7); 4065 4066 dst3 = const_vec; 4067 dst4 = const_vec; 4068 dst5 = const_vec; 4069 dst6 = const_vec; 4070 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3); 4071 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4); 4072 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5); 4073 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6); 4074 4075 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l); 4076 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l); 4077 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l); 4078 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l); 4079 4080 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4081 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1); 4082 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4083 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1); 4084 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4085 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1); 4086 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4087 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1); 4088 4089 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6); 4090 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6); 4091 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, 4092 dst3_r, dst0_r, dst1_r, dst2_r, dst3_r); 4093 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride); 4094 dst_tmp += (4 * dst_stride); 4095 4096 dst10_r = dst54_r; 4097 dst10_l = dst54_l; 4098 dst21_r = dst65_r; 4099 dst21_l = dst65_l; 4100 dst2 = dst6; 4101 } 4102 4103 src += 8; 4104 dst += 8; 4105 4106 mask2 = LD_SB(ff_hevc_mask_arr + 16); 4107 mask3 = mask2 + 2; 4108 4109 LD_SB3(src, src_stride, src0, src1, src2); 4110 src += (3 * src_stride); 4111 XORI_B3_128_SB(src0, src1, src2); 4112 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1); 4113 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3); 4114 dst10 = const_vec; 4115 dst21 = const_vec; 4116 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10); 4117 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21); 4118 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r); 4119 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1); 4120 4121 for (loop_cnt = 2; loop_cnt--;) { 4122 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, 4123 src10); 4124 src += (8 * src_stride); 4125 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10); 4126 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1); 4127 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3); 4128 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5); 4129 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7); 4130 4131 dst73 = const_vec; 4132 dst84 = const_vec; 4133 dst95 = const_vec; 4134 dst106 = const_vec; 4135 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73); 4136 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84); 4137 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95); 4138 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106); 4139 4140 dst32_r = __msa_ilvr_h(dst73, dst22); 4141 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r); 4142 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r); 4143 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r); 4144 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1); 4145 dst76_r = __msa_ilvr_h(dst22, dst106); 4146 4147 tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1); 4148 tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1); 4149 tmp2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1); 4150 tmp3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1); 4151 tmp4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1); 4152 tmp5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1); 4153 tmp6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1); 4154 tmp7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1); 4155 4156 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6); 4157 SRA_4V(tmp4, tmp5, tmp6, tmp7, 6); 4158 PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1, 4159 tmp2, tmp3); 4160 ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride); 4161 dst += (8 * dst_stride); 4162 4163 dst10_r = dst98_r; 4164 dst21_r = dst109_r; 4165 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1); 4166 } 4167} 4168 4169static void hevc_hv_4t_16w_msa(uint8_t *src, 4170 int32_t src_stride, 4171 int16_t *dst, 4172 int32_t dst_stride, 4173 const int8_t *filter_x, 4174 const int8_t *filter_y, 4175 int32_t height) 4176{ 4177 if (4 == height) { 4178 hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride, 4179 filter_x, filter_y, 2); 4180 } else { 4181 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4182 filter_x, filter_y, height, 2); 4183 } 4184} 4185 4186static void hevc_hv_4t_24w_msa(uint8_t *src, 4187 int32_t src_stride, 4188 int16_t *dst, 4189 int32_t dst_stride, 4190 const int8_t *filter_x, 4191 const int8_t *filter_y, 4192 int32_t height) 4193{ 4194 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4195 filter_x, filter_y, height, 3); 4196} 4197 4198static void hevc_hv_4t_32w_msa(uint8_t *src, 4199 int32_t src_stride, 4200 int16_t *dst, 4201 int32_t dst_stride, 4202 const int8_t *filter_x, 4203 const int8_t *filter_y, 4204 int32_t height) 4205{ 4206 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride, 4207 filter_x, filter_y, height, 4); 4208} 4209 4210#define MC_COPY(WIDTH) \ 4211void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \ 4212 uint8_t *src, \ 4213 ptrdiff_t src_stride, \ 4214 int height, \ 4215 intptr_t mx, \ 4216 intptr_t my, \ 4217 int width) \ 4218{ \ 4219 hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \ 4220} 4221 4222MC_COPY(4); 4223MC_COPY(6); 4224MC_COPY(8); 4225MC_COPY(12); 4226MC_COPY(16); 4227MC_COPY(24); 4228MC_COPY(32); 4229MC_COPY(48); 4230MC_COPY(64); 4231 4232#undef MC_COPY 4233 4234#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 4235void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \ 4236 uint8_t *src, \ 4237 ptrdiff_t src_stride, \ 4238 int height, \ 4239 intptr_t mx, \ 4240 intptr_t my, \ 4241 int width) \ 4242{ \ 4243 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 4244 \ 4245 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \ 4246 MAX_PB_SIZE, filter, height); \ 4247} 4248 4249MC(qpel, h, 4, 8, hz, mx); 4250MC(qpel, h, 8, 8, hz, mx); 4251MC(qpel, h, 12, 8, hz, mx); 4252MC(qpel, h, 16, 8, hz, mx); 4253MC(qpel, h, 24, 8, hz, mx); 4254MC(qpel, h, 32, 8, hz, mx); 4255MC(qpel, h, 48, 8, hz, mx); 4256MC(qpel, h, 64, 8, hz, mx); 4257 4258MC(qpel, v, 4, 8, vt, my); 4259MC(qpel, v, 8, 8, vt, my); 4260MC(qpel, v, 12, 8, vt, my); 4261MC(qpel, v, 16, 8, vt, my); 4262MC(qpel, v, 24, 8, vt, my); 4263MC(qpel, v, 32, 8, vt, my); 4264MC(qpel, v, 48, 8, vt, my); 4265MC(qpel, v, 64, 8, vt, my); 4266 4267MC(epel, h, 4, 4, hz, mx); 4268MC(epel, h, 6, 4, hz, mx); 4269MC(epel, h, 8, 4, hz, mx); 4270MC(epel, h, 12, 4, hz, mx); 4271MC(epel, h, 16, 4, hz, mx); 4272MC(epel, h, 24, 4, hz, mx); 4273MC(epel, h, 32, 4, hz, mx); 4274 4275MC(epel, v, 4, 4, vt, my); 4276MC(epel, v, 6, 4, vt, my); 4277MC(epel, v, 8, 4, vt, my); 4278MC(epel, v, 12, 4, vt, my); 4279MC(epel, v, 16, 4, vt, my); 4280MC(epel, v, 24, 4, vt, my); 4281MC(epel, v, 32, 4, vt, my); 4282 4283#undef MC 4284 4285#define MC_HV(PEL, WIDTH, TAP) \ 4286void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_msa(int16_t *dst, \ 4287 uint8_t *src, \ 4288 ptrdiff_t src_stride, \ 4289 int height, \ 4290 intptr_t mx, \ 4291 intptr_t my, \ 4292 int width) \ 4293{ \ 4294 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 4295 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 4296 \ 4297 hevc_hv_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \ 4298 filter_x, filter_y, height); \ 4299} 4300 4301MC_HV(qpel, 4, 8); 4302MC_HV(qpel, 8, 8); 4303MC_HV(qpel, 12, 8); 4304MC_HV(qpel, 16, 8); 4305MC_HV(qpel, 24, 8); 4306MC_HV(qpel, 32, 8); 4307MC_HV(qpel, 48, 8); 4308MC_HV(qpel, 64, 8); 4309 4310MC_HV(epel, 4, 4); 4311MC_HV(epel, 6, 4); 4312MC_HV(epel, 8, 4); 4313MC_HV(epel, 12, 4); 4314MC_HV(epel, 16, 4); 4315MC_HV(epel, 24, 4); 4316MC_HV(epel, 32, 4); 4317 4318#undef MC_HV 4319