1/* 2 * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "h264dsp_mips.h" 23 24static void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride, 25 int32_t log2_denom, int32_t src_weight, 26 int32_t offset_in) 27{ 28 uint32_t tp0, tp1, offset_val; 29 v16u8 zero = { 0 }; 30 v16u8 src0 = { 0 }; 31 v8i16 src0_r, tmp0, wgt, denom, offset; 32 33 offset_val = (unsigned) offset_in << log2_denom; 34 35 wgt = __msa_fill_h(src_weight); 36 offset = __msa_fill_h(offset_val); 37 denom = __msa_fill_h(log2_denom); 38 39 LW2(data, stride, tp0, tp1); 40 INSERT_W2_UB(tp0, tp1, src0); 41 src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0); 42 tmp0 = wgt * src0_r; 43 tmp0 = __msa_adds_s_h(tmp0, offset); 44 tmp0 = __msa_maxi_s_h(tmp0, 0); 45 tmp0 = __msa_srlr_h(tmp0, denom); 46 tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7); 47 src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); 48 ST_W2(src0, 0, 1, data, stride); 49} 50 51static void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride, 52 int32_t log2_denom, int32_t src_weight, 53 int32_t offset_in) 54{ 55 uint32_t tp0, tp1, tp2, tp3, offset_val; 56 v16u8 src0 = { 0 }; 57 v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset; 58 59 offset_val = (unsigned) offset_in << log2_denom; 60 61 wgt = __msa_fill_h(src_weight); 62 offset = __msa_fill_h(offset_val); 63 denom = __msa_fill_h(log2_denom); 64 65 LW4(data, stride, tp0, tp1, tp2, tp3); 66 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 67 UNPCK_UB_SH(src0, src0_r, src1_r); 68 MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1); 69 ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1); 70 MAXI_SH2_SH(tmp0, tmp1, 0); 71 tmp0 = __msa_srlr_h(tmp0, denom); 72 tmp1 = __msa_srlr_h(tmp1, denom); 73 SAT_UH2_SH(tmp0, tmp1, 7); 74 src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 75 ST_W4(src0, 0, 1, 2, 3, data, stride); 76} 77 78static void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride, 79 int32_t log2_denom, int32_t src_weight, 80 int32_t offset_in) 81{ 82 uint32_t tp0, tp1, tp2, tp3, offset_val; 83 v16u8 src0 = { 0 }, src1 = { 0 }; 84 v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; 85 v8i16 wgt, denom, offset; 86 87 offset_val = (unsigned) offset_in << log2_denom; 88 89 wgt = __msa_fill_h(src_weight); 90 offset = __msa_fill_h(offset_val); 91 denom = __msa_fill_h(log2_denom); 92 93 LW4(data, stride, tp0, tp1, tp2, tp3); 94 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 95 LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3); 96 INSERT_W4_UB(tp0, tp1, tp2, tp3, src1); 97 UNPCK_UB_SH(src0, src0_r, src1_r); 98 UNPCK_UB_SH(src1, src2_r, src3_r); 99 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2, 100 tmp3); 101 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, 102 tmp1, tmp2, tmp3); 103 MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0); 104 SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom); 105 SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 106 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 107 ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride); 108} 109 110static void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride, 111 int32_t log2_denom, int32_t src_weight, 112 int32_t offset_in) 113{ 114 uint32_t offset_val; 115 uint64_t tp0, tp1, tp2, tp3; 116 v16u8 src0 = { 0 }, src1 = { 0 }; 117 v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; 118 v8i16 wgt, denom, offset; 119 120 offset_val = (unsigned) offset_in << log2_denom; 121 122 wgt = __msa_fill_h(src_weight); 123 offset = __msa_fill_h(offset_val); 124 denom = __msa_fill_h(log2_denom); 125 126 LD4(data, stride, tp0, tp1, tp2, tp3); 127 INSERT_D2_UB(tp0, tp1, src0); 128 INSERT_D2_UB(tp2, tp3, src1); 129 UNPCK_UB_SH(src0, src0_r, src1_r); 130 UNPCK_UB_SH(src1, src2_r, src3_r); 131 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2, 132 tmp3); 133 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, 134 tmp1, tmp2, tmp3); 135 MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0); 136 SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom); 137 SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 138 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 139 ST_D4(src0, src1, 0, 1, 0, 1, data, stride); 140} 141 142static void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, 143 int32_t src_weight, int32_t offset_in) 144{ 145 uint32_t offset_val; 146 uint64_t tp0, tp1, tp2, tp3; 147 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 148 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 149 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 150 v8i16 wgt, denom, offset; 151 152 offset_val = (unsigned) offset_in << log2_denom; 153 154 wgt = __msa_fill_h(src_weight); 155 offset = __msa_fill_h(offset_val); 156 denom = __msa_fill_h(log2_denom); 157 158 LD4(data, stride, tp0, tp1, tp2, tp3); 159 INSERT_D2_UB(tp0, tp1, src0); 160 INSERT_D2_UB(tp2, tp3, src1); 161 LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3); 162 INSERT_D2_UB(tp0, tp1, src2); 163 INSERT_D2_UB(tp2, tp3, src3); 164 UNPCK_UB_SH(src0, src0_r, src1_r); 165 UNPCK_UB_SH(src1, src2_r, src3_r); 166 UNPCK_UB_SH(src2, src4_r, src5_r); 167 UNPCK_UB_SH(src3, src6_r, src7_r); 168 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2, 169 tmp3); 170 MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6, 171 tmp7); 172 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, 173 tmp1, tmp2, tmp3); 174 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4, 175 tmp5, tmp6, tmp7); 176 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); 177 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); 178 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); 179 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1, 180 src2, src3); 181 ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride); 182} 183 184static void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride, 185 int32_t log2_denom, int32_t src_weight, 186 int32_t offset_in) 187{ 188 uint32_t offset_val, cnt; 189 uint64_t tp0, tp1, tp2, tp3; 190 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 191 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 192 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 193 v8i16 wgt, denom, offset; 194 195 offset_val = (unsigned) offset_in << log2_denom; 196 197 wgt = __msa_fill_h(src_weight); 198 offset = __msa_fill_h(offset_val); 199 denom = __msa_fill_h(log2_denom); 200 201 for (cnt = 2; cnt--;) { 202 LD4(data, stride, tp0, tp1, tp2, tp3); 203 INSERT_D2_UB(tp0, tp1, src0); 204 INSERT_D2_UB(tp2, tp3, src1); 205 LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3); 206 INSERT_D2_UB(tp0, tp1, src2); 207 INSERT_D2_UB(tp2, tp3, src3); 208 UNPCK_UB_SH(src0, src0_r, src1_r); 209 UNPCK_UB_SH(src1, src2_r, src3_r); 210 UNPCK_UB_SH(src2, src4_r, src5_r); 211 UNPCK_UB_SH(src3, src6_r, src7_r); 212 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, 213 tmp2, tmp3); 214 MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, 215 tmp6, tmp7); 216 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, 217 tmp0, tmp1, tmp2, tmp3); 218 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, 219 tmp4, tmp5, tmp6, tmp7); 220 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); 221 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); 222 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); 223 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1, 224 src2, src3); 225 ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride); 226 data += 8 * stride; 227 } 228} 229 230static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 231 int32_t log2_denom, int32_t src_weight, 232 int32_t dst_weight, int32_t offset_in) 233{ 234 uint32_t tp0, tp1; 235 v16i8 src_wgt, dst_wgt, wgt, vec0; 236 v16u8 src0 = { 0 }, dst0 = { 0 }; 237 v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255); 238 239 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 240 offset_in += (128 * (src_weight + dst_weight)); 241 242 src_wgt = __msa_fill_b(src_weight); 243 dst_wgt = __msa_fill_b(dst_weight); 244 offset = __msa_fill_h(offset_in); 245 denom = __msa_fill_h(log2_denom + 1); 246 247 wgt = __msa_ilvev_b(dst_wgt, src_wgt); 248 249 LW2(src, stride, tp0, tp1); 250 INSERT_W2_UB(tp0, tp1, src0); 251 LW2(dst, stride, tp0, tp1); 252 INSERT_W2_UB(tp0, tp1, dst0); 253 XORI_B2_128_UB(src0, dst0); 254 vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0); 255 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 256 tmp0 >>= denom; 257 tmp0 = __msa_maxi_s_h(tmp0, 0); 258 tmp0 = __msa_min_s_h(max255, tmp0); 259 dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); 260 ST_W2(dst0, 0, 1, dst, stride); 261} 262 263static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 264 int32_t log2_denom, int32_t src_weight, 265 int32_t dst_weight, int32_t offset_in) 266{ 267 uint32_t tp0, tp1, tp2, tp3; 268 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1; 269 v16u8 src0, dst0; 270 v8i16 tmp0, tmp1, denom, offset; 271 272 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 273 offset_in += (128 * (src_weight + dst_weight)); 274 275 src_wgt = __msa_fill_b(src_weight); 276 dst_wgt = __msa_fill_b(dst_weight); 277 offset = __msa_fill_h(offset_in); 278 denom = __msa_fill_h(log2_denom + 1); 279 280 wgt = __msa_ilvev_b(dst_wgt, src_wgt); 281 282 LW4(src, stride, tp0, tp1, tp2, tp3); 283 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 284 LW4(dst, stride, tp0, tp1, tp2, tp3); 285 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 286 XORI_B2_128_UB(src0, dst0); 287 ILVRL_B2_SB(dst0, src0, vec0, vec1); 288 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 289 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 290 tmp0 >>= denom; 291 tmp1 >>= denom; 292 CLIP_SH2_0_255(tmp0, tmp1); 293 dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); 294 ST_W4(dst0, 0, 1, 2, 3, dst, stride); 295} 296 297static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 298 int32_t log2_denom, int32_t src_weight, 299 int32_t dst_weight, int32_t offset_in) 300{ 301 uint32_t tp0, tp1, tp2, tp3; 302 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3; 303 v16u8 src0, src1, dst0, dst1; 304 v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset; 305 306 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 307 offset_in += (128 * (src_weight + dst_weight)); 308 309 src_wgt = __msa_fill_b(src_weight); 310 dst_wgt = __msa_fill_b(dst_weight); 311 offset = __msa_fill_h(offset_in); 312 denom = __msa_fill_h(log2_denom + 1); 313 wgt = __msa_ilvev_b(dst_wgt, src_wgt); 314 315 LW4(src, stride, tp0, tp1, tp2, tp3); 316 src += 4 * stride; 317 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); 318 LW4(src, stride, tp0, tp1, tp2, tp3); 319 INSERT_W4_UB(tp0, tp1, tp2, tp3, src1); 320 LW4(dst, stride, tp0, tp1, tp2, tp3); 321 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 322 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 323 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 324 XORI_B4_128_UB(src0, src1, dst0, dst1); 325 ILVRL_B2_SB(dst0, src0, vec0, vec1); 326 ILVRL_B2_SB(dst1, src1, vec2, vec3); 327 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 328 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 329 tmp2 = __msa_dpadd_s_h(offset, wgt, vec2); 330 tmp3 = __msa_dpadd_s_h(offset, wgt, vec3); 331 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); 332 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 333 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); 334 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 335} 336 337static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 338 int32_t log2_denom, int32_t src_weight, 339 int32_t dst_weight, int32_t offset_in) 340{ 341 uint64_t tp0, tp1, tp2, tp3; 342 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3; 343 v16u8 src0, src1, dst0, dst1; 344 v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset; 345 346 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 347 offset_in += (128 * (src_weight + dst_weight)); 348 349 src_wgt = __msa_fill_b(src_weight); 350 dst_wgt = __msa_fill_b(dst_weight); 351 offset = __msa_fill_h(offset_in); 352 denom = __msa_fill_h(log2_denom + 1); 353 354 wgt = __msa_ilvev_b(dst_wgt, src_wgt); 355 356 LD4(src, stride, tp0, tp1, tp2, tp3); 357 INSERT_D2_UB(tp0, tp1, src0); 358 INSERT_D2_UB(tp2, tp3, src1); 359 LD4(dst, stride, tp0, tp1, tp2, tp3); 360 INSERT_D2_UB(tp0, tp1, dst0); 361 INSERT_D2_UB(tp2, tp3, dst1); 362 XORI_B4_128_UB(src0, src1, dst0, dst1); 363 ILVRL_B2_SB(dst0, src0, vec0, vec1); 364 ILVRL_B2_SB(dst1, src1, vec2, vec3); 365 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 366 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 367 tmp2 = __msa_dpadd_s_h(offset, wgt, vec2); 368 tmp3 = __msa_dpadd_s_h(offset, wgt, vec3); 369 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); 370 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); 371 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); 372 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride); 373} 374 375static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 376 int32_t log2_denom, int32_t src_weight, 377 int32_t dst_weight, int32_t offset_in) 378{ 379 uint64_t tp0, tp1, tp2, tp3; 380 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 381 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; 382 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset; 383 384 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 385 offset_in += (128 * (src_weight + dst_weight)); 386 387 src_wgt = __msa_fill_b(src_weight); 388 dst_wgt = __msa_fill_b(dst_weight); 389 offset = __msa_fill_h(offset_in); 390 denom = __msa_fill_h(log2_denom + 1); 391 wgt = __msa_ilvev_b(dst_wgt, src_wgt); 392 393 LD4(src, stride, tp0, tp1, tp2, tp3); 394 INSERT_D2_UB(tp0, tp1, src0); 395 INSERT_D2_UB(tp2, tp3, src1); 396 LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3); 397 INSERT_D2_UB(tp0, tp1, src2); 398 INSERT_D2_UB(tp2, tp3, src3); 399 LD4(dst, stride, tp0, tp1, tp2, tp3); 400 INSERT_D2_UB(tp0, tp1, dst0); 401 INSERT_D2_UB(tp2, tp3, dst1); 402 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 403 INSERT_D2_UB(tp0, tp1, dst2); 404 INSERT_D2_UB(tp2, tp3, dst3); 405 XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3); 406 ILVRL_B2_SB(dst0, src0, vec0, vec1); 407 ILVRL_B2_SB(dst1, src1, vec2, vec3); 408 ILVRL_B2_SB(dst2, src2, vec4, vec5); 409 ILVRL_B2_SB(dst3, src3, vec6, vec7); 410 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 411 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 412 tmp2 = __msa_dpadd_s_h(offset, wgt, vec2); 413 tmp3 = __msa_dpadd_s_h(offset, wgt, vec3); 414 tmp4 = __msa_dpadd_s_h(offset, wgt, vec4); 415 tmp5 = __msa_dpadd_s_h(offset, wgt, vec5); 416 tmp6 = __msa_dpadd_s_h(offset, wgt, vec6); 417 tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); 418 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); 419 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); 420 CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 421 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); 422 PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3); 423 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 424} 425 426static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 427 int32_t log2_denom, int32_t src_weight, 428 int32_t dst_weight, int32_t offset_in) 429{ 430 uint8_t cnt; 431 uint64_t tp0, tp1, tp2, tp3; 432 v16i8 src_wgt, dst_wgt, wgt; 433 v16u8 src0, src1, src2, src3; 434 v16u8 dst0, dst1, dst2, dst3; 435 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 436 v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; 437 v8i16 denom, offset; 438 439 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 440 offset_in += (128 * (src_weight + dst_weight)); 441 442 src_wgt = __msa_fill_b(src_weight); 443 dst_wgt = __msa_fill_b(dst_weight); 444 offset = __msa_fill_h(offset_in); 445 denom = __msa_fill_h(log2_denom + 1); 446 wgt = __msa_ilvev_b(dst_wgt, src_wgt); 447 448 for (cnt = 2; cnt--;) { 449 LD4(src, stride, tp0, tp1, tp2, tp3); 450 src += 4 * stride; 451 INSERT_D2_UB(tp0, tp1, src0); 452 INSERT_D2_UB(tp2, tp3, src1); 453 LD4(src, stride, tp0, tp1, tp2, tp3); 454 src += 4 * stride; 455 INSERT_D2_UB(tp0, tp1, src2); 456 INSERT_D2_UB(tp2, tp3, src3); 457 LD4(dst, stride, tp0, tp1, tp2, tp3); 458 INSERT_D2_UB(tp0, tp1, dst0); 459 INSERT_D2_UB(tp2, tp3, dst1); 460 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3); 461 INSERT_D2_UB(tp0, tp1, dst2); 462 INSERT_D2_UB(tp2, tp3, dst3); 463 XORI_B4_128_UB(src0, src1, src2, src3); 464 XORI_B4_128_UB(dst0, dst1, dst2, dst3); 465 ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, 466 vec0, vec2, vec4, vec6); 467 ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, 468 vec1, vec3, vec5, vec7); 469 470 temp0 = __msa_dpadd_s_h(offset, wgt, vec0); 471 temp1 = __msa_dpadd_s_h(offset, wgt, vec1); 472 temp2 = __msa_dpadd_s_h(offset, wgt, vec2); 473 temp3 = __msa_dpadd_s_h(offset, wgt, vec3); 474 temp4 = __msa_dpadd_s_h(offset, wgt, vec4); 475 temp5 = __msa_dpadd_s_h(offset, wgt, vec5); 476 temp6 = __msa_dpadd_s_h(offset, wgt, vec6); 477 temp7 = __msa_dpadd_s_h(offset, wgt, vec7); 478 479 SRA_4V(temp0, temp1, temp2, temp3, denom); 480 SRA_4V(temp4, temp5, temp6, temp7, denom); 481 CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); 482 PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, 483 dst0, dst1, dst2, dst3); 484 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); 485 dst += 8 * stride; 486 } 487} 488 489#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \ 490 q3_or_p3_org_in, p1_or_q1_org_in, \ 491 p2_or_q2_org_in, q1_or_p1_org_in, \ 492 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \ 493{ \ 494 v8i16 threshold; \ 495 v8i16 const3 = __msa_ldi_h(3); \ 496 \ 497 threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \ 498 threshold += (p1_or_q1_org_in); \ 499 \ 500 (p0_or_q0_out) = threshold << 1; \ 501 (p0_or_q0_out) += (p2_or_q2_org_in); \ 502 (p0_or_q0_out) += (q1_or_p1_org_in); \ 503 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \ 504 \ 505 (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \ 506 (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \ 507 \ 508 (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \ 509 (p2_or_q2_out) += (p3_or_q3_org_in); \ 510 (p2_or_q2_out) += (p3_or_q3_org_in); \ 511 (p2_or_q2_out) += threshold; \ 512 (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \ 513} 514 515/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */ 516#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \ 517 p1_or_q1_org_in, p0_or_q0_out) \ 518{ \ 519 (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \ 520 (p0_or_q0_out) += (p1_or_q1_org_in); \ 521 (p0_or_q0_out) += (p1_or_q1_org_in); \ 522 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \ 523} 524 525#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \ 526 p1_or_q1_org_in, p2_or_q2_org_in, \ 527 negate_tc_in, tc_in, p1_or_q1_out) \ 528{ \ 529 v8i16 clip3, temp; \ 530 \ 531 clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \ 532 (v8u16) q0_or_p0_org_in); \ 533 temp = p1_or_q1_org_in << 1; \ 534 clip3 = clip3 - temp; \ 535 clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \ 536 CLIP_SH(clip3, negate_tc_in, tc_in); \ 537 p1_or_q1_out = p1_or_q1_org_in + clip3; \ 538} 539 540#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \ 541 p1_or_q1_org_in, q1_or_p1_org_in, \ 542 negate_threshold_in, threshold_in, \ 543 p0_or_q0_out, q0_or_p0_out) \ 544{ \ 545 v8i16 q0_sub_p0, p1_sub_q1, delta; \ 546 \ 547 q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \ 548 p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \ 549 q0_sub_p0 <<= 2; \ 550 p1_sub_q1 += 4; \ 551 delta = q0_sub_p0 + p1_sub_q1; \ 552 delta >>= 3; \ 553 \ 554 CLIP_SH(delta, negate_threshold_in, threshold_in); \ 555 \ 556 p0_or_q0_out = p0_or_q0_org_in + delta; \ 557 q0_or_p0_out = q0_or_p0_org_in - delta; \ 558 \ 559 CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \ 560} 561 562#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \ 563{ \ 564 uint32_t load0, load1, load2, load3; \ 565 v16u8 src0 = { 0 }; \ 566 v16u8 src1 = { 0 }; \ 567 v16u8 src2 = { 0 }; \ 568 v16u8 src3 = { 0 }; \ 569 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \ 570 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \ 571 v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \ 572 v8i16 res0_r, res1_r; \ 573 v16i8 zeros = { 0 }; \ 574 v16u8 res0, res1; \ 575 \ 576 LW4((src - 2), stride, load0, load1, load2, load3); \ 577 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \ 578 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \ 579 src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \ 580 src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \ 581 \ 582 TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \ 583 \ 584 p0_asub_q0 = __msa_asub_u_b(src2, src1); \ 585 p1_asub_p0 = __msa_asub_u_b(src1, src0); \ 586 q1_asub_q0 = __msa_asub_u_b(src2, src3); \ 587 \ 588 tc = __msa_fill_h(tc_val); \ 589 \ 590 is_less_than_alpha = (p0_asub_q0 < alpha); \ 591 is_less_than_beta = (p1_asub_p0 < beta); \ 592 is_less_than = is_less_than_alpha & is_less_than_beta; \ 593 is_less_than_beta = (q1_asub_q0 < beta); \ 594 is_less_than = is_less_than_beta & is_less_than; \ 595 \ 596 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \ 597 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \ 598 \ 599 q0_sub_p0 <<= 2; \ 600 delta = q0_sub_p0 + p1_sub_q1; \ 601 delta = __msa_srari_h(delta, 3); \ 602 \ 603 CLIP_SH(delta, -tc, tc); \ 604 \ 605 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \ 606 \ 607 res0_r += delta; \ 608 res1_r -= delta; \ 609 \ 610 CLIP_SH2_0_255(res0_r, res1_r); \ 611 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \ 612 \ 613 res0 = __msa_bmnz_v(src1, res0, is_less_than); \ 614 res1 = __msa_bmnz_v(src2, res1, is_less_than); \ 615 \ 616 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \ 617} 618 619#define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \ 620{ \ 621 v16i8 zero_m = { 0 }; \ 622 \ 623 out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \ 624 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \ 625 SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3); \ 626} 627 628#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \ 629{ \ 630 uint32_t load0, load1; \ 631 v16u8 src0 = { 0 }; \ 632 v16u8 src1 = { 0 }; \ 633 v16u8 src2 = { 0 }; \ 634 v16u8 src3 = { 0 }; \ 635 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \ 636 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \ 637 v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \ 638 v16i8 zeros = { 0 }; \ 639 v16u8 res0, res1; \ 640 \ 641 load0 = LW(src - 2); \ 642 load1 = LW(src - 2 + stride); \ 643 \ 644 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \ 645 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \ 646 \ 647 TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \ 648 \ 649 p0_asub_q0 = __msa_asub_u_b(src2, src1); \ 650 p1_asub_p0 = __msa_asub_u_b(src1, src0); \ 651 q1_asub_q0 = __msa_asub_u_b(src2, src3); \ 652 \ 653 tc = __msa_fill_h(tc_val); \ 654 \ 655 is_less_than_alpha = (p0_asub_q0 < alpha); \ 656 is_less_than_beta = (p1_asub_p0 < beta); \ 657 is_less_than = is_less_than_alpha & is_less_than_beta; \ 658 is_less_than_beta = (q1_asub_q0 < beta); \ 659 is_less_than = is_less_than_beta & is_less_than; \ 660 \ 661 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \ 662 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \ 663 \ 664 q0_sub_p0 <<= 2; \ 665 delta = q0_sub_p0 + p1_sub_q1; \ 666 delta = __msa_srari_h(delta, 3); \ 667 CLIP_SH(delta, -tc, tc); \ 668 \ 669 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \ 670 \ 671 res0_r += delta; \ 672 res1_r -= delta; \ 673 \ 674 CLIP_SH2_0_255(res0_r, res1_r); \ 675 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \ 676 \ 677 res0 = __msa_bmnz_v(src1, res0, is_less_than); \ 678 res1 = __msa_bmnz_v(src2, res1, is_less_than); \ 679 \ 680 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \ 681} 682 683static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data, 684 uint8_t alpha_in, 685 uint8_t beta_in, 686 ptrdiff_t img_width) 687{ 688 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; 689 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha; 690 v16u8 p1_org, p0_org, q0_org, q1_org; 691 692 LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org); 693 694 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); 695 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); 696 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); 697 698 is_less_than_alpha = (p0_asub_q0 < alpha_in); 699 is_less_than_beta = (p1_asub_p0 < beta_in); 700 is_less_than = is_less_than_beta & is_less_than_alpha; 701 is_less_than_beta = (q1_asub_q0 < beta_in); 702 is_less_than = is_less_than_beta & is_less_than; 703 704 if (!__msa_test_bz_v(is_less_than)) { 705 v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta; 706 v8i16 p0_r = { 0 }; 707 v8i16 q0_r = { 0 }; 708 v8i16 p0_l = { 0 }; 709 v8i16 q0_l = { 0 }; 710 v16i8 zero = { 0 }; 711 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 712 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; 713 v16u8 q2_org = LD_UB(data + (2 * img_width)); 714 v16u8 p2_org = LD_UB(data - (3 * img_width)); 715 v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2); 716 717 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l); 718 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l); 719 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l); 720 721 tmp_flag = (p0_asub_q0 < tmp_flag); 722 723 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org); 724 is_less_than_beta = (p2_asub_p0 < beta_in); 725 is_less_than_beta = is_less_than_beta & tmp_flag; 726 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); 727 is_less_than_beta = is_less_than_beta & is_less_than; 728 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 729 730 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org); 731 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org); 732 733 /* combine and store */ 734 if (!__msa_test_bz_v(is_less_than_beta)) { 735 v8i16 p3_org_l, p3_org_r; 736 v16u8 p3_org = LD_UB(data - (img_width << 2)); 737 v16u8 p2, p1; 738 v8i16 p2_r = { 0 }; 739 v8i16 p2_l = { 0 }; 740 v8i16 p1_r = { 0 }; 741 v8i16 p1_l = { 0 }; 742 743 ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r); 744 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r, 745 p2_r, q1_org_r, p0_r, p1_r, p2_r); 746 747 ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l); 748 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l, 749 p2_l, q1_org_l, p0_l, p1_l, p2_l); 750 751 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2); 752 753 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta); 754 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta); 755 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta); 756 757 ST_UB(p1_org, data - (2 * img_width)); 758 ST_UB(p2_org, data - (3 * img_width)); 759 } 760 761 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r); 762 AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l); 763 764 /* combine */ 765 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r); 766 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta); 767 768 ST_UB(p0_org, data - img_width); 769 770 /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */ 771 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org); 772 is_less_than_beta = (q2_asub_q0 < beta_in); 773 is_less_than_beta = is_less_than_beta & tmp_flag; 774 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); 775 is_less_than_beta = is_less_than_beta & is_less_than; 776 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 777 778 /* combine and store */ 779 if (!__msa_test_bz_v(is_less_than_beta)) { 780 v8i16 q3_org_r, q3_org_l; 781 v16u8 q3_org = LD_UB(data + (3 * img_width)); 782 v16u8 q1, q2; 783 v8i16 q2_r = { 0 }; 784 v8i16 q2_l = { 0 }; 785 v8i16 q1_r = { 0 }; 786 v8i16 q1_l = { 0 }; 787 788 ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r); 789 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r, 790 q2_r, p1_org_r, q0_r, q1_r, q2_r); 791 792 ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l); 793 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l, 794 q2_l, p1_org_l, q0_l, q1_l, q2_l); 795 796 PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2); 797 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta); 798 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta); 799 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta); 800 801 ST_UB(q1_org, data + img_width); 802 ST_UB(q2_org, data + 2 * img_width); 803 } 804 805 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r); 806 AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l); 807 808 /* combine */ 809 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r); 810 q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta); 811 812 ST_UB(q0_org, data); 813 } 814} 815 816static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data, 817 uint8_t alpha_in, 818 uint8_t beta_in, 819 ptrdiff_t img_width) 820{ 821 uint8_t *src = data - 4; 822 v16u8 alpha, beta, p0_asub_q0; 823 v16u8 is_less_than_alpha, is_less_than, is_less_than_beta; 824 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; 825 v16u8 p1_asub_p0, q1_asub_q0; 826 827 828 { 829 v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 830 v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 831 832 LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7); 833 LD_UB8(src + (8 * img_width), img_width, 834 row8, row9, row10, row11, row12, row13, row14, row15); 835 836 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, 837 row4, row5, row6, row7, 838 row8, row9, row10, row11, 839 row12, row13, row14, row15, 840 p3_org, p2_org, p1_org, p0_org, 841 q0_org, q1_org, q2_org, q3_org); 842 } 843 844 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); 845 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); 846 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); 847 848 alpha = (v16u8) __msa_fill_b(alpha_in); 849 beta = (v16u8) __msa_fill_b(beta_in); 850 851 is_less_than_alpha = (p0_asub_q0 < alpha); 852 is_less_than_beta = (p1_asub_p0 < beta); 853 is_less_than = is_less_than_beta & is_less_than_alpha; 854 is_less_than_beta = (q1_asub_q0 < beta); 855 is_less_than = is_less_than_beta & is_less_than; 856 857 if (!__msa_test_bz_v(is_less_than)) { 858 v8i16 p0_r = { 0 }; 859 v8i16 q0_r = { 0 }; 860 v8i16 p0_l = { 0 }; 861 v8i16 q0_l = { 0 }; 862 v16i8 zero = { 0 }; 863 v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0; 864 v16u8 negate_is_less_than_beta; 865 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 866 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; 867 868 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l); 869 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l); 870 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l); 871 UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l); 872 873 tmp_flag = alpha >> 2; 874 tmp_flag = tmp_flag + 2; 875 tmp_flag = (p0_asub_q0 < tmp_flag); 876 877 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org); 878 is_less_than_beta = (p2_asub_p0 < beta); 879 is_less_than_beta = tmp_flag & is_less_than_beta; 880 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); 881 is_less_than_beta = is_less_than_beta & is_less_than; 882 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 883 884 if (!__msa_test_bz_v(is_less_than_beta)) { 885 v16u8 p2, p1; 886 v8i16 p3_org_r, p3_org_l; 887 v8i16 p2_l = { 0 }; 888 v8i16 p2_r = { 0 }; 889 v8i16 p1_l = { 0 }; 890 v8i16 p1_r = { 0 }; 891 892 ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r); 893 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r, 894 p2_r, q1_org_r, p0_r, p1_r, p2_r); 895 896 ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l); 897 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l, 898 p2_l, q1_org_l, p0_l, p1_l, p2_l); 899 900 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2); 901 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta); 902 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta); 903 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta); 904 } 905 906 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r); 907 AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l); 908 909 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r); 910 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta); 911 912 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org); 913 is_less_than_beta = (q2_asub_q0 < beta); 914 915 is_less_than_beta = is_less_than_beta & tmp_flag; 916 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff); 917 918 is_less_than_beta = is_less_than_beta & is_less_than; 919 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 920 921 if (!__msa_test_bz_v(is_less_than_beta)) { 922 v16u8 q1, q2; 923 v8i16 q3_org_r, q3_org_l; 924 v8i16 q1_l = { 0 }; 925 v8i16 q1_r = { 0 }; 926 v8i16 q2_l = { 0 }; 927 v8i16 q2_r = { 0 }; 928 929 ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r); 930 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r, 931 q2_r, p1_org_r, q0_r, q1_r, q2_r); 932 933 ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l); 934 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l, 935 q2_l, p1_org_l, q0_l, q1_l, q2_l); 936 937 PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2); 938 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta); 939 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta); 940 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta); 941 } 942 943 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r); 944 AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l); 945 946 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r); 947 q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta); 948 949 { 950 v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 951 952 ILVRL_B2_SH(p1_org, p2_org, tp0, tp2); 953 ILVRL_B2_SH(q0_org, p0_org, tp1, tp3); 954 ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5); 955 956 ILVRL_H2_SH(tp1, tp0, tmp3, tmp4); 957 ILVRL_H2_SH(tp3, tp2, tmp6, tmp7); 958 959 src = data - 3; 960 ST_W4(tmp3, 0, 1, 2, 3, src, img_width); 961 ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width); 962 src += 4 * img_width; 963 ST_W4(tmp4, 0, 1, 2, 3, src, img_width); 964 ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width); 965 src += 4 * img_width; 966 967 ST_W4(tmp6, 0, 1, 2, 3, src, img_width); 968 ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width); 969 src += 4 * img_width; 970 ST_W4(tmp7, 0, 1, 2, 3, src, img_width); 971 ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width); 972 } 973 } 974} 975 976static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, 977 ptrdiff_t stride, 978 int32_t alpha_in, 979 int32_t beta_in) 980{ 981 uint64_t load0, load1; 982 uint32_t out0, out2; 983 uint16_t out1, out3; 984 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; 985 v8u16 dst0_r, dst1_r, dst4_r, dst5_r; 986 v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r; 987 v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y; 988 v8i16 tmp0, tmp1, tmp2, tmp3; 989 v16u8 alpha, beta; 990 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0; 991 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; 992 v16u8 is_less_than_beta1, is_less_than_beta2; 993 v16i8 src0 = { 0 }; 994 v16i8 src1 = { 0 }; 995 v16i8 src2 = { 0 }; 996 v16i8 src3 = { 0 }; 997 v16i8 src4 = { 0 }; 998 v16i8 src5 = { 0 }; 999 v16i8 src6 = { 0 }; 1000 v16i8 src7 = { 0 }; 1001 v16i8 zeros = { 0 }; 1002 1003 load0 = LD(src - 4); 1004 load1 = LD(src + stride - 4); 1005 src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0); 1006 src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1); 1007 1008 load0 = LD(src + (2 * stride) - 4); 1009 load1 = LD(src + (3 * stride) - 4); 1010 src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0); 1011 src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1); 1012 1013 load0 = LD(src + (4 * stride) - 4); 1014 load1 = LD(src + (5 * stride) - 4); 1015 src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0); 1016 src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1); 1017 1018 load0 = LD(src + (6 * stride) - 4); 1019 load1 = LD(src + (7 * stride) - 4); 1020 src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0); 1021 src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1); 1022 1023 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 1024 src0, src1, src2, src3); 1025 1026 ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2); 1027 ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3); 1028 1029 ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3); 1030 ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5); 1031 SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5, 1032 8, src0, src2, src4, src7); 1033 1034 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3); 1035 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2); 1036 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3); 1037 1038 alpha = (v16u8) __msa_fill_b(alpha_in); 1039 beta = (v16u8) __msa_fill_b(beta_in); 1040 1041 is_less_than_alpha = (p0_asub_q0 < alpha); 1042 is_less_than_beta = (p1_asub_p0 < beta); 1043 is_less_than = is_less_than_alpha & is_less_than_beta; 1044 is_less_than_beta = (q1_asub_q0 < beta); 1045 is_less_than = is_less_than & is_less_than_beta; 1046 1047 alpha >>= 2; 1048 alpha += 2; 1049 1050 is_less_than_alpha = (p0_asub_q0 < alpha); 1051 1052 p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2); 1053 is_less_than_beta1 = (p2_asub_p0 < beta); 1054 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3); 1055 is_less_than_beta2 = (q2_asub_q0 < beta); 1056 1057 ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1058 src0_r, src1_r, src2_r, src3_r); 1059 ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1060 src4_r, src5_r, src6_r, src7_r); 1061 1062 dst2_x_r = src1_r + src2_r + src3_r; 1063 dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r; 1064 dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3); 1065 dst1_r = src0_r + src1_r + src2_r + src3_r; 1066 dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2); 1067 1068 dst0_r = (2 * src6_r) + (3 * src0_r); 1069 dst0_r += src1_r + src2_r + src3_r; 1070 dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3); 1071 dst2_y_r = (2 * src1_r) + src2_r + src4_r; 1072 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2); 1073 1074 PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y); 1075 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1); 1076 1077 dst3_x_r = src2_r + src3_r + src4_r; 1078 dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r; 1079 dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3); 1080 dst4_r = src2_r + src3_r + src4_r + src5_r; 1081 dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2); 1082 1083 dst5_r = (2 * src7_r) + (3 * src5_r); 1084 dst5_r += src4_r + src3_r + src2_r; 1085 dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3); 1086 dst3_y_r = (2 * src4_r) + src3_r + src1_r; 1087 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2); 1088 1089 PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y); 1090 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2); 1091 1092 dst2_y_r = (2 * src1_r) + src2_r + src4_r; 1093 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2); 1094 dst3_y_r = (2 * src4_r) + src3_r + src1_r; 1095 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2); 1096 1097 PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y); 1098 1099 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha); 1100 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha); 1101 dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than); 1102 dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than); 1103 1104 is_less_than = is_less_than_alpha & is_less_than; 1105 dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r); 1106 is_less_than_beta1 = is_less_than_beta1 & is_less_than; 1107 dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1); 1108 1109 dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r); 1110 dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1); 1111 dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r); 1112 is_less_than_beta2 = is_less_than_beta2 & is_less_than; 1113 dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2); 1114 dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r); 1115 dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2); 1116 1117 ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1); 1118 dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4); 1119 ILVRL_H2_SH(dst1, dst0, tmp0, tmp1); 1120 ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3); 1121 1122 ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4); 1123 SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5); 1124 dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0); 1125 dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1); 1126 SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y); 1127 1128 out0 = __msa_copy_u_w((v4i32) dst0, 0); 1129 out1 = __msa_copy_u_h((v8i16) dst0, 2); 1130 out2 = __msa_copy_u_w((v4i32) dst1, 0); 1131 out3 = __msa_copy_u_h((v8i16) dst1, 2); 1132 1133 SW(out0, (src - 3)); 1134 SH(out1, (src + 1)); 1135 src += stride; 1136 SW(out2, (src - 3)); 1137 SH(out3, (src + 1)); 1138 src += stride; 1139 1140 out0 = __msa_copy_u_w((v4i32) dst2_x, 0); 1141 out1 = __msa_copy_u_h((v8i16) dst2_x, 2); 1142 out2 = __msa_copy_u_w((v4i32) dst3_x, 0); 1143 out3 = __msa_copy_u_h((v8i16) dst3_x, 2); 1144 1145 SW(out0, (src - 3)); 1146 SH(out1, (src + 1)); 1147 src += stride; 1148 SW(out2, (src - 3)); 1149 SH(out3, (src + 1)); 1150 src += stride; 1151 1152 out0 = __msa_copy_u_w((v4i32) dst4, 0); 1153 out1 = __msa_copy_u_h((v8i16) dst4, 2); 1154 out2 = __msa_copy_u_w((v4i32) dst5, 0); 1155 out3 = __msa_copy_u_h((v8i16) dst5, 2); 1156 1157 SW(out0, (src - 3)); 1158 SH(out1, (src + 1)); 1159 src += stride; 1160 SW(out2, (src - 3)); 1161 SH(out3, (src + 1)); 1162 src += stride; 1163 1164 out0 = __msa_copy_u_w((v4i32) dst2_y, 0); 1165 out1 = __msa_copy_u_h((v8i16) dst2_y, 2); 1166 out2 = __msa_copy_u_w((v4i32) dst3_y, 0); 1167 out3 = __msa_copy_u_h((v8i16) dst3_y, 2); 1168 1169 SW(out0, (src - 3)); 1170 SH(out1, (src + 1)); 1171 src += stride; 1172 SW(out2, (src - 3)); 1173 SH(out3, (src + 1)); 1174} 1175 1176static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr, 1177 uint8_t alpha_in, 1178 uint8_t beta_in, 1179 ptrdiff_t img_width) 1180{ 1181 v16u8 alpha, beta; 1182 v16u8 is_less_than; 1183 v8i16 p0_or_q0, q0_or_p0; 1184 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org; 1185 v16i8 zero = { 0 }; 1186 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; 1187 v16u8 is_less_than_alpha, is_less_than_beta; 1188 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 1189 1190 alpha = (v16u8) __msa_fill_b(alpha_in); 1191 beta = (v16u8) __msa_fill_b(beta_in); 1192 1193 LD_UB4(data_cb_or_cr - (img_width << 1), img_width, 1194 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org); 1195 1196 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org); 1197 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org); 1198 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org); 1199 1200 is_less_than_alpha = (p0_asub_q0 < alpha); 1201 is_less_than_beta = (p1_asub_p0 < beta); 1202 is_less_than = is_less_than_beta & is_less_than_alpha; 1203 is_less_than_beta = (q1_asub_q0 < beta); 1204 is_less_than = is_less_than_beta & is_less_than; 1205 1206 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); 1207 1208 if (!__msa_test_bz_v(is_less_than)) { 1209 ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org, 1210 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r); 1211 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0); 1212 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0); 1213 PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0); 1214 1215 p0_or_q0_org = 1216 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than); 1217 q0_or_p0_org = 1218 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than); 1219 1220 ST_UB(q0_or_p0_org, data_cb_or_cr); 1221 ST_UB(p0_or_q0_org, data_cb_or_cr - img_width); 1222 } 1223} 1224 1225static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr, 1226 uint8_t alpha_in, 1227 uint8_t beta_in, 1228 ptrdiff_t img_width) 1229{ 1230 v8i16 tmp1; 1231 v16u8 alpha, beta, is_less_than; 1232 v8i16 p0_or_q0, q0_or_p0; 1233 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org; 1234 v16i8 zero = { 0 }; 1235 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; 1236 v16u8 is_less_than_alpha, is_less_than_beta; 1237 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 1238 1239 { 1240 v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 1241 1242 LD_UB8((data_cb_or_cr - 2), img_width, 1243 row0, row1, row2, row3, row4, row5, row6, row7); 1244 1245 TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 1246 p1_or_q1_org, p0_or_q0_org, 1247 q0_or_p0_org, q1_or_p1_org); 1248 } 1249 1250 alpha = (v16u8) __msa_fill_b(alpha_in); 1251 beta = (v16u8) __msa_fill_b(beta_in); 1252 1253 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org); 1254 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org); 1255 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org); 1256 1257 is_less_than_alpha = (p0_asub_q0 < alpha); 1258 is_less_than_beta = (p1_asub_p0 < beta); 1259 is_less_than = is_less_than_beta & is_less_than_alpha; 1260 is_less_than_beta = (q1_asub_q0 < beta); 1261 is_less_than = is_less_than_beta & is_less_than; 1262 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); 1263 1264 if (!__msa_test_bz_v(is_less_than)) { 1265 ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org, 1266 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r); 1267 1268 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0); 1269 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0); 1270 1271 /* convert 16 bit output into 8 bit output */ 1272 PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0); 1273 1274 p0_or_q0_org = 1275 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than); 1276 q0_or_p0_org = 1277 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than); 1278 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org); 1279 1280 data_cb_or_cr -= 1; 1281 ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width); 1282 data_cb_or_cr += 4 * img_width; 1283 ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width); 1284 } 1285} 1286 1287static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t* pPix, uint32_t iStride, 1288 uint8_t iAlpha, uint8_t iBeta, 1289 uint8_t* pTc) 1290{ 1291 v16u8 p0, p1, p2, q0, q1, q2; 1292 v16i8 iTc, negiTc, negTc, flags, f; 1293 v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, q1_l, q1_r, q2_l, q2_r; 1294 v8i16 tc_l, tc_r, negTc_l, negTc_r; 1295 v8i16 iTc_l, iTc_r, negiTc_l, negiTc_r; 1296 // Use for temporary variable 1297 v8i16 t0, t1, t2, t3; 1298 v16u8 alpha, beta; 1299 v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0; 1300 v16i8 const_1_b = __msa_ldi_b(1); 1301 v8i16 const_1_h = __msa_ldi_h(1); 1302 v8i16 const_4_h = __msa_ldi_h(4); 1303 v8i16 const_not_255_h = __msa_ldi_h(~255); 1304 v16i8 zero = { 0 }; 1305 v16i8 tc = { pTc[0 >> 2], pTc[1 >> 2], pTc[2 >> 2], pTc[3 >> 2], 1306 pTc[4 >> 2], pTc[5 >> 2], pTc[6 >> 2], pTc[7 >> 2], 1307 pTc[8 >> 2], pTc[9 >> 2], pTc[10 >> 2], pTc[11 >> 2], 1308 pTc[12 >> 2], pTc[13 >> 2], pTc[14 >> 2], pTc[15 >> 2] }; 1309 negTc = zero - tc; 1310 iTc = tc; 1311 1312 // Load data from pPix 1313 LD_SH8(pPix - 3, iStride, t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r); 1314 LD_SH8(pPix + 8 * iStride - 3, iStride, p0_l, p0_r, p1_l, p1_r, 1315 p2_l, p2_r, q0_l, q0_r); 1316 TRANSPOSE16x8_UB_UB(t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r, 1317 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, 1318 p2, p1, p0, q0, q1, q2, alpha, beta); 1319 1320 alpha = (v16u8)__msa_fill_b(iAlpha); 1321 beta = (v16u8)__msa_fill_b(iBeta); 1322 1323 bDetaP0Q0 = __msa_asub_u_b(p0, q0); 1324 bDetaP1P0 = __msa_asub_u_b(p1, p0); 1325 bDetaQ1Q0 = __msa_asub_u_b(q1, q0); 1326 bDetaP2P0 = __msa_asub_u_b(p2, p0); 1327 bDetaQ2Q0 = __msa_asub_u_b(q2, q0); 1328 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha); 1329 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta); 1330 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta); 1331 bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta); 1332 bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta); 1333 1334 // Unsigned extend p0, p1, p2, q0, q1, q2 from 8 bits to 16 bits 1335 ILVRL_B2_SH(zero, p0, p0_r, p0_l); 1336 ILVRL_B2_SH(zero, p1, p1_r, p1_l); 1337 ILVRL_B2_SH(zero, p2, p2_r, p2_l); 1338 ILVRL_B2_SH(zero, q0, q0_r, q0_l); 1339 ILVRL_B2_SH(zero, q1, q1_r, q1_l); 1340 ILVRL_B2_SH(zero, q2, q2_r, q2_l); 1341 // Signed extend tc, negTc from 8 bits to 16 bits 1342 flags = __msa_clt_s_b(tc, zero); 1343 ILVRL_B2(v8i16, flags, tc, tc_r, tc_l); 1344 flags = __msa_clt_s_b(negTc, zero); 1345 ILVRL_B2(v8i16, flags, negTc, negTc_r, negTc_l); 1346 1347 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0; 1348 flags = f & (v16i8)bDetaP2P0; 1349 flags = __msa_ceq_b(flags, zero); 1350 iTc += ((~flags) & const_1_b); 1351 flags = f & (v16i8)bDetaQ2Q0; 1352 flags = __msa_ceq_b(flags, zero); 1353 iTc += ((~flags) & const_1_b); 1354 negiTc = zero - iTc; 1355 // Signed extend iTc, negiTc from 8 bits to 16 bits 1356 flags = __msa_clt_s_b(iTc, zero); 1357 ILVRL_B2(v8i16, flags, iTc, iTc_r, iTc_l); 1358 flags = __msa_clt_s_b(negiTc, zero); 1359 ILVRL_B2(v8i16, flags, negiTc, negiTc_r, negiTc_l); 1360 1361 // Calculate the left part 1362 // p1 1363 t0 = (p2_l + ((p0_l + q0_l + const_1_h) >> 1) - (p1_l << 1)) >> 1; 1364 t0 = __msa_max_s_h(negTc_l, t0); 1365 t0 = __msa_min_s_h(tc_l, t0); 1366 t1 = p1_l + t0; 1367 // q1 1368 t0 = (q2_l + ((p0_l + q0_l + const_1_h) >> 1) - (q1_l << 1)) >> 1; 1369 t0 = __msa_max_s_h(negTc_l, t0); 1370 t0 = __msa_min_s_h(tc_l, t0); 1371 t2 = q1_l + t0; 1372 // iDeta 1373 t0 = (((q0_l - p0_l) << 2) + (p1_l - q1_l) + const_4_h) >> 3; 1374 t0 = __msa_max_s_h(negiTc_l, t0); 1375 t0 = __msa_min_s_h(iTc_l, t0); 1376 p1_l = t1; 1377 q1_l = t2; 1378 // p0 1379 t1 = p0_l + t0; 1380 t2 = t1 & const_not_255_h; 1381 t3 = __msa_cle_s_h((v8i16)zero, t1); 1382 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero); 1383 p0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags)); 1384 // q0 1385 t1 = q0_l - t0; 1386 t2 = t1 & const_not_255_h; 1387 t3 = __msa_cle_s_h((v8i16)zero, t1); 1388 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero); 1389 q0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags)); 1390 1391 // Calculate the right part 1392 // p1 1393 t0 = (p2_r + ((p0_r + q0_r + const_1_h) >> 1) - (p1_r << 1)) >> 1; 1394 t0 = __msa_max_s_h(negTc_r, t0); 1395 t0 = __msa_min_s_h(tc_r, t0); 1396 t1 = p1_r + t0; 1397 // q1 1398 t0 = (q2_r + ((p0_r + q0_r + const_1_h) >> 1) - (q1_r << 1)) >> 1; 1399 t0 = __msa_max_s_h(negTc_r, t0); 1400 t0 = __msa_min_s_h(tc_r, t0); 1401 t2 = q1_r + t0; 1402 // iDeta 1403 t0 = (((q0_r - p0_r) << 2) + (p1_r - q1_r) + const_4_h) >> 3; 1404 t0 = __msa_max_s_h(negiTc_r, t0); 1405 t0 = __msa_min_s_h(iTc_r, t0); 1406 p1_r = t1; 1407 q1_r = t2; 1408 // p0 1409 t1 = p0_r + t0; 1410 t2 = t1 & const_not_255_h; 1411 t3 = __msa_cle_s_h((v8i16)zero, t1); 1412 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero); 1413 p0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags)); 1414 // q0 1415 t1 = q0_r - t0; 1416 t2 = t1 & const_not_255_h; 1417 t3 = __msa_cle_s_h((v8i16)zero, t1); 1418 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero); 1419 q0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags)); 1420 1421 // Combined left and right 1422 PCKEV_B4(v8i16, p1_l, p1_r, p0_l, p0_r, q0_l, q0_r, q1_l, q1_r, 1423 t0, t1, t2, t3); 1424 flags = (v16i8)__msa_cle_s_b(zero, tc); 1425 flags &= f; 1426 p0 = (v16u8)(((v16i8)t1 & flags) + (p0 & (~flags))); 1427 q0 = (v16u8)(((v16i8)t2 & flags) + (q0 & (~flags))); 1428 // Using t1, t2 as temporary flags 1429 t1 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaP2P0, zero)))); 1430 p1 = (v16u8)(t0 & t1) + (p1 & (v16u8)(~t1)); 1431 t2 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaQ2Q0, zero)))); 1432 q1 = (v16u8)(t3 & t2) + (q1 & (v16u8)(~t2)); 1433 1434 ILVRL_B2_SH(p0, p1, t0, t1); 1435 ILVRL_B2_SH(q1, q0, t2, t3); 1436 ILVRL_H2_UB(t2, t0, p1, p0); 1437 ILVRL_H2_UB(t3, t1, q0, q1); 1438 // Store data to pPix 1439 ST_W8(p1, p0, 0, 1, 2, 3, 0, 1, 2, 3, pPix - 2, iStride); 1440 ST_W8(q0, q1, 0, 1, 2, 3, 0, 1, 2, 3, pPix + 8 * iStride - 2, iStride); 1441} 1442 1443static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data, 1444 uint8_t bs0, uint8_t bs1, 1445 uint8_t bs2, uint8_t bs3, 1446 uint8_t tc0, uint8_t tc1, 1447 uint8_t tc2, uint8_t tc3, 1448 uint8_t alpha_in, 1449 uint8_t beta_in, 1450 ptrdiff_t image_width) 1451{ 1452 v16u8 tmp_vec; 1453 v16u8 bs = { 0 }; 1454 1455 tmp_vec = (v16u8) __msa_fill_b(bs0); 1456 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec); 1457 tmp_vec = (v16u8) __msa_fill_b(bs1); 1458 bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec); 1459 tmp_vec = (v16u8) __msa_fill_b(bs2); 1460 bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec); 1461 tmp_vec = (v16u8) __msa_fill_b(bs3); 1462 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec); 1463 1464 if (!__msa_test_bz_v(bs)) { 1465 v16u8 alpha, beta, is_less_than, is_less_than_beta; 1466 v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org; 1467 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; 1468 v16u8 is_less_than_alpha, is_bs_greater_than0; 1469 v8i16 p0_r, q0_r, p0_l, q0_l; 1470 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 1471 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; 1472 v16i8 zero = { 0 }; 1473 v16i8 tc = { 0 }; 1474 1475 tmp_vec = (v16u8) __msa_fill_b(tc0); 1476 tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec); 1477 tmp_vec = (v16u8) __msa_fill_b(tc1); 1478 tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec); 1479 tmp_vec = (v16u8) __msa_fill_b(tc2); 1480 tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec); 1481 tmp_vec = (v16u8) __msa_fill_b(tc3); 1482 tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec); 1483 1484 alpha = (v16u8) __msa_fill_b(alpha_in); 1485 beta = (v16u8) __msa_fill_b(beta_in); 1486 1487 LD_UB5(data - (3 * image_width), image_width, 1488 p2_org, p1_org, p0_org, q0_org, q1_org); 1489 1490 is_bs_greater_than0 = ((v16u8) zero < bs); 1491 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); 1492 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); 1493 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); 1494 1495 is_less_than_alpha = (p0_asub_q0 < alpha); 1496 is_less_than_beta = (p1_asub_p0 < beta); 1497 is_less_than = is_less_than_beta & is_less_than_alpha; 1498 is_less_than_beta = (q1_asub_q0 < beta); 1499 is_less_than = is_less_than_beta & is_less_than; 1500 is_less_than = is_less_than & is_bs_greater_than0; 1501 1502 if (!__msa_test_bz_v(is_less_than)) { 1503 v16i8 sign_negate_tc, negate_tc; 1504 v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r; 1505 v16u8 p2_asub_p0, q2_asub_q0; 1506 1507 q2_org = LD_UB(data + (2 * image_width)); 1508 negate_tc = zero - tc; 1509 sign_negate_tc = __msa_clti_s_b(negate_tc, 0); 1510 1511 ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l); 1512 1513 UNPCK_UB_SH(tc, tc_r, tc_l); 1514 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l); 1515 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l); 1516 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l); 1517 1518 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org); 1519 is_less_than_beta = (p2_asub_p0 < beta); 1520 is_less_than_beta = is_less_than_beta & is_less_than; 1521 1522 if (!__msa_test_bz_v(is_less_than_beta)) { 1523 v16u8 p1; 1524 v8i16 p1_r = { 0 }; 1525 v8i16 p1_l = { 0 }; 1526 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org); 1527 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org); 1528 1529 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r, 1530 negate_tc_r, tc_r, p1_r); 1531 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l, 1532 i16_negatetc_l, tc_l, p1_l); 1533 1534 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r); 1535 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta); 1536 ST_UB(p1_org, data - (2 * image_width)); 1537 1538 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1); 1539 tc = tc + (v16i8) is_less_than_beta; 1540 } 1541 1542 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org); 1543 is_less_than_beta = (q2_asub_q0 < beta); 1544 is_less_than_beta = is_less_than_beta & is_less_than; 1545 1546 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org); 1547 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org); 1548 1549 if (!__msa_test_bz_v(is_less_than_beta)) { 1550 v16u8 q1; 1551 v8i16 q1_r = { 0 }; 1552 v8i16 q1_l = { 0 }; 1553 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org); 1554 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org); 1555 1556 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r, 1557 negate_tc_r, tc_r, q1_r); 1558 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l, 1559 i16_negatetc_l, tc_l, q1_l); 1560 1561 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r); 1562 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta); 1563 ST_UB(q1_org, data + image_width); 1564 1565 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1); 1566 tc = tc + (v16i8) is_less_than_beta; 1567 } 1568 { 1569 v16i8 negate_thresh, sign_negate_thresh; 1570 v8i16 threshold_r, threshold_l; 1571 v8i16 negate_thresh_l, negate_thresh_r; 1572 1573 negate_thresh = zero - tc; 1574 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0); 1575 1576 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh, 1577 threshold_r, negate_thresh_r); 1578 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, 1579 negate_thresh_r, threshold_r, p0_r, q0_r); 1580 1581 threshold_l = (v8i16) __msa_ilvl_b(zero, tc); 1582 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh, 1583 negate_thresh); 1584 AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l, 1585 negate_thresh_l, threshold_l, p0_l, q0_l); 1586 } 1587 1588 PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0); 1589 1590 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than); 1591 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); 1592 1593 ST_UB(p0_org, (data - image_width)); 1594 ST_UB(q0_org, data); 1595 } 1596 } 1597} 1598 1599static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, ptrdiff_t stride, 1600 int32_t alpha_in, int32_t beta_in, 1601 int8_t *tc0) 1602{ 1603 uint8_t *data = in; 1604 uint32_t out0, out1, out2, out3; 1605 uint64_t load; 1606 uint32_t tc_val; 1607 v16u8 alpha, beta; 1608 v16i8 inp0 = { 0 }; 1609 v16i8 inp1 = { 0 }; 1610 v16i8 inp2 = { 0 }; 1611 v16i8 inp3 = { 0 }; 1612 v16i8 inp4 = { 0 }; 1613 v16i8 inp5 = { 0 }; 1614 v16i8 inp6 = { 0 }; 1615 v16i8 inp7 = { 0 }; 1616 v16i8 src0, src1, src2, src3; 1617 v8i16 src4, src5, src6, src7; 1618 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0; 1619 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; 1620 v16u8 is_less_than_beta1, is_less_than_beta2; 1621 v8i16 tc, tc_orig_r, tc_plus1; 1622 v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 }; 1623 v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1; 1624 v8i16 src2_r, src3_r; 1625 v8i16 p2_r, p1_r, q2_r, q1_r; 1626 v16u8 p2, q2, p0, q0; 1627 v4i32 dst0, dst1; 1628 v16i8 zeros = { 0 }; 1629 1630 alpha = (v16u8) __msa_fill_b(alpha_in); 1631 beta = (v16u8) __msa_fill_b(beta_in); 1632 1633 if (tc0[0] < 0) { 1634 data += (2 * stride); 1635 } else { 1636 load = LD(data - 3); 1637 inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load); 1638 load = LD(data - 3 + stride); 1639 inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load); 1640 data += (2 * stride); 1641 } 1642 1643 if (tc0[1] < 0) { 1644 data += (2 * stride); 1645 } else { 1646 load = LD(data - 3); 1647 inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load); 1648 load = LD(data - 3 + stride); 1649 inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load); 1650 data += (2 * stride); 1651 } 1652 1653 if (tc0[2] < 0) { 1654 data += (2 * stride); 1655 } else { 1656 load = LD(data - 3); 1657 inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load); 1658 load = LD(data - 3 + stride); 1659 inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load); 1660 data += (2 * stride); 1661 } 1662 1663 if (tc0[3] < 0) { 1664 data += (2 * stride); 1665 } else { 1666 load = LD(data - 3); 1667 inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load); 1668 load = LD(data - 3 + stride); 1669 inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load); 1670 data += (2 * stride); 1671 } 1672 1673 ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6, 1674 src0, src1, src2, src3); 1675 1676 ILVR_H2_SH(src1, src0, src3, src2, src4, src6); 1677 ILVL_H2_SH(src1, src0, src3, src2, src5, src7); 1678 1679 src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4); 1680 src1 = __msa_sldi_b(zeros, (v16i8) src0, 8); 1681 src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4); 1682 src3 = __msa_sldi_b(zeros, (v16i8) src2, 8); 1683 src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5); 1684 src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8); 1685 1686 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3); 1687 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2); 1688 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3); 1689 p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2); 1690 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3); 1691 1692 is_less_than_alpha = (p0_asub_q0 < alpha); 1693 is_less_than_beta = (p1_asub_p0 < beta); 1694 is_less_than = is_less_than_alpha & is_less_than_beta; 1695 is_less_than_beta = (q1_asub_q0 < beta); 1696 is_less_than = is_less_than_beta & is_less_than; 1697 1698 is_less_than_beta1 = (p2_asub_p0 < beta); 1699 is_less_than_beta2 = (q2_asub_q0 < beta); 1700 1701 p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2); 1702 p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0); 1703 p0_add_q0 = __msa_srari_h(p0_add_q0, 1); 1704 1705 ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r); 1706 p2_r += p0_add_q0; 1707 p2_r >>= 1; 1708 p2_r -= p1_r; 1709 ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r); 1710 q2_r += p0_add_q0; 1711 q2_r >>= 1; 1712 q2_r -= q1_r; 1713 1714 tc_val = LW(tc0); 1715 tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val); 1716 tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig); 1717 is_tc_orig1 = tc_orig; 1718 is_tc_orig2 = tc_orig; 1719 tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig); 1720 tc = tc_orig_r; 1721 1722 CLIP_SH(p2_r, -tc_orig_r, tc_orig_r); 1723 CLIP_SH(q2_r, -tc_orig_r, tc_orig_r); 1724 1725 p2_r += p1_r; 1726 q2_r += q1_r; 1727 1728 PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2); 1729 1730 is_tc_orig1 = (zeros < is_tc_orig1); 1731 is_tc_orig2 = is_tc_orig1; 1732 is_tc_orig1 = is_less_than_beta1 & is_tc_orig1; 1733 is_tc_orig2 = is_less_than_beta2 & is_tc_orig2; 1734 is_tc_orig1 = is_less_than & is_tc_orig1; 1735 is_tc_orig2 = is_less_than & is_tc_orig2; 1736 1737 p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1); 1738 q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2); 1739 1740 q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0); 1741 q0_sub_p0 <<= 2; 1742 p1_sub_q1 = p1_r - q1_r; 1743 q0_sub_p0 += p1_sub_q1; 1744 q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3); 1745 1746 tc_plus1 = tc + 1; 1747 is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1, 1748 (v16i8) is_less_than_beta1); 1749 tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1); 1750 tc_plus1 = tc + 1; 1751 is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2, 1752 (v16i8) is_less_than_beta2); 1753 tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2); 1754 1755 CLIP_SH(q0_sub_p0, -tc, tc); 1756 1757 ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r); 1758 src2_r += q0_sub_p0; 1759 src3_r -= q0_sub_p0; 1760 1761 CLIP_SH2_0_255(src2_r, src3_r); 1762 1763 PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0); 1764 1765 p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than); 1766 q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than); 1767 1768 ILVR_B2_UB(p0, p2, q2, q0, p2, q2); 1769 1770 ILVRL_H2_SW(q2, p2, dst0, dst1); 1771 1772 data = in; 1773 1774 out0 = __msa_copy_u_w(dst0, 0); 1775 out1 = __msa_copy_u_w(dst0, 1); 1776 out2 = __msa_copy_u_w(dst0, 2); 1777 out3 = __msa_copy_u_w(dst0, 3); 1778 1779 if (tc0[0] < 0) { 1780 data += (2 * stride); 1781 } else { 1782 SW(out0, (data - 2)); 1783 data += stride; 1784 SW(out1, (data - 2)); 1785 data += stride; 1786 } 1787 1788 if (tc0[1] < 0) { 1789 data += (2 * stride); 1790 } else { 1791 SW(out2, (data - 2)); 1792 data += stride; 1793 SW(out3, (data - 2)); 1794 data += stride; 1795 } 1796 1797 out0 = __msa_copy_u_w(dst1, 0); 1798 out1 = __msa_copy_u_w(dst1, 1); 1799 out2 = __msa_copy_u_w(dst1, 2); 1800 out3 = __msa_copy_u_w(dst1, 3); 1801 1802 if (tc0[2] < 0) { 1803 data += (2 * stride); 1804 } else { 1805 SW(out0, (data - 2)); 1806 data += stride; 1807 SW(out1, (data - 2)); 1808 data += stride; 1809 } 1810 1811 if (tc0[3] >= 0) { 1812 SW(out2, (data - 2)); 1813 data += stride; 1814 SW(out3, (data - 2)); 1815 } 1816} 1817 1818static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data, 1819 uint8_t bs0, uint8_t bs1, 1820 uint8_t bs2, uint8_t bs3, 1821 uint8_t tc0, uint8_t tc1, 1822 uint8_t tc2, uint8_t tc3, 1823 uint8_t alpha_in, 1824 uint8_t beta_in, 1825 ptrdiff_t img_width) 1826{ 1827 v16u8 alpha, beta; 1828 v8i16 tmp_vec; 1829 v8i16 bs = { 0 }; 1830 v8i16 tc = { 0 }; 1831 v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0; 1832 v16u8 is_less_than; 1833 v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0; 1834 v8i16 p0_r, q0_r; 1835 v16u8 p1_org, p0_org, q0_org, q1_org; 1836 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 1837 v16i8 negate_tc, sign_negate_tc; 1838 v8i16 tc_r, negate_tc_r; 1839 v16i8 zero = { 0 }; 1840 1841 tmp_vec = (v8i16) __msa_fill_b(bs0); 1842 bs = __msa_insve_h(bs, 0, tmp_vec); 1843 tmp_vec = (v8i16) __msa_fill_b(bs1); 1844 bs = __msa_insve_h(bs, 1, tmp_vec); 1845 tmp_vec = (v8i16) __msa_fill_b(bs2); 1846 bs = __msa_insve_h(bs, 2, tmp_vec); 1847 tmp_vec = (v8i16) __msa_fill_b(bs3); 1848 bs = __msa_insve_h(bs, 3, tmp_vec); 1849 1850 if (!__msa_test_bz_v((v16u8) bs)) { 1851 tmp_vec = (v8i16) __msa_fill_b(tc0); 1852 tc = __msa_insve_h(tc, 0, tmp_vec); 1853 tmp_vec = (v8i16) __msa_fill_b(tc1); 1854 tc = __msa_insve_h(tc, 1, tmp_vec); 1855 tmp_vec = (v8i16) __msa_fill_b(tc2); 1856 tc = __msa_insve_h(tc, 2, tmp_vec); 1857 tmp_vec = (v8i16) __msa_fill_b(tc3); 1858 tc = __msa_insve_h(tc, 3, tmp_vec); 1859 1860 is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs); 1861 1862 alpha = (v16u8) __msa_fill_b(alpha_in); 1863 beta = (v16u8) __msa_fill_b(beta_in); 1864 1865 LD_UB4(data - (img_width << 1), img_width, 1866 p1_org, p0_org, q0_org, q1_org); 1867 1868 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); 1869 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); 1870 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); 1871 1872 is_less_than_alpha = (p0_asub_q0 < alpha); 1873 is_less_than_beta = (p1_asub_p0 < beta); 1874 is_less_than = is_less_than_beta & is_less_than_alpha; 1875 is_less_than_beta = (q1_asub_q0 < beta); 1876 is_less_than = is_less_than_beta & is_less_than; 1877 is_less_than = is_less_than & is_bs_greater_than0; 1878 1879 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); 1880 1881 if (!__msa_test_bz_v(is_less_than)) { 1882 negate_tc = zero - (v16i8) tc; 1883 sign_negate_tc = __msa_clti_s_b(negate_tc, 0); 1884 1885 ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r); 1886 1887 ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org, 1888 p1_org_r, p0_org_r, q0_org_r, q1_org_r); 1889 1890 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r, 1891 tc_r, p0_r, q0_r); 1892 1893 PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0); 1894 1895 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than); 1896 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); 1897 1898 ST_UB(q0_org, data); 1899 ST_UB(p0_org, (data - img_width)); 1900 } 1901 } 1902} 1903 1904static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data, 1905 uint8_t bs0, uint8_t bs1, 1906 uint8_t bs2, uint8_t bs3, 1907 uint8_t tc0, uint8_t tc1, 1908 uint8_t tc2, uint8_t tc3, 1909 uint8_t alpha_in, 1910 uint8_t beta_in, 1911 ptrdiff_t img_width) 1912{ 1913 uint8_t *src; 1914 v16u8 alpha, beta; 1915 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; 1916 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha; 1917 v16u8 p0, q0; 1918 v8i16 p0_r = { 0 }; 1919 v8i16 q0_r = { 0 }; 1920 v16u8 p1_org, p0_org, q0_org, q1_org; 1921 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; 1922 v16u8 is_bs_greater_than0; 1923 v8i16 tc_r, negate_tc_r; 1924 v16i8 negate_tc, sign_negate_tc; 1925 v16i8 zero = { 0 }; 1926 v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 1927 v8i16 tmp1, tmp_vec, bs = { 0 }; 1928 v8i16 tc = { 0 }; 1929 1930 tmp_vec = (v8i16) __msa_fill_b(bs0); 1931 bs = __msa_insve_h(bs, 0, tmp_vec); 1932 tmp_vec = (v8i16) __msa_fill_b(bs1); 1933 bs = __msa_insve_h(bs, 1, tmp_vec); 1934 tmp_vec = (v8i16) __msa_fill_b(bs2); 1935 bs = __msa_insve_h(bs, 2, tmp_vec); 1936 tmp_vec = (v8i16) __msa_fill_b(bs3); 1937 bs = __msa_insve_h(bs, 3, tmp_vec); 1938 1939 if (!__msa_test_bz_v((v16u8) bs)) { 1940 tmp_vec = (v8i16) __msa_fill_b(tc0); 1941 tc = __msa_insve_h(tc, 0, tmp_vec); 1942 tmp_vec = (v8i16) __msa_fill_b(tc1); 1943 tc = __msa_insve_h(tc, 1, tmp_vec); 1944 tmp_vec = (v8i16) __msa_fill_b(tc2); 1945 tc = __msa_insve_h(tc, 2, tmp_vec); 1946 tmp_vec = (v8i16) __msa_fill_b(tc3); 1947 tc = __msa_insve_h(tc, 3, tmp_vec); 1948 1949 is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs); 1950 1951 LD_UB8((data - 2), img_width, 1952 row0, row1, row2, row3, row4, row5, row6, row7); 1953 1954 TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, 1955 row4, row5, row6, row7, 1956 p1_org, p0_org, q0_org, q1_org); 1957 1958 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org); 1959 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org); 1960 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org); 1961 1962 alpha = (v16u8) __msa_fill_b(alpha_in); 1963 beta = (v16u8) __msa_fill_b(beta_in); 1964 1965 is_less_than_alpha = (p0_asub_q0 < alpha); 1966 is_less_than_beta = (p1_asub_p0 < beta); 1967 is_less_than = is_less_than_beta & is_less_than_alpha; 1968 is_less_than_beta = (q1_asub_q0 < beta); 1969 is_less_than = is_less_than_beta & is_less_than; 1970 is_less_than = is_bs_greater_than0 & is_less_than; 1971 1972 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than); 1973 1974 if (!__msa_test_bz_v(is_less_than)) { 1975 ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org, 1976 p1_org_r, p0_org_r, q0_org_r, q1_org_r); 1977 1978 negate_tc = zero - (v16i8) tc; 1979 sign_negate_tc = __msa_clti_s_b(negate_tc, 0); 1980 1981 ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r); 1982 1983 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r, 1984 tc_r, p0_r, q0_r); 1985 1986 PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0); 1987 1988 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than); 1989 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); 1990 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org); 1991 src = data - 1; 1992 ST_H4(tmp1, 0, 1, 2, 3, src, img_width); 1993 src += 4 * img_width; 1994 ST_H4(tmp1, 4, 5, 6, 7, src, img_width); 1995 } 1996 } 1997} 1998 1999static void avc_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t stride, 2000 int32_t alpha_in, int32_t beta_in, 2001 int8_t *tc0) 2002{ 2003 int32_t col, tc_val; 2004 v16u8 alpha, beta, res; 2005 2006 alpha = (v16u8) __msa_fill_b(alpha_in); 2007 beta = (v16u8) __msa_fill_b(beta_in); 2008 2009 for (col = 0; col < 4; col++) { 2010 tc_val = (tc0[col] - 1) + 1; 2011 2012 if (tc_val <= 0) { 2013 src += (4 * stride); 2014 continue; 2015 } 2016 2017 AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res); 2018 ST_H4(res, 0, 1, 2, 3, (src - 1), stride); 2019 src += (4 * stride); 2020 } 2021} 2022 2023static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, 2024 ptrdiff_t stride, 2025 int32_t alpha_in, 2026 int32_t beta_in, 2027 int8_t *tc0) 2028{ 2029 int32_t col, tc_val; 2030 int16_t out0, out1; 2031 v16u8 alpha, beta, res; 2032 2033 alpha = (v16u8) __msa_fill_b(alpha_in); 2034 beta = (v16u8) __msa_fill_b(beta_in); 2035 2036 for (col = 0; col < 4; col++) { 2037 tc_val = (tc0[col] - 1) + 1; 2038 2039 if (tc_val <= 0) { 2040 src += 4 * stride; 2041 continue; 2042 } 2043 2044 AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res); 2045 2046 out0 = __msa_copy_s_h((v8i16) res, 0); 2047 out1 = __msa_copy_s_h((v8i16) res, 1); 2048 2049 SH(out0, (src - 1)); 2050 src += stride; 2051 SH(out1, (src - 1)); 2052 src += stride; 2053 } 2054} 2055 2056void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width, 2057 int alpha, int beta, int8_t *tc) 2058{ 2059// uint8_t bs0 = 1; 2060// uint8_t bs1 = 1; 2061// uint8_t bs2 = 1; 2062// uint8_t bs3 = 1; 2063// 2064// if (tc[0] < 0) 2065// bs0 = 0; 2066// if (tc[1] < 0) 2067// bs1 = 0; 2068// if (tc[2] < 0) 2069// bs2 = 0; 2070// if (tc[3] < 0) 2071// bs3 = 0; 2072// 2073// avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3, 2074// tc[0], tc[1], tc[2], tc[3], 2075// alpha, beta, img_width); 2076 avc_loopfilter_luma_inter_edge_ver_msa(data, img_width, alpha, beta, tc); 2077} 2078 2079void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width, 2080 int alpha, int beta, int8_t *tc) 2081{ 2082 2083 uint8_t bs0 = 1; 2084 uint8_t bs1 = 1; 2085 uint8_t bs2 = 1; 2086 uint8_t bs3 = 1; 2087 2088 if (tc[0] < 0) 2089 bs0 = 0; 2090 if (tc[1] < 0) 2091 bs1 = 0; 2092 if (tc[2] < 0) 2093 bs2 = 0; 2094 if (tc[3] < 0) 2095 bs3 = 0; 2096 2097 avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3, 2098 tc[0], tc[1], tc[2], tc[3], 2099 alpha, beta, img_width); 2100} 2101 2102void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width, 2103 int alpha, int beta, int8_t *tc) 2104{ 2105 uint8_t bs0 = 1; 2106 uint8_t bs1 = 1; 2107 uint8_t bs2 = 1; 2108 uint8_t bs3 = 1; 2109 2110 if (tc[0] < 0) 2111 bs0 = 0; 2112 if (tc[1] < 0) 2113 bs1 = 0; 2114 if (tc[2] < 0) 2115 bs2 = 0; 2116 if (tc[3] < 0) 2117 bs3 = 0; 2118 2119 avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3, 2120 tc[0], tc[1], tc[2], tc[3], 2121 alpha, beta, img_width); 2122} 2123 2124void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width, 2125 int alpha, int beta, int8_t *tc) 2126{ 2127 uint8_t bs0 = 1; 2128 uint8_t bs1 = 1; 2129 uint8_t bs2 = 1; 2130 uint8_t bs3 = 1; 2131 2132 if (tc[0] < 0) 2133 bs0 = 0; 2134 if (tc[1] < 0) 2135 bs1 = 0; 2136 if (tc[2] < 0) 2137 bs2 = 0; 2138 if (tc[3] < 0) 2139 bs3 = 0; 2140 2141 avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3, 2142 tc[0], tc[1], tc[2], tc[3], 2143 alpha, beta, img_width); 2144} 2145 2146void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width, 2147 int alpha, int beta) 2148{ 2149 avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha, 2150 (uint8_t) beta, 2151 img_width); 2152} 2153 2154void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width, 2155 int alpha, int beta) 2156{ 2157 avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha, 2158 (uint8_t) beta, 2159 img_width); 2160} 2161 2162void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width, 2163 int alpha, int beta) 2164{ 2165 avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha, 2166 (uint8_t) beta, 2167 img_width); 2168} 2169 2170void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width, 2171 int alpha, int beta) 2172{ 2173 avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha, 2174 (uint8_t) beta, 2175 img_width); 2176} 2177 2178void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, 2179 ptrdiff_t ystride, 2180 int32_t alpha, int32_t beta, 2181 int8_t *tc0) 2182{ 2183 avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0); 2184} 2185 2186void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, 2187 ptrdiff_t ystride, 2188 int32_t alpha, 2189 int32_t beta, 2190 int8_t *tc0) 2191{ 2192 avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0); 2193} 2194 2195void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, 2196 ptrdiff_t ystride, 2197 int32_t alpha, 2198 int32_t beta, 2199 int8_t *tc0) 2200{ 2201 avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0); 2202} 2203 2204void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, 2205 ptrdiff_t ystride, 2206 int32_t alpha, 2207 int32_t beta) 2208{ 2209 avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta); 2210} 2211 2212void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride, 2213 int height, int log2_denom, 2214 int weight_src, int offset_in) 2215{ 2216 uint32_t offset_val; 2217 v16i8 zero = { 0 }; 2218 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2219 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2220 v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r; 2221 v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r; 2222 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2223 v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 2224 v8i16 wgt, denom, offset; 2225 2226 offset_val = (unsigned) offset_in << log2_denom; 2227 2228 wgt = __msa_fill_h(weight_src); 2229 offset = __msa_fill_h(offset_val); 2230 denom = __msa_fill_h(log2_denom); 2231 2232 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 2233 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r, 2234 src2_r, src3_r); 2235 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l, 2236 src2_l, src3_l); 2237 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r, 2238 src6_r, src7_r); 2239 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l, 2240 src6_l, src7_l); 2241 MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2, 2242 tmp3); 2243 MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6, 2244 tmp7); 2245 MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10, 2246 tmp11); 2247 MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13, 2248 tmp14, tmp15); 2249 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, 2250 tmp1, tmp2, tmp3); 2251 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4, 2252 tmp5, tmp6, tmp7); 2253 ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8, 2254 tmp9, tmp10, tmp11); 2255 ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset, 2256 tmp12, tmp13, tmp14, tmp15); 2257 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); 2258 MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0); 2259 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); 2260 SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom); 2261 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); 2262 SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7); 2263 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, 2264 dst2, dst3); 2265 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, 2266 dst5, dst6, dst7); 2267 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride); 2268 src += 8 * stride; 2269 2270 if (16 == height) { 2271 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 2272 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, 2273 src1_r, src2_r, src3_r); 2274 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, 2275 src1_l, src2_l, src3_l); 2276 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, 2277 src5_r, src6_r, src7_r); 2278 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, 2279 src5_l, src6_l, src7_l); 2280 MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, 2281 tmp2, tmp3); 2282 MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, 2283 tmp6, tmp7); 2284 MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, 2285 tmp10, tmp11); 2286 MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13, 2287 tmp14, tmp15); 2288 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, 2289 tmp0, tmp1, tmp2, tmp3); 2290 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, 2291 tmp4, tmp5, tmp6, tmp7); 2292 ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, 2293 tmp8, tmp9, tmp10, tmp11); 2294 ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset, 2295 tmp12, tmp13, tmp14, tmp15); 2296 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); 2297 MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0); 2298 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); 2299 SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom); 2300 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); 2301 SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7); 2302 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, 2303 dst2, dst3); 2304 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, 2305 dst5, dst6, dst7); 2306 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride); 2307 } 2308} 2309 2310void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride, 2311 int height, int log2_denom, 2312 int weight_src, int offset) 2313{ 2314 if (4 == height) { 2315 avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset); 2316 } else if (8 == height) { 2317 avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset); 2318 } else { 2319 avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset); 2320 } 2321} 2322 2323void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride, 2324 int height, int log2_denom, 2325 int weight_src, int offset) 2326{ 2327 if (2 == height) { 2328 avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset); 2329 } else if (4 == height) { 2330 avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset); 2331 } else { 2332 avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset); 2333 } 2334} 2335 2336void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, 2337 ptrdiff_t stride, int height, 2338 int log2_denom, int weight_dst, 2339 int weight_src, int offset_in) 2340{ 2341 v16i8 src_wgt, dst_wgt, wgt; 2342 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2343 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2344 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2345 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 2346 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2347 v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 2348 v8i16 denom, offset; 2349 2350 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 2351 offset_in += (128 * (weight_src + weight_dst)); 2352 2353 src_wgt = __msa_fill_b(weight_src); 2354 dst_wgt = __msa_fill_b(weight_dst); 2355 offset = __msa_fill_h(offset_in); 2356 denom = __msa_fill_h(log2_denom + 1); 2357 2358 wgt = __msa_ilvev_b(dst_wgt, src_wgt); 2359 2360 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 2361 src += 8 * stride; 2362 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 2363 XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7); 2364 XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 2365 ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4, 2366 vec6); 2367 ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5, 2368 vec7); 2369 ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10, 2370 vec12, vec14); 2371 ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11, 2372 vec13, vec15); 2373 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 2374 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 2375 tmp2 = __msa_dpadd_s_h(offset, wgt, vec2); 2376 tmp3 = __msa_dpadd_s_h(offset, wgt, vec3); 2377 tmp4 = __msa_dpadd_s_h(offset, wgt, vec4); 2378 tmp5 = __msa_dpadd_s_h(offset, wgt, vec5); 2379 tmp6 = __msa_dpadd_s_h(offset, wgt, vec6); 2380 tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); 2381 tmp8 = __msa_dpadd_s_h(offset, wgt, vec8); 2382 tmp9 = __msa_dpadd_s_h(offset, wgt, vec9); 2383 tmp10 = __msa_dpadd_s_h(offset, wgt, vec10); 2384 tmp11 = __msa_dpadd_s_h(offset, wgt, vec11); 2385 tmp12 = __msa_dpadd_s_h(offset, wgt, vec12); 2386 tmp13 = __msa_dpadd_s_h(offset, wgt, vec13); 2387 tmp14 = __msa_dpadd_s_h(offset, wgt, vec14); 2388 tmp15 = __msa_dpadd_s_h(offset, wgt, vec15); 2389 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); 2390 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); 2391 SRA_4V(tmp8, tmp9, tmp10, tmp11, denom); 2392 SRA_4V(tmp12, tmp13, tmp14, tmp15, denom); 2393 CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2394 CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); 2395 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, 2396 dst2, dst3); 2397 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, 2398 dst5, dst6, dst7); 2399 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride); 2400 dst += 8 * stride; 2401 2402 if (16 == height) { 2403 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 2404 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 2405 XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7); 2406 XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 2407 ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, 2408 vec4, vec6); 2409 ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, 2410 vec5, vec7); 2411 ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10, 2412 vec12, vec14); 2413 ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11, 2414 vec13, vec15); 2415 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0); 2416 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1); 2417 tmp2 = __msa_dpadd_s_h(offset, wgt, vec2); 2418 tmp3 = __msa_dpadd_s_h(offset, wgt, vec3); 2419 tmp4 = __msa_dpadd_s_h(offset, wgt, vec4); 2420 tmp5 = __msa_dpadd_s_h(offset, wgt, vec5); 2421 tmp6 = __msa_dpadd_s_h(offset, wgt, vec6); 2422 tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); 2423 tmp8 = __msa_dpadd_s_h(offset, wgt, vec8); 2424 tmp9 = __msa_dpadd_s_h(offset, wgt, vec9); 2425 tmp10 = __msa_dpadd_s_h(offset, wgt, vec10); 2426 tmp11 = __msa_dpadd_s_h(offset, wgt, vec11); 2427 tmp12 = __msa_dpadd_s_h(offset, wgt, vec12); 2428 tmp13 = __msa_dpadd_s_h(offset, wgt, vec13); 2429 tmp14 = __msa_dpadd_s_h(offset, wgt, vec14); 2430 tmp15 = __msa_dpadd_s_h(offset, wgt, vec15); 2431 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); 2432 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); 2433 SRA_4V(tmp8, tmp9, tmp10, tmp11, denom); 2434 SRA_4V(tmp12, tmp13, tmp14, tmp15, denom); 2435 CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2436 CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); 2437 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, 2438 dst2, dst3); 2439 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, 2440 dst5, dst6, dst7); 2441 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride); 2442 } 2443} 2444 2445void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, 2446 ptrdiff_t stride, int height, 2447 int log2_denom, int weight_dst, 2448 int weight_src, int offset) 2449{ 2450 if (4 == height) { 2451 avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2452 offset); 2453 } else if (8 == height) { 2454 avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2455 offset); 2456 } else { 2457 avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2458 offset); 2459 } 2460} 2461 2462void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, 2463 ptrdiff_t stride, int height, 2464 int log2_denom, int weight_dst, 2465 int weight_src, int offset) 2466{ 2467 if (2 == height) { 2468 avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2469 offset); 2470 } else if (4 == height) { 2471 avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2472 offset); 2473 } else { 2474 avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst, 2475 offset); 2476 } 2477} 2478