1/* 2 * Copyright (c) 2015 -2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mips/generic_macros_msa.h" 22#include "libavcodec/mips/hevcdsp_mips.h" 23 24static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, 25 int32_t beta, int32_t *tc, 26 uint8_t *p_is_pcm, uint8_t *q_is_pcm) 27{ 28 uint8_t *p3 = src - (stride << 2); 29 uint8_t *p2 = src - ((stride << 1) + stride); 30 uint8_t *p1 = src - (stride << 1); 31 uint8_t *p0 = src - stride; 32 uint8_t *q0 = src; 33 uint8_t *q1 = src + stride; 34 uint8_t *q2 = src + (stride << 1); 35 uint8_t *q3 = src + (stride << 1) + stride; 36 uint8_t flag0, flag1; 37 int32_t dp00, dq00, dp30, dq30, d00, d30; 38 int32_t d0030, d0434; 39 int32_t dp04, dq04, dp34, dq34, d04, d34; 40 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250; 41 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp; 42 uint64_t dst_val0, dst_val1; 43 v16u8 dst0, dst1, dst2, dst3, dst4, dst5; 44 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec; 45 v2i64 cmp3; 46 v8u16 temp0, temp1; 47 v8i16 temp2; 48 v8i16 tc_pos, tc_neg; 49 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0; 50 v16i8 zero = { 0 }; 51 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src; 52 53 dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]); 54 dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]); 55 dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]); 56 dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]); 57 d00 = dp00 + dq00; 58 d30 = dp30 + dq30; 59 dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]); 60 dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]); 61 dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]); 62 dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]); 63 d04 = dp04 + dq04; 64 d34 = dp34 + dq34; 65 66 p_is_pcm0 = p_is_pcm[0]; 67 p_is_pcm4 = p_is_pcm[1]; 68 q_is_pcm0 = q_is_pcm[0]; 69 q_is_pcm4 = q_is_pcm[1]; 70 71 cmp0 = __msa_fill_d(p_is_pcm0); 72 cmp1 = __msa_fill_d(p_is_pcm4); 73 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 74 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); 75 76 d0030 = (d00 + d30) >= beta; 77 d0434 = (d04 + d34) >= beta; 78 79 cmp0 = (v2i64) __msa_fill_w(d0030); 80 cmp1 = (v2i64) __msa_fill_w(d0434); 81 cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0); 82 cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0); 83 84 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) && 85 (!d0030 || !d0434)) { 86 p3_src = LD_UH(p3); 87 p2_src = LD_UH(p2); 88 p1_src = LD_UH(p1); 89 p0_src = LD_UH(p0); 90 91 cmp0 = __msa_fill_d(q_is_pcm0); 92 cmp1 = __msa_fill_d(q_is_pcm4); 93 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 94 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); 95 96 tc0 = tc[0]; 97 beta30 = beta >> 3; 98 beta20 = beta >> 2; 99 tc250 = ((tc0 * 5 + 1) >> 1); 100 tc4 = tc[1]; 101 tc254 = ((tc4 * 5 + 1) >> 1); 102 103 cmp0 = (v2i64) __msa_fill_h(tc0); 104 cmp1 = (v2i64) __msa_fill_h(tc4); 105 106 ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src, 107 p3_src, p2_src, p1_src, p0_src); 108 q0_src = LD_UH(q0); 109 q1_src = LD_UH(q1); 110 q2_src = LD_UH(q2); 111 q3_src = LD_UH(q3); 112 113 flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 && 114 abs(p0[0] - q0[0]) < tc250; 115 flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 && 116 abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 && 117 (d30 << 1) < beta20); 118 119 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0); 120 ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src, 121 q0_src, q1_src, q2_src, q3_src); 122 flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 && 123 abs(p0[4] - q0[4]) < tc254; 124 flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 && 125 abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 && 126 (d34 << 1) < beta20); 127 128 cmp0 = (v2i64) __msa_fill_w(flag0); 129 cmp1 = (v2i64) __msa_fill_w(flag1); 130 cmp2 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0); 131 cmp2 = (v2i64) __msa_ceqi_w((v4i32) cmp2, 0); 132 133 if (flag0 && flag1) { /* strong only */ 134 /* strong filter */ 135 tc_pos <<= 1; 136 tc_neg = -tc_pos; 137 138 /* p part */ 139 temp0 = (p1_src + p0_src + q0_src); 140 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; 141 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 142 temp2 = (v8i16) (temp1 - p2_src); 143 CLIP_SH(temp2, tc_neg, tc_pos); 144 dst0 = (v16u8) (temp2 + (v8i16) p2_src); 145 146 temp1 = temp0 + p2_src; 147 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); 148 temp2 = (v8i16) (temp1 - p1_src); 149 CLIP_SH(temp2, tc_neg, tc_pos); 150 dst1 = (v16u8) (temp2 + (v8i16) p1_src); 151 152 temp1 = (temp0 << 1) + p2_src + q1_src; 153 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 154 temp2 = (v8i16) (temp1 - p0_src); 155 CLIP_SH(temp2, tc_neg, tc_pos); 156 dst2 = (v16u8) (temp2 + (v8i16) p0_src); 157 158 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); 159 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec); 160 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec); 161 162 /* q part */ 163 temp0 = (q1_src + p0_src + q0_src); 164 165 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; 166 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 167 temp2 = (v8i16) (temp1 - q2_src); 168 CLIP_SH(temp2, tc_neg, tc_pos); 169 dst5 = (v16u8) (temp2 + (v8i16) q2_src); 170 171 temp1 = temp0 + q2_src; 172 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); 173 temp2 = (v8i16) (temp1 - q1_src); 174 CLIP_SH(temp2, tc_neg, tc_pos); 175 dst4 = (v16u8) (temp2 + (v8i16) q1_src); 176 177 temp1 = (temp0 << 1) + p1_src + q2_src; 178 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 179 temp2 = (v8i16) (temp1 - q0_src); 180 CLIP_SH(temp2, tc_neg, tc_pos); 181 dst3 = (v16u8) (temp2 + (v8i16) q0_src); 182 183 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); 184 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec); 185 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec); 186 187 /* pack results to 8 bit */ 188 PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1); 189 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 190 191 /* pack src to 8 bit */ 192 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4); 193 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src); 194 195 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3); 196 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3); 197 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3); 198 199 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0); 200 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1); 201 202 ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride); 203 SD(dst_val0, p2 + 4 * stride); 204 SD(dst_val1, p2 + 5 * stride); 205 /* strong filter ends */ 206 } else if (flag0 == flag1) { /* weak only */ 207 /* weak filter */ 208 tc_neg = -tc_pos; 209 210 diff0 = (v8i16) (q0_src - p0_src); 211 diff1 = (v8i16) (q1_src - p1_src); 212 diff0 = (diff0 << 3) + diff0; 213 diff1 = (diff1 << 1) + diff1; 214 delta0 = diff0 - diff1; 215 delta0 = __msa_srari_h(delta0, 4); 216 217 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1)); 218 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); 219 abs_delta0 = (v8u16) abs_delta0 < temp1; 220 221 CLIP_SH(delta0, tc_neg, tc_pos); 222 223 temp2 = (v8i16) (delta0 + p0_src); 224 CLIP_SH_0_255(temp2); 225 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, 226 (v16u8) p_is_pcm_vec); 227 228 temp2 = (v8i16) (q0_src - delta0); 229 CLIP_SH_0_255(temp2); 230 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, 231 (v16u8) q_is_pcm_vec); 232 233 p_is_pcm_vec = ~p_is_pcm_vec; 234 q_is_pcm_vec = ~q_is_pcm_vec; 235 tmp = (beta + (beta >> 1)) >> 3; 236 cmp0 = __msa_fill_d(dp00 + dp30 < tmp); 237 cmp1 = __msa_fill_d(dp04 + dp34 < tmp); 238 cmp0 = __msa_ilvev_d(cmp1, cmp0); 239 cmp0 = __msa_ceqi_d(cmp0, 0); 240 p_is_pcm_vec = p_is_pcm_vec | cmp0; 241 242 cmp0 = __msa_fill_d(dq00 + dq30 < tmp); 243 cmp1 = __msa_fill_d(dq04 + dq34 < tmp); 244 cmp0 = __msa_ilvev_d(cmp1, cmp0); 245 cmp0 = __msa_ceqi_d(cmp0, 0); 246 q_is_pcm_vec = q_is_pcm_vec | cmp0; 247 248 tc_pos >>= 1; 249 tc_neg = -tc_pos; 250 251 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src); 252 delta1 -= (v8i16) p1_src; 253 delta1 += delta0; 254 delta1 >>= 1; 255 CLIP_SH(delta1, tc_neg, tc_pos); 256 delta1 = (v8i16) p1_src + (v8i16) delta1; 257 CLIP_SH_0_255(delta1); 258 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, 259 (v16u8) p_is_pcm_vec); 260 261 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src); 262 delta2 = delta2 - (v8i16) q1_src; 263 delta2 = delta2 - delta0; 264 delta2 = delta2 >> 1; 265 CLIP_SH(delta2, tc_neg, tc_pos); 266 delta2 = (v8i16) q1_src + (v8i16) delta2; 267 CLIP_SH_0_255(delta2); 268 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, 269 (v16u8) q_is_pcm_vec); 270 271 dst1 = (v16u8) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src, 272 (v16u8) abs_delta0); 273 dst2 = (v16u8) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, 274 (v16u8) abs_delta0); 275 dst3 = (v16u8) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, 276 (v16u8) abs_delta0); 277 dst4 = (v16u8) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src, 278 (v16u8) abs_delta0); 279 /* pack results to 8 bit */ 280 PCKEV_B2_UB(dst2, dst1, dst4, dst3, dst0, dst1); 281 282 /* pack src to 8 bit */ 283 PCKEV_B2_UB(p0_src, p1_src, q1_src, q0_src, dst2, dst3); 284 285 dst0 = __msa_bmz_v(dst0, dst2, (v16u8) cmp3); 286 dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3); 287 288 p2 += stride; 289 ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride); 290 /* weak filter ends */ 291 } else { /* strong + weak */ 292 /* strong filter */ 293 tc_pos <<= 1; 294 tc_neg = -tc_pos; 295 296 /* p part */ 297 temp0 = (p1_src + p0_src + q0_src); 298 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; 299 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 300 temp2 = (v8i16) (temp1 - p2_src); 301 CLIP_SH(temp2, tc_neg, tc_pos); 302 dst0 = (v16u8) (temp2 + (v8i16) p2_src); 303 304 temp1 = temp0 + p2_src; 305 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); 306 temp2 = (v8i16) (temp1 - p1_src); 307 CLIP_SH(temp2, tc_neg, tc_pos); 308 dst1 = (v16u8) (temp2 + (v8i16) p1_src); 309 310 temp1 = (temp0 << 1) + p2_src + q1_src; 311 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 312 temp2 = (v8i16) (temp1 - p0_src); 313 CLIP_SH(temp2, tc_neg, tc_pos); 314 dst2 = (v16u8) (temp2 + (v8i16) p0_src); 315 316 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); 317 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec); 318 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec); 319 320 /* q part */ 321 temp0 = (q1_src + p0_src + q0_src); 322 323 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; 324 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 325 temp2 = (v8i16) (temp1 - q2_src); 326 CLIP_SH(temp2, tc_neg, tc_pos); 327 dst5 = (v16u8) (temp2 + (v8i16) q2_src); 328 329 temp1 = temp0 + q2_src; 330 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); 331 temp2 = (v8i16) (temp1 - q1_src); 332 CLIP_SH(temp2, tc_neg, tc_pos); 333 dst4 = (v16u8) (temp2 + (v8i16) q1_src); 334 335 temp1 = (temp0 << 1) + p1_src + q2_src; 336 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 337 temp2 = (v8i16) (temp1 - q0_src); 338 CLIP_SH(temp2, tc_neg, tc_pos); 339 dst3 = (v16u8) (temp2 + (v8i16) q0_src); 340 341 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); 342 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec); 343 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec); 344 345 /* pack strong results to 8 bit */ 346 PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1); 347 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); 348 /* strong filter ends */ 349 350 /* weak filter */ 351 tc_pos >>= 1; 352 tc_neg = -tc_pos; 353 354 diff0 = (v8i16) (q0_src - p0_src); 355 diff1 = (v8i16) (q1_src - p1_src); 356 diff0 = (diff0 << 3) + diff0; 357 diff1 = (diff1 << 1) + diff1; 358 delta0 = diff0 - diff1; 359 delta0 = __msa_srari_h(delta0, 4); 360 361 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1)); 362 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); 363 abs_delta0 = (v8u16) abs_delta0 < temp1; 364 365 CLIP_SH(delta0, tc_neg, tc_pos); 366 367 temp2 = (v8i16) (delta0 + p0_src); 368 CLIP_SH_0_255(temp2); 369 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, 370 (v16u8) p_is_pcm_vec); 371 372 temp2 = (v8i16) (q0_src - delta0); 373 CLIP_SH_0_255(temp2); 374 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, 375 (v16u8) q_is_pcm_vec); 376 377 p_is_pcm_vec = ~p_is_pcm_vec; 378 q_is_pcm_vec = ~q_is_pcm_vec; 379 tmp = (beta + (beta >> 1)) >> 3; 380 cmp0 = __msa_fill_d(dp00 + dp30 < tmp); 381 cmp1 = __msa_fill_d(dp04 + dp34 < tmp); 382 cmp0 = __msa_ilvev_d(cmp1, cmp0); 383 p_is_pcm_vec = p_is_pcm_vec | __msa_ceqi_d(cmp0, 0); 384 385 cmp0 = __msa_fill_d(dq00 + dq30 < tmp); 386 cmp1 = __msa_fill_d(dq04 + dq34 < tmp); 387 cmp0 = __msa_ilvev_d(cmp1, cmp0); 388 q_is_pcm_vec = q_is_pcm_vec | __msa_ceqi_d(cmp0, 0); 389 390 tc_pos >>= 1; 391 tc_neg = -tc_pos; 392 393 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src); 394 delta1 -= (v8i16) p1_src; 395 delta1 += delta0; 396 delta1 >>= 1; 397 CLIP_SH(delta1, tc_neg, tc_pos); 398 delta1 = (v8i16) p1_src + (v8i16) delta1; 399 CLIP_SH_0_255(delta1); 400 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, 401 (v16u8) p_is_pcm_vec); 402 403 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src); 404 delta2 = delta2 - (v8i16) q1_src; 405 delta2 = delta2 - delta0; 406 delta2 = delta2 >> 1; 407 CLIP_SH(delta2, tc_neg, tc_pos); 408 delta2 = (v8i16) q1_src + (v8i16) delta2; 409 CLIP_SH_0_255(delta2); 410 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, 411 (v16u8) q_is_pcm_vec); 412 413 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src, 414 (v16u8) abs_delta0); 415 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, 416 (v16u8) abs_delta0); 417 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, 418 (v16u8) abs_delta0); 419 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src, 420 (v16u8) abs_delta0); 421 /* weak filter ends */ 422 423 /* pack weak results to 8 bit */ 424 PCKEV_B2_UB(delta1, p2_src, temp2, temp0, dst3, dst4); 425 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) delta2); 426 427 /* select between weak or strong */ 428 dst0 = __msa_bmnz_v(dst0, dst3, (v16u8) cmp2); 429 dst1 = __msa_bmnz_v(dst1, dst4, (v16u8) cmp2); 430 dst2 = __msa_bmnz_v(dst2, dst5, (v16u8) cmp2); 431 432 /* pack src to 8 bit */ 433 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4); 434 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src); 435 436 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3); 437 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3); 438 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3); 439 440 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0); 441 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1); 442 443 ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride); 444 SD(dst_val0, p2 + 4 * stride); 445 SD(dst_val1, p2 + 5 * stride); 446 } 447 } 448} 449 450static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, 451 int32_t beta, int32_t *tc, 452 uint8_t *p_is_pcm, uint8_t *q_is_pcm) 453{ 454 uint8_t *p3 = src; 455 uint8_t *p2 = src + 3 * stride; 456 uint8_t *p1 = src + (stride << 2); 457 uint8_t *p0 = src + 7 * stride; 458 uint8_t flag0, flag1; 459 uint16_t tmp0, tmp1; 460 uint32_t tmp2, tmp3; 461 int32_t dp00, dq00, dp30, dq30, d00, d30; 462 int32_t d0030, d0434; 463 int32_t dp04, dq04, dp34, dq34, d04, d34; 464 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250; 465 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp; 466 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 467 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec; 468 v2i64 cmp3; 469 v8u16 temp0, temp1; 470 v8i16 temp2; 471 v8i16 tc_pos, tc_neg; 472 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0; 473 v16i8 zero = { 0 }; 474 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src; 475 476 dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]); 477 dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]); 478 dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]); 479 dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]); 480 d00 = dp00 + dq00; 481 d30 = dp30 + dq30; 482 p_is_pcm0 = p_is_pcm[0]; 483 q_is_pcm0 = q_is_pcm[0]; 484 485 dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]); 486 dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]); 487 dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]); 488 dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]); 489 d04 = dp04 + dq04; 490 d34 = dp34 + dq34; 491 p_is_pcm4 = p_is_pcm[1]; 492 q_is_pcm4 = q_is_pcm[1]; 493 494 cmp0 = __msa_fill_d(p_is_pcm0); 495 cmp1 = __msa_fill_d(p_is_pcm4); 496 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 497 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); 498 499 d0030 = (d00 + d30) >= beta; 500 d0434 = (d04 + d34) >= beta; 501 502 cmp0 = __msa_fill_d(d0030); 503 cmp1 = __msa_fill_d(d0434); 504 cmp3 = __msa_ilvev_d(cmp1, cmp0); 505 cmp3 = (v2i64) __msa_ceqi_d(cmp3, 0); 506 507 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) && 508 (!d0030 || !d0434)) { 509 src -= 4; 510 LD_UH8(src, stride, p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, 511 q2_src, q3_src); 512 513 cmp0 = __msa_fill_d(q_is_pcm0); 514 cmp1 = __msa_fill_d(q_is_pcm4); 515 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 516 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); 517 518 tc0 = tc[0]; 519 beta30 = beta >> 3; 520 beta20 = beta >> 2; 521 tc250 = ((tc0 * 5 + 1) >> 1); 522 523 tc4 = tc[1]; 524 tc254 = ((tc4 * 5 + 1) >> 1); 525 cmp0 = (v2i64) __msa_fill_h(tc0 << 1); 526 cmp1 = (v2i64) __msa_fill_h(tc4 << 1); 527 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0); 528 529 TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, 530 q2_src, q3_src, p3_src, p2_src, p1_src, p0_src, 531 q0_src, q1_src, q2_src, q3_src); 532 533 flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 && 534 abs(p3[-1] - p3[0]) < tc250; 535 flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 && 536 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 && 537 (d30 << 1) < beta20); 538 cmp0 = __msa_fill_d(flag0); 539 ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src, 540 p3_src, p2_src, p1_src, p0_src); 541 542 flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 && 543 abs(p1[-1] - p1[0]) < tc254; 544 flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 && 545 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 && 546 (d34 << 1) < beta20); 547 ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src, 548 q0_src, q1_src, q2_src, q3_src); 549 550 cmp1 = __msa_fill_d(flag1); 551 cmp2 = __msa_ilvev_d(cmp1, cmp0); 552 cmp2 = __msa_ceqi_d(cmp2, 0); 553 554 if (flag0 && flag1) { /* strong only */ 555 /* strong filter */ 556 tc_neg = -tc_pos; 557 558 /* p part */ 559 temp0 = (p1_src + p0_src + q0_src); 560 561 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; 562 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 563 temp2 = (v8i16) (temp1 - p2_src); 564 CLIP_SH(temp2, tc_neg, tc_pos); 565 dst0 = (v16u8) (temp2 + (v8i16) p2_src); 566 567 temp1 = temp0 + p2_src; 568 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); 569 temp2 = (v8i16) (temp1 - p1_src); 570 CLIP_SH(temp2, tc_neg, tc_pos); 571 dst1 = (v16u8) (temp2 + (v8i16) p1_src); 572 573 temp1 = (temp0 << 1) + p2_src + q1_src; 574 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 575 temp2 = (v8i16) (temp1 - p0_src); 576 CLIP_SH(temp2, tc_neg, tc_pos); 577 dst2 = (v16u8) (temp2 + (v8i16) p0_src); 578 579 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); 580 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec); 581 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec); 582 583 /* q part */ 584 temp0 = (q1_src + p0_src + q0_src); 585 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; 586 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 587 temp2 = (v8i16) (temp1 - q2_src); 588 CLIP_SH(temp2, tc_neg, tc_pos); 589 dst5 = (v16u8) (temp2 + (v8i16) q2_src); 590 591 temp1 = temp0 + q2_src; 592 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); 593 temp2 = (v8i16) (temp1 - q1_src); 594 CLIP_SH(temp2, tc_neg, tc_pos); 595 dst4 = (v16u8) (temp2 + (v8i16) q1_src); 596 597 temp1 = (temp0 << 1) + p1_src + q2_src; 598 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 599 temp2 = (v8i16) (temp1 - q0_src); 600 CLIP_SH(temp2, tc_neg, tc_pos); 601 dst3 = (v16u8) (temp2 + (v8i16) q0_src); 602 603 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); 604 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec); 605 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec); 606 /* strong filter ends */ 607 } else if (flag0 == flag1) { /* weak only */ 608 /* weak filter */ 609 tc_pos >>= 1; 610 tc_neg = -tc_pos; 611 612 diff0 = (v8i16) (q0_src - p0_src); 613 diff1 = (v8i16) (q1_src - p1_src); 614 diff0 = (diff0 << 3) + diff0; 615 diff1 = (diff1 << 1) + diff1; 616 delta0 = diff0 - diff1; 617 delta0 = __msa_srari_h(delta0, 4); 618 619 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1)); 620 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); 621 abs_delta0 = (v8u16) abs_delta0 < temp1; 622 623 CLIP_SH(delta0, tc_neg, tc_pos); 624 temp2 = (v8i16) (delta0 + p0_src); 625 CLIP_SH_0_255(temp2); 626 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, 627 (v16u8) p_is_pcm_vec); 628 629 temp2 = (v8i16) (q0_src - delta0); 630 CLIP_SH_0_255(temp2); 631 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, 632 (v16u8) q_is_pcm_vec); 633 634 tmp = ((beta + (beta >> 1)) >> 3); 635 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp)); 636 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp)); 637 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 638 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); 639 640 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp)); 641 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp)); 642 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 643 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); 644 645 tc_pos >>= 1; 646 tc_neg = -tc_pos; 647 648 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src); 649 delta1 -= (v8i16) p1_src; 650 delta1 += delta0; 651 delta1 >>= 1; 652 CLIP_SH(delta1, tc_neg, tc_pos); 653 delta1 = (v8i16) p1_src + (v8i16) delta1; 654 CLIP_SH_0_255(delta1); 655 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, 656 (v16u8) p_is_pcm_vec); 657 658 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src); 659 delta2 = delta2 - (v8i16) q1_src; 660 delta2 = delta2 - delta0; 661 delta2 = delta2 >> 1; 662 CLIP_SH(delta2, tc_neg, tc_pos); 663 delta2 = (v8i16) q1_src + (v8i16) delta2; 664 CLIP_SH_0_255(delta2); 665 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, 666 (v16u8) q_is_pcm_vec); 667 668 dst0 = __msa_bmz_v((v16u8) delta1, (v16u8) p1_src, 669 (v16u8) abs_delta0); 670 dst1 = __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, 671 (v16u8) abs_delta0); 672 dst2 = __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, 673 (v16u8) abs_delta0); 674 dst3 = __msa_bmz_v((v16u8) delta2, (v16u8) q1_src, 675 (v16u8) abs_delta0); 676 /* weak filter ends */ 677 678 dst0 = __msa_bmz_v(dst0, (v16u8) p1_src, (v16u8) cmp3); 679 dst1 = __msa_bmz_v(dst1, (v16u8) p0_src, (v16u8) cmp3); 680 dst2 = __msa_bmz_v(dst2, (v16u8) q0_src, (v16u8) cmp3); 681 dst3 = __msa_bmz_v(dst3, (v16u8) q1_src, (v16u8) cmp3); 682 683 PCKEV_B2_UB(dst2, dst0, dst3, dst1, dst0, dst1); 684 685 /* transpose */ 686 ILVRL_B2_UB(dst1, dst0, dst4, dst5); 687 ILVRL_H2_UB(dst5, dst4, dst0, dst1); 688 689 src += 2; 690 691 tmp2 = __msa_copy_u_w((v4i32) dst0, 0); 692 tmp3 = __msa_copy_u_w((v4i32) dst0, 1); 693 SW(tmp2, src); 694 src += stride; 695 SW(tmp3, src); 696 src += stride; 697 698 tmp2 = __msa_copy_u_w((v4i32) dst0, 2); 699 tmp3 = __msa_copy_u_w((v4i32) dst0, 3); 700 SW(tmp2, src); 701 src += stride; 702 SW(tmp3, src); 703 src += stride; 704 705 tmp2 = __msa_copy_u_w((v4i32) dst1, 0); 706 tmp3 = __msa_copy_u_w((v4i32) dst1, 1); 707 SW(tmp2, src); 708 src += stride; 709 SW(tmp3, src); 710 src += stride; 711 712 tmp2 = __msa_copy_u_w((v4i32) dst1, 2); 713 tmp3 = __msa_copy_u_w((v4i32) dst1, 3); 714 SW(tmp2, src); 715 src += stride; 716 SW(tmp3, src); 717 718 return; 719 } else { /* strong + weak */ 720 /* strong filter */ 721 tc_neg = -tc_pos; 722 723 /* p part */ 724 temp0 = (p1_src + p0_src + q0_src); 725 726 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; 727 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 728 temp2 = (v8i16) (temp1 - p2_src); 729 CLIP_SH(temp2, tc_neg, tc_pos); 730 dst0 = (v16u8) (temp2 + (v8i16) p2_src); 731 732 temp1 = temp0 + p2_src; 733 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); 734 temp2 = (v8i16) (temp1 - p1_src); 735 CLIP_SH(temp2, tc_neg, tc_pos); 736 dst1 = (v16u8) (temp2 + (v8i16) p1_src); 737 738 temp1 = (temp0 << 1) + p2_src + q1_src; 739 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 740 temp2 = (v8i16) (temp1 - p0_src); 741 CLIP_SH(temp2, tc_neg, tc_pos); 742 dst2 = (v16u8) (temp2 + (v8i16) p0_src); 743 744 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); 745 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec); 746 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec); 747 748 /* q part */ 749 temp0 = (q1_src + p0_src + q0_src); 750 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; 751 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 752 temp2 = (v8i16) (temp1 - q2_src); 753 CLIP_SH(temp2, tc_neg, tc_pos); 754 dst5 = (v16u8) (temp2 + (v8i16) q2_src); 755 756 temp1 = temp0 + q2_src; 757 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); 758 temp2 = (v8i16) (temp1 - q1_src); 759 CLIP_SH(temp2, tc_neg, tc_pos); 760 dst4 = (v16u8) (temp2 + (v8i16) q1_src); 761 762 temp1 = (temp0 << 1) + p1_src + q2_src; 763 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); 764 temp2 = (v8i16) (temp1 - q0_src); 765 CLIP_SH(temp2, tc_neg, tc_pos); 766 dst3 = (v16u8) (temp2 + (v8i16) q0_src); 767 768 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); 769 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec); 770 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec); 771 /* strong filter ends */ 772 773 /* weak filter */ 774 tc_pos >>= 1; 775 tc_neg = -tc_pos; 776 777 diff0 = (v8i16) (q0_src - p0_src); 778 diff1 = (v8i16) (q1_src - p1_src); 779 diff0 = (diff0 << 3) + diff0; 780 diff1 = (diff1 << 1) + diff1; 781 delta0 = diff0 - diff1; 782 delta0 = __msa_srari_h(delta0, 4); 783 784 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1)); 785 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); 786 abs_delta0 = (v8u16) abs_delta0 < temp1; 787 788 CLIP_SH(delta0, tc_neg, tc_pos); 789 790 temp2 = (v8i16) (delta0 + p0_src); 791 CLIP_SH_0_255(temp2); 792 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, 793 (v16u8) p_is_pcm_vec); 794 795 temp2 = (v8i16) (q0_src - delta0); 796 CLIP_SH_0_255(temp2); 797 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, 798 (v16u8) q_is_pcm_vec); 799 800 tmp = (beta + (beta >> 1)) >> 3; 801 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp)); 802 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp)); 803 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 804 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); 805 806 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp)); 807 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp)); 808 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 809 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); 810 811 tc_pos >>= 1; 812 tc_neg = -tc_pos; 813 814 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src); 815 delta1 -= (v8i16) p1_src; 816 delta1 += delta0; 817 delta1 >>= 1; 818 CLIP_SH(delta1, tc_neg, tc_pos); 819 delta1 = (v8i16) p1_src + (v8i16) delta1; 820 CLIP_SH_0_255(delta1); 821 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, 822 (v16u8) p_is_pcm_vec); 823 824 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src); 825 delta2 = delta2 - (v8i16) q1_src; 826 delta2 = delta2 - delta0; 827 delta2 = delta2 >> 1; 828 CLIP_SH(delta2, tc_neg, tc_pos); 829 delta2 = (v8i16) q1_src + (v8i16) delta2; 830 CLIP_SH_0_255(delta2); 831 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, 832 (v16u8) q_is_pcm_vec); 833 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src, 834 (v16u8) abs_delta0); 835 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, 836 (v16u8) abs_delta0); 837 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, 838 (v16u8) abs_delta0); 839 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src, 840 (v16u8) abs_delta0); 841 /* weak filter ends*/ 842 843 /* select between weak or strong */ 844 dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2); 845 dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2); 846 dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2); 847 dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2); 848 dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2); 849 dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2); 850 } 851 852 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp3); 853 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp3); 854 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp3); 855 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp3); 856 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp3); 857 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp3); 858 859 /* pack results to 8 bit */ 860 PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst4, dst4, dst5, dst5, dst0, dst1, 861 dst2, dst3); 862 863 /* transpose */ 864 ILVRL_B2_UB(dst1, dst0, dst4, dst5); 865 ILVRL_B2_UB(dst3, dst2, dst6, dst7); 866 ILVRL_H2_UB(dst5, dst4, dst0, dst1); 867 ILVRL_H2_UB(dst7, dst6, dst2, dst3); 868 869 src += 1; 870 871 tmp2 = __msa_copy_u_w((v4i32) dst0, 0); 872 tmp3 = __msa_copy_u_w((v4i32) dst0, 1); 873 tmp0 = __msa_copy_u_h((v8i16) dst2, 0); 874 tmp1 = __msa_copy_u_h((v8i16) dst2, 2); 875 SW(tmp2, src); 876 SH(tmp0, src + 4); 877 src += stride; 878 SW(tmp3, src); 879 SH(tmp1, src + 4); 880 src += stride; 881 882 tmp2 = __msa_copy_u_w((v4i32) dst0, 2); 883 tmp3 = __msa_copy_u_w((v4i32) dst0, 3); 884 tmp0 = __msa_copy_u_h((v8i16) dst2, 4); 885 tmp1 = __msa_copy_u_h((v8i16) dst2, 6); 886 SW(tmp2, src); 887 SH(tmp0, src + 4); 888 src += stride; 889 SW(tmp3, src); 890 SH(tmp1, src + 4); 891 src += stride; 892 893 tmp2 = __msa_copy_u_w((v4i32) dst1, 0); 894 tmp3 = __msa_copy_u_w((v4i32) dst1, 1); 895 tmp0 = __msa_copy_u_h((v8i16) dst3, 0); 896 tmp1 = __msa_copy_u_h((v8i16) dst3, 2); 897 SW(tmp2, src); 898 SH(tmp0, src + 4); 899 src += stride; 900 SW(tmp3, src); 901 SH(tmp1, src + 4); 902 src += stride; 903 904 tmp2 = __msa_copy_u_w((v4i32) dst1, 2); 905 tmp3 = __msa_copy_u_w((v4i32) dst1, 3); 906 tmp0 = __msa_copy_u_h((v8i16) dst3, 4); 907 tmp1 = __msa_copy_u_h((v8i16) dst3, 6); 908 SW(tmp2, src); 909 SH(tmp0, src + 4); 910 src += stride; 911 SW(tmp3, src); 912 SH(tmp1, src + 4); 913 } 914} 915 916static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride, 917 int32_t *tc, uint8_t *p_is_pcm, 918 uint8_t *q_is_pcm) 919{ 920 uint8_t *p1_ptr = src - (stride << 1); 921 uint8_t *p0_ptr = src - stride; 922 uint8_t *q0_ptr = src; 923 uint8_t *q1_ptr = src + stride; 924 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec; 925 v8u16 p1, p0, q0, q1; 926 v8i16 tc_pos, tc_neg; 927 v16i8 zero = { 0 }; 928 v8i16 temp0, temp1, delta; 929 930 if (!(tc[0] <= 0) || !(tc[1] <= 0)) { 931 cmp0 = (v2i64) __msa_fill_h(tc[0]); 932 cmp1 = (v2i64) __msa_fill_h(tc[1]); 933 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0); 934 tc_neg = -tc_pos; 935 936 cmp0 = __msa_fill_d(p_is_pcm[0]); 937 cmp1 = __msa_fill_d(p_is_pcm[1]); 938 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 939 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); 940 941 cmp0 = __msa_fill_d(q_is_pcm[0]); 942 cmp1 = __msa_fill_d(q_is_pcm[1]); 943 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 944 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); 945 946 p1 = LD_UH(p1_ptr); 947 p0 = LD_UH(p0_ptr); 948 q0 = LD_UH(q0_ptr); 949 q1 = LD_UH(q1_ptr); 950 951 ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1); 952 953 temp0 = (v8i16) (q0 - p0); 954 temp1 = (v8i16) (p1 - q1); 955 temp0 <<= 2; 956 temp0 += temp1; 957 delta = __msa_srari_h((v8i16) temp0, 3); 958 CLIP_SH(delta, tc_neg, tc_pos); 959 960 temp0 = (v8i16) ((v8i16) p0 + delta); 961 CLIP_SH_0_255(temp0); 962 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0, 963 (v16u8) p_is_pcm_vec); 964 965 temp1 = (v8i16) ((v8i16) q0 - delta); 966 CLIP_SH_0_255(temp1); 967 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0, 968 (v16u8) q_is_pcm_vec); 969 970 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0); 971 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos); 972 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos); 973 974 temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0); 975 ST_D2(temp0, 0, 1, p0_ptr, stride); 976 } 977} 978 979static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride, 980 int32_t *tc, uint8_t *p_is_pcm, 981 uint8_t *q_is_pcm) 982{ 983 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec; 984 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 985 v8u16 p1, p0, q0, q1; 986 v8i16 tc_pos, tc_neg; 987 v16i8 zero = { 0 }; 988 v8i16 temp0, temp1, delta; 989 990 if (!(tc[0] <= 0) || !(tc[1] <= 0)) { 991 cmp0 = (v2i64) __msa_fill_h(tc[0]); 992 cmp1 = (v2i64) __msa_fill_h(tc[1]); 993 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0); 994 tc_neg = -tc_pos; 995 996 cmp0 = __msa_fill_d(p_is_pcm[0]); 997 cmp1 = __msa_fill_d(p_is_pcm[1]); 998 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 999 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0); 1000 1001 cmp0 = __msa_fill_d(q_is_pcm[0]); 1002 cmp1 = __msa_fill_d(q_is_pcm[1]); 1003 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0); 1004 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0); 1005 1006 src -= 2; 1007 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); 1008 TRANSPOSE8x4_UB_UH(src0, src1, src2, src3, src4, src5, src6, src7, 1009 p1, p0, q0, q1); 1010 ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1); 1011 1012 temp0 = (v8i16) (q0 - p0); 1013 temp1 = (v8i16) (p1 - q1); 1014 temp0 <<= 2; 1015 temp0 += temp1; 1016 delta = __msa_srari_h((v8i16) temp0, 3); 1017 CLIP_SH(delta, tc_neg, tc_pos); 1018 1019 temp0 = (v8i16) ((v8i16) p0 + delta); 1020 CLIP_SH_0_255(temp0); 1021 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0, 1022 (v16u8) p_is_pcm_vec); 1023 1024 temp1 = (v8i16) ((v8i16) q0 - delta); 1025 CLIP_SH_0_255(temp1); 1026 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0, 1027 (v16u8) q_is_pcm_vec); 1028 1029 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0); 1030 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos); 1031 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos); 1032 1033 temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0); 1034 1035 src += 1; 1036 ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7, src, stride); 1037 } 1038} 1039 1040static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride, 1041 uint8_t *src, int32_t src_stride, 1042 int32_t sao_left_class, 1043 int16_t *sao_offset_val, 1044 int32_t height) 1045{ 1046 v16u8 src0, src1, src2, src3; 1047 v16i8 src0_r, src1_r; 1048 v16i8 offset, offset_val, mask; 1049 v16i8 dst0, offset0, offset1; 1050 v16i8 zero = { 0 }; 1051 1052 offset_val = LD_SB(sao_offset_val + 1); 1053 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val); 1054 1055 offset_val = __msa_pckev_b(offset_val, offset_val); 1056 offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val); 1057 offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31)); 1058 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31)); 1059 1060 /* load in advance. */ 1061 LD_UB4(src, src_stride, src0, src1, src2, src3); 1062 1063 if (!((sao_left_class > 12) & (sao_left_class < 29))) { 1064 SWAP(offset0, offset1); 1065 } 1066 1067 for (height -= 4; height; height -= 4) { 1068 src += (4 * src_stride); 1069 1070 ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r); 1071 1072 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r); 1073 mask = __msa_srli_b(src0_r, 3); 1074 offset = __msa_vshf_b(mask, offset1, offset0); 1075 1076 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128); 1077 dst0 = __msa_adds_s_b(src0_r, offset); 1078 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1079 1080 /* load in advance. */ 1081 LD_UB4(src, src_stride, src0, src1, src2, src3); 1082 1083 /* store results */ 1084 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); 1085 dst += (4 * dst_stride); 1086 } 1087 1088 ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r); 1089 1090 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r); 1091 mask = __msa_srli_b(src0_r, 3); 1092 offset = __msa_vshf_b(mask, offset1, offset0); 1093 1094 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128); 1095 dst0 = __msa_adds_s_b(src0_r, offset); 1096 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1097 1098 /* store results */ 1099 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); 1100} 1101 1102static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride, 1103 uint8_t *src, int32_t src_stride, 1104 int32_t sao_left_class, 1105 int16_t *sao_offset_val, 1106 int32_t height) 1107{ 1108 v16u8 src0, src1, src2, src3; 1109 v16i8 src0_r, src1_r, mask0, mask1; 1110 v16i8 offset_mask0, offset_mask1, offset_val; 1111 v16i8 offset0, offset1, dst0, dst1; 1112 v16i8 zero = { 0 }; 1113 1114 offset_val = LD_SB(sao_offset_val + 1); 1115 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val); 1116 offset_val = __msa_pckev_b(offset_val, offset_val); 1117 offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val); 1118 offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31)); 1119 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31)); 1120 1121 /* load in advance. */ 1122 LD_UB4(src, src_stride, src0, src1, src2, src3); 1123 1124 if (!((sao_left_class > 12) & (sao_left_class < 29))) { 1125 SWAP(offset0, offset1); 1126 } 1127 1128 for (height -= 4; height; height -= 4) { 1129 src += src_stride << 2; 1130 1131 ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r); 1132 1133 mask0 = __msa_srli_b(src0_r, 3); 1134 mask1 = __msa_srli_b(src1_r, 3); 1135 1136 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0); 1137 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0); 1138 1139 /* load in advance. */ 1140 LD_UB4(src, src_stride, src0, src1, src2, src3); 1141 1142 XORI_B2_128_SB(src0_r, src1_r); 1143 1144 dst0 = __msa_adds_s_b(src0_r, offset_mask0); 1145 dst1 = __msa_adds_s_b(src1_r, offset_mask1); 1146 1147 XORI_B2_128_SB(dst0, dst1); 1148 1149 /* store results */ 1150 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 1151 dst += dst_stride << 2; 1152 } 1153 1154 ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r); 1155 1156 mask0 = __msa_srli_b(src0_r, 3); 1157 mask1 = __msa_srli_b(src1_r, 3); 1158 1159 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0); 1160 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0); 1161 1162 XORI_B2_128_SB(src0_r, src1_r); 1163 1164 dst0 = __msa_adds_s_b(src0_r, offset_mask0); 1165 dst1 = __msa_adds_s_b(src1_r, offset_mask1); 1166 1167 XORI_B2_128_SB(dst0, dst1); 1168 1169 /* store results */ 1170 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); 1171} 1172 1173static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst, 1174 int32_t dst_stride, 1175 uint8_t *src, 1176 int32_t src_stride, 1177 int32_t sao_left_class, 1178 int16_t *sao_offset_val, 1179 int32_t width, int32_t height) 1180{ 1181 int32_t w_cnt; 1182 v16u8 src0, src1, src2, src3; 1183 v16i8 out0, out1, out2, out3; 1184 v16i8 mask0, mask1, mask2, mask3; 1185 v16i8 tmp0, tmp1, tmp2, tmp3, offset_val; 1186 v16i8 offset0, offset1; 1187 v16i8 zero = { 0 }; 1188 1189 offset_val = LD_SB(sao_offset_val + 1); 1190 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val); 1191 offset_val = __msa_pckev_b(offset_val, offset_val); 1192 offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val); 1193 offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31)); 1194 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31)); 1195 1196 if (!((sao_left_class > 12) & (sao_left_class < 29))) { 1197 SWAP(offset0, offset1); 1198 } 1199 1200 while (height > 0) { 1201 /* load in advance */ 1202 LD_UB4(src, src_stride, src0, src1, src2, src3); 1203 1204 for (w_cnt = 16; w_cnt < width; w_cnt += 16) { 1205 mask0 = __msa_srli_b((v16i8) src0, 3); 1206 mask1 = __msa_srli_b((v16i8) src1, 3); 1207 mask2 = __msa_srli_b((v16i8) src2, 3); 1208 mask3 = __msa_srli_b((v16i8) src3, 3); 1209 1210 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1, 1211 tmp0, tmp1); 1212 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3, 1213 tmp2, tmp3); 1214 XORI_B4_128_UB(src0, src1, src2, src3); 1215 1216 out0 = __msa_adds_s_b((v16i8) src0, tmp0); 1217 out1 = __msa_adds_s_b((v16i8) src1, tmp1); 1218 out2 = __msa_adds_s_b((v16i8) src2, tmp2); 1219 out3 = __msa_adds_s_b((v16i8) src3, tmp3); 1220 1221 /* load for next iteration */ 1222 LD_UB4(src + w_cnt, src_stride, src0, src1, src2, src3); 1223 1224 XORI_B4_128_SB(out0, out1, out2, out3); 1225 1226 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride); 1227 } 1228 1229 mask0 = __msa_srli_b((v16i8) src0, 3); 1230 mask1 = __msa_srli_b((v16i8) src1, 3); 1231 mask2 = __msa_srli_b((v16i8) src2, 3); 1232 mask3 = __msa_srli_b((v16i8) src3, 3); 1233 1234 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1, tmp0, 1235 tmp1); 1236 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3, tmp2, 1237 tmp3); 1238 XORI_B4_128_UB(src0, src1, src2, src3); 1239 1240 out0 = __msa_adds_s_b((v16i8) src0, tmp0); 1241 out1 = __msa_adds_s_b((v16i8) src1, tmp1); 1242 out2 = __msa_adds_s_b((v16i8) src2, tmp2); 1243 out3 = __msa_adds_s_b((v16i8) src3, tmp3); 1244 1245 XORI_B4_128_SB(out0, out1, out2, out3); 1246 1247 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride); 1248 1249 src += src_stride << 2; 1250 dst += dst_stride << 2; 1251 height -= 4; 1252 } 1253} 1254 1255static void hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst, 1256 int32_t dst_stride, 1257 uint8_t *src, 1258 int32_t src_stride, 1259 int16_t *sao_offset_val, 1260 int32_t height) 1261{ 1262 uint32_t dst_val0, dst_val1; 1263 v16u8 cmp_minus10, diff_minus10, diff_minus11, src_minus10, src_minus11; 1264 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1265 v16i8 sao_offset = LD_SB(sao_offset_val); 1266 v16i8 src_plus10, offset, src0, dst0; 1267 v16u8 const1 = (v16u8) __msa_ldi_b(1); 1268 v16i8 zero = { 0 }; 1269 1270 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 1271 src -= 1; 1272 1273 /* load in advance */ 1274 LD_UB2(src, src_stride, src_minus10, src_minus11); 1275 1276 for (height -= 2; height; height -= 2) { 1277 src += (2 * src_stride); 1278 1279 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11, 1280 (v2i64) src_minus10); 1281 1282 src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1); 1283 src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2); 1284 1285 cmp_minus10 = ((v16u8) src0 == src_minus10); 1286 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1287 cmp_minus10 = (src_minus10 < (v16u8) src0); 1288 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1289 1290 cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10); 1291 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10); 1292 cmp_minus10 = ((v16u8) src_plus10 < (v16u8) src0); 1293 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10); 1294 1295 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2; 1296 1297 /* load in advance */ 1298 LD_UB2(src, src_stride, src_minus10, src_minus11); 1299 1300 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 1301 offset, offset); 1302 1303 src0 = (v16i8) __msa_xori_b((v16u8) src0, 128); 1304 dst0 = __msa_adds_s_b(src0, offset); 1305 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1306 1307 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); 1308 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); 1309 SW(dst_val0, dst); 1310 dst += dst_stride; 1311 SW(dst_val1, dst); 1312 dst += dst_stride; 1313 } 1314 1315 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11, 1316 (v2i64) src_minus10); 1317 1318 src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1); 1319 src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2); 1320 1321 cmp_minus10 = ((v16u8) src0 == src_minus10); 1322 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1323 cmp_minus10 = (src_minus10 < (v16u8) src0); 1324 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1325 1326 cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10); 1327 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10); 1328 cmp_minus10 = ((v16u8) src_plus10 < (v16u8) src0); 1329 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10); 1330 1331 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2; 1332 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 1333 offset, offset); 1334 1335 src0 = (v16i8) __msa_xori_b((v16u8) src0, 128); 1336 dst0 = __msa_adds_s_b(src0, offset); 1337 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1338 1339 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); 1340 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); 1341 1342 SW(dst_val0, dst); 1343 dst += dst_stride; 1344 SW(dst_val1, dst); 1345} 1346 1347static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst, 1348 int32_t dst_stride, 1349 uint8_t *src, 1350 int32_t src_stride, 1351 int16_t *sao_offset_val, 1352 int32_t height) 1353{ 1354 uint64_t dst_val0, dst_val1; 1355 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1356 v16u8 const1 = (v16u8) __msa_ldi_b(1); 1357 v16u8 cmp_minus10, diff_minus10, diff_minus11; 1358 v16u8 src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11; 1359 v16i8 offset, sao_offset = LD_SB(sao_offset_val); 1360 v16i8 zeros = { 0 }; 1361 1362 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 1363 src -= 1; 1364 1365 /* load in advance */ 1366 LD_UB2(src, src_stride, src_minus10, src_minus11); 1367 1368 for (height -= 2; height; height -= 2) { 1369 src += (src_stride << 1); 1370 1371 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1); 1372 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11); 1373 1374 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, 1375 src_minus10, src_plus10); 1376 src0 = (v16u8) __msa_pckev_d((v2i64) src1, (v2i64) src0); 1377 1378 cmp_minus10 = (src0 == src_minus10); 1379 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1380 cmp_minus10 = (src_minus10 < src0); 1381 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1382 1383 cmp_minus10 = (src0 == src_plus10); 1384 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10); 1385 cmp_minus10 = (src_plus10 < src0); 1386 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10); 1387 1388 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2; 1389 1390 /* load in advance */ 1391 LD_UB2(src, src_stride, src_minus10, src_minus11); 1392 1393 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 1394 offset, offset); 1395 1396 src0 = __msa_xori_b(src0, 128); 1397 dst0 = (v16u8) __msa_adds_s_b((v16i8) src0, offset); 1398 dst0 = __msa_xori_b(dst0, 128); 1399 1400 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); 1401 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1); 1402 SD(dst_val0, dst); 1403 dst += dst_stride; 1404 SD(dst_val1, dst); 1405 dst += dst_stride; 1406 } 1407 1408 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1); 1409 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11); 1410 1411 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10, 1412 src_plus10); 1413 src0 = (v16u8) __msa_pckev_d((v2i64) src1, (v2i64) src0); 1414 1415 cmp_minus10 = ((v16u8) src0 == src_minus10); 1416 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1417 cmp_minus10 = (src_minus10 < (v16u8) src0); 1418 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1419 1420 cmp_minus10 = (src0 == src_plus10); 1421 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10); 1422 cmp_minus10 = (src_plus10 < src0); 1423 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10); 1424 1425 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2; 1426 1427 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 1428 offset, offset); 1429 1430 src0 = __msa_xori_b(src0, 128); 1431 dst0 = (v16u8) __msa_adds_s_b((v16i8) src0, offset); 1432 dst0 = __msa_xori_b(dst0, 128); 1433 1434 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); 1435 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1); 1436 SD(dst_val0, dst); 1437 dst += dst_stride; 1438 SD(dst_val1, dst); 1439} 1440 1441static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst, 1442 int32_t dst_stride, 1443 uint8_t *src, 1444 int32_t src_stride, 1445 int16_t *sao_offset_val, 1446 int32_t width, 1447 int32_t height) 1448{ 1449 uint8_t *dst_ptr, *src_minus1; 1450 int32_t v_cnt; 1451 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1452 v16u8 const1 = (v16u8) __msa_ldi_b(1); 1453 v16i8 sao_offset; 1454 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; 1455 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; 1456 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; 1457 v16u8 diff_plus13; 1458 v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3; 1459 v16u8 src_minus10, src_minus11, src_minus12, src_minus13; 1460 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3; 1461 v16i8 src_zero0, src_zero1, src_zero2, src_zero3; 1462 v16i8 src_plus10, src_plus11, src_plus12, src_plus13; 1463 1464 sao_offset = LD_SB(sao_offset_val); 1465 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 1466 1467 for (; height; height -= 4) { 1468 src_minus1 = src - 1; 1469 LD_UB4(src_minus1, src_stride, 1470 src_minus10, src_minus11, src_minus12, src_minus13); 1471 1472 for (v_cnt = 0; v_cnt < width; v_cnt += 16) { 1473 src_minus1 += 16; 1474 dst_ptr = dst + v_cnt; 1475 LD_UB4(src_minus1, src_stride, src10, src11, src12, src13); 1476 1477 SLDI_B4_SB(src10, src_minus10, src11, src_minus11, 1478 src12, src_minus12, src13, src_minus13, 1, 1479 src_zero0, src_zero1, src_zero2, src_zero3); 1480 SLDI_B4_SB(src10, src_minus10, src11, src_minus11, 1481 src12, src_minus12, src13, src_minus13, 2, 1482 src_plus10, src_plus11, src_plus12, src_plus13); 1483 1484 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 1485 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10); 1486 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 1487 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11); 1488 cmp_minus12 = ((v16u8) src_zero2 == src_minus12); 1489 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12); 1490 cmp_minus13 = ((v16u8) src_zero3 == src_minus13); 1491 cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13); 1492 1493 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1494 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10); 1495 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 1496 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11); 1497 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12); 1498 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12); 1499 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13); 1500 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13); 1501 1502 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 1503 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0); 1504 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 1505 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1); 1506 cmp_minus12 = (src_minus12 < (v16u8) src_zero2); 1507 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2); 1508 cmp_minus13 = (src_minus13 < (v16u8) src_zero3); 1509 cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3); 1510 1511 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1512 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10); 1513 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 1514 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11); 1515 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12); 1516 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12); 1517 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13); 1518 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13); 1519 1520 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10; 1521 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0, 1522 offset_mask0, offset_mask0, offset_mask0); 1523 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11; 1524 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1, 1525 offset_mask1, offset_mask1, offset_mask1); 1526 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12; 1527 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2, 1528 offset_mask2, offset_mask2, offset_mask2); 1529 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13; 1530 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3, 1531 offset_mask3, offset_mask3, offset_mask3); 1532 1533 XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3); 1534 1535 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0); 1536 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1); 1537 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2); 1538 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3); 1539 1540 XORI_B4_128_UB(dst0, dst1, dst2, dst3); 1541 1542 src_minus10 = src10; 1543 ST_UB(dst0, dst_ptr); 1544 src_minus11 = src11; 1545 ST_UB(dst1, dst_ptr + dst_stride); 1546 src_minus12 = src12; 1547 ST_UB(dst2, dst_ptr + (dst_stride << 1)); 1548 src_minus13 = src13; 1549 ST_UB(dst3, dst_ptr + (dst_stride * 3)); 1550 } 1551 1552 src += (src_stride << 2); 1553 dst += (dst_stride << 2); 1554 } 1555} 1556 1557static void hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst, 1558 int32_t dst_stride, 1559 uint8_t *src, 1560 int32_t src_stride, 1561 int16_t *sao_offset_val, 1562 int32_t height) 1563{ 1564 uint32_t dst_val0, dst_val1; 1565 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1566 v16u8 const1 = (v16u8) __msa_ldi_b(1); 1567 v16i8 dst0; 1568 v16i8 sao_offset = LD_SB(sao_offset_val); 1569 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; 1570 v16u8 src_minus10, src_minus11, src10, src11; 1571 v16i8 src_zero0, src_zero1; 1572 v16i8 offset; 1573 v8i16 offset_mask0, offset_mask1; 1574 1575 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 1576 1577 /* load in advance */ 1578 LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11); 1579 LD_UB2(src + src_stride, src_stride, src10, src11); 1580 1581 for (height -= 2; height; height -= 2) { 1582 src += (src_stride << 1); 1583 1584 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10); 1585 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11); 1586 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11); 1587 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10); 1588 1589 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 1590 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1591 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 1592 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1593 1594 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 1595 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 1596 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 1597 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 1598 1599 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 1600 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 1601 1602 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 1603 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 1604 1605 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 1606 offset, offset); 1607 1608 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1609 dst0 = __msa_adds_s_b(dst0, offset); 1610 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1611 1612 src_minus10 = src10; 1613 src_minus11 = src11; 1614 1615 /* load in advance */ 1616 LD_UB2(src + src_stride, src_stride, src10, src11); 1617 1618 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); 1619 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); 1620 SW(dst_val0, dst); 1621 dst += dst_stride; 1622 SW(dst_val1, dst); 1623 1624 dst += dst_stride; 1625 } 1626 1627 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10); 1628 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11); 1629 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11); 1630 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10); 1631 1632 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 1633 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1634 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 1635 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1636 1637 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 1638 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 1639 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 1640 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 1641 1642 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 1643 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 1644 1645 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 1646 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 1647 1648 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, 1649 offset, offset, offset); 1650 1651 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1652 dst0 = __msa_adds_s_b(dst0, offset); 1653 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1654 1655 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); 1656 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); 1657 SW(dst_val0, dst); 1658 dst += dst_stride; 1659 SW(dst_val1, dst); 1660} 1661 1662static void hevc_sao_edge_filter_90degree_8width_msa(uint8_t *dst, 1663 int32_t dst_stride, 1664 uint8_t *src, 1665 int32_t src_stride, 1666 int16_t *sao_offset_val, 1667 int32_t height) 1668{ 1669 uint64_t dst_val0, dst_val1; 1670 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1671 v16u8 const1 = (v16u8) __msa_ldi_b(1); 1672 v16i8 offset, sao_offset = LD_SB(sao_offset_val); 1673 v16i8 src_zero0, src_zero1, dst0; 1674 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; 1675 v16u8 src_minus10, src_minus11, src10, src11; 1676 v8i16 offset_mask0, offset_mask1; 1677 1678 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 1679 1680 /* load in advance */ 1681 LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11); 1682 LD_UB2(src + src_stride, src_stride, src10, src11); 1683 1684 for (height -= 2; height; height -= 2) { 1685 src += (src_stride << 1); 1686 1687 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10); 1688 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11); 1689 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11); 1690 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10); 1691 1692 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 1693 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1694 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 1695 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1696 1697 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 1698 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 1699 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 1700 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 1701 1702 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 1703 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 1704 1705 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 1706 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 1707 1708 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, 1709 offset, offset, offset); 1710 1711 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1712 dst0 = __msa_adds_s_b(dst0, offset); 1713 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1714 1715 src_minus10 = src10; 1716 src_minus11 = src11; 1717 1718 /* load in advance */ 1719 LD_UB2(src + src_stride, src_stride, src10, src11); 1720 1721 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); 1722 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1); 1723 SD(dst_val0, dst); 1724 dst += dst_stride; 1725 SD(dst_val1, dst); 1726 dst += dst_stride; 1727 } 1728 1729 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10); 1730 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11); 1731 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11); 1732 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10); 1733 1734 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 1735 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1736 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 1737 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1738 1739 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 1740 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 1741 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 1742 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 1743 1744 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 1745 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 1746 1747 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 1748 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 1749 1750 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 1751 offset, offset); 1752 1753 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1754 dst0 = __msa_adds_s_b(dst0, offset); 1755 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1756 1757 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); 1758 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1); 1759 SD(dst_val0, dst); 1760 dst += dst_stride; 1761 SD(dst_val1, dst); 1762} 1763 1764static void hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t *dst, 1765 int32_t dst_stride, 1766 uint8_t *src, 1767 int32_t src_stride, 1768 int16_t * 1769 sao_offset_val, 1770 int32_t width, 1771 int32_t height) 1772{ 1773 uint8_t *src_orig = src; 1774 uint8_t *dst_orig = dst; 1775 int32_t h_cnt, v_cnt; 1776 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1777 v16u8 const1 = (v16u8) __msa_ldi_b(1); 1778 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; 1779 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; 1780 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; 1781 v16u8 diff_plus13; 1782 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1; 1783 v16u8 src12, dst2, src13, dst3; 1784 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset; 1785 1786 sao_offset = LD_SB(sao_offset_val); 1787 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 1788 1789 for (v_cnt = 0; v_cnt < width; v_cnt += 16) { 1790 src = src_orig + v_cnt; 1791 dst = dst_orig + v_cnt; 1792 1793 LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11); 1794 1795 for (h_cnt = (height >> 2); h_cnt--;) { 1796 LD_UB4(src + src_stride, src_stride, src10, src11, src12, src13); 1797 1798 cmp_minus10 = (src_minus11 == src_minus10); 1799 cmp_plus10 = (src_minus11 == src10); 1800 cmp_minus11 = (src10 == src_minus11); 1801 cmp_plus11 = (src10 == src11); 1802 cmp_minus12 = (src11 == src10); 1803 cmp_plus12 = (src11 == src12); 1804 cmp_minus13 = (src12 == src11); 1805 cmp_plus13 = (src12 == src13); 1806 1807 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1808 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10); 1809 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 1810 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11); 1811 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12); 1812 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12); 1813 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13); 1814 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13); 1815 1816 cmp_minus10 = (src_minus10 < src_minus11); 1817 cmp_plus10 = (src10 < src_minus11); 1818 cmp_minus11 = (src_minus11 < src10); 1819 cmp_plus11 = (src11 < src10); 1820 cmp_minus12 = (src10 < src11); 1821 cmp_plus12 = (src12 < src11); 1822 cmp_minus13 = (src11 < src12); 1823 cmp_plus13 = (src13 < src12); 1824 1825 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1826 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10); 1827 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 1828 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11); 1829 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12); 1830 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12); 1831 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13); 1832 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13); 1833 1834 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10; 1835 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 1836 offset_mask0, offset_mask0, offset_mask0, offset_mask0); 1837 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11; 1838 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 1839 offset_mask1, offset_mask1, offset_mask1, offset_mask1); 1840 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12; 1841 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 1842 offset_mask2, offset_mask2, offset_mask2, offset_mask2); 1843 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13; 1844 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 1845 offset_mask3, offset_mask3, offset_mask3, offset_mask3); 1846 1847 src_minus10 = src12; 1848 XORI_B4_128_UB(src_minus11, src10, src11, src12); 1849 1850 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_minus11, offset_mask0); 1851 dst1 = (v16u8) __msa_adds_s_b((v16i8) src10, offset_mask1); 1852 dst2 = (v16u8) __msa_adds_s_b((v16i8) src11, offset_mask2); 1853 dst3 = (v16u8) __msa_adds_s_b((v16i8) src12, offset_mask3); 1854 1855 XORI_B4_128_UB(dst0, dst1, dst2, dst3); 1856 src_minus11 = src13; 1857 1858 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); 1859 1860 src += (src_stride << 2); 1861 dst += (dst_stride << 2); 1862 } 1863 } 1864} 1865 1866static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst, 1867 int32_t dst_stride, 1868 uint8_t *src, 1869 int32_t src_stride, 1870 int16_t *sao_offset_val, 1871 int32_t height) 1872{ 1873 uint8_t *src_orig; 1874 uint32_t dst_val0, dst_val1; 1875 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1876 v16u8 const1 = (v16u8) __msa_ldi_b(1); 1877 v16i8 offset, sao_offset = LD_SB(sao_offset_val); 1878 v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11; 1879 v16u8 src_minus11, src10, src11; 1880 v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0; 1881 v8i16 offset_mask0, offset_mask1; 1882 v16i8 zeros = { 0 }; 1883 1884 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 1885 1886 src_orig = src - 1; 1887 1888 /* load in advance */ 1889 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11); 1890 LD_UB2(src_orig + src_stride, src_stride, src10, src11); 1891 1892 for (height -= 2; height; height -= 2) { 1893 src_orig += (src_stride << 1); 1894 1895 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); 1896 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1); 1897 1898 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10, 1899 src_minus11); 1900 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, 1901 src_zero1); 1902 1903 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 1904 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1905 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 1906 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1907 1908 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 1909 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 1910 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 1911 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 1912 1913 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 1914 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 1915 1916 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 1917 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 1918 1919 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, 1920 offset, offset, offset); 1921 1922 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1923 dst0 = __msa_adds_s_b(dst0, offset); 1924 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1925 1926 src_minus10 = src10; 1927 src_minus11 = src11; 1928 1929 /* load in advance */ 1930 LD_UB2(src_orig + src_stride, src_stride, src10, src11); 1931 1932 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); 1933 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); 1934 SW(dst_val0, dst); 1935 dst += dst_stride; 1936 SW(dst_val1, dst); 1937 1938 dst += dst_stride; 1939 } 1940 1941 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); 1942 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1); 1943 1944 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10, 1945 src_minus11); 1946 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, 1947 src_zero1); 1948 1949 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 1950 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 1951 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 1952 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 1953 1954 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 1955 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 1956 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 1957 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 1958 1959 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 1960 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 1961 1962 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 1963 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 1964 1965 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 1966 offset, offset); 1967 1968 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1969 dst0 = __msa_adds_s_b(dst0, offset); 1970 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 1971 1972 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); 1973 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); 1974 SW(dst_val0, dst); 1975 dst += dst_stride; 1976 SW(dst_val1, dst); 1977} 1978 1979static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst, 1980 int32_t dst_stride, 1981 uint8_t *src, 1982 int32_t src_stride, 1983 int16_t *sao_offset_val, 1984 int32_t height) 1985{ 1986 uint8_t *src_orig; 1987 uint64_t dst_val0, dst_val1; 1988 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 1989 v16u8 const1 = (v16u8) __msa_ldi_b(1); 1990 v16i8 offset, sao_offset = LD_SB(sao_offset_val); 1991 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; 1992 v16u8 src_minus10, src10, src_minus11, src11; 1993 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0; 1994 v8i16 offset_mask0, offset_mask1; 1995 v16i8 zeros = { 0 }; 1996 1997 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 1998 src_orig = src - 1; 1999 2000 /* load in advance */ 2001 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11); 2002 LD_UB2(src_orig + src_stride, src_stride, src10, src11); 2003 2004 for (height -= 2; height; height -= 2) { 2005 src_orig += (src_stride << 1); 2006 2007 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); 2008 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11); 2009 2010 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, 2011 src_minus10, src_minus11); 2012 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, 2013 src_zero0, src_zero1); 2014 2015 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 2016 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 2017 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 2018 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 2019 2020 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 2021 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 2022 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 2023 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 2024 2025 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 2026 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 2027 2028 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 2029 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 2030 2031 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 2032 offset, offset); 2033 2034 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2035 dst0 = __msa_adds_s_b(dst0, offset); 2036 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2037 2038 src_minus10 = src10; 2039 src_minus11 = src11; 2040 2041 /* load in advance */ 2042 LD_UB2(src_orig + src_stride, src_stride, src10, src11); 2043 2044 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); 2045 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1); 2046 SD(dst_val0, dst); 2047 dst += dst_stride; 2048 SD(dst_val1, dst); 2049 dst += dst_stride; 2050 } 2051 2052 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); 2053 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11); 2054 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10, 2055 src_minus11); 2056 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, 2057 src_zero1); 2058 2059 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 2060 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 2061 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 2062 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 2063 2064 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 2065 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 2066 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 2067 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 2068 2069 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 2070 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 2071 2072 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 2073 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 2074 2075 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 2076 offset, offset); 2077 2078 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2079 dst0 = __msa_adds_s_b(dst0, offset); 2080 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2081 2082 src_minus10 = src10; 2083 src_minus11 = src11; 2084 2085 /* load in advance */ 2086 LD_UB2(src_orig + src_stride, src_stride, src10, src11); 2087 2088 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); 2089 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1); 2090 SD(dst_val0, dst); 2091 dst += dst_stride; 2092 SD(dst_val1, dst); 2093} 2094 2095static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst, 2096 int32_t dst_stride, 2097 uint8_t *src, 2098 int32_t src_stride, 2099 int16_t * 2100 sao_offset_val, 2101 int32_t width, 2102 int32_t height) 2103{ 2104 uint8_t *src_orig = src; 2105 uint8_t *dst_orig = dst; 2106 int32_t v_cnt; 2107 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 2108 v16u8 const1 = (v16u8) __msa_ldi_b(1); 2109 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; 2110 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; 2111 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; 2112 v16u8 diff_plus13, src_minus14, src_plus13; 2113 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3; 2114 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1; 2115 v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3; 2116 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12; 2117 v16i8 src_zero3, sao_offset; 2118 2119 sao_offset = LD_SB(sao_offset_val); 2120 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 2121 2122 for (; height; height -= 4) { 2123 src_orig = src - 1; 2124 dst_orig = dst; 2125 LD_UB4(src_orig, src_stride, src_minus11, src_minus12, src_minus13, 2126 src_minus14); 2127 2128 for (v_cnt = 0; v_cnt < width; v_cnt += 16) { 2129 src_minus10 = LD_UB(src_orig - src_stride); 2130 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13); 2131 src_plus13 = LD_UB(src + 1 + v_cnt + (src_stride << 2)); 2132 src_orig += 16; 2133 2134 SLDI_B4_SB(src10, src_minus11, src11, src_minus12, 2135 src12, src_minus13, src13, src_minus14, 1, 2136 src_zero0, src_zero1, src_zero2, src_zero3); 2137 SLDI_B2_SB(src11, src_minus12, src12, src_minus13, 2, src_plus10, 2138 src_plus11); 2139 2140 src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2); 2141 2142 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 2143 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10); 2144 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 2145 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11); 2146 cmp_minus12 = ((v16u8) src_zero2 == src_minus12); 2147 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12); 2148 cmp_minus13 = ((v16u8) src_zero3 == src_minus13); 2149 cmp_plus13 = ((v16u8) src_zero3 == src_plus13); 2150 2151 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 2152 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10); 2153 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 2154 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11); 2155 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12); 2156 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12); 2157 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13); 2158 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13); 2159 2160 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 2161 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0); 2162 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 2163 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1); 2164 cmp_minus12 = (src_minus12 < (v16u8) src_zero2); 2165 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2); 2166 cmp_minus13 = (src_minus13 < (v16u8) src_zero3); 2167 cmp_plus13 = (src_plus13 < (v16u8) src_zero3); 2168 2169 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 2170 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10); 2171 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 2172 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11); 2173 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12); 2174 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12); 2175 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13); 2176 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13); 2177 2178 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10; 2179 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11; 2180 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12; 2181 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13; 2182 2183 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 2184 offset_mask0, offset_mask0, offset_mask0, offset_mask0); 2185 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 2186 offset_mask1, offset_mask1, offset_mask1, offset_mask1); 2187 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 2188 offset_mask2, offset_mask2, offset_mask2, offset_mask2); 2189 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 2190 offset_mask3, offset_mask3, offset_mask3, offset_mask3); 2191 2192 XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3); 2193 2194 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0); 2195 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1); 2196 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2); 2197 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3); 2198 2199 XORI_B4_128_UB(dst0, dst1, dst2, dst3); 2200 2201 src_minus11 = src10; 2202 src_minus12 = src11; 2203 src_minus13 = src12; 2204 src_minus14 = src13; 2205 2206 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride); 2207 dst_orig += 16; 2208 } 2209 2210 src += (src_stride << 2); 2211 dst += (dst_stride << 2); 2212 } 2213} 2214 2215static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst, 2216 int32_t dst_stride, 2217 uint8_t *src, 2218 int32_t src_stride, 2219 int16_t *sao_offset_val, 2220 int32_t height) 2221{ 2222 uint8_t *src_orig; 2223 uint32_t dst_val0, dst_val1; 2224 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 2225 v16u8 const1 = (v16u8) __msa_ldi_b(1); 2226 v16i8 offset, sao_offset = LD_SB(sao_offset_val); 2227 v16i8 src_zero0, src_zero1, dst0; 2228 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; 2229 v16u8 src_minus10, src10, src_minus11, src11; 2230 v8i16 offset_mask0, offset_mask1; 2231 v16i8 zeros = { 0 }; 2232 2233 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 2234 src_orig = src - 1; 2235 2236 /* load in advance */ 2237 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11); 2238 LD_UB2(src_orig + src_stride, src_stride, src10, src11); 2239 2240 for (height -= 2; height; height -= 2) { 2241 src_orig += (src_stride << 1); 2242 2243 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); 2244 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11); 2245 2246 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, 2247 src_minus11); 2248 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, 2249 src_zero1); 2250 2251 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 2252 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 2253 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 2254 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 2255 2256 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 2257 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 2258 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 2259 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 2260 2261 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 2262 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 2263 2264 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 2265 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 2266 2267 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 2268 offset, offset); 2269 2270 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2271 dst0 = __msa_adds_s_b(dst0, offset); 2272 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2273 2274 src_minus10 = src10; 2275 src_minus11 = src11; 2276 2277 /* load in advance */ 2278 LD_UB2(src_orig + src_stride, src_stride, src10, src11); 2279 2280 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); 2281 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); 2282 2283 SW(dst_val0, dst); 2284 dst += dst_stride; 2285 SW(dst_val1, dst); 2286 2287 dst += dst_stride; 2288 } 2289 2290 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); 2291 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11); 2292 2293 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, 2294 src_minus11); 2295 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, 2296 src_zero1); 2297 2298 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 2299 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 2300 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 2301 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 2302 2303 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 2304 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 2305 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 2306 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 2307 2308 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 2309 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 2310 2311 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 2312 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 2313 2314 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 2315 offset, offset); 2316 2317 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2318 dst0 = __msa_adds_s_b(dst0, offset); 2319 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2320 2321 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0); 2322 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2); 2323 2324 SW(dst_val0, dst); 2325 dst += dst_stride; 2326 SW(dst_val1, dst); 2327 dst += dst_stride; 2328} 2329 2330static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst, 2331 int32_t dst_stride, 2332 uint8_t *src, 2333 int32_t src_stride, 2334 int16_t *sao_offset_val, 2335 int32_t height) 2336{ 2337 uint8_t *src_orig; 2338 uint64_t dst_val0, dst_val1; 2339 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 2340 v16u8 const1 = (v16u8) __msa_ldi_b(1); 2341 v16i8 offset, sao_offset = LD_SB(sao_offset_val); 2342 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; 2343 v16u8 src_minus10, src10, src_minus11, src11; 2344 v16i8 src_zero0, src_zero1, dst0; 2345 v8i16 offset_mask0, offset_mask1; 2346 v16i8 zeros = { 0 }; 2347 2348 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 2349 src_orig = src - 1; 2350 2351 /* load in advance */ 2352 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11); 2353 LD_UB2(src_orig + src_stride, src_stride, src10, src11); 2354 2355 for (height -= 2; height; height -= 2) { 2356 src_orig += (src_stride << 1); 2357 2358 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); 2359 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11); 2360 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, 2361 src_minus11); 2362 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, 2363 src_zero1); 2364 2365 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 2366 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 2367 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 2368 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 2369 2370 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 2371 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 2372 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 2373 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 2374 2375 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 2376 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 2377 2378 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 2379 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 2380 2381 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 2382 offset, offset); 2383 2384 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2385 dst0 = __msa_adds_s_b(dst0, offset); 2386 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2387 2388 src_minus10 = src10; 2389 src_minus11 = src11; 2390 2391 /* load in advance */ 2392 LD_UB2(src_orig + src_stride, src_stride, src10, src11); 2393 2394 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); 2395 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1); 2396 2397 SD(dst_val0, dst); 2398 dst += dst_stride; 2399 SD(dst_val1, dst); 2400 dst += dst_stride; 2401 } 2402 2403 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); 2404 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11); 2405 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, 2406 src_minus11); 2407 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, 2408 src_zero1); 2409 2410 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 2411 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 2412 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 2413 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 2414 2415 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 2416 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 2417 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 2418 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 2419 2420 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2); 2421 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2); 2422 2423 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0); 2424 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0); 2425 2426 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset, 2427 offset, offset); 2428 2429 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2430 dst0 = __msa_adds_s_b(dst0, offset); 2431 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); 2432 2433 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0); 2434 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1); 2435 2436 SD(dst_val0, dst); 2437 dst += dst_stride; 2438 SD(dst_val1, dst); 2439 dst += dst_stride; 2440} 2441 2442static void hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t *dst, 2443 int32_t dst_stride, 2444 uint8_t *src, 2445 int32_t src_stride, 2446 int16_t * 2447 sao_offset_val, 2448 int32_t width, 2449 int32_t height) 2450{ 2451 uint8_t *src_orig, *dst_orig; 2452 int32_t v_cnt; 2453 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 2454 v16u8 const1 = (v16u8) __msa_ldi_b(1); 2455 v16u8 dst0, dst1, dst2, dst3; 2456 v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10; 2457 v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11; 2458 v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12; 2459 v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11; 2460 v16u8 src_plus10, src_plus11, src_plus12, src_plus13; 2461 v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3; 2462 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset; 2463 2464 sao_offset = LD_SB(sao_offset_val); 2465 sao_offset = __msa_pckev_b(sao_offset, sao_offset); 2466 2467 for (; height; height -= 4) { 2468 src_orig = src - 1; 2469 dst_orig = dst; 2470 2471 LD_UB4(src_orig, src_stride, src_minus11, src_plus10, src_plus11, 2472 src_plus12); 2473 2474 for (v_cnt = 0; v_cnt < width; v_cnt += 16) { 2475 src_minus10 = LD_UB(src_orig + 2 - src_stride); 2476 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13); 2477 src_plus13 = LD_UB(src_orig + (src_stride << 2)); 2478 src_orig += 16; 2479 2480 src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1); 2481 cmp_minus10 = ((v16u8) src_zero0 == src_minus10); 2482 cmp_plus10 = ((v16u8) src_zero0 == src_plus10); 2483 2484 src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1); 2485 src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10, 2486 (v16i8) src_minus11, 2); 2487 cmp_minus11 = ((v16u8) src_zero1 == src_minus11); 2488 cmp_plus11 = ((v16u8) src_zero1 == src_plus11); 2489 2490 src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1); 2491 src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2); 2492 cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12); 2493 cmp_plus12 = ((v16u8) src_zero2 == src_plus12); 2494 2495 src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1); 2496 src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2); 2497 cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13); 2498 cmp_plus13 = ((v16u8) src_zero3 == src_plus13); 2499 2500 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10); 2501 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10); 2502 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11); 2503 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11); 2504 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12); 2505 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12); 2506 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13); 2507 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13); 2508 2509 cmp_minus10 = (src_minus10 < (v16u8) src_zero0); 2510 cmp_plus10 = (src_plus10 < (v16u8) src_zero0); 2511 cmp_minus11 = (src_minus11 < (v16u8) src_zero1); 2512 cmp_plus11 = (src_plus11 < (v16u8) src_zero1); 2513 cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2); 2514 cmp_plus12 = (src_plus12 < (v16u8) src_zero2); 2515 cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3); 2516 cmp_plus13 = (src_plus13 < (v16u8) src_zero3); 2517 2518 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10); 2519 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10); 2520 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11); 2521 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11); 2522 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12); 2523 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12); 2524 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13); 2525 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13); 2526 2527 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10; 2528 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11; 2529 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12; 2530 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13; 2531 2532 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 2533 offset_mask0, offset_mask0, offset_mask0, offset_mask0); 2534 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 2535 offset_mask1, offset_mask1, offset_mask1, offset_mask1); 2536 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 2537 offset_mask2, offset_mask2, offset_mask2, offset_mask2); 2538 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, 2539 offset_mask3, offset_mask3, offset_mask3, offset_mask3); 2540 2541 XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3); 2542 2543 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0); 2544 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1); 2545 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2); 2546 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3); 2547 2548 XORI_B4_128_UB(dst0, dst1, dst2, dst3); 2549 2550 src_minus11 = src10; 2551 src_plus10 = src11; 2552 src_plus11 = src12; 2553 src_plus12 = src13; 2554 2555 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride); 2556 dst_orig += 16; 2557 } 2558 2559 src += (src_stride << 2); 2560 dst += (dst_stride << 2); 2561 } 2562} 2563 2564void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src, 2565 ptrdiff_t src_stride, 2566 int32_t beta, int32_t *tc, 2567 uint8_t *no_p, uint8_t *no_q) 2568{ 2569 hevc_loopfilter_luma_hor_msa(src, src_stride, beta, tc, no_p, no_q); 2570} 2571 2572void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src, 2573 ptrdiff_t src_stride, 2574 int32_t beta, int32_t *tc, 2575 uint8_t *no_p, uint8_t *no_q) 2576{ 2577 hevc_loopfilter_luma_ver_msa(src, src_stride, beta, tc, no_p, no_q); 2578} 2579 2580void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src, 2581 ptrdiff_t src_stride, 2582 int32_t *tc, uint8_t *no_p, 2583 uint8_t *no_q) 2584{ 2585 hevc_loopfilter_chroma_hor_msa(src, src_stride, tc, no_p, no_q); 2586} 2587 2588void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src, 2589 ptrdiff_t src_stride, 2590 int32_t *tc, uint8_t *no_p, 2591 uint8_t *no_q) 2592{ 2593 hevc_loopfilter_chroma_ver_msa(src, src_stride, tc, no_p, no_q); 2594} 2595 2596void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src, 2597 ptrdiff_t stride_dst, ptrdiff_t stride_src, 2598 int16_t *sao_offset_val, int sao_left_class, 2599 int width, int height) 2600{ 2601 if (width >> 4) { 2602 hevc_sao_band_filter_16multiple_msa(dst, stride_dst, src, stride_src, 2603 sao_left_class, sao_offset_val, 2604 width - (width % 16), height); 2605 dst += width - (width % 16); 2606 src += width - (width % 16); 2607 width %= 16; 2608 } 2609 2610 if (width >> 3) { 2611 hevc_sao_band_filter_8width_msa(dst, stride_dst, src, stride_src, 2612 sao_left_class, sao_offset_val, height); 2613 dst += 8; 2614 src += 8; 2615 width %= 8; 2616 } 2617 2618 if (width) { 2619 hevc_sao_band_filter_4width_msa(dst, stride_dst, src, stride_src, 2620 sao_left_class, sao_offset_val, height); 2621 } 2622} 2623 2624void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src, 2625 ptrdiff_t stride_dst, 2626 int16_t *sao_offset_val, 2627 int eo, int width, int height) 2628{ 2629 ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(uint8_t); 2630 2631 switch (eo) { 2632 case 0: 2633 if (width >> 4) { 2634 hevc_sao_edge_filter_0degree_16multiple_msa(dst, stride_dst, 2635 src, stride_src, 2636 sao_offset_val, 2637 width - (width % 16), 2638 height); 2639 dst += width - (width % 16); 2640 src += width - (width % 16); 2641 width %= 16; 2642 } 2643 2644 if (width >> 3) { 2645 hevc_sao_edge_filter_0degree_8width_msa(dst, stride_dst, 2646 src, stride_src, 2647 sao_offset_val, height); 2648 dst += 8; 2649 src += 8; 2650 width %= 8; 2651 } 2652 2653 if (width) { 2654 hevc_sao_edge_filter_0degree_4width_msa(dst, stride_dst, 2655 src, stride_src, 2656 sao_offset_val, height); 2657 } 2658 break; 2659 2660 case 1: 2661 if (width >> 4) { 2662 hevc_sao_edge_filter_90degree_16multiple_msa(dst, stride_dst, 2663 src, stride_src, 2664 sao_offset_val, 2665 width - (width % 16), 2666 height); 2667 dst += width - (width % 16); 2668 src += width - (width % 16); 2669 width %= 16; 2670 } 2671 2672 if (width >> 3) { 2673 hevc_sao_edge_filter_90degree_8width_msa(dst, stride_dst, 2674 src, stride_src, 2675 sao_offset_val, height); 2676 dst += 8; 2677 src += 8; 2678 width %= 8; 2679 } 2680 2681 if (width) { 2682 hevc_sao_edge_filter_90degree_4width_msa(dst, stride_dst, 2683 src, stride_src, 2684 sao_offset_val, height); 2685 } 2686 break; 2687 2688 case 2: 2689 if (width >> 4) { 2690 hevc_sao_edge_filter_45degree_16multiple_msa(dst, stride_dst, 2691 src, stride_src, 2692 sao_offset_val, 2693 width - (width % 16), 2694 height); 2695 dst += width - (width % 16); 2696 src += width - (width % 16); 2697 width %= 16; 2698 } 2699 2700 if (width >> 3) { 2701 hevc_sao_edge_filter_45degree_8width_msa(dst, stride_dst, 2702 src, stride_src, 2703 sao_offset_val, height); 2704 dst += 8; 2705 src += 8; 2706 width %= 8; 2707 } 2708 2709 if (width) { 2710 hevc_sao_edge_filter_45degree_4width_msa(dst, stride_dst, 2711 src, stride_src, 2712 sao_offset_val, height); 2713 } 2714 break; 2715 2716 case 3: 2717 if (width >> 4) { 2718 hevc_sao_edge_filter_135degree_16multiple_msa(dst, stride_dst, 2719 src, stride_src, 2720 sao_offset_val, 2721 width - (width % 16), 2722 height); 2723 dst += width - (width % 16); 2724 src += width - (width % 16); 2725 width %= 16; 2726 } 2727 2728 if (width >> 3) { 2729 hevc_sao_edge_filter_135degree_8width_msa(dst, stride_dst, 2730 src, stride_src, 2731 sao_offset_val, height); 2732 dst += 8; 2733 src += 8; 2734 width %= 8; 2735 } 2736 2737 if (width) { 2738 hevc_sao_edge_filter_135degree_4width_msa(dst, stride_dst, 2739 src, stride_src, 2740 sao_offset_val, height); 2741 } 2742 break; 2743 } 2744} 2745