1/* 2 * Copyright (c) 2022 Loongson Technology Corporation Limited 3 * Contributed by Lu Wang <wanglu@loongson.cn> 4 * Hao Chen <chenhao@loongson.cn> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/loongarch/loongson_intrinsics.h" 24#include "hevcdsp_lsx.h" 25 26void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride, 27 int32_t beta, int32_t *tc, 28 uint8_t *p_is_pcm, uint8_t *q_is_pcm) 29{ 30 ptrdiff_t stride_2x = (stride << 1); 31 ptrdiff_t stride_4x = (stride << 2); 32 ptrdiff_t stride_3x = stride_2x + stride; 33 uint8_t *p3 = src - stride_4x; 34 uint8_t *p2 = src - stride_3x; 35 uint8_t *p1 = src - stride_2x; 36 uint8_t *p0 = src - stride; 37 uint8_t *q0 = src; 38 uint8_t *q1 = src + stride; 39 uint8_t *q2 = src + stride_2x; 40 uint8_t *q3 = src + stride_3x; 41 uint8_t flag0, flag1; 42 int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434; 43 int32_t dp04, dq04, dp34, dq34, d04, d34; 44 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250; 45 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp; 46 47 __m128i dst0, dst1, dst2, dst3, dst4, dst5; 48 __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec; 49 __m128i temp0, temp1; 50 __m128i temp2, tc_pos, tc_neg; 51 __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0; 52 __m128i zero = {0}; 53 __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src; 54 55 dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]); 56 dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]); 57 dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]); 58 dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]); 59 d00 = dp00 + dq00; 60 d30 = dp30 + dq30; 61 dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]); 62 dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]); 63 dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]); 64 dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]); 65 d04 = dp04 + dq04; 66 d34 = dp34 + dq34; 67 68 p_is_pcm0 = p_is_pcm[0]; 69 p_is_pcm4 = p_is_pcm[1]; 70 q_is_pcm0 = q_is_pcm[0]; 71 q_is_pcm4 = q_is_pcm[1]; 72 73 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1); 74 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 75 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); 76 d0030 = (d00 + d30) >= beta; 77 d0434 = (d04 + d34) >= beta; 78 DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1); 79 cmp3 = __lsx_vpackev_w(cmp1, cmp0); 80 cmp3 = __lsx_vseqi_w(cmp3, 0); 81 82 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) && 83 (!d0030 || !d0434)) { 84 DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0, 85 p3_src, p2_src, p1_src, p0_src); 86 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1); 87 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 88 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); 89 90 tc0 = tc[0]; 91 beta30 = beta >> 3; 92 beta20 = beta >> 2; 93 tc250 = (((tc0 << 2) + tc0 + 1) >> 1); 94 tc4 = tc[1]; 95 tc254 = (((tc4 << 2) + tc4 + 1) >> 1); 96 97 DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1); 98 DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero, 99 p0_src, p3_src, p2_src, p1_src, p0_src); 100 DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0, 101 q0_src, q1_src, q2_src, q3_src); 102 flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 && 103 abs(p0[0] - q0[0]) < tc250; 104 flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 && 105 abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 && 106 (d30 << 1) < beta20); 107 tc_pos = __lsx_vpackev_d(cmp1, cmp0); 108 DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, 109 zero, q3_src, q0_src, q1_src, q2_src, q3_src); 110 111 flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 && 112 abs(p0[4] - q0[4]) < tc254; 113 flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 && 114 abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 && 115 (d34 << 1) < beta20); 116 DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1); 117 cmp2 = __lsx_vpackev_w(cmp1, cmp0); 118 cmp2 = __lsx_vseqi_w(cmp2, 0); 119 120 if (flag0 && flag1) { /* strong only */ 121 /* strong filter */ 122 tc_pos = __lsx_vslli_h(tc_pos, 1); 123 tc_neg = __lsx_vneg_h(tc_pos); 124 125 /* p part */ 126 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src, 127 temp0, temp0); 128 temp1 = __lsx_vadd_h(p3_src, p2_src); 129 temp1 = __lsx_vslli_h(temp1, 1); 130 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1); 131 temp1 = __lsx_vsrari_h(temp1, 3); 132 temp2 = __lsx_vsub_h(temp1, p2_src); 133 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 134 dst0 = __lsx_vadd_h(temp2, p2_src); 135 136 temp1 = __lsx_vadd_h(temp0, p2_src); 137 temp1 = __lsx_vsrari_h(temp1, 2); 138 temp2 = __lsx_vsub_h(temp1, p1_src); 139 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 140 dst1 = __lsx_vadd_h(temp2, p1_src); 141 142 temp1 = __lsx_vslli_h(temp0, 1); 143 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, 144 temp1, temp1); 145 temp1 = __lsx_vsrari_h(temp1, 3); 146 temp2 = __lsx_vsub_h(temp1, p0_src); 147 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 148 dst2 = __lsx_vadd_h(temp2, p0_src); 149 150 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); 151 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, 152 p1_src, p_is_pcm_vec, dst0, dst1); 153 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec); 154 155 /* q part */ 156 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, 157 temp0, temp0); 158 temp1 = __lsx_vadd_h(q3_src, q2_src); 159 temp1 = __lsx_vslli_h(temp1, 1); 160 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1); 161 temp1 = __lsx_vsrari_h(temp1, 3); 162 temp2 = __lsx_vsub_h(temp1, q2_src); 163 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 164 dst5 = __lsx_vadd_h(temp2, q2_src); 165 166 temp1 = __lsx_vadd_h(temp0, q2_src); 167 temp1 = __lsx_vsrari_h(temp1, 2); 168 temp2 = __lsx_vsub_h(temp1, q1_src); 169 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 170 dst4 = __lsx_vadd_h(temp2, q1_src); 171 172 temp0 = __lsx_vslli_h(temp0, 1); 173 DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src, 174 temp1, temp1); 175 temp1 = __lsx_vsrari_h(temp1, 3); 176 temp2 = __lsx_vsub_h(temp1, q0_src); 177 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 178 dst3 = __lsx_vadd_h(temp2, q0_src); 179 180 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); 181 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, 182 q1_src, q_is_pcm_vec, dst3, dst4); 183 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec); 184 185 /* pack results to 8 bit */ 186 DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1); 187 dst2 = __lsx_vpickev_b(dst5, dst4); 188 189 /* pack src to 8 bit */ 190 DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src, 191 dst3, dst4); 192 dst5 = __lsx_vpickev_b(q2_src, q1_src); 193 194 cmp3 = __lsx_vnor_v(cmp3, cmp3); 195 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3, 196 dst0, dst1); 197 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3); 198 199 __lsx_vstelm_d(dst0, p2, 0, 0); 200 __lsx_vstelm_d(dst0, p2 + stride, 0, 1); 201 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0); 202 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1); 203 __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0); 204 __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1); 205 /* strong filter ends */ 206 } else if (flag0 == flag1) { /* weak only */ 207 /* weak filter */ 208 tc_neg = __lsx_vneg_h(tc_pos); 209 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src, 210 diff0, diff1); 211 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0, 212 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1); 213 delta0 = __lsx_vsub_h(diff0, diff1); 214 delta0 = __lsx_vsrari_h(delta0, 4); 215 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3), 216 __lsx_vslli_h(tc_pos, 1)); 217 abs_delta0 = __lsx_vadda_h(delta0, zero); 218 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0); 219 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); 220 221 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos); 222 temp2 = __lsx_vadd_h(delta0, p0_src); 223 temp2 = __lsx_vclip255_h(temp2); 224 temp0 = __lsx_vbitsel_v(temp2, p0_src, 225 __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec)); 226 temp2 = __lsx_vsub_h(q0_src, delta0); 227 temp2 = __lsx_vclip255_h(temp2); 228 temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec, 229 q_is_pcm_vec)); 230 DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec, 231 q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec); 232 233 tmp = (beta + (beta >> 1)) >> 3; 234 DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp, 235 cmp0, cmp1); 236 cmp0 = __lsx_vpackev_d(cmp1, cmp0); 237 cmp0 = __lsx_vseqi_d(cmp0, 0); 238 p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0); 239 240 DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp, 241 cmp0, cmp1); 242 cmp0 = __lsx_vpackev_d(cmp1, cmp0); 243 cmp0 = __lsx_vseqi_d(cmp0, 0); 244 q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0); 245 tc_pos = __lsx_vsrai_h(tc_pos, 1); 246 tc_neg = __lsx_vneg_h(tc_pos); 247 248 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src, 249 delta1, delta2); 250 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src, 251 delta1, delta2); 252 delta1 = __lsx_vadd_h(delta1, delta0); 253 delta2 = __lsx_vsub_h(delta2, delta0); 254 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2); 255 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, 256 tc_neg, tc_pos, delta1, delta2); 257 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2, 258 delta1, delta2); 259 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2); 260 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2, 261 q1_src, q_is_pcm_vec, delta1, delta2); 262 263 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); 264 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0, 265 p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2, 266 q1_src, abs_delta0, dst1, dst2, dst3, dst4); 267 /* pack results to 8 bit */ 268 DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1); 269 /* pack src to 8 bit */ 270 DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src, 271 dst2, dst3); 272 cmp3 = __lsx_vnor_v(cmp3, cmp3); 273 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3, 274 dst0, dst1); 275 276 p2 += stride; 277 __lsx_vstelm_d(dst0, p2, 0, 0); 278 __lsx_vstelm_d(dst0, p2 + stride, 0, 1); 279 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0); 280 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1); 281 /* weak filter ends */ 282 } else { /* strong + weak */ 283 /* strong filter */ 284 tc_pos = __lsx_vslli_h(tc_pos, 1); 285 tc_neg = __lsx_vneg_h(tc_pos); 286 287 /* p part */ 288 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src, 289 temp0, temp0); 290 temp1 = __lsx_vadd_h(p3_src, p2_src); 291 temp1 = __lsx_vslli_h(temp1, 1); 292 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1); 293 temp1 = __lsx_vsrari_h(temp1, 3); 294 temp2 = __lsx_vsub_h(temp1, p2_src); 295 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 296 dst0 = __lsx_vadd_h(temp2, p2_src); 297 298 temp1 = __lsx_vadd_h(temp0, p2_src); 299 temp1 = __lsx_vsrari_h(temp1, 2); 300 temp2 = __lsx_vsub_h(temp1, p1_src); 301 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 302 dst1 = __lsx_vadd_h(temp2, p1_src); 303 304 temp1 = __lsx_vslli_h(temp0, 1); 305 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1); 306 temp1 = __lsx_vsrari_h(temp1, 3); 307 temp2 = __lsx_vsub_h(temp1, p0_src); 308 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 309 dst2 = __lsx_vadd_h(temp2, p0_src); 310 311 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); 312 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, 313 p1_src, p_is_pcm_vec, dst0, dst1); 314 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec); 315 316 /* q part */ 317 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, 318 temp0, temp0); 319 temp1 = __lsx_vadd_h(q3_src, q2_src); 320 temp1 = __lsx_vslli_h(temp1, 1); 321 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1); 322 temp1 = __lsx_vsrari_h(temp1, 3); 323 temp2 = __lsx_vsub_h(temp1, q2_src); 324 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 325 dst5 = __lsx_vadd_h(temp2, q2_src); 326 327 temp1 = __lsx_vadd_h(temp0, q2_src); 328 temp1 = __lsx_vsrari_h(temp1, 2); 329 temp2 = __lsx_vsub_h(temp1, q1_src); 330 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 331 dst4 = __lsx_vadd_h(temp2, q1_src); 332 333 temp1 = __lsx_vslli_h(temp0, 1); 334 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1); 335 temp1 = __lsx_vsrari_h(temp1, 3); 336 temp2 = __lsx_vsub_h(temp1, q0_src); 337 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 338 dst3 = __lsx_vadd_h(temp2, q0_src); 339 340 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); 341 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, 342 q1_src, q_is_pcm_vec, dst3, dst4); 343 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec); 344 345 /* pack strong results to 8 bit */ 346 DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1); 347 dst2 = __lsx_vpickev_b(dst5, dst4); 348 /* strong filter ends */ 349 350 /* weak filter */ 351 tc_pos = __lsx_vsrai_h(tc_pos, 1); 352 tc_neg = __lsx_vneg_h(tc_pos); 353 354 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src, 355 diff0, diff1); 356 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0, 357 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1); 358 delta0 = __lsx_vsub_h(diff0, diff1); 359 delta0 = __lsx_vsrari_h(delta0, 4); 360 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3), 361 __lsx_vslli_h(tc_pos, 1)); 362 abs_delta0 = __lsx_vadda_h(delta0, zero); 363 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0); 364 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); 365 366 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos); 367 temp2 = __lsx_vadd_h(delta0, p0_src); 368 temp2 = __lsx_vclip255_h(temp2); 369 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec); 370 371 temp2 = __lsx_vsub_h(q0_src, delta0); 372 temp2 = __lsx_vclip255_h(temp2); 373 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec); 374 375 tmp = (beta + (beta >> 1)) >> 3; 376 DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp, 377 cmp0, cmp1); 378 cmp0 = __lsx_vpackev_d(cmp1, cmp0); 379 p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0)); 380 DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp, 381 cmp0, cmp1); 382 cmp0 = __lsx_vpackev_d(cmp1, cmp0); 383 q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0)); 384 385 tc_pos = __lsx_vsrai_h(tc_pos, 1); 386 tc_neg = __lsx_vneg_h(tc_pos); 387 388 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src, 389 delta1, delta2); 390 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src, 391 delta1, delta2); 392 delta1 = __lsx_vadd_h(delta1, delta0); 393 delta2 = __lsx_vsub_h(delta2, delta0); 394 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2); 395 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg, 396 tc_pos, delta1, delta2); 397 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2, 398 delta1, delta2); 399 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2); 400 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2, 401 q1_src, q_is_pcm_vec, delta1, delta2); 402 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); 403 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2, 404 q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2, 405 q0_src, abs_delta0, delta1, delta2, temp0, temp2); 406 /* weak filter ends */ 407 408 /* pack weak results to 8 bit */ 409 DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0, 410 dst3, dst4); 411 dst5 = __lsx_vpickev_b(q2_src, delta2); 412 413 /* select between weak or strong */ 414 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2, 415 dst0, dst1); 416 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2); 417 418 /* pack src to 8 bit */ 419 DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src, 420 dst3, dst4); 421 dst5 = __lsx_vpickev_b(q2_src, q1_src); 422 423 cmp3 = __lsx_vnor_v(cmp3, cmp3); 424 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3, 425 dst0, dst1); 426 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3); 427 428 __lsx_vstelm_d(dst0, p2, 0, 0); 429 __lsx_vstelm_d(dst0, p2 + stride, 0, 1); 430 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0); 431 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1); 432 __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0); 433 __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1); 434 } 435 } 436} 437 438void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride, 439 int32_t beta, int32_t *tc, 440 uint8_t *p_is_pcm, uint8_t *q_is_pcm) 441{ 442 ptrdiff_t stride_2x = (stride << 1); 443 ptrdiff_t stride_4x = (stride << 2); 444 ptrdiff_t stride_3x = stride_2x + stride; 445 uint8_t *p3 = src; 446 uint8_t *p2 = src + stride_3x; 447 uint8_t *p1 = src + stride_4x; 448 uint8_t *p0 = src + stride_4x + stride_3x; 449 uint8_t flag0, flag1; 450 int32_t dp00, dq00, dp30, dq30, d00, d30; 451 int32_t d0030, d0434; 452 int32_t dp04, dq04, dp34, dq34, d04, d34; 453 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250; 454 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp; 455 456 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 457 __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec; 458 __m128i cmp3; 459 __m128i temp0, temp1; 460 __m128i temp2; 461 __m128i tc_pos, tc_neg; 462 __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0; 463 __m128i zero = {0}; 464 __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src; 465 466 dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]); 467 dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]); 468 dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]); 469 dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]); 470 d00 = dp00 + dq00; 471 d30 = dp30 + dq30; 472 p_is_pcm0 = p_is_pcm[0]; 473 q_is_pcm0 = q_is_pcm[0]; 474 475 dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]); 476 dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]); 477 dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]); 478 dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]); 479 d04 = dp04 + dq04; 480 d34 = dp34 + dq34; 481 p_is_pcm4 = p_is_pcm[1]; 482 q_is_pcm4 = q_is_pcm[1]; 483 484 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1); 485 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 486 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); 487 488 d0030 = (d00 + d30) >= beta; 489 d0434 = (d04 + d34) >= beta; 490 491 DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1); 492 cmp3 = __lsx_vpackev_d(cmp1, cmp0); 493 cmp3 = __lsx_vseqi_d(cmp3, 0); 494 495 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) && 496 (!d0030 || !d0434)) { 497 src -= 4; 498 DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0, 499 src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src); 500 src += stride_4x; 501 DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0, 502 src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src); 503 src -= stride_4x; 504 505 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1); 506 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 507 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); 508 509 tc0 = tc[0]; 510 beta30 = beta >> 3; 511 beta20 = beta >> 2; 512 tc250 = (((tc0 << 2) + tc0 + 1) >> 1); 513 tc4 = tc[1]; 514 tc254 = (((tc4 << 2) + tc4 + 1) >> 1); 515 DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1); 516 tc_pos = __lsx_vpackev_d(cmp1, cmp0); 517 LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, 518 q2_src, q3_src, p3_src, p2_src, p1_src, p0_src, 519 q0_src, q1_src, q2_src, q3_src); 520 521 flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 && 522 abs(p3[-1] - p3[0]) < tc250; 523 flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 && 524 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 && 525 (d30 << 1) < beta20); 526 cmp0 = __lsx_vreplgr2vr_d(flag0); 527 DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero, 528 p0_src, p3_src, p2_src, p1_src, p0_src); 529 530 flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 && 531 abs(p1[-1] - p1[0]) < tc254; 532 flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 && 533 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 && 534 (d34 << 1) < beta20); 535 DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero, 536 q3_src, q0_src, q1_src, q2_src, q3_src); 537 538 cmp1 = __lsx_vreplgr2vr_d(flag1); 539 cmp2 = __lsx_vpackev_d(cmp1, cmp0); 540 cmp2 = __lsx_vseqi_d(cmp2, 0); 541 542 if (flag0 && flag1) { /* strong only */ 543 /* strong filter */ 544 tc_neg = __lsx_vneg_h(tc_pos); 545 /* p part */ 546 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src, 547 temp0, temp0); 548 temp1 = __lsx_vadd_h(p3_src, p2_src); 549 temp1 = __lsx_vslli_h(temp1, 1); 550 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1); 551 temp1 = __lsx_vsrari_h(temp1, 3); 552 temp2 = __lsx_vsub_h(temp1, p2_src); 553 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 554 dst0 = __lsx_vadd_h(temp2, p2_src); 555 556 temp1 = __lsx_vadd_h(temp0, p2_src); 557 temp1 = __lsx_vsrari_h(temp1, 2); 558 temp2 = __lsx_vsub_h(temp1, p1_src); 559 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 560 dst1 = __lsx_vadd_h(temp2, p1_src); 561 562 temp1 = __lsx_vslli_h(temp0, 1); 563 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1); 564 temp1 = __lsx_vsrari_h(temp1, 3); 565 temp2 = __lsx_vsub_h(temp1, p0_src); 566 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 567 dst2 = __lsx_vadd_h(temp2, p0_src); 568 569 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); 570 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src, 571 p_is_pcm_vec, dst0, dst1); 572 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec); 573 574 /* q part */ 575 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, 576 temp0, temp0); 577 temp1 = __lsx_vadd_h(q3_src, q2_src); 578 temp1 = __lsx_vslli_h(temp1, 1); 579 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1); 580 temp1 = __lsx_vsrari_h(temp1, 3); 581 temp2 = __lsx_vsub_h(temp1, q2_src); 582 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 583 dst5 = __lsx_vadd_h(temp2, q2_src); 584 585 temp1 = __lsx_vadd_h(temp0, q2_src); 586 temp1 = __lsx_vsrari_h(temp1, 2); 587 temp2 = __lsx_vsub_h(temp1, q1_src); 588 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 589 dst4 = __lsx_vadd_h(temp2, q1_src); 590 591 temp1 = __lsx_vslli_h(temp0, 1); 592 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1); 593 temp1 = __lsx_vsrari_h(temp1, 3); 594 temp2 = __lsx_vsub_h(temp1, q0_src); 595 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 596 dst3 = __lsx_vadd_h(temp2, q0_src); 597 598 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); 599 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src, 600 q_is_pcm_vec, dst3, dst4); 601 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec); 602 /* strong filter ends */ 603 } else if (flag0 == flag1) { /* weak only */ 604 /* weak filter */ 605 tc_pos = __lsx_vsrai_h(tc_pos, 1); 606 tc_neg = __lsx_vneg_h(tc_pos); 607 608 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src, 609 diff0, diff1); 610 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0, 611 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1); 612 delta0 = __lsx_vsub_h(diff0, diff1); 613 delta0 = __lsx_vsrari_h(delta0, 4); 614 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3), 615 __lsx_vslli_h(tc_pos, 1)); 616 abs_delta0 = __lsx_vadda_h(delta0, zero); 617 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0); 618 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); 619 620 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos); 621 temp2 = __lsx_vadd_h(delta0, p0_src); 622 temp2 = __lsx_vclip255_h(temp2); 623 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); 624 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec); 625 626 temp2 = __lsx_vsub_h(q0_src, delta0); 627 temp2 = __lsx_vclip255_h(temp2); 628 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); 629 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec); 630 631 tmp = ((beta + (beta >> 1)) >> 3); 632 DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp), 633 !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1); 634 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 635 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); 636 637 DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp), 638 (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1); 639 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 640 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); 641 tc_pos = __lsx_vsrai_h(tc_pos, 1); 642 tc_neg = __lsx_vneg_h(tc_pos); 643 644 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src, 645 delta1, delta2); 646 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src, 647 delta1, delta2); 648 delta1 = __lsx_vadd_h(delta1, delta0); 649 delta2 = __lsx_vsub_h(delta2, delta0); 650 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2); 651 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg, 652 tc_pos, delta1, delta2); 653 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2, 654 delta1, delta2); 655 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2); 656 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2, 657 q1_src, q_is_pcm_vec, delta1, delta2); 658 659 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); 660 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0, 661 p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2, 662 q1_src, abs_delta0, dst0, dst1, dst2, dst3); 663 /* weak filter ends */ 664 665 cmp3 = __lsx_vnor_v(cmp3, cmp3); 666 DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src, 667 cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3, 668 dst0, dst1, dst2, dst3); 669 DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1); 670 671 /* transpose */ 672 dst4 = __lsx_vilvl_b(dst1, dst0); 673 dst5 = __lsx_vilvh_b(dst1, dst0); 674 dst0 = __lsx_vilvl_h(dst5, dst4); 675 dst1 = __lsx_vilvh_h(dst5, dst4); 676 677 src += 2; 678 __lsx_vstelm_w(dst0, src, 0, 0); 679 __lsx_vstelm_w(dst0, src + stride, 0, 1); 680 __lsx_vstelm_w(dst0, src + stride_2x, 0, 2); 681 __lsx_vstelm_w(dst0, src + stride_3x, 0, 3); 682 src += stride_4x; 683 __lsx_vstelm_w(dst1, src, 0, 0); 684 __lsx_vstelm_w(dst1, src + stride, 0, 1); 685 __lsx_vstelm_w(dst1, src + stride_2x, 0, 2); 686 __lsx_vstelm_w(dst1, src + stride_3x, 0, 3); 687 return; 688 } else { /* strong + weak */ 689 /* strong filter */ 690 tc_neg = __lsx_vneg_h(tc_pos); 691 692 /* p part */ 693 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src, 694 temp0, temp0); 695 696 temp1 = __lsx_vadd_h(p3_src, p2_src); 697 temp1 = __lsx_vslli_h(temp1, 1); 698 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1); 699 temp1 = __lsx_vsrari_h(temp1, 3); 700 temp2 = __lsx_vsub_h(temp1, p2_src); 701 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 702 dst0 = __lsx_vadd_h(temp2, p2_src); 703 704 temp1 = __lsx_vadd_h(temp0, p2_src); 705 temp1 = __lsx_vsrari_h(temp1, 2); 706 temp2 = __lsx_vsub_h(temp1, p1_src); 707 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 708 dst1 = __lsx_vadd_h(temp2, p1_src); 709 710 temp1 = __lsx_vslli_h(temp0, 1); 711 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1); 712 temp1 = __lsx_vsrari_h(temp1, 3); 713 temp2 = __lsx_vsub_h(temp1, p0_src); 714 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 715 dst2 = __lsx_vadd_h(temp2, p0_src); 716 717 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); 718 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src, 719 p_is_pcm_vec, dst0, dst1); 720 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec); 721 722 /* q part */ 723 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0); 724 temp1 = __lsx_vadd_h(q3_src, q2_src); 725 temp1 = __lsx_vslli_h(temp1, 1); 726 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1); 727 temp1 = __lsx_vsrari_h(temp1, 3); 728 temp2 = __lsx_vsub_h(temp1, q2_src); 729 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 730 dst5 = __lsx_vadd_h(temp2, q2_src); 731 732 temp1 = __lsx_vadd_h(temp0, q2_src); 733 temp1 = __lsx_vsrari_h(temp1, 2); 734 temp2 = __lsx_vsub_h(temp1, q1_src); 735 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 736 dst4 = __lsx_vadd_h(temp2, q1_src); 737 738 temp1 = __lsx_vslli_h(temp0, 1); 739 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1); 740 temp1 = __lsx_vsrari_h(temp1, 3); 741 temp2 = __lsx_vsub_h(temp1, q0_src); 742 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos); 743 dst3 = __lsx_vadd_h(temp2, q0_src); 744 745 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); 746 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src, 747 q_is_pcm_vec, dst3, dst4); 748 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec); 749 /* strong filter ends */ 750 751 /* weak filter */ 752 tc_pos = __lsx_vsrai_h(tc_pos, 1); 753 tc_neg = __lsx_vneg_h(tc_pos); 754 755 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src, 756 diff0, diff1); 757 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0, 758 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1); 759 delta0 = __lsx_vsub_h(diff0, diff1); 760 delta0 = __lsx_vsrari_h(delta0, 4); 761 762 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3), 763 __lsx_vslli_h(tc_pos, 1)); 764 abs_delta0 = __lsx_vadda_h(delta0, zero); 765 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0); 766 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); 767 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos); 768 temp2 = __lsx_vadd_h(delta0, p0_src); 769 temp2 = __lsx_vclip255_h(temp2); 770 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec); 771 temp2 = __lsx_vsub_h(q0_src, delta0); 772 temp2 = __lsx_vclip255_h(temp2); 773 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec); 774 775 tmp = (beta + (beta >> 1)) >> 3; 776 DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp), 777 !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1); 778 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 779 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); 780 781 DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp), 782 (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1); 783 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 784 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); 785 tc_pos = __lsx_vsrai_h(tc_pos, 1); 786 tc_neg = __lsx_vneg_h(tc_pos); 787 788 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src, 789 delta1, delta2); 790 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src, 791 delta1, delta2); 792 delta1 = __lsx_vadd_h(delta1, delta0); 793 delta2 = __lsx_vsub_h(delta2, delta0); 794 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2); 795 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg, 796 tc_pos, delta1, delta2); 797 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2, 798 delta1, delta2); 799 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2); 800 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2, 801 q1_src, q_is_pcm_vec, delta1, delta2); 802 803 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0); 804 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2, 805 q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2, 806 q0_src, abs_delta0, delta1, delta2, temp0, temp2); 807 /* weak filter ends*/ 808 809 /* select between weak or strong */ 810 DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1, 811 cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2, 812 dst0, dst1, dst2, dst3); 813 DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2, 814 dst4, dst5); 815 } 816 817 cmp3 = __lsx_vnor_v(cmp3, cmp3); 818 DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2, 819 p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3); 820 DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3, 821 dst4, dst5); 822 823 /* pack results to 8 bit */ 824 DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5, 825 dst5, dst0, dst1, dst2, dst3); 826 827 /* transpose */ 828 DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6); 829 DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7); 830 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2); 831 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3); 832 833 src += 1; 834 __lsx_vstelm_w(dst0, src, 0, 0); 835 __lsx_vstelm_h(dst2, src, 4, 0); 836 src += stride; 837 __lsx_vstelm_w(dst0, src, 0, 1); 838 __lsx_vstelm_h(dst2, src, 4, 2); 839 src += stride; 840 841 __lsx_vstelm_w(dst0, src, 0, 2); 842 __lsx_vstelm_h(dst2, src, 4, 4); 843 src += stride; 844 __lsx_vstelm_w(dst0, src, 0, 3); 845 __lsx_vstelm_h(dst2, src, 4, 6); 846 src += stride; 847 848 __lsx_vstelm_w(dst1, src, 0, 0); 849 __lsx_vstelm_h(dst3, src, 4, 0); 850 src += stride; 851 __lsx_vstelm_w(dst1, src, 0, 1); 852 __lsx_vstelm_h(dst3, src, 4, 2); 853 src += stride; 854 855 __lsx_vstelm_w(dst1, src, 0, 2); 856 __lsx_vstelm_h(dst3, src, 4, 4); 857 src += stride; 858 __lsx_vstelm_w(dst1, src, 0, 3); 859 __lsx_vstelm_h(dst3, src, 4, 6); 860 } 861} 862 863void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride, 864 int32_t *tc, uint8_t *p_is_pcm, 865 uint8_t *q_is_pcm) 866{ 867 uint8_t *p1_ptr = src - (stride << 1); 868 uint8_t *p0_ptr = src - stride; 869 uint8_t *q0_ptr = src; 870 uint8_t *q1_ptr = src + stride; 871 __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec; 872 __m128i p1, p0, q0, q1; 873 __m128i tc_pos, tc_neg; 874 __m128i zero = {0}; 875 __m128i temp0, temp1, delta; 876 877 if (!(tc[0] <= 0) || !(tc[1] <= 0)) { 878 DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1); 879 tc_pos = __lsx_vpackev_d(cmp1, cmp0); 880 tc_neg = __lsx_vneg_h(tc_pos); 881 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1); 882 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 883 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); 884 885 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1); 886 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 887 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); 888 889 DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0, 890 p1, p0, q0, q1); 891 DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1, 892 p1, p0, q0, q1); 893 DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1); 894 temp0 = __lsx_vslli_h(temp0, 2); 895 temp0 = __lsx_vadd_h(temp0, temp1); 896 delta = __lsx_vsrari_h(temp0, 3); 897 delta = __lsx_vclip_h(delta, tc_neg, tc_pos); 898 temp0 = __lsx_vadd_h(p0, delta); 899 temp0 = __lsx_vclip255_h(temp0); 900 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec); 901 temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec); 902 903 temp1 = __lsx_vsub_h(q0, delta); 904 temp1 = __lsx_vclip255_h(temp1); 905 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec); 906 temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec); 907 908 tc_pos = __lsx_vslei_d(tc_pos, 0); 909 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos, 910 temp0, temp1); 911 temp0 = __lsx_vpickev_b(temp1, temp0); 912 __lsx_vstelm_d(temp0, p0_ptr, 0, 0); 913 __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1); 914 } 915} 916 917void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride, 918 int32_t *tc, uint8_t *p_is_pcm, 919 uint8_t *q_is_pcm) 920{ 921 ptrdiff_t stride_2x = (stride << 1); 922 ptrdiff_t stride_4x = (stride << 2); 923 ptrdiff_t stride_3x = stride_2x + stride; 924 __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec; 925 __m128i src0, src1, src2, src3, src4, src5, src6, src7; 926 __m128i p1, p0, q0, q1; 927 __m128i tc_pos, tc_neg; 928 __m128i zero = {0}; 929 __m128i temp0, temp1, delta; 930 931 if (!(tc[0] <= 0) || !(tc[1] <= 0)) { 932 DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1); 933 tc_pos = __lsx_vpackev_d(cmp1, cmp0); 934 tc_neg = __lsx_vneg_h(tc_pos); 935 936 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1); 937 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 938 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0); 939 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1); 940 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0); 941 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0); 942 943 src -= 2; 944 DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0, 945 src + stride_3x, 0, src0, src1, src2, src3); 946 src += stride_4x; 947 DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0, 948 src + stride_3x, 0, src4, src5, src6, src7); 949 src -= stride_4x; 950 LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7, 951 p1, p0, q0, q1); 952 DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1, 953 p1, p0, q0, q1); 954 955 DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1); 956 temp0 = __lsx_vslli_h(temp0, 2); 957 temp0 = __lsx_vadd_h(temp0, temp1); 958 delta = __lsx_vsrari_h(temp0, 3); 959 delta = __lsx_vclip_h(delta, tc_neg, tc_pos); 960 961 temp0 = __lsx_vadd_h(p0, delta); 962 temp1 = __lsx_vsub_h(q0, delta); 963 DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1); 964 DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec, 965 q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec); 966 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0, 967 q_is_pcm_vec, temp0, temp1); 968 969 tc_pos = __lsx_vslei_d(tc_pos, 0); 970 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos, 971 temp0, temp1); 972 temp0 = __lsx_vpackev_b(temp1, temp0); 973 974 src += 1; 975 __lsx_vstelm_h(temp0, src, 0, 0); 976 __lsx_vstelm_h(temp0, src + stride, 0, 1); 977 __lsx_vstelm_h(temp0, src + stride_2x, 0, 2); 978 __lsx_vstelm_h(temp0, src + stride_3x, 0, 3); 979 src += stride_4x; 980 __lsx_vstelm_h(temp0, src, 0, 4); 981 __lsx_vstelm_h(temp0, src + stride, 0, 5); 982 __lsx_vstelm_h(temp0, src + stride_2x, 0, 6); 983 __lsx_vstelm_h(temp0, src + stride_3x, 0, 7); 984 src -= stride_4x; 985 } 986} 987 988static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst, 989 int32_t dst_stride, 990 uint8_t *src, 991 int32_t src_stride, 992 int16_t *sao_offset_val, 993 int32_t height) 994{ 995 const int32_t src_stride_2x = (src_stride << 1); 996 const int32_t dst_stride_2x = (dst_stride << 1); 997 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; 998 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; 999 __m128i edge_idx = {0x403000201, 0x0}; 1000 __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11; 1001 __m128i sao_offset = __lsx_vld(sao_offset_val, 0); 1002 __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0; 1003 __m128i const1 = __lsx_vldi(1); 1004 __m128i zero = {0}; 1005 1006 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 1007 src -= 1; 1008 1009 /* load in advance */ 1010 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11); 1011 1012 for (height -= 2; height; height -= 2) { 1013 src += src_stride_2x; 1014 src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10); 1015 src0 = __lsx_vshuf_b(zero, src_minus10, shuf1); 1016 src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2); 1017 1018 DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, 1019 cmp_minus10, cmp_minus11); 1020 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1021 cmp_minus11, diff_minus10, diff_minus11); 1022 DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, 1023 cmp_minus10, cmp_minus11); 1024 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1025 cmp_minus11, cmp_minus10, cmp_minus11); 1026 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 1027 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); 1028 1029 offset = __lsx_vadd_b(diff_minus10, diff_minus11); 1030 offset = __lsx_vaddi_bu(offset, 2); 1031 1032 /* load in advance */ 1033 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, 1034 src_minus10, src_minus11); 1035 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, 1036 sao_offset, sao_offset, offset, offset, offset); 1037 src0 = __lsx_vxori_b(src0, 128); 1038 dst0 = __lsx_vsadd_b(src0, offset); 1039 dst0 = __lsx_vxori_b(dst0, 128); 1040 1041 __lsx_vstelm_w(dst0, dst, 0, 0); 1042 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); 1043 dst += dst_stride_2x; 1044 } 1045 1046 src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10); 1047 src0 = __lsx_vshuf_b(zero, src_minus10, shuf1); 1048 src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2); 1049 1050 DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10, 1051 cmp_minus11); 1052 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1053 diff_minus10, diff_minus11); 1054 DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10, 1055 cmp_minus11); 1056 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1057 cmp_minus10, cmp_minus11); 1058 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, 1059 const1, cmp_minus11, diff_minus10, diff_minus11); 1060 1061 offset = __lsx_vadd_b(diff_minus10, diff_minus11); 1062 offset = __lsx_vaddi_bu(offset, 2); 1063 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset, 1064 offset, offset, offset); 1065 src0 = __lsx_vxori_b(src0, 128); 1066 dst0 = __lsx_vsadd_b(src0, offset); 1067 dst0 = __lsx_vxori_b(dst0, 128); 1068 1069 __lsx_vstelm_w(dst0, dst, 0, 0); 1070 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); 1071} 1072 1073static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst, 1074 int32_t dst_stride, 1075 uint8_t *src, 1076 int32_t src_stride, 1077 int16_t *sao_offset_val, 1078 int32_t height) 1079{ 1080 const int32_t src_stride_2x = (src_stride << 1); 1081 const int32_t dst_stride_2x = (dst_stride << 1); 1082 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; 1083 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; 1084 __m128i edge_idx = {0x403000201, 0x0}; 1085 __m128i const1 = __lsx_vldi(1); 1086 __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11; 1087 __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11; 1088 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); 1089 __m128i zeros = {0}; 1090 1091 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 1092 src -= 1; 1093 1094 /* load in advance */ 1095 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11); 1096 1097 for (height -= 2; height; height -= 2) { 1098 src += src_stride_2x; 1099 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, 1100 src_minus11, shuf1, src0, src1); 1101 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, 1102 src_minus11, shuf2, src_plus10, src_plus11); 1103 DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11, 1104 src_plus10, src_minus10, src_plus10); 1105 src0 = __lsx_vpickev_d(src1, src0); 1106 1107 DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, 1108 cmp_minus10, cmp_minus11); 1109 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1110 cmp_minus11, diff_minus10, diff_minus11); 1111 DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, 1112 cmp_minus10, cmp_minus11); 1113 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1114 cmp_minus11, cmp_minus10, cmp_minus11); 1115 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 1116 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); 1117 1118 offset = __lsx_vadd_b(diff_minus10, diff_minus11); 1119 offset = __lsx_vaddi_bu(offset, 2); 1120 1121 /* load in advance */ 1122 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, 1123 src_minus10, src_minus11); 1124 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 1125 sao_offset, offset, offset, offset); 1126 src0 = __lsx_vxori_b(src0, 128); 1127 dst0 = __lsx_vsadd_b(src0, offset); 1128 dst0 = __lsx_vxori_b(dst0, 128); 1129 1130 __lsx_vstelm_d(dst0, dst, 0, 0); 1131 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); 1132 dst += dst_stride_2x; 1133 } 1134 1135 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11, 1136 shuf1, src0, src1); 1137 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11, 1138 shuf2, src_plus10, src_plus11); 1139 DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11, 1140 src_plus10, src_minus10, src_plus10); 1141 src0 = __lsx_vpickev_d(src1, src0); 1142 1143 DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10, 1144 cmp_minus11); 1145 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1146 diff_minus10, diff_minus11); 1147 DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10, 1148 cmp_minus11); 1149 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1150 cmp_minus10, cmp_minus11); 1151 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, 1152 const1, cmp_minus11, diff_minus10, diff_minus11); 1153 1154 offset = __lsx_vadd_b(diff_minus10, diff_minus11); 1155 offset = __lsx_vaddi_bu(offset, 2); 1156 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 1157 sao_offset, offset, offset, offset); 1158 src0 = __lsx_vxori_b(src0, 128); 1159 dst0 = __lsx_vsadd_b(src0, offset); 1160 dst0 = __lsx_vxori_b(dst0, 128); 1161 1162 __lsx_vstelm_d(dst0, dst, 0, 0); 1163 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); 1164} 1165 1166static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst, 1167 int32_t dst_stride, 1168 uint8_t *src, 1169 int32_t src_stride, 1170 int16_t *sao_offset_val, 1171 int32_t width, 1172 int32_t height) 1173{ 1174 uint8_t *dst_ptr, *src_minus1; 1175 int32_t v_cnt; 1176 const int32_t src_stride_2x = (src_stride << 1); 1177 const int32_t dst_stride_2x = (dst_stride << 1); 1178 const int32_t src_stride_4x = (src_stride << 2); 1179 const int32_t dst_stride_4x = (dst_stride << 2); 1180 const int32_t src_stride_3x = src_stride_2x + src_stride; 1181 const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1182 1183 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; 1184 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; 1185 __m128i edge_idx = {0x403000201, 0x0}; 1186 __m128i const1 = __lsx_vldi(1); 1187 __m128i sao_offset; 1188 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; 1189 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; 1190 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; 1191 __m128i diff_plus13; 1192 __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3; 1193 __m128i src_minus10, src_minus11, src_minus12, src_minus13; 1194 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3; 1195 __m128i src_zero0, src_zero1, src_zero2, src_zero3; 1196 __m128i src_plus10, src_plus11, src_plus12, src_plus13; 1197 1198 sao_offset = __lsx_vld(sao_offset_val, 0); 1199 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 1200 1201 for (; height; height -= 4) { 1202 src_minus1 = src - 1; 1203 src_minus10 = __lsx_vld(src_minus1, 0); 1204 DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1, 1205 src_stride_2x, src_minus11, src_minus12); 1206 src_minus13 = __lsx_vldx(src_minus1, src_stride_3x); 1207 1208 for (v_cnt = 0; v_cnt < width; v_cnt += 16) { 1209 src_minus1 += 16; 1210 dst_ptr = dst + v_cnt; 1211 src10 = __lsx_vld(src_minus1, 0); 1212 DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1, 1213 src_stride_2x, src11, src12); 1214 src13 = __lsx_vldx(src_minus1, src_stride_3x); 1215 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11, 1216 src_minus11, shuf1, src12, src_minus12, shuf1, src13, 1217 src_minus13, shuf1, src_zero0, src_zero1, 1218 src_zero2, src_zero3); 1219 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11, 1220 src_minus11, shuf2, src12, src_minus12, shuf2, src13, 1221 src_minus13, shuf2, src_plus10, src_plus11, 1222 src_plus12, src_plus13); 1223 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0, 1224 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11, 1225 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11); 1226 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2, 1227 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13, 1228 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13); 1229 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, 1230 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, 1231 cmp_plus11, diff_minus10, diff_plus10, diff_minus11, 1232 diff_plus11); 1233 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, 1234 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, 1235 cmp_plus13, diff_minus12, diff_plus12, diff_minus13, 1236 diff_plus13); 1237 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0, 1238 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11, 1239 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11); 1240 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2, 1241 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13, 1242 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13); 1243 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, 1244 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, 1245 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11, 1246 cmp_plus11); 1247 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, 1248 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, 1249 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13, 1250 cmp_plus13); 1251 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 1252 diff_plus10, const1, cmp_plus10, diff_minus11, const1, 1253 cmp_minus11, diff_plus11, const1, cmp_plus11, 1254 diff_minus10, diff_plus10, diff_minus11, diff_plus11); 1255 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12, 1256 diff_plus12, const1, cmp_plus12, diff_minus13, const1, 1257 cmp_minus13, diff_plus13, const1, cmp_plus13, 1258 diff_minus12, diff_plus12, diff_minus13, diff_plus13); 1259 1260 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11, 1261 diff_plus11, diff_minus12, diff_plus12, diff_minus13, 1262 diff_plus13, offset_mask0, offset_mask1, offset_mask2, 1263 offset_mask3); 1264 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2, 1265 offset_mask2, 2, offset_mask3, 2, offset_mask0, 1266 offset_mask1, offset_mask2, offset_mask3); 1267 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0, 1268 sao_offset, sao_offset, offset_mask0, offset_mask0, 1269 offset_mask0); 1270 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1, 1271 sao_offset, sao_offset, offset_mask1, offset_mask1, 1272 offset_mask1); 1273 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2, 1274 sao_offset, sao_offset, offset_mask2, offset_mask2, 1275 offset_mask2); 1276 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3, 1277 sao_offset, sao_offset, offset_mask3, offset_mask3, 1278 offset_mask3); 1279 1280 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, 1281 src_zero2, 128, src_zero3, 128, src_zero0, src_zero1, 1282 src_zero2, src_zero3); 1283 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1, 1284 offset_mask1, src_zero2, offset_mask2, src_zero3, 1285 offset_mask3, dst0, dst1, dst2, dst3); 1286 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 1287 128, dst0, dst1, dst2, dst3); 1288 1289 src_minus10 = src10; 1290 src_minus11 = src11; 1291 src_minus12 = src12; 1292 src_minus13 = src13; 1293 1294 __lsx_vst(dst0, dst_ptr, 0); 1295 __lsx_vst(dst1, dst_ptr + dst_stride, 0); 1296 __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0); 1297 __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0); 1298 } 1299 src += src_stride_4x; 1300 dst += dst_stride_4x; 1301 } 1302} 1303 1304static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst, 1305 int32_t dst_stride, 1306 uint8_t *src, 1307 int32_t src_stride, 1308 int16_t *sao_offset_val, 1309 int32_t height) 1310{ 1311 const int32_t src_stride_2x = (src_stride << 1); 1312 const int32_t dst_stride_2x = (dst_stride << 1); 1313 __m128i edge_idx = {0x403000201, 0x0}; 1314 __m128i const1 = __lsx_vldi(1); 1315 __m128i dst0; 1316 __m128i sao_offset = __lsx_vld(sao_offset_val, 0); 1317 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; 1318 __m128i src_minus10, src_minus11, src10, src11; 1319 __m128i src_zero0, src_zero1; 1320 __m128i offset; 1321 __m128i offset_mask0, offset_mask1; 1322 1323 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 1324 1325 /* load in advance */ 1326 DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0, 1327 src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11); 1328 1329 for (height -= 2; height; height -= 2) { 1330 src += src_stride_2x; 1331 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11, 1332 src11, src_minus11, src10, src10, src_minus10, src_zero0, 1333 src_minus11, src_zero1); 1334 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 1335 cmp_minus10, cmp_minus11); 1336 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1337 cmp_minus11, diff_minus10, diff_minus11); 1338 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, 1339 src_minus11, cmp_minus10, cmp_minus11); 1340 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1341 cmp_minus11, cmp_minus10, cmp_minus11); 1342 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 1343 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); 1344 1345 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 1346 diff_minus11, offset_mask0, offset_mask1); 1347 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, 1348 offset_mask0, offset_mask1); 1349 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 1350 src_zero0, offset, dst0); 1351 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 1352 sao_offset, offset, offset, offset); 1353 1354 dst0 = __lsx_vxori_b(dst0, 128); 1355 dst0 = __lsx_vsadd_b(dst0, offset); 1356 dst0 = __lsx_vxori_b(dst0, 128); 1357 src_minus10 = src10; 1358 src_minus11 = src11; 1359 1360 /* load in advance */ 1361 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, 1362 src10, src11); 1363 1364 __lsx_vstelm_w(dst0, dst, 0, 0); 1365 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); 1366 dst += dst_stride_2x; 1367 } 1368 1369 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11, 1370 src11, src_minus11, src10, src10, src_minus10, src_zero0, 1371 src_minus11, src_zero1); 1372 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 1373 cmp_minus10, cmp_minus11); 1374 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1375 diff_minus10, diff_minus11); 1376 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, 1377 cmp_minus10, cmp_minus11); 1378 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1379 cmp_minus10, cmp_minus11); 1380 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, 1381 const1, cmp_minus11, diff_minus10, diff_minus11); 1382 1383 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 1384 diff_minus11, offset_mask0, offset_mask1); 1385 DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2, 1386 offset_mask0, offset_mask1); 1387 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 1388 src_zero0, offset, dst0); 1389 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 1390 sao_offset, offset, offset, offset); 1391 dst0 = __lsx_vxori_b(dst0, 128); 1392 dst0 = __lsx_vsadd_b(dst0, offset); 1393 dst0 = __lsx_vxori_b(dst0, 128); 1394 1395 __lsx_vstelm_w(dst0, dst, 0, 0); 1396 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); 1397} 1398 1399static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst, 1400 int32_t dst_stride, 1401 uint8_t *src, 1402 int32_t src_stride, 1403 int16_t *sao_offset_val, 1404 int32_t height) 1405{ 1406 const int32_t src_stride_2x = (src_stride << 1); 1407 const int32_t dst_stride_2x = (dst_stride << 1); 1408 __m128i edge_idx = {0x403000201, 0x0}; 1409 __m128i const1 = __lsx_vldi(1); 1410 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); 1411 __m128i src_zero0, src_zero1, dst0; 1412 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; 1413 __m128i src_minus10, src_minus11, src10, src11; 1414 __m128i offset_mask0, offset_mask1; 1415 1416 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 1417 1418 /* load in advance */ 1419 DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11); 1420 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11); 1421 1422 for (height -= 2; height; height -= 2) { 1423 src += src_stride_2x; 1424 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11, 1425 src11, src_minus11, src10, src10, src_minus10, src_zero0, 1426 src_minus11, src_zero1); 1427 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 1428 cmp_minus10, cmp_minus11); 1429 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1430 cmp_minus11, diff_minus10, diff_minus11); 1431 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, 1432 src_minus11, cmp_minus10, cmp_minus11); 1433 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1434 cmp_minus11, cmp_minus10, cmp_minus11); 1435 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 1436 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); 1437 1438 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 1439 diff_minus11, offset_mask0, offset_mask1); 1440 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, 1441 offset_mask0, offset_mask1); 1442 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 1443 src_zero0, offset, dst0); 1444 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 1445 sao_offset, offset, offset, offset); 1446 1447 dst0 = __lsx_vxori_b(dst0, 128); 1448 dst0 = __lsx_vsadd_b(dst0, offset); 1449 dst0 = __lsx_vxori_b(dst0, 128); 1450 src_minus10 = src10; 1451 src_minus11 = src11; 1452 1453 /* load in advance */ 1454 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, 1455 src10, src11); 1456 1457 __lsx_vstelm_d(dst0, dst, 0, 0); 1458 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); 1459 dst += dst_stride_2x; 1460 } 1461 1462 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11, 1463 src11, src_minus11, src10, src10, src_minus10, src_zero0, 1464 src_minus11, src_zero1); 1465 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 1466 cmp_minus10, cmp_minus11); 1467 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1468 diff_minus10, diff_minus11); 1469 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, 1470 cmp_minus10, cmp_minus11); 1471 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1472 cmp_minus10, cmp_minus11); 1473 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, 1474 const1, cmp_minus11, diff_minus10, diff_minus11); 1475 1476 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 1477 diff_minus11, offset_mask0, offset_mask1); 1478 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, 1479 offset_mask0, offset_mask1); 1480 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 1481 src_zero0, offset, dst0); 1482 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 1483 sao_offset, offset, offset, offset); 1484 dst0 = __lsx_vxori_b(dst0, 128); 1485 dst0 = __lsx_vsadd_b(dst0, offset); 1486 dst0 = __lsx_vxori_b(dst0, 128); 1487 1488 __lsx_vstelm_d(dst0, dst, 0, 0); 1489 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); 1490} 1491 1492static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst, 1493 int32_t dst_stride, 1494 uint8_t *src, 1495 int32_t src_stride, 1496 int16_t * 1497 sao_offset_val, 1498 int32_t width, 1499 int32_t height) 1500{ 1501 uint8_t *src_orig = src; 1502 uint8_t *dst_orig = dst; 1503 int32_t h_cnt, v_cnt; 1504 const int32_t src_stride_2x = (src_stride << 1); 1505 const int32_t dst_stride_2x = (dst_stride << 1); 1506 const int32_t src_stride_4x = (src_stride << 2); 1507 const int32_t dst_stride_4x = (dst_stride << 2); 1508 const int32_t src_stride_3x = src_stride_2x + src_stride; 1509 const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1510 __m128i edge_idx = {0x403000201, 0x0}; 1511 __m128i const1 = __lsx_vldi(1); 1512 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; 1513 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; 1514 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; 1515 __m128i diff_plus13; 1516 __m128i src10, src_minus10, dst0, src11, src_minus11, dst1; 1517 __m128i src12, dst2, src13, dst3; 1518 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset; 1519 1520 sao_offset = __lsx_vld(sao_offset_val, 0); 1521 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 1522 1523 for (v_cnt = 0; v_cnt < width; v_cnt += 16) { 1524 src = src_orig + v_cnt; 1525 dst = dst_orig + v_cnt; 1526 1527 DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, 1528 src_minus10, src_minus11); 1529 1530 for (h_cnt = (height >> 2); h_cnt--;) { 1531 DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, 1532 src, src_stride_3x, src, src_stride_4x, 1533 src10, src11, src12, src13); 1534 DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11, 1535 src10, src10, src_minus11, src10, src11, cmp_minus10, 1536 cmp_plus10, cmp_minus11, cmp_plus11); 1537 DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11, 1538 src12, src13, cmp_minus12, cmp_plus12, 1539 cmp_minus13, cmp_plus13); 1540 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, 1541 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, 1542 cmp_plus11, diff_minus10, diff_plus10, diff_minus11, 1543 diff_plus11); 1544 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, 1545 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, 1546 cmp_plus13, diff_minus12, diff_plus12, diff_minus13, 1547 diff_plus13); 1548 DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11, 1549 src10, src10, src_minus11, src10, src11, cmp_minus10, 1550 cmp_plus10, cmp_minus11, cmp_plus11); 1551 DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11, 1552 src12, src13, cmp_minus12, cmp_plus12, cmp_minus13, 1553 cmp_plus13); 1554 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, 1555 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, 1556 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11, 1557 cmp_plus11); 1558 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, 1559 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, 1560 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13, 1561 cmp_plus13); 1562 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 1563 diff_plus10, const1, cmp_plus10, diff_minus11, const1, 1564 cmp_minus11, diff_plus11, const1, cmp_plus11, 1565 diff_minus10, diff_plus10, diff_minus11, diff_plus11); 1566 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12, 1567 diff_plus12, const1, cmp_plus12, diff_minus13, const1, 1568 cmp_minus13, diff_plus13, const1, cmp_plus13, 1569 diff_minus12, diff_plus12, diff_minus13, diff_plus13); 1570 1571 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11, 1572 diff_plus11, diff_minus12, diff_plus12, diff_minus13, 1573 diff_plus13, offset_mask0, offset_mask1, offset_mask2, 1574 offset_mask3); 1575 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2, 1576 offset_mask2, 2, offset_mask3, 2, offset_mask0, 1577 offset_mask1, offset_mask2, offset_mask3); 1578 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0, 1579 sao_offset, sao_offset, offset_mask0,\ 1580 offset_mask0, offset_mask0); 1581 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1, 1582 sao_offset, sao_offset, offset_mask1, offset_mask1, 1583 offset_mask1); 1584 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2, 1585 sao_offset, sao_offset, offset_mask2, offset_mask2, 1586 offset_mask2); 1587 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3, 1588 sao_offset, sao_offset, offset_mask3, offset_mask3, 1589 offset_mask3); 1590 1591 src_minus10 = src12; 1592 DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128, 1593 src12, 128, src_minus11, src10, src11, src12); 1594 DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10, 1595 offset_mask1, src11, offset_mask2, src12, 1596 offset_mask3, dst0, dst1, dst2, dst3); 1597 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 1598 128, dst0, dst1, dst2, dst3); 1599 src_minus11 = src13; 1600 1601 __lsx_vst(dst0, dst, 0); 1602 __lsx_vstx(dst1, dst, dst_stride); 1603 __lsx_vstx(dst2, dst, dst_stride_2x); 1604 __lsx_vstx(dst3, dst, dst_stride_3x); 1605 src += src_stride_4x; 1606 dst += dst_stride_4x; 1607 } 1608 } 1609} 1610 1611static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst, 1612 int32_t dst_stride, 1613 uint8_t *src, 1614 int32_t src_stride, 1615 int16_t *sao_offset_val, 1616 int32_t height) 1617{ 1618 uint8_t *src_orig; 1619 const int32_t src_stride_2x = (src_stride << 1); 1620 const int32_t dst_stride_2x = (dst_stride << 1); 1621 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; 1622 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; 1623 __m128i edge_idx = {0x403000201, 0x0}; 1624 __m128i const1 = __lsx_vldi(1); 1625 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); 1626 __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11; 1627 __m128i src_minus11, src10, src11; 1628 __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0; 1629 __m128i offset_mask0, offset_mask1; 1630 __m128i zeros = {0}; 1631 1632 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 1633 src_orig = src - 1; 1634 1635 /* load in advance */ 1636 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, 1637 src_minus10, src_minus11); 1638 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 1639 src10, src11); 1640 1641 for (height -= 2; height; height -= 2) { 1642 src_orig += src_stride_2x; 1643 1644 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, 1645 shuf1, src_zero0, src_zero1); 1646 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2, 1647 src_plus0, src_plus1); 1648 1649 DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, 1650 src_minus11, src_minus10, src_minus11); 1651 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, 1652 src_zero1, src_zero0, src_zero1); 1653 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, 1654 src_minus11, cmp_minus10, cmp_minus11); 1655 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1656 cmp_minus11, diff_minus10, diff_minus11); 1657 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, 1658 src_minus11, cmp_minus10, cmp_minus11); 1659 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1660 cmp_minus11, cmp_minus10, cmp_minus11); 1661 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 1662 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); 1663 1664 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 1665 diff_minus11, offset_mask0, offset_mask1); 1666 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, 1667 offset_mask0, offset_mask1); 1668 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 1669 src_zero0, offset, dst0); 1670 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 1671 sao_offset, offset, offset, offset); 1672 dst0 = __lsx_vxori_b(dst0, 128); 1673 dst0 = __lsx_vsadd_b(dst0, offset); 1674 dst0 = __lsx_vxori_b(dst0, 128); 1675 1676 src_minus10 = src10; 1677 src_minus11 = src11; 1678 1679 /* load in advance */ 1680 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 1681 src10, src11); 1682 1683 __lsx_vstelm_w(dst0, dst, 0, 0); 1684 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); 1685 dst += dst_stride_2x; 1686 } 1687 1688 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1, 1689 src_zero0, src_zero1); 1690 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2, 1691 src_plus0, src_plus1); 1692 1693 DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11, 1694 src_minus10, src_minus11); 1695 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, 1696 src_zero0, src_zero1); 1697 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 1698 cmp_minus10, cmp_minus11); 1699 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1700 diff_minus10, diff_minus11); 1701 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, 1702 cmp_minus10, cmp_minus11); 1703 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1704 cmp_minus10, cmp_minus11); 1705 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, 1706 const1, cmp_minus11, diff_minus10, diff_minus11); 1707 1708 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 1709 diff_minus11, offset_mask0, offset_mask1); 1710 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0, 1711 offset_mask1); 1712 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 1713 src_zero0, offset, dst0); 1714 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 1715 sao_offset, offset, offset, offset); 1716 dst0 = __lsx_vxori_b(dst0, 128); 1717 dst0 = __lsx_vsadd_b(dst0, offset); 1718 dst0 = __lsx_vxori_b(dst0, 128); 1719 1720 __lsx_vstelm_w(dst0, dst, 0, 0); 1721 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); 1722} 1723 1724static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst, 1725 int32_t dst_stride, 1726 uint8_t *src, 1727 int32_t src_stride, 1728 int16_t *sao_offset_val, 1729 int32_t height) 1730{ 1731 uint8_t *src_orig; 1732 const int32_t src_stride_2x = (src_stride << 1); 1733 const int32_t dst_stride_2x = (dst_stride << 1); 1734 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; 1735 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; 1736 __m128i edge_idx = {0x403000201, 0x0}; 1737 __m128i const1 = __lsx_vldi(1); 1738 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); 1739 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; 1740 __m128i src_minus10, src10, src_minus11, src11; 1741 __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0; 1742 __m128i offset_mask0, offset_mask1; 1743 __m128i zeros = {0}; 1744 1745 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 1746 src_orig = src - 1; 1747 1748 /* load in advance */ 1749 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10, 1750 src_minus11); 1751 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 1752 src10, src11); 1753 1754 for (height -= 2; height; height -= 2) { 1755 src_orig += src_stride_2x; 1756 1757 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, 1758 shuf1, src_zero0, src_zero1); 1759 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2, 1760 src_plus10, src_plus11); 1761 1762 DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, 1763 src_minus11, src_minus10, src_minus11); 1764 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, 1765 src_zero0, src_zero1); 1766 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 1767 cmp_minus10, cmp_minus11); 1768 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1769 cmp_minus11, diff_minus10, diff_minus11); 1770 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, 1771 src_minus11, cmp_minus10, cmp_minus11); 1772 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1773 cmp_minus11, cmp_minus10, cmp_minus11); 1774 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 1775 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); 1776 1777 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 1778 diff_minus11, offset_mask0, offset_mask1); 1779 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, 1780 offset_mask0, offset_mask1); 1781 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 1782 src_zero0, offset, dst0); 1783 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 1784 sao_offset, offset, offset, offset); 1785 dst0 = __lsx_vxori_b(dst0, 128); 1786 dst0 = __lsx_vsadd_b(dst0, offset); 1787 dst0 = __lsx_vxori_b(dst0, 128); 1788 1789 src_minus10 = src10; 1790 src_minus11 = src11; 1791 1792 /* load in advance */ 1793 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 1794 src10, src11) 1795 __lsx_vstelm_d(dst0, dst, 0, 0); 1796 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); 1797 dst += dst_stride_2x; 1798 } 1799 1800 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1, 1801 src_zero0, src_zero1); 1802 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2, 1803 src_plus10, src_plus11); 1804 DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11, 1805 src_minus10, src_minus11); 1806 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, 1807 src_zero0, src_zero1); 1808 1809 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 1810 cmp_minus10, cmp_minus11); 1811 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 1812 cmp_minus11, diff_minus10, diff_minus11); 1813 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, 1814 cmp_minus10, cmp_minus11); 1815 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 1816 cmp_minus10, cmp_minus11); 1817 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, 1818 const1, cmp_minus11, diff_minus10, diff_minus11); 1819 1820 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 1821 diff_minus11, offset_mask0, offset_mask1); 1822 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0, 1823 offset_mask1); 1824 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 1825 src_zero0, offset, dst0); 1826 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 1827 sao_offset, offset, offset, offset); 1828 dst0 = __lsx_vxori_b(dst0, 128); 1829 dst0 = __lsx_vsadd_b(dst0, offset); 1830 dst0 = __lsx_vxori_b(dst0, 128); 1831 1832 src_minus10 = src10; 1833 src_minus11 = src11; 1834 1835 /* load in advance */ 1836 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 1837 src10, src11); 1838 1839 __lsx_vstelm_d(dst0, dst, 0, 0); 1840 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); 1841} 1842 1843static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst, 1844 int32_t dst_stride, 1845 uint8_t *src, 1846 int32_t src_stride, 1847 int16_t * 1848 sao_offset_val, 1849 int32_t width, 1850 int32_t height) 1851{ 1852 uint8_t *src_orig = src; 1853 uint8_t *dst_orig = dst; 1854 int32_t v_cnt; 1855 const int32_t src_stride_2x = (src_stride << 1); 1856 const int32_t dst_stride_2x = (dst_stride << 1); 1857 const int32_t src_stride_4x = (src_stride << 2); 1858 const int32_t dst_stride_4x = (dst_stride << 2); 1859 const int32_t src_stride_3x = src_stride_2x + src_stride; 1860 const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 1861 1862 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; 1863 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; 1864 __m128i edge_idx = {0x403000201, 0x0}; 1865 __m128i const1 = __lsx_vldi(1); 1866 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11; 1867 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12; 1868 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13; 1869 __m128i diff_plus13, src_minus14, src_plus13; 1870 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3; 1871 __m128i src10, src_minus10, dst0, src11, src_minus11, dst1; 1872 __m128i src12, src_minus12, dst2, src13, src_minus13, dst3; 1873 __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2; 1874 __m128i src_zero3, sao_offset, src_plus12; 1875 1876 sao_offset = __lsx_vld(sao_offset_val, 0); 1877 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 1878 1879 for (; height; height -= 4) { 1880 src_orig = src - 1; 1881 dst_orig = dst; 1882 src_minus11 = __lsx_vld(src_orig, 0); 1883 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 1884 src_minus12, src_minus13); 1885 src_minus14 = __lsx_vldx(src_orig, src_stride_3x); 1886 1887 for (v_cnt = 0; v_cnt < width; v_cnt += 16) { 1888 src_minus10 = __lsx_vld(src_orig - src_stride, 0); 1889 src_orig += 16; 1890 src10 = __lsx_vld(src_orig, 0); 1891 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, 1892 src_stride_2x, src11, src12); 1893 src13 = __lsx_vldx(src_orig, src_stride_3x); 1894 src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1); 1895 1896 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11, 1897 src_minus12, shuf1, src12, src_minus13, shuf1, 1898 src13, src_minus14, shuf1, src_zero0, src_zero1, 1899 src_zero2, src_zero3); 1900 DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12, 1901 src_minus13, shuf2, src_plus10, src_plus11); 1902 src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2); 1903 1904 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0, 1905 src_plus10, src_zero1, src_minus11, src_zero1, 1906 src_plus11, cmp_minus10, cmp_plus10, 1907 cmp_minus11, cmp_plus11); 1908 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2, 1909 src_plus12, src_zero3, src_minus13, src_zero3, 1910 src_plus13, cmp_minus12, cmp_plus12, 1911 cmp_minus13, cmp_plus13); 1912 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, 1913 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, 1914 cmp_plus11, diff_minus10, diff_plus10, diff_minus11, 1915 diff_plus11); 1916 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, 1917 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, 1918 cmp_plus13, diff_minus12, diff_plus12, diff_minus13, 1919 diff_plus13); 1920 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0, 1921 src_plus10, src_zero1, src_minus11, src_zero1, 1922 src_plus11, cmp_minus10, cmp_plus10, cmp_minus11, 1923 cmp_plus11); 1924 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2, 1925 src_plus12, src_zero3, src_minus13, src_zero3, 1926 src_plus13, cmp_minus12, cmp_plus12, cmp_minus13, 1927 cmp_plus13); 1928 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, 1929 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, 1930 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11, 1931 cmp_plus11); 1932 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, 1933 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, 1934 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13, 1935 cmp_plus13); 1936 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 1937 diff_plus10, const1, cmp_plus10, diff_minus11, const1, 1938 cmp_minus11, diff_plus11, const1, cmp_plus11, 1939 diff_minus10, diff_plus10, diff_minus11, diff_plus11); 1940 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12, 1941 diff_plus12, const1, cmp_plus12, diff_minus13, const1, 1942 cmp_minus13, diff_plus13, const1, cmp_plus13, 1943 diff_minus12, diff_plus12, diff_minus13, diff_plus13); 1944 1945 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11, 1946 diff_plus11, diff_minus12, diff_plus12, diff_minus13, 1947 diff_plus13, offset_mask0, offset_mask1, offset_mask2, 1948 offset_mask3); 1949 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2, 1950 offset_mask2, 2, offset_mask3, 2, offset_mask0, 1951 offset_mask1, offset_mask2, offset_mask3); 1952 1953 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0, 1954 sao_offset, sao_offset, offset_mask0, offset_mask0, 1955 offset_mask0); 1956 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1, 1957 sao_offset, sao_offset, offset_mask1, offset_mask1, 1958 offset_mask1); 1959 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2, 1960 sao_offset, sao_offset, offset_mask2, offset_mask2, 1961 offset_mask2); 1962 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3, 1963 sao_offset, sao_offset, offset_mask3, offset_mask3, 1964 offset_mask3); 1965 1966 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2, 1967 128, src_zero3, 128, src_zero0, src_zero1, src_zero2, 1968 src_zero3); 1969 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1, 1970 offset_mask1, src_zero2, offset_mask2, src_zero3, 1971 offset_mask3, dst0, dst1, dst2, dst3); 1972 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 1973 128, dst0, dst1, dst2, dst3); 1974 1975 src_minus11 = src10; 1976 src_minus12 = src11; 1977 src_minus13 = src12; 1978 src_minus14 = src13; 1979 1980 __lsx_vst(dst0, dst_orig, 0); 1981 __lsx_vstx(dst1, dst_orig, dst_stride); 1982 __lsx_vstx(dst2, dst_orig, dst_stride_2x); 1983 __lsx_vstx(dst3, dst_orig, dst_stride_3x); 1984 dst_orig += 16; 1985 } 1986 src += src_stride_4x; 1987 dst += dst_stride_4x; 1988 } 1989} 1990 1991static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst, 1992 int32_t dst_stride, 1993 uint8_t *src, 1994 int32_t src_stride, 1995 int16_t *sao_offset_val, 1996 int32_t height) 1997{ 1998 uint8_t *src_orig; 1999 const int32_t src_stride_2x = (src_stride << 1); 2000 const int32_t dst_stride_2x = (dst_stride << 1); 2001 2002 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; 2003 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; 2004 __m128i edge_idx = {0x403000201, 0x0}; 2005 __m128i const1 = __lsx_vldi(1); 2006 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); 2007 __m128i src_zero0, src_zero1, dst0; 2008 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; 2009 __m128i src_minus10, src10, src_minus11, src11; 2010 __m128i offset_mask0, offset_mask1; 2011 __m128i zeros = {0}; 2012 2013 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 2014 src_orig = src - 1; 2015 2016 /* load in advance */ 2017 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, 2018 src_minus10, src_minus11); 2019 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 2020 src10, src11); 2021 2022 for (height -= 2; height; height -= 2) { 2023 src_orig += src_stride_2x; 2024 2025 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, 2026 shuf1, src_zero0, src_zero1); 2027 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11, 2028 shuf2, src_minus10, src_minus11); 2029 2030 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11, 2031 src_minus10, src_minus11); 2032 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, 2033 src_zero0, src_zero1); 2034 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 2035 cmp_minus10, cmp_minus11); 2036 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 2037 cmp_minus11, diff_minus10, diff_minus11); 2038 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, 2039 src_minus11, cmp_minus10, cmp_minus11); 2040 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 2041 cmp_minus11, cmp_minus10, cmp_minus11); 2042 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 2043 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); 2044 2045 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 2046 diff_minus11, offset_mask0, offset_mask1); 2047 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, 2048 offset_mask0, offset_mask1); 2049 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 2050 src_zero0, offset, dst0); 2051 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 2052 sao_offset, offset, offset, offset); 2053 dst0 = __lsx_vxori_b(dst0, 128); 2054 dst0 = __lsx_vsadd_b(dst0, offset); 2055 dst0 = __lsx_vxori_b(dst0, 128); 2056 2057 src_minus10 = src10; 2058 src_minus11 = src11; 2059 2060 /* load in advance */ 2061 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 2062 src10, src11); 2063 2064 __lsx_vstelm_w(dst0, dst, 0, 0); 2065 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); 2066 dst += dst_stride_2x; 2067 } 2068 2069 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1, 2070 src_zero0, src_zero1); 2071 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11, 2072 shuf2, src_minus10, src_minus11); 2073 2074 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11, 2075 src_minus10, src_minus11); 2076 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, 2077 src_zero0, src_zero1); 2078 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 2079 cmp_minus10, cmp_minus11); 2080 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 2081 cmp_minus11, diff_minus10, diff_minus11); 2082 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, 2083 cmp_minus10, cmp_minus11); 2084 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 2085 cmp_minus10, cmp_minus11); 2086 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, 2087 const1, cmp_minus11, diff_minus10, diff_minus11); 2088 2089 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 2090 diff_minus11, offset_mask0, offset_mask1); 2091 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0, 2092 offset_mask1); 2093 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 2094 src_zero0, offset, dst0); 2095 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 2096 sao_offset, offset, offset, offset); 2097 dst0 = __lsx_vxori_b(dst0, 128); 2098 dst0 = __lsx_vsadd_b(dst0, offset); 2099 dst0 = __lsx_vxori_b(dst0, 128); 2100 2101 __lsx_vstelm_w(dst0, dst, 0, 0); 2102 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2); 2103 dst += dst_stride_2x; 2104} 2105 2106static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst, 2107 int32_t dst_stride, 2108 uint8_t *src, 2109 int32_t src_stride, 2110 int16_t *sao_offset_val, 2111 int32_t height) 2112{ 2113 uint8_t *src_orig; 2114 const int32_t src_stride_2x = (src_stride << 1); 2115 const int32_t dst_stride_2x = (dst_stride << 1); 2116 2117 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; 2118 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; 2119 __m128i edge_idx = {0x403000201, 0x0}; 2120 __m128i const1 = __lsx_vldi(1); 2121 __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0); 2122 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; 2123 __m128i src_minus10, src10, src_minus11, src11; 2124 __m128i src_zero0, src_zero1, dst0; 2125 __m128i offset_mask0, offset_mask1; 2126 __m128i zeros = {0}; 2127 2128 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 2129 src_orig = src - 1; 2130 2131 /* load in advance */ 2132 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, 2133 src_minus10, src_minus11); 2134 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 2135 src10, src11); 2136 2137 for (height -= 2; height; height -= 2) { 2138 src_orig += src_stride_2x; 2139 2140 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, 2141 shuf1, src_zero0, src_zero1); 2142 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11, 2143 shuf2, src_minus10, src_minus11); 2144 2145 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11, 2146 src_minus10, src_minus11); 2147 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, 2148 src_zero0, src_zero1); 2149 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 2150 cmp_minus10, cmp_minus11); 2151 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 2152 cmp_minus11, diff_minus10, diff_minus11); 2153 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, 2154 src_minus11, cmp_minus10, cmp_minus11); 2155 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, 2156 cmp_minus11, cmp_minus10, cmp_minus11); 2157 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 2158 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11); 2159 2160 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 2161 diff_minus11, offset_mask0, offset_mask1); 2162 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, 2163 offset_mask0, offset_mask1); 2164 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 2165 src_zero0, offset, dst0); 2166 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 2167 sao_offset, offset, offset, offset); 2168 dst0 = __lsx_vxori_b(dst0, 128); 2169 dst0 = __lsx_vsadd_b(dst0, offset); 2170 dst0 = __lsx_vxori_b(dst0, 128); 2171 2172 src_minus10 = src10; 2173 src_minus11 = src11; 2174 2175 /* load in advance */ 2176 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 2177 src10, src11); 2178 2179 __lsx_vstelm_d(dst0, dst, 0, 0); 2180 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); 2181 dst += dst_stride_2x; 2182 } 2183 2184 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1, 2185 src_zero0, src_zero1); 2186 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11, 2187 shuf2, src_minus10, src_minus11); 2188 2189 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11, 2190 src_minus10, src_minus11); 2191 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1, 2192 src_zero0, src_zero1); 2193 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11, 2194 cmp_minus10, cmp_minus11); 2195 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 2196 diff_minus10, diff_minus11); 2197 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11, 2198 cmp_minus10, cmp_minus11); 2199 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11, 2200 cmp_minus10, cmp_minus11); 2201 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11, 2202 const1, cmp_minus11, diff_minus10, diff_minus11); 2203 2204 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11, 2205 diff_minus11, offset_mask0, offset_mask1); 2206 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0, 2207 offset_mask1); 2208 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1, 2209 src_zero0, offset, dst0); 2210 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, 2211 sao_offset, offset, offset, offset); 2212 dst0 = __lsx_vxori_b(dst0, 128); 2213 dst0 = __lsx_vsadd_b(dst0, offset); 2214 dst0 = __lsx_vxori_b(dst0, 128); 2215 2216 __lsx_vstelm_d(dst0, dst, 0, 0); 2217 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1); 2218} 2219 2220static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst, 2221 int32_t dst_stride, 2222 uint8_t *src, 2223 int32_t src_stride, 2224 int16_t *sao_offset_val, 2225 int32_t width, 2226 int32_t height) 2227{ 2228 uint8_t *src_orig, *dst_orig; 2229 int32_t v_cnt; 2230 const int32_t src_stride_2x = (src_stride << 1); 2231 const int32_t dst_stride_2x = (dst_stride << 1); 2232 const int32_t src_stride_4x = (src_stride << 2); 2233 const int32_t dst_stride_4x = (dst_stride << 2); 2234 const int32_t src_stride_3x = src_stride_2x + src_stride; 2235 const int32_t dst_stride_3x = dst_stride_2x + dst_stride; 2236 2237 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09}; 2238 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A}; 2239 __m128i edge_idx = {0x403000201, 0x0}; 2240 __m128i const1 = __lsx_vldi(1); 2241 __m128i dst0, dst1, dst2, dst3; 2242 __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10; 2243 __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11; 2244 __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12; 2245 __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11; 2246 __m128i src_plus10, src_plus11, src_plus12, src_plus13; 2247 __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3; 2248 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset; 2249 2250 sao_offset = __lsx_vld(sao_offset_val, 0); 2251 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset); 2252 2253 for (; height; height -= 4) { 2254 src_orig = src - 1; 2255 dst_orig = dst; 2256 2257 src_minus11 = __lsx_vld(src_orig, 0); 2258 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 2259 src_plus10, src_plus11); 2260 src_plus12 = __lsx_vldx(src_orig, src_stride_3x); 2261 2262 for (v_cnt = 0; v_cnt < width; v_cnt += 16) { 2263 src_minus10 = __lsx_vld(src_orig - src_stride, 2); 2264 src_plus13 = __lsx_vldx(src_orig, src_stride_4x); 2265 src_orig += 16; 2266 src10 = __lsx_vld(src_orig, 0); 2267 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x, 2268 src11, src12); 2269 src13 =__lsx_vldx(src_orig, src_stride_3x); 2270 2271 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11, 2272 src_plus10, shuf1, src12, src_plus11, shuf1, src13, 2273 src_plus12, shuf1, src_zero0, src_zero1, src_zero2, 2274 src_zero3); 2275 src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2); 2276 DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12, 2277 src_plus11, shuf2, src_minus12, src_minus13); 2278 2279 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0, 2280 src_plus10, src_zero1, src_minus11, src_zero1, 2281 src_plus11, cmp_minus10, cmp_plus10, cmp_minus11, 2282 cmp_plus11); 2283 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2, 2284 src_plus12, src_zero3, src_minus13, src_zero3, 2285 src_plus13, cmp_minus12, cmp_plus12, cmp_minus13, 2286 cmp_plus13); 2287 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, 2288 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, 2289 cmp_plus11, diff_minus10, diff_plus10, diff_minus11, 2290 diff_plus11); 2291 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, 2292 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, 2293 cmp_plus13, diff_minus12, diff_plus12, diff_minus13, 2294 diff_plus13); 2295 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0, 2296 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11, 2297 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11); 2298 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2, 2299 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13, 2300 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13); 2301 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10, 2302 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11, 2303 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11, 2304 cmp_plus11); 2305 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12, 2306 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13, 2307 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13, 2308 cmp_plus13); 2309 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, 2310 diff_plus10, const1, cmp_plus10, diff_minus11, const1, 2311 cmp_minus11, diff_plus11, const1, cmp_plus11, 2312 diff_minus10, diff_plus10, diff_minus11, diff_plus11); 2313 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12, 2314 diff_plus12, const1, cmp_plus12, diff_minus13, const1, 2315 cmp_minus13, diff_plus13, const1, cmp_plus13, 2316 diff_minus12, diff_plus12, diff_minus13, diff_plus13); 2317 2318 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11, 2319 diff_plus11, diff_minus12, diff_plus12, diff_minus13, 2320 diff_plus13, offset_mask0, offset_mask1, offset_mask2, 2321 offset_mask3); 2322 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2, 2323 offset_mask2, 2, offset_mask3, 2, offset_mask0, 2324 offset_mask1, offset_mask2, offset_mask3); 2325 2326 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0, 2327 sao_offset, sao_offset, offset_mask0, offset_mask0, 2328 offset_mask0); 2329 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1, 2330 sao_offset, sao_offset, offset_mask1, offset_mask1, 2331 offset_mask1); 2332 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2, 2333 sao_offset, sao_offset, offset_mask2, offset_mask2, 2334 offset_mask2); 2335 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3, 2336 sao_offset, sao_offset, offset_mask3, offset_mask3, 2337 offset_mask3); 2338 2339 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, 2340 src_zero2, 128, src_zero3, 128, src_zero0, src_zero1, 2341 src_zero2, src_zero3); 2342 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1, 2343 offset_mask1, src_zero2, offset_mask2, src_zero3, 2344 offset_mask3, dst0, dst1, dst2, dst3); 2345 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 2346 128, dst0, dst1, dst2, dst3); 2347 2348 src_minus11 = src10; 2349 src_plus10 = src11; 2350 src_plus11 = src12; 2351 src_plus12 = src13; 2352 2353 __lsx_vst(dst0, dst_orig, 0); 2354 __lsx_vstx(dst1, dst_orig, dst_stride); 2355 __lsx_vstx(dst2, dst_orig, dst_stride_2x); 2356 __lsx_vstx(dst3, dst_orig, dst_stride_3x); 2357 dst_orig += 16; 2358 } 2359 2360 src += src_stride_4x; 2361 dst += dst_stride_4x; 2362 } 2363} 2364 2365void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src, 2366 ptrdiff_t stride_dst, 2367 int16_t *sao_offset_val, 2368 int eo, int width, int height) 2369{ 2370 ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE); 2371 2372 switch (eo) { 2373 case 0: 2374 if (width >> 4) { 2375 hevc_sao_edge_filter_0degree_16multiple_lsx(dst, stride_dst, 2376 src, stride_src, 2377 sao_offset_val, 2378 width - (width & 0x0F), 2379 height); 2380 dst += width & 0xFFFFFFF0; 2381 src += width & 0xFFFFFFF0; 2382 width &= 0x0F; 2383 } 2384 2385 if (width >> 3) { 2386 hevc_sao_edge_filter_0degree_8width_lsx(dst, stride_dst, 2387 src, stride_src, 2388 sao_offset_val, height); 2389 dst += 8; 2390 src += 8; 2391 width &= 0x07; 2392 } 2393 2394 if (width) { 2395 hevc_sao_edge_filter_0degree_4width_lsx(dst, stride_dst, 2396 src, stride_src, 2397 sao_offset_val, height); 2398 } 2399 break; 2400 2401 case 1: 2402 if (width >> 4) { 2403 hevc_sao_edge_filter_90degree_16multiple_lsx(dst, stride_dst, 2404 src, stride_src, 2405 sao_offset_val, 2406 width - (width & 0x0F), 2407 height); 2408 dst += width & 0xFFFFFFF0; 2409 src += width & 0xFFFFFFF0; 2410 width &= 0x0F; 2411 } 2412 2413 if (width >> 3) { 2414 hevc_sao_edge_filter_90degree_8width_lsx(dst, stride_dst, 2415 src, stride_src, 2416 sao_offset_val, height); 2417 dst += 8; 2418 src += 8; 2419 width &= 0x07; 2420 } 2421 2422 if (width) { 2423 hevc_sao_edge_filter_90degree_4width_lsx(dst, stride_dst, 2424 src, stride_src, 2425 sao_offset_val, height); 2426 } 2427 break; 2428 2429 case 2: 2430 if (width >> 4) { 2431 hevc_sao_edge_filter_45degree_16multiple_lsx(dst, stride_dst, 2432 src, stride_src, 2433 sao_offset_val, 2434 width - (width & 0x0F), 2435 height); 2436 dst += width & 0xFFFFFFF0; 2437 src += width & 0xFFFFFFF0; 2438 width &= 0x0F; 2439 } 2440 2441 if (width >> 3) { 2442 hevc_sao_edge_filter_45degree_8width_lsx(dst, stride_dst, 2443 src, stride_src, 2444 sao_offset_val, height); 2445 dst += 8; 2446 src += 8; 2447 width &= 0x07; 2448 } 2449 2450 if (width) { 2451 hevc_sao_edge_filter_45degree_4width_lsx(dst, stride_dst, 2452 src, stride_src, 2453 sao_offset_val, height); 2454 } 2455 break; 2456 2457 case 3: 2458 if (width >> 4) { 2459 hevc_sao_edge_filter_135degree_16multiple_lsx(dst, stride_dst, 2460 src, stride_src, 2461 sao_offset_val, 2462 width - (width & 0x0F), 2463 height); 2464 dst += width & 0xFFFFFFF0; 2465 src += width & 0xFFFFFFF0; 2466 width &= 0x0F; 2467 } 2468 2469 if (width >> 3) { 2470 hevc_sao_edge_filter_135degree_8width_lsx(dst, stride_dst, 2471 src, stride_src, 2472 sao_offset_val, height); 2473 dst += 8; 2474 src += 8; 2475 width &= 0x07; 2476 } 2477 2478 if (width) { 2479 hevc_sao_edge_filter_135degree_4width_lsx(dst, stride_dst, 2480 src, stride_src, 2481 sao_offset_val, height); 2482 } 2483 break; 2484 } 2485} 2486