1/* 2 * Copyright (c) 2021 Loongson Technology Corporation Limited 3 * Contributed by Jin Bo <jinbo@loongson.cn> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavcodec/vp9dsp.h" 23#include "libavutil/loongarch/loongson_intrinsics.h" 24#include "libavutil/common.h" 25#include "vp9dsp_loongarch.h" 26 27#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, _in2, \ 28 _in3, _in4, _in5, _in6, _in7) \ 29{ \ 30 _in0 = __lsx_vld(_src, 0); \ 31 _in1 = __lsx_vldx(_src, _stride); \ 32 _in2 = __lsx_vldx(_src, _stride2); \ 33 _in3 = __lsx_vldx(_src, _stride3); \ 34 _src += _stride4; \ 35 _in4 = __lsx_vld(_src, 0); \ 36 _in5 = __lsx_vldx(_src, _stride); \ 37 _in6 = __lsx_vldx(_src, _stride2); \ 38 _in7 = __lsx_vldx(_src, _stride3); \ 39} 40 41#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, \ 42 _dst, _stride, _stride2, _stride3, _stride4) \ 43{ \ 44 __lsx_vst(_dst0, _dst, 0); \ 45 __lsx_vstx(_dst1, _dst, _stride); \ 46 __lsx_vstx(_dst2, _dst, _stride2); \ 47 __lsx_vstx(_dst3, _dst, _stride3); \ 48 _dst += _stride4; \ 49 __lsx_vst(_dst4, _dst, 0); \ 50 __lsx_vstx(_dst5, _dst, _stride); \ 51 __lsx_vstx(_dst6, _dst, _stride2); \ 52 __lsx_vstx(_dst7, _dst, _stride3); \ 53} 54 55#define VP9_LPF_FILTER4_4W(p1_src, p0_src, q0_src, q1_src, mask_src, hev_src, \ 56 p1_dst, p0_dst, q0_dst, q1_dst) \ 57{ \ 58 __m128i p1_tmp, p0_tmp, q0_tmp, q1_tmp, q0_sub_p0, filt, filt1, filt2; \ 59 const __m128i cnst3b = __lsx_vldi(3); \ 60 const __m128i cnst4b = __lsx_vldi(4); \ 61 \ 62 p1_tmp = __lsx_vxori_b(p1_src, 0x80); \ 63 p0_tmp = __lsx_vxori_b(p0_src, 0x80); \ 64 q0_tmp = __lsx_vxori_b(q0_src, 0x80); \ 65 q1_tmp = __lsx_vxori_b(q1_src, 0x80); \ 66 \ 67 filt = __lsx_vssub_b(p1_tmp, q1_tmp); \ 68 \ 69 filt = filt & hev_src; \ 70 \ 71 q0_sub_p0 = __lsx_vssub_b(q0_tmp, p0_tmp); \ 72 filt = __lsx_vsadd_b(filt, q0_sub_p0); \ 73 filt = __lsx_vsadd_b(filt, q0_sub_p0); \ 74 filt = __lsx_vsadd_b(filt, q0_sub_p0); \ 75 filt = filt & mask_src; \ 76 \ 77 filt1 = __lsx_vsadd_b(filt, cnst4b); \ 78 filt1 = __lsx_vsrai_b(filt1, 3); \ 79 \ 80 filt2 = __lsx_vsadd_b(filt, cnst3b); \ 81 filt2 = __lsx_vsrai_b(filt2, 3); \ 82 \ 83 q0_tmp = __lsx_vssub_b(q0_tmp, filt1); \ 84 q0_dst = __lsx_vxori_b(q0_tmp, 0x80); \ 85 p0_tmp = __lsx_vsadd_b(p0_tmp, filt2); \ 86 p0_dst = __lsx_vxori_b(p0_tmp, 0x80); \ 87 \ 88 filt = __lsx_vsrari_b(filt1, 1); \ 89 hev_src = __lsx_vxori_b(hev_src, 0xff); \ 90 filt = filt & hev_src; \ 91 \ 92 q1_tmp = __lsx_vssub_b(q1_tmp, filt); \ 93 q1_dst = __lsx_vxori_b(q1_tmp, 0x80); \ 94 p1_tmp = __lsx_vsadd_b(p1_tmp, filt); \ 95 p1_dst = __lsx_vxori_b(p1_tmp, 0x80); \ 96} 97 98#define VP9_FLAT4(p3_src, p2_src, p0_src, q0_src, q2_src, q3_src, flat_dst) \ 99{ \ 100 __m128i f_tmp = __lsx_vldi(1); \ 101 __m128i p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ 102 \ 103 p2_a_sub_p0 = __lsx_vabsd_bu(p2_src, p0_src); \ 104 q2_a_sub_q0 = __lsx_vabsd_bu(q2_src, q0_src); \ 105 p3_a_sub_p0 = __lsx_vabsd_bu(p3_src, p0_src); \ 106 q3_a_sub_q0 = __lsx_vabsd_bu(q3_src, q0_src); \ 107 \ 108 p2_a_sub_p0 = __lsx_vmax_bu(p2_a_sub_p0, q2_a_sub_q0); \ 109 flat_dst = __lsx_vmax_bu(p2_a_sub_p0, flat_dst); \ 110 p3_a_sub_p0 = __lsx_vmax_bu(p3_a_sub_p0, q3_a_sub_q0); \ 111 flat_dst = __lsx_vmax_bu(p3_a_sub_p0, flat_dst); \ 112 \ 113 flat_dst = __lsx_vslt_bu(f_tmp, flat_dst); \ 114 flat_dst = __lsx_vxori_b(flat_dst, 0xff); \ 115 flat_dst = flat_dst & mask; \ 116} 117 118#define VP9_FLAT5(p7_src, p6_src, p5_src, p4_src, p0_src, q0_src, q4_src, \ 119 q5_src, q6_src, q7_src, flat_src, flat2_dst) \ 120{ \ 121 __m128i f_tmp = __lsx_vldi(1); \ 122 __m128i p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ 123 __m128i p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ 124 \ 125 p4_a_sub_p0 = __lsx_vabsd_bu(p4_src, p0_src); \ 126 q4_a_sub_q0 = __lsx_vabsd_bu(q4_src, q0_src); \ 127 p5_a_sub_p0 = __lsx_vabsd_bu(p5_src, p0_src); \ 128 q5_a_sub_q0 = __lsx_vabsd_bu(q5_src, q0_src); \ 129 p6_a_sub_p0 = __lsx_vabsd_bu(p6_src, p0_src); \ 130 q6_a_sub_q0 = __lsx_vabsd_bu(q6_src, q0_src); \ 131 p7_a_sub_p0 = __lsx_vabsd_bu(p7_src, p0_src); \ 132 q7_a_sub_q0 = __lsx_vabsd_bu(q7_src, q0_src); \ 133 \ 134 p4_a_sub_p0 = __lsx_vmax_bu(p4_a_sub_p0, q4_a_sub_q0); \ 135 flat2_dst = __lsx_vmax_bu(p5_a_sub_p0, q5_a_sub_q0); \ 136 flat2_dst = __lsx_vmax_bu(p4_a_sub_p0, flat2_dst); \ 137 p6_a_sub_p0 = __lsx_vmax_bu(p6_a_sub_p0, q6_a_sub_q0); \ 138 flat2_dst = __lsx_vmax_bu(p6_a_sub_p0, flat2_dst); \ 139 p7_a_sub_p0 = __lsx_vmax_bu(p7_a_sub_p0, q7_a_sub_q0); \ 140 flat2_dst = __lsx_vmax_bu(p7_a_sub_p0, flat2_dst); \ 141 \ 142 flat2_dst = __lsx_vslt_bu(f_tmp, flat2_dst); \ 143 flat2_dst = __lsx_vxori_b(flat2_dst, 0xff); \ 144 flat2_dst = flat2_dst & flat_src; \ 145} 146 147#define VP9_FILTER8(p3_src, p2_src, p1_src, p0_src, \ 148 q0_src, q1_src, q2_src, q3_src, \ 149 p2_filt8_dst, p1_filt8_dst, p0_filt8_dst, \ 150 q0_filt8_dst, q1_filt8_dst, q2_filt8_dst) \ 151{ \ 152 __m128i tmp0, tmp1, tmp2; \ 153 \ 154 tmp2 = __lsx_vadd_h(p2_src, p1_src); \ 155 tmp2 = __lsx_vadd_h(tmp2, p0_src); \ 156 tmp0 = __lsx_vslli_h(p3_src, 1); \ 157 \ 158 tmp0 = __lsx_vadd_h(tmp0, tmp2); \ 159 tmp0 = __lsx_vadd_h(tmp0, q0_src); \ 160 tmp1 = __lsx_vadd_h(tmp0, p3_src); \ 161 tmp1 = __lsx_vadd_h(tmp1, p2_src); \ 162 p2_filt8_dst = __lsx_vsrari_h(tmp1, 3); \ 163 \ 164 tmp1 = __lsx_vadd_h(tmp0, p1_src); \ 165 tmp1 = __lsx_vadd_h(tmp1, q1_src); \ 166 p1_filt8_dst = __lsx_vsrari_h(tmp1, 3); \ 167 \ 168 tmp1 = __lsx_vadd_h(q2_src, q1_src); \ 169 tmp1 = __lsx_vadd_h(tmp1, q0_src); \ 170 tmp2 = __lsx_vadd_h(tmp2, tmp1); \ 171 tmp0 = __lsx_vadd_h(tmp2, p0_src); \ 172 tmp0 = __lsx_vadd_h(tmp0, p3_src); \ 173 p0_filt8_dst = __lsx_vsrari_h(tmp0, 3); \ 174 \ 175 tmp0 = __lsx_vadd_h(q2_src, q3_src); \ 176 tmp0 = __lsx_vadd_h(tmp0, p0_src); \ 177 tmp0 = __lsx_vadd_h(tmp0, tmp1); \ 178 tmp1 = __lsx_vadd_h(q3_src, q3_src); \ 179 tmp1 = __lsx_vadd_h(tmp1, tmp0); \ 180 q2_filt8_dst = __lsx_vsrari_h(tmp1, 3); \ 181 \ 182 tmp0 = __lsx_vadd_h(tmp2, q3_src); \ 183 tmp1 = __lsx_vadd_h(tmp0, q0_src); \ 184 q0_filt8_dst = __lsx_vsrari_h(tmp1, 3); \ 185 \ 186 tmp1 = __lsx_vsub_h(tmp0, p2_src); \ 187 tmp0 = __lsx_vadd_h(q1_src, q3_src); \ 188 tmp1 = __lsx_vadd_h(tmp0, tmp1); \ 189 q1_filt8_dst = __lsx_vsrari_h(tmp1, 3); \ 190} 191 192#define LPF_MASK_HEV(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, \ 193 q2_src, q3_src, limit_src, b_limit_src, thresh_src, \ 194 hev_dst, mask_dst, flat_dst) \ 195{ \ 196 __m128i p3_asub_p2_tmp, p2_asub_p1_tmp, p1_asub_p0_tmp, q1_asub_q0_tmp; \ 197 __m128i p1_asub_q1_tmp, p0_asub_q0_tmp, q3_asub_q2_tmp, q2_asub_q1_tmp; \ 198 \ 199 /* absolute subtraction of pixel values */ \ 200 p3_asub_p2_tmp = __lsx_vabsd_bu(p3_src, p2_src); \ 201 p2_asub_p1_tmp = __lsx_vabsd_bu(p2_src, p1_src); \ 202 p1_asub_p0_tmp = __lsx_vabsd_bu(p1_src, p0_src); \ 203 q1_asub_q0_tmp = __lsx_vabsd_bu(q1_src, q0_src); \ 204 q2_asub_q1_tmp = __lsx_vabsd_bu(q2_src, q1_src); \ 205 q3_asub_q2_tmp = __lsx_vabsd_bu(q3_src, q2_src); \ 206 p0_asub_q0_tmp = __lsx_vabsd_bu(p0_src, q0_src); \ 207 p1_asub_q1_tmp = __lsx_vabsd_bu(p1_src, q1_src); \ 208 \ 209 /* calculation of hev */ \ 210 flat_dst = __lsx_vmax_bu(p1_asub_p0_tmp, q1_asub_q0_tmp); \ 211 hev_dst = __lsx_vslt_bu(thresh_src, flat_dst); \ 212 \ 213 /* calculation of mask */ \ 214 p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p0_asub_q0_tmp); \ 215 p1_asub_q1_tmp = __lsx_vsrli_b(p1_asub_q1_tmp, 1); \ 216 p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p1_asub_q1_tmp); \ 217 \ 218 mask_dst = __lsx_vslt_bu(b_limit_src, p0_asub_q0_tmp); \ 219 mask_dst = __lsx_vmax_bu(flat_dst, mask_dst); \ 220 p3_asub_p2_tmp = __lsx_vmax_bu(p3_asub_p2_tmp, p2_asub_p1_tmp); \ 221 mask_dst = __lsx_vmax_bu(p3_asub_p2_tmp, mask_dst); \ 222 q2_asub_q1_tmp = __lsx_vmax_bu(q2_asub_q1_tmp, q3_asub_q2_tmp); \ 223 mask_dst = __lsx_vmax_bu(q2_asub_q1_tmp, mask_dst); \ 224 \ 225 mask_dst = __lsx_vslt_bu(limit_src, mask_dst); \ 226 mask_dst = __lsx_vxori_b(mask_dst, 0xff); \ 227} 228 229void ff_loop_filter_v_4_8_lsx(uint8_t *dst, ptrdiff_t stride, 230 int32_t b_limit_ptr, 231 int32_t limit_ptr, 232 int32_t thresh_ptr) 233{ 234 ptrdiff_t stride2 = stride << 1; 235 ptrdiff_t stride3 = stride2 + stride; 236 ptrdiff_t stride4 = stride2 << 1; 237 __m128i mask, hev, flat, thresh, b_limit, limit; 238 __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; 239 240 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, 241 dst, -stride, p3, p2, p1, p0); 242 q0 = __lsx_vld(dst, 0); 243 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); 244 q3 = __lsx_vldx(dst, stride3); 245 246 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 247 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 248 limit = __lsx_vreplgr2vr_b(limit_ptr); 249 250 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 251 hev, mask, flat); 252 253 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 254 q1_out); 255 256 __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); 257 __lsx_vstelm_d(p0_out, dst - stride, 0, 0); 258 __lsx_vstelm_d(q0_out, dst , 0, 0); 259 __lsx_vstelm_d(q1_out, dst + stride, 0, 0); 260} 261 262void ff_loop_filter_v_44_16_lsx(uint8_t *dst, ptrdiff_t stride, 263 int32_t b_limit_ptr, 264 int32_t limit_ptr, 265 int32_t thresh_ptr) 266{ 267 ptrdiff_t stride2 = stride << 1; 268 ptrdiff_t stride3 = stride2 + stride; 269 ptrdiff_t stride4 = stride2 << 1; 270 __m128i mask, hev, flat, thresh0, b_limit0; 271 __m128i limit0, thresh1, b_limit1, limit1; 272 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 273 274 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, 275 dst, -stride, p3, p2, p1, p0); 276 q0 = __lsx_vld(dst, 0); 277 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); 278 q3 = __lsx_vldx(dst, stride3); 279 280 thresh0 = __lsx_vreplgr2vr_b(thresh_ptr); 281 thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8); 282 thresh0 = __lsx_vilvl_d(thresh1, thresh0); 283 284 b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr); 285 b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8); 286 b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); 287 288 limit0 = __lsx_vreplgr2vr_b(limit_ptr); 289 limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8); 290 limit0 = __lsx_vilvl_d(limit1, limit0); 291 292 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, 293 hev, mask, flat); 294 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); 295 296 __lsx_vst(p1, dst - stride2, 0); 297 __lsx_vst(p0, dst - stride, 0); 298 __lsx_vst(q0, dst , 0); 299 __lsx_vst(q1, dst + stride, 0); 300} 301 302void ff_loop_filter_v_8_8_lsx(uint8_t *dst, ptrdiff_t stride, 303 int32_t b_limit_ptr, 304 int32_t limit_ptr, 305 int32_t thresh_ptr) 306{ 307 ptrdiff_t stride2 = stride << 1; 308 ptrdiff_t stride3 = stride2 + stride; 309 ptrdiff_t stride4 = stride2 << 1; 310 __m128i mask, hev, flat, thresh, b_limit, limit; 311 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 312 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 313 __m128i p2_filter8, p1_filter8, p0_filter8; 314 __m128i q0_filter8, q1_filter8, q2_filter8; 315 __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; 316 __m128i zero = __lsx_vldi(0); 317 318 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, 319 dst, -stride, p3, p2, p1, p0); 320 q0 = __lsx_vld(dst, 0); 321 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); 322 q3 = __lsx_vldx(dst, stride3); 323 324 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 325 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 326 limit = __lsx_vreplgr2vr_b(limit_ptr); 327 328 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 329 hev, mask, flat); 330 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 331 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 332 q1_out); 333 334 flat = __lsx_vilvl_d(zero, flat); 335 336 /* if flat is zero for all pixels, then no need to calculate other filter */ 337 if (__lsx_bz_v(flat)) { 338 __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); 339 __lsx_vstelm_d(p0_out, dst - stride, 0, 0); 340 __lsx_vstelm_d(q0_out, dst , 0, 0); 341 __lsx_vstelm_d(q1_out, dst + stride, 0, 0); 342 } else { 343 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, 344 p3_l, p2_l, p1_l, p0_l); 345 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, 346 q0_l, q1_l, q2_l, q3_l); 347 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8, 348 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); 349 350 /* convert 16 bit output data into 8 bit */ 351 DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, 352 zero, p0_filter8, zero, q0_filter8, p2_filter8, 353 p1_filter8, p0_filter8, q0_filter8); 354 DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, 355 q1_filter8, q2_filter8); 356 357 /* store pixel values */ 358 p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat); 359 p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat); 360 p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat); 361 q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat); 362 q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat); 363 q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat); 364 365 __lsx_vstelm_d(p2_out, dst - stride3, 0, 0); 366 __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); 367 __lsx_vstelm_d(p0_out, dst - stride, 0, 0); 368 __lsx_vstelm_d(q0_out, dst, 0, 0); 369 __lsx_vstelm_d(q1_out, dst + stride, 0, 0); 370 __lsx_vstelm_d(q2_out, dst + stride2, 0, 0); 371 } 372} 373 374void ff_loop_filter_v_88_16_lsx(uint8_t *dst, ptrdiff_t stride, 375 int32_t b_limit_ptr, 376 int32_t limit_ptr, 377 int32_t thresh_ptr) 378{ 379 ptrdiff_t stride2 = stride << 1; 380 ptrdiff_t stride3 = stride2 + stride; 381 ptrdiff_t stride4 = stride2 << 1; 382 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 383 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 384 __m128i flat, mask, hev, tmp, thresh, b_limit, limit; 385 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 386 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; 387 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; 388 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; 389 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; 390 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; 391 __m128i zero = __lsx_vldi(0); 392 393 /* load vector elements */ 394 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, 395 dst, -stride, p3, p2, p1, p0); 396 q0 = __lsx_vld(dst, 0); 397 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); 398 q3 = __lsx_vldx(dst, stride3); 399 400 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 401 tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8); 402 thresh = __lsx_vilvl_d(tmp, thresh); 403 404 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 405 tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8); 406 b_limit = __lsx_vilvl_d(tmp, b_limit); 407 408 limit = __lsx_vreplgr2vr_b(limit_ptr); 409 tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8); 410 limit = __lsx_vilvl_d(tmp, limit); 411 412 /* mask and hev */ 413 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 414 hev, mask, flat); 415 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 416 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 417 q1_out); 418 419 /* if flat is zero for all pixels, then no need to calculate other filter */ 420 if (__lsx_bz_v(flat)) { 421 __lsx_vst(p1_out, dst - stride2, 0); 422 __lsx_vst(p0_out, dst - stride, 0); 423 __lsx_vst(q0_out, dst, 0); 424 __lsx_vst(q1_out, dst + stride, 0); 425 } else { 426 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, 427 p3_l, p2_l, p1_l, p0_l); 428 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, 429 q0_l, q1_l, q2_l, q3_l); 430 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 431 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 432 433 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, 434 p3_h, p2_h, p1_h, p0_h); 435 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, 436 q0_h, q1_h, q2_h, q3_h); 437 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, 438 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); 439 440 /* convert 16 bit output data into 8 bit */ 441 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, 442 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, 443 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l); 444 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, 445 q2_filt8_l, q1_filt8_l, q2_filt8_l); 446 447 /* store pixel values */ 448 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); 449 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); 450 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); 451 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); 452 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); 453 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); 454 455 456 __lsx_vstx(p2_out, dst, -stride3); 457 __lsx_vstx(p1_out, dst, -stride2); 458 __lsx_vstx(p0_out, dst, -stride); 459 __lsx_vst(q0_out, dst, 0); 460 __lsx_vstx(q1_out, dst, stride); 461 __lsx_vstx(q2_out, dst, stride2); 462 } 463} 464 465void ff_loop_filter_v_84_16_lsx(uint8_t *dst, ptrdiff_t stride, 466 int32_t b_limit_ptr, 467 int32_t limit_ptr, 468 int32_t thresh_ptr) 469{ 470 ptrdiff_t stride2 = stride << 1; 471 ptrdiff_t stride3 = stride2 + stride; 472 ptrdiff_t stride4 = stride2 << 1; 473 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 474 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 475 __m128i flat, mask, hev, tmp, thresh, b_limit, limit; 476 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 477 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; 478 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; 479 __m128i zero = __lsx_vldi(0); 480 481 /* load vector elements */ 482 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, 483 dst, -stride, p3, p2, p1, p0); 484 q0 = __lsx_vld(dst, 0); 485 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); 486 q3 = __lsx_vldx(dst, stride3); 487 488 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 489 tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8); 490 thresh = __lsx_vilvl_d(tmp, thresh); 491 492 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 493 tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8); 494 b_limit = __lsx_vilvl_d(tmp, b_limit); 495 496 limit = __lsx_vreplgr2vr_b(limit_ptr); 497 tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8); 498 limit = __lsx_vilvl_d(tmp, limit); 499 500 /* mask and hev */ 501 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 502 hev, mask, flat); 503 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 504 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 505 q1_out); 506 507 flat = __lsx_vilvl_d(zero, flat); 508 509 /* if flat is zero for all pixels, then no need to calculate other filter */ 510 if (__lsx_bz_v(flat)) { 511 __lsx_vstx(p1_out, dst, -stride2); 512 __lsx_vstx(p0_out, dst, -stride); 513 __lsx_vst(q0_out, dst, 0); 514 __lsx_vstx(q1_out, dst, stride); 515 } else { 516 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, 517 p3_l, p2_l, p1_l, p0_l); 518 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, 519 q0_l, q1_l, q2_l, q3_l); 520 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 521 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 522 523 /* convert 16 bit output data into 8 bit */ 524 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, 525 p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, 526 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l); 527 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, 528 q2_filt8_l, q1_filt8_l, q2_filt8_l); 529 530 /* store pixel values */ 531 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); 532 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); 533 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); 534 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); 535 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); 536 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); 537 538 __lsx_vstx(p2_out, dst, -stride3); 539 __lsx_vstx(p1_out, dst, -stride2); 540 __lsx_vstx(p0_out, dst, -stride); 541 __lsx_vst(q0_out, dst, 0); 542 __lsx_vstx(q1_out, dst, stride); 543 __lsx_vstx(q2_out, dst, stride2); 544 } 545} 546 547void ff_loop_filter_v_48_16_lsx(uint8_t *dst, ptrdiff_t stride, 548 int32_t b_limit_ptr, 549 int32_t limit_ptr, 550 int32_t thresh_ptr) 551{ 552 ptrdiff_t stride2 = stride << 1; 553 ptrdiff_t stride3 = stride2 + stride; 554 ptrdiff_t stride4 = stride2 << 1; 555 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 556 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 557 __m128i flat, mask, hev, tmp, thresh, b_limit, limit; 558 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; 559 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; 560 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; 561 __m128i zero = { 0 }; 562 563 /* load vector elements */ 564 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, 565 dst, -stride, p3, p2, p1, p0); 566 q0 = __lsx_vld(dst, 0); 567 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); 568 q3 = __lsx_vldx(dst, stride3); 569 570 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 571 tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8); 572 thresh = __lsx_vilvl_d(tmp, thresh); 573 574 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 575 tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8); 576 b_limit = __lsx_vilvl_d(tmp, b_limit); 577 578 limit = __lsx_vreplgr2vr_b(limit_ptr); 579 tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8); 580 limit = __lsx_vilvl_d(tmp, limit); 581 582 /* mask and hev */ 583 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 584 hev, mask, flat); 585 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 586 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 587 q1_out); 588 589 flat = __lsx_vilvh_d(flat, zero); 590 591 /* if flat is zero for all pixels, then no need to calculate other filter */ 592 if (__lsx_bz_v(flat)) { 593 __lsx_vstx(p1_out, dst, -stride2); 594 __lsx_vstx(p0_out, dst, -stride); 595 __lsx_vst(q0_out, dst, 0); 596 __lsx_vstx(q1_out, dst, stride); 597 } else { 598 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, 599 p3_h, p2_h, p1_h, p0_h); 600 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, 601 q0_h, q1_h, q2_h, q3_h); 602 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, 603 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); 604 605 /* convert 16 bit output data into 8 bit */ 606 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h, 607 p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h, 608 p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h); 609 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h, 610 q2_filt8_h, q1_filt8_h, q2_filt8_h); 611 612 /* store pixel values */ 613 p2_out = __lsx_vbitsel_v(p2, p2_filt8_h, flat); 614 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_h, flat); 615 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_h, flat); 616 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_h, flat); 617 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_h, flat); 618 q2_out = __lsx_vbitsel_v(q2, q2_filt8_h, flat); 619 620 __lsx_vstx(p2_out, dst, -stride3); 621 __lsx_vstx(p1_out, dst, -stride2); 622 __lsx_vstx(p0_out, dst, -stride); 623 __lsx_vst(q0_out, dst, 0); 624 __lsx_vstx(q1_out, dst, stride); 625 __lsx_vstx(q2_out, dst, stride2); 626 } 627} 628 629static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *dst, ptrdiff_t stride, 630 uint8_t *filter48, 631 int32_t b_limit_ptr, 632 int32_t limit_ptr, 633 int32_t thresh_ptr) 634{ 635 ptrdiff_t stride2 = stride << 1; 636 ptrdiff_t stride3 = stride2 + stride; 637 ptrdiff_t stride4 = stride2 << 1; 638 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 639 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 640 __m128i flat, mask, hev, thresh, b_limit, limit; 641 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 642 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; 643 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; 644 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; 645 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; 646 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; 647 __m128i zero = __lsx_vldi(0); 648 649 /* load vector elements */ 650 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, 651 dst, -stride, p3, p2, p1, p0); 652 q0 = __lsx_vld(dst, 0); 653 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); 654 q3 = __lsx_vldx(dst, stride3); 655 656 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 657 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 658 limit = __lsx_vreplgr2vr_b(limit_ptr); 659 660 /* mask and hev */ 661 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 662 hev, mask, flat); 663 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 664 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 665 q1_out); 666 667 /* if flat is zero for all pixels, then no need to calculate other filter */ 668 if (__lsx_bz_v(flat)) { 669 __lsx_vstx(p1_out, dst, -stride2); 670 __lsx_vstx(p0_out, dst, -stride); 671 __lsx_vst(q0_out, dst, 0); 672 __lsx_vstx(q1_out, dst, stride); 673 return 1; 674 } else { 675 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, 676 p3_l, p2_l, p1_l, p0_l); 677 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, 678 q0_l, q1_l, q2_l, q3_l); 679 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 680 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 681 682 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, 683 p3_h, p2_h, p1_h, p0_h); 684 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, 685 q0_h, q1_h, q2_h, q3_h); 686 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, 687 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); 688 689 /* convert 16 bit output data into 8 bit */ 690 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, 691 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, 692 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l); 693 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, 694 q2_filt8_l, q1_filt8_l, q2_filt8_l); 695 696 /* store pixel values */ 697 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); 698 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); 699 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); 700 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); 701 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); 702 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); 703 704 __lsx_vst(p2_out, filter48, 0); 705 __lsx_vst(p1_out, filter48, 16); 706 __lsx_vst(p0_out, filter48, 32); 707 __lsx_vst(q0_out, filter48, 48); 708 __lsx_vst(q1_out, filter48, 64); 709 __lsx_vst(q2_out, filter48, 80); 710 __lsx_vst(flat, filter48, 96); 711 712 return 0; 713 } 714} 715 716static void vp9_hz_lpf_t16_16w(uint8_t *dst, ptrdiff_t stride, 717 uint8_t *filter48) 718{ 719 ptrdiff_t stride2 = stride << 1; 720 ptrdiff_t stride3 = stride2 + stride; 721 ptrdiff_t stride4 = stride2 << 1; 722 uint8_t *dst_tmp = dst - stride4; 723 uint8_t *dst_tmp1 = dst + stride4; 724 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 725 __m128i flat, flat2, filter8; 726 __m128i zero = __lsx_vldi(0); 727 __m128i out_h, out_l; 728 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; 729 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; 730 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; 731 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; 732 v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in; 733 v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in; 734 v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in; 735 v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in; 736 v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h; 737 738 flat = __lsx_vld(filter48, 96); 739 740 DUP4_ARG2(__lsx_vldx, dst_tmp, -stride4, dst_tmp, -stride3, dst_tmp, 741 -stride2, dst_tmp, -stride, p7, p6, p5, p4); 742 p3 = __lsx_vld(dst_tmp, 0); 743 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1); 744 p0 = __lsx_vldx(dst_tmp, stride3); 745 746 q0 = __lsx_vld(dst, 0); 747 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); 748 q3 = __lsx_vldx(dst, stride3); 749 750 q4 = __lsx_vld(dst_tmp1, 0); 751 DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6); 752 q7 = __lsx_vldx(dst_tmp1, stride3); 753 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 754 755 /* if flat2 is zero for all pixels, then no need to calculate other filter */ 756 if (__lsx_bz_v(flat2)) { 757 DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 758 48, p2, p1, p0, q0); 759 DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); 760 761 __lsx_vstx(p2, dst, -stride3); 762 __lsx_vstx(p1, dst, -stride2); 763 __lsx_vstx(p0, dst, -stride); 764 __lsx_vst(q0, dst, 0); 765 __lsx_vstx(q1, dst, stride); 766 __lsx_vstx(q2, dst, stride2); 767 } else { 768 dst = dst_tmp - stride3; 769 770 p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7); 771 p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6); 772 p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5); 773 p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4); 774 p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3); 775 p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2); 776 p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1); 777 p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0); 778 779 q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0); 780 781 tmp0_l = p7_l_in << 3; 782 tmp0_l -= p7_l_in; 783 tmp0_l += p6_l_in; 784 tmp0_l += q0_l_in; 785 tmp1_l = p6_l_in + p5_l_in; 786 tmp1_l += p4_l_in; 787 tmp1_l += p3_l_in; 788 tmp1_l += p2_l_in; 789 tmp1_l += p1_l_in; 790 tmp1_l += p0_l_in; 791 tmp1_l += tmp0_l; 792 793 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 794 795 p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7); 796 p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6); 797 p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5); 798 p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4); 799 800 p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3); 801 p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2); 802 p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1); 803 p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0); 804 q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0); 805 806 tmp0_h = p7_h_in << 3; 807 tmp0_h -= p7_h_in; 808 tmp0_h += p6_h_in; 809 tmp0_h += q0_h_in; 810 tmp1_h = p6_h_in + p5_h_in; 811 tmp1_h += p4_h_in; 812 tmp1_h += p3_h_in; 813 tmp1_h += p2_h_in; 814 tmp1_h += p1_h_in; 815 tmp1_h += p0_h_in; 816 tmp1_h += tmp0_h; 817 818 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 819 820 out_l = __lsx_vpickev_b(out_h, out_l); 821 p6 = __lsx_vbitsel_v(p6, out_l, flat2); 822 __lsx_vst(p6, dst, 0); 823 dst += stride; 824 825 /* p5 */ 826 q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1); 827 tmp0_l = p5_l_in - p6_l_in; 828 tmp0_l += q1_l_in; 829 tmp0_l -= p7_l_in; 830 tmp1_l += tmp0_l; 831 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 832 833 q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1); 834 tmp0_h = p5_h_in - p6_h_in; 835 tmp0_h += q1_h_in; 836 tmp0_h -= p7_h_in; 837 tmp1_h += tmp0_h; 838 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 839 840 out_l = __lsx_vpickev_b(out_h, out_l); 841 p5 = __lsx_vbitsel_v(p5, out_l, flat2); 842 __lsx_vst(p5, dst, 0); 843 dst += stride; 844 845 /* p4 */ 846 q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2); 847 tmp0_l = p4_l_in - p5_l_in; 848 tmp0_l += q2_l_in; 849 tmp0_l -= p7_l_in; 850 tmp1_l += tmp0_l; 851 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 852 853 q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2); 854 tmp0_h = p4_h_in - p5_h_in; 855 tmp0_h += q2_h_in; 856 tmp0_h -= p7_h_in; 857 tmp1_h += tmp0_h; 858 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 859 860 out_l = __lsx_vpickev_b(out_h, out_l); 861 p4 = __lsx_vbitsel_v(p4, out_l, flat2); 862 __lsx_vst(p4, dst, 0); 863 dst += stride; 864 865 /* p3 */ 866 q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3); 867 tmp0_l = p3_l_in - p4_l_in; 868 tmp0_l += q3_l_in; 869 tmp0_l -= p7_l_in; 870 tmp1_l += tmp0_l; 871 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 872 873 q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3); 874 tmp0_h = p3_h_in - p4_h_in; 875 tmp0_h += q3_h_in; 876 tmp0_h -= p7_h_in; 877 tmp1_h += tmp0_h; 878 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 879 880 out_l = __lsx_vpickev_b(out_h, out_l); 881 p3 = __lsx_vbitsel_v(p3, out_l, flat2); 882 __lsx_vst(p3, dst, 0); 883 dst += stride; 884 885 /* p2 */ 886 q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4); 887 filter8 = __lsx_vld(filter48, 0); 888 tmp0_l = p2_l_in - p3_l_in; 889 tmp0_l += q4_l_in; 890 tmp0_l -= p7_l_in; 891 tmp1_l += tmp0_l; 892 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 893 894 q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4); 895 tmp0_h = p2_h_in - p3_h_in; 896 tmp0_h += q4_h_in; 897 tmp0_h -= p7_h_in; 898 tmp1_h += tmp0_h; 899 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 900 901 out_l = __lsx_vpickev_b(out_h, out_l); 902 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 903 __lsx_vst(filter8, dst, 0); 904 dst += stride; 905 906 /* p1 */ 907 q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5); 908 filter8 = __lsx_vld(filter48, 16); 909 tmp0_l = p1_l_in - p2_l_in; 910 tmp0_l += q5_l_in; 911 tmp0_l -= p7_l_in; 912 tmp1_l += tmp0_l; 913 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 914 915 q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5); 916 tmp0_h = p1_h_in - p2_h_in; 917 tmp0_h += q5_h_in; 918 tmp0_h -= p7_h_in; 919 tmp1_h += tmp0_h; 920 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 921 922 out_l = __lsx_vpickev_b(out_h, out_l); 923 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 924 __lsx_vst(filter8, dst, 0); 925 dst += stride; 926 927 /* p0 */ 928 q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6); 929 filter8 = __lsx_vld(filter48, 32); 930 tmp0_l = p0_l_in - p1_l_in; 931 tmp0_l += q6_l_in; 932 tmp0_l -= p7_l_in; 933 tmp1_l += tmp0_l; 934 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 935 936 q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6); 937 tmp0_h = p0_h_in - p1_h_in; 938 tmp0_h += q6_h_in; 939 tmp0_h -= p7_h_in; 940 tmp1_h += tmp0_h; 941 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 942 943 out_l = __lsx_vpickev_b(out_h, out_l); 944 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 945 __lsx_vst(filter8, dst, 0); 946 dst += stride; 947 948 /* q0 */ 949 q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7); 950 filter8 = __lsx_vld(filter48, 48); 951 tmp0_l = q7_l_in - p0_l_in; 952 tmp0_l += q0_l_in; 953 tmp0_l -= p7_l_in; 954 tmp1_l += tmp0_l; 955 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 956 957 q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7); 958 tmp0_h = q7_h_in - p0_h_in; 959 tmp0_h += q0_h_in; 960 tmp0_h -= p7_h_in; 961 tmp1_h += tmp0_h; 962 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 963 964 out_l = __lsx_vpickev_b(out_h, out_l); 965 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 966 __lsx_vst(filter8, dst, 0); 967 dst += stride; 968 969 /* q1 */ 970 filter8 = __lsx_vld(filter48, 64); 971 tmp0_l = q7_l_in - q0_l_in; 972 tmp0_l += q1_l_in; 973 tmp0_l -= p6_l_in; 974 tmp1_l += tmp0_l; 975 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 976 977 tmp0_h = q7_h_in - q0_h_in; 978 tmp0_h += q1_h_in; 979 tmp0_h -= p6_h_in; 980 tmp1_h += tmp0_h; 981 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 982 983 out_l = __lsx_vpickev_b(out_h, out_l); 984 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 985 __lsx_vst(filter8, dst, 0); 986 dst += stride; 987 988 /* q2 */ 989 filter8 = __lsx_vld(filter48, 80); 990 tmp0_l = q7_l_in - q1_l_in; 991 tmp0_l += q2_l_in; 992 tmp0_l -= p5_l_in; 993 tmp1_l += tmp0_l; 994 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 995 996 tmp0_h = q7_h_in - q1_h_in; 997 tmp0_h += q2_h_in; 998 tmp0_h -= p5_h_in; 999 tmp1_h += tmp0_h; 1000 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 1001 1002 out_l = __lsx_vpickev_b(out_h, out_l); 1003 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 1004 __lsx_vst(filter8, dst, 0); 1005 dst += stride; 1006 1007 /* q3 */ 1008 tmp0_l = q7_l_in - q2_l_in; 1009 tmp0_l += q3_l_in; 1010 tmp0_l -= p4_l_in; 1011 tmp1_l += tmp0_l; 1012 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 1013 1014 tmp0_h = q7_h_in - q2_h_in; 1015 tmp0_h += q3_h_in; 1016 tmp0_h -= p4_h_in; 1017 tmp1_h += tmp0_h; 1018 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 1019 1020 out_l = __lsx_vpickev_b(out_h, out_l); 1021 q3 = __lsx_vbitsel_v(q3, out_l, flat2); 1022 __lsx_vst(q3, dst, 0); 1023 dst += stride; 1024 1025 /* q4 */ 1026 tmp0_l = q7_l_in - q3_l_in; 1027 tmp0_l += q4_l_in; 1028 tmp0_l -= p3_l_in; 1029 tmp1_l += tmp0_l; 1030 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 1031 1032 tmp0_h = q7_h_in - q3_h_in; 1033 tmp0_h += q4_h_in; 1034 tmp0_h -= p3_h_in; 1035 tmp1_h += tmp0_h; 1036 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 1037 1038 out_l = __lsx_vpickev_b(out_h, out_l); 1039 q4 = __lsx_vbitsel_v(q4, out_l, flat2); 1040 __lsx_vst(q4, dst, 0); 1041 dst += stride; 1042 1043 /* q5 */ 1044 tmp0_l = q7_l_in - q4_l_in; 1045 tmp0_l += q5_l_in; 1046 tmp0_l -= p2_l_in; 1047 tmp1_l += tmp0_l; 1048 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 1049 1050 tmp0_h = q7_h_in - q4_h_in; 1051 tmp0_h += q5_h_in; 1052 tmp0_h -= p2_h_in; 1053 tmp1_h += tmp0_h; 1054 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 1055 1056 out_l = __lsx_vpickev_b(out_h, out_l); 1057 q5 = __lsx_vbitsel_v(q5, out_l, flat2); 1058 __lsx_vst(q5, dst, 0); 1059 dst += stride; 1060 1061 /* q6 */ 1062 tmp0_l = q7_l_in - q5_l_in; 1063 tmp0_l += q6_l_in; 1064 tmp0_l -= p1_l_in; 1065 tmp1_l += tmp0_l; 1066 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 1067 1068 tmp0_h = q7_h_in - q5_h_in; 1069 tmp0_h += q6_h_in; 1070 tmp0_h -= p1_h_in; 1071 tmp1_h += tmp0_h; 1072 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 1073 1074 out_l = __lsx_vpickev_b(out_h, out_l); 1075 q6 = __lsx_vbitsel_v(q6, out_l, flat2); 1076 __lsx_vst(q6, dst, 0); 1077 } 1078} 1079 1080void ff_loop_filter_v_16_16_lsx(uint8_t *dst, ptrdiff_t stride, 1081 int32_t b_limit_ptr, 1082 int32_t limit_ptr, 1083 int32_t thresh_ptr) 1084{ 1085 uint8_t filter48[16 * 8] __attribute__ ((aligned(16))); 1086 uint8_t early_exit = 0; 1087 1088 early_exit = vp9_hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], 1089 b_limit_ptr, limit_ptr, thresh_ptr); 1090 1091 if (0 == early_exit) { 1092 vp9_hz_lpf_t16_16w(dst, stride, filter48); 1093 } 1094} 1095 1096void ff_loop_filter_v_16_8_lsx(uint8_t *dst, ptrdiff_t stride, 1097 int32_t b_limit_ptr, 1098 int32_t limit_ptr, 1099 int32_t thresh_ptr) 1100{ 1101 ptrdiff_t stride2 = stride << 1; 1102 ptrdiff_t stride3 = stride2 + stride; 1103 ptrdiff_t stride4 = stride2 << 1; 1104 uint8_t *dst_tmp = dst - stride4; 1105 uint8_t *dst_tmp1 = dst + stride4; 1106 __m128i zero = __lsx_vldi(0); 1107 __m128i flat2, mask, hev, flat, thresh, b_limit, limit; 1108 __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; 1109 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 1110 __m128i p0_filter16, p1_filter16; 1111 __m128i p2_filter8, p1_filter8, p0_filter8; 1112 __m128i q0_filter8, q1_filter8, q2_filter8; 1113 __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l; 1114 __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; 1115 __m128i tmp0, tmp1, tmp2; 1116 1117 /* load vector elements */ 1118 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, 1119 dst, -stride, p3, p2, p1, p0); 1120 q0 = __lsx_vld(dst, 0); 1121 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); 1122 q3 = __lsx_vldx(dst, stride3); 1123 1124 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 1125 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 1126 limit = __lsx_vreplgr2vr_b(limit_ptr); 1127 1128 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1129 hev, mask, flat); 1130 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1131 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1132 q1_out); 1133 1134 flat = __lsx_vilvl_d(zero, flat); 1135 1136 /* if flat is zero for all pixels, then no need to calculate other filter */ 1137 if (__lsx_bz_v(flat)) { 1138 __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); 1139 __lsx_vstelm_d(p0_out, dst - stride, 0, 0); 1140 __lsx_vstelm_d(q0_out, dst , 0, 0); 1141 __lsx_vstelm_d(q1_out, dst + stride, 0, 0); 1142 } else { 1143 /* convert 8 bit input data into 16 bit */ 1144 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, 1145 p3_l, p2_l, p1_l, p0_l); 1146 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, 1147 q0_l, q1_l, q2_l, q3_l); 1148 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, 1149 p2_filter8, p1_filter8, p0_filter8, q0_filter8, 1150 q1_filter8, q2_filter8); 1151 1152 /* convert 16 bit output data into 8 bit */ 1153 DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, 1154 zero, p0_filter8, zero, q0_filter8, p2_filter8, 1155 p1_filter8, p0_filter8, q0_filter8); 1156 DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, 1157 q1_filter8, q2_filter8); 1158 1159 /* store pixel values */ 1160 p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat); 1161 p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat); 1162 p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat); 1163 q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat); 1164 q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat); 1165 q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat); 1166 1167 /* load 16 vector elements */ 1168 DUP4_ARG2(__lsx_vld, dst_tmp - stride4, 0, dst_tmp - stride3, 0, 1169 dst_tmp - stride2, 0, dst_tmp - stride, 0, p7, p6, p5, p4); 1170 DUP4_ARG2(__lsx_vld, dst_tmp1, 0, dst_tmp1 + stride, 0, 1171 dst_tmp1 + stride2, 0, dst_tmp1 + stride3, 0, q4, q5, q6, q7); 1172 1173 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 1174 1175 /* if flat2 is zero for all pixels, then no need to calculate other filter */ 1176 if (__lsx_bz_v(flat2)) { 1177 dst -= stride3; 1178 __lsx_vstelm_d(p2_out, dst, 0, 0); 1179 dst += stride; 1180 __lsx_vstelm_d(p1_out, dst, 0, 0); 1181 dst += stride; 1182 __lsx_vstelm_d(p0_out, dst, 0, 0); 1183 dst += stride; 1184 __lsx_vstelm_d(q0_out, dst, 0, 0); 1185 dst += stride; 1186 __lsx_vstelm_d(q1_out, dst, 0, 0); 1187 dst += stride; 1188 __lsx_vstelm_d(q2_out, dst, 0, 0); 1189 } else { 1190 /* LSB(right) 8 pixel operation */ 1191 DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, 1192 p7_l, p6_l, p5_l, p4_l); 1193 DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, 1194 q4_l, q5_l, q6_l, q7_l); 1195 1196 tmp0 = __lsx_vslli_h(p7_l, 3); 1197 tmp0 = __lsx_vsub_h(tmp0, p7_l); 1198 tmp0 = __lsx_vadd_h(tmp0, p6_l); 1199 tmp0 = __lsx_vadd_h(tmp0, q0_l); 1200 1201 dst = dst_tmp - stride3; 1202 1203 /* calculation of p6 and p5 */ 1204 tmp1 = __lsx_vadd_h(p6_l, p5_l); 1205 tmp1 = __lsx_vadd_h(tmp1, p4_l); 1206 tmp1 = __lsx_vadd_h(tmp1, p3_l); 1207 tmp1 = __lsx_vadd_h(tmp1, p2_l); 1208 tmp1 = __lsx_vadd_h(tmp1, p1_l); 1209 tmp1 = __lsx_vadd_h(tmp1, p0_l); 1210 tmp1 = __lsx_vadd_h(tmp1, tmp0); 1211 1212 p0_filter16 = __lsx_vsrari_h(tmp1, 4); 1213 tmp0 = __lsx_vsub_h(p5_l, p6_l); 1214 tmp0 = __lsx_vadd_h(tmp0, q1_l); 1215 tmp0 = __lsx_vsub_h(tmp0, p7_l); 1216 tmp1 = __lsx_vadd_h(tmp1, tmp0); 1217 1218 p1_filter16 = __lsx_vsrari_h(tmp1, 4); 1219 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, 1220 p1_filter16, p0_filter16, p1_filter16); 1221 p0_filter16 = __lsx_vbitsel_v(p6, p0_filter16, flat2); 1222 p1_filter16 = __lsx_vbitsel_v(p5, p1_filter16, flat2); 1223 __lsx_vstelm_d(p0_filter16, dst, 0, 0); 1224 dst += stride; 1225 __lsx_vstelm_d(p1_filter16, dst, 0, 0); 1226 dst += stride; 1227 1228 /* calculation of p4 and p3 */ 1229 tmp0 = __lsx_vsub_h(p4_l, p5_l); 1230 tmp0 = __lsx_vadd_h(tmp0, q2_l); 1231 tmp0 = __lsx_vsub_h(tmp0, p7_l); 1232 tmp2 = __lsx_vsub_h(p3_l, p4_l); 1233 tmp2 = __lsx_vadd_h(tmp2, q3_l); 1234 tmp2 = __lsx_vsub_h(tmp2, p7_l); 1235 tmp1 = __lsx_vadd_h(tmp1, tmp0); 1236 p0_filter16 = __lsx_vsrari_h(tmp1, 4); 1237 tmp1 = __lsx_vadd_h(tmp1, tmp2); 1238 p1_filter16 = __lsx_vsrari_h(tmp1, 4); 1239 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, 1240 p1_filter16, p0_filter16, p1_filter16); 1241 p0_filter16 = __lsx_vbitsel_v(p4, p0_filter16, flat2); 1242 p1_filter16 = __lsx_vbitsel_v(p3, p1_filter16, flat2); 1243 __lsx_vstelm_d(p0_filter16, dst, 0, 0); 1244 dst += stride; 1245 __lsx_vstelm_d(p1_filter16, dst, 0, 0); 1246 dst += stride; 1247 1248 /* calculation of p2 and p1 */ 1249 tmp0 = __lsx_vsub_h(p2_l, p3_l); 1250 tmp0 = __lsx_vadd_h(tmp0, q4_l); 1251 tmp0 = __lsx_vsub_h(tmp0, p7_l); 1252 tmp2 = __lsx_vsub_h(p1_l, p2_l); 1253 tmp2 = __lsx_vadd_h(tmp2, q5_l); 1254 tmp2 = __lsx_vsub_h(tmp2, p7_l); 1255 tmp1 = __lsx_vadd_h(tmp1, tmp0); 1256 p0_filter16 = __lsx_vsrari_h(tmp1, 4); 1257 tmp1 = __lsx_vadd_h(tmp1, tmp2); 1258 p1_filter16 = __lsx_vsrari_h(tmp1, 4); 1259 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, 1260 p1_filter16, p0_filter16, p1_filter16); 1261 p0_filter16 = __lsx_vbitsel_v(p2_out, p0_filter16, flat2); 1262 p1_filter16 = __lsx_vbitsel_v(p1_out, p1_filter16, flat2); 1263 __lsx_vstelm_d(p0_filter16, dst, 0, 0); 1264 dst += stride; 1265 __lsx_vstelm_d(p1_filter16, dst, 0, 0); 1266 dst += stride; 1267 1268 /* calculation of p0 and q0 */ 1269 tmp0 = __lsx_vsub_h(p0_l, p1_l); 1270 tmp0 = __lsx_vadd_h(tmp0, q6_l); 1271 tmp0 = __lsx_vsub_h(tmp0, p7_l); 1272 tmp2 = __lsx_vsub_h(q7_l, p0_l); 1273 tmp2 = __lsx_vadd_h(tmp2, q0_l); 1274 tmp2 = __lsx_vsub_h(tmp2, p7_l); 1275 tmp1 = __lsx_vadd_h(tmp1, tmp0); 1276 p0_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4); 1277 tmp1 = __lsx_vadd_h(tmp1, tmp2); 1278 p1_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4); 1279 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, 1280 p1_filter16, p0_filter16, p1_filter16); 1281 p0_filter16 = __lsx_vbitsel_v(p0_out, p0_filter16, flat2); 1282 p1_filter16 = __lsx_vbitsel_v(q0_out, p1_filter16, flat2); 1283 __lsx_vstelm_d(p0_filter16, dst, 0, 0); 1284 dst += stride; 1285 __lsx_vstelm_d(p1_filter16, dst, 0, 0); 1286 dst += stride; 1287 1288 /* calculation of q1 and q2 */ 1289 tmp0 = __lsx_vsub_h(q7_l, q0_l); 1290 tmp0 = __lsx_vadd_h(tmp0, q1_l); 1291 tmp0 = __lsx_vsub_h(tmp0, p6_l); 1292 tmp2 = __lsx_vsub_h(q7_l, q1_l); 1293 tmp2 = __lsx_vadd_h(tmp2, q2_l); 1294 tmp2 = __lsx_vsub_h(tmp2, p5_l); 1295 tmp1 = __lsx_vadd_h(tmp1, tmp0); 1296 p0_filter16 = __lsx_vsrari_h(tmp1, 4); 1297 tmp1 = __lsx_vadd_h(tmp1, tmp2); 1298 p1_filter16 = __lsx_vsrari_h(tmp1, 4); 1299 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, 1300 p1_filter16, p0_filter16, p1_filter16); 1301 p0_filter16 = __lsx_vbitsel_v(q1_out, p0_filter16, flat2); 1302 p1_filter16 = __lsx_vbitsel_v(q2_out, p1_filter16, flat2); 1303 __lsx_vstelm_d(p0_filter16, dst, 0, 0); 1304 dst += stride; 1305 __lsx_vstelm_d(p1_filter16, dst, 0, 0); 1306 dst += stride; 1307 1308 /* calculation of q3 and q4 */ 1309 tmp0 = __lsx_vsub_h(q7_l, q2_l); 1310 tmp0 = __lsx_vadd_h(tmp0, q3_l); 1311 tmp0 = __lsx_vsub_h(tmp0, p4_l); 1312 tmp2 = __lsx_vsub_h(q7_l, q3_l); 1313 tmp2 = __lsx_vadd_h(tmp2, q4_l); 1314 tmp2 = __lsx_vsub_h(tmp2, p3_l); 1315 tmp1 = __lsx_vadd_h(tmp1, tmp0); 1316 p0_filter16 = __lsx_vsrari_h(tmp1, 4); 1317 tmp1 = __lsx_vadd_h(tmp1, tmp2); 1318 p1_filter16 = __lsx_vsrari_h(tmp1, 4); 1319 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, 1320 p1_filter16, p0_filter16, p1_filter16); 1321 p0_filter16 = __lsx_vbitsel_v(q3, p0_filter16, flat2); 1322 p1_filter16 = __lsx_vbitsel_v(q4, p1_filter16, flat2); 1323 __lsx_vstelm_d(p0_filter16, dst, 0, 0); 1324 dst += stride; 1325 __lsx_vstelm_d(p1_filter16, dst, 0, 0); 1326 dst += stride; 1327 1328 /* calculation of q5 and q6 */ 1329 tmp0 = __lsx_vsub_h(q7_l, q4_l); 1330 tmp0 = __lsx_vadd_h(tmp0, q5_l); 1331 tmp0 = __lsx_vsub_h(tmp0, p2_l); 1332 tmp2 = __lsx_vsub_h(q7_l, q5_l); 1333 tmp2 = __lsx_vadd_h(tmp2, q6_l); 1334 tmp2 = __lsx_vsub_h(tmp2, p1_l); 1335 tmp1 = __lsx_vadd_h(tmp1, tmp0); 1336 p0_filter16 = __lsx_vsrari_h(tmp1, 4); 1337 tmp1 = __lsx_vadd_h(tmp1, tmp2); 1338 p1_filter16 = __lsx_vsrari_h(tmp1, 4); 1339 DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, 1340 p1_filter16, p0_filter16, p1_filter16); 1341 p0_filter16 = __lsx_vbitsel_v(q5, p0_filter16, flat2); 1342 p1_filter16 = __lsx_vbitsel_v(q6, p1_filter16, flat2); 1343 __lsx_vstelm_d(p0_filter16, dst, 0, 0); 1344 dst += stride; 1345 __lsx_vstelm_d(p1_filter16, dst, 0, 0); 1346 } 1347 } 1348} 1349 1350void ff_loop_filter_h_4_8_lsx(uint8_t *dst, ptrdiff_t stride, 1351 int32_t b_limit_ptr, 1352 int32_t limit_ptr, 1353 int32_t thresh_ptr) 1354{ 1355 ptrdiff_t stride2 = stride << 1; 1356 ptrdiff_t stride3 = stride2 + stride; 1357 ptrdiff_t stride4 = stride2 << 1; 1358 uint8_t *dst_tmp1 = dst - 4; 1359 uint8_t *dst_tmp2 = dst_tmp1 + stride4; 1360 __m128i mask, hev, flat, limit, thresh, b_limit; 1361 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 1362 __m128i vec0, vec1, vec2, vec3; 1363 1364 p3 = __lsx_vld(dst_tmp1, 0); 1365 DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, p2, p1); 1366 p0 = __lsx_vldx(dst_tmp1, stride3); 1367 q0 = __lsx_vld(dst_tmp2, 0); 1368 DUP2_ARG2(__lsx_vldx, dst_tmp2, stride, dst_tmp2, stride2, q1, q2); 1369 q3 = __lsx_vldx(dst_tmp2, stride3); 1370 1371 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 1372 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 1373 limit = __lsx_vreplgr2vr_b(limit_ptr); 1374 1375 LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, 1376 p3, p2, p1, p0, q0, q1, q2, q3); 1377 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1378 hev, mask, flat); 1379 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); 1380 DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1); 1381 vec2 = __lsx_vilvl_h(vec1, vec0); 1382 vec3 = __lsx_vilvh_h(vec1, vec0); 1383 1384 dst -= 2; 1385 __lsx_vstelm_w(vec2, dst, 0, 0); 1386 __lsx_vstelm_w(vec2, dst + stride, 0, 1); 1387 __lsx_vstelm_w(vec2, dst + stride2, 0, 2); 1388 __lsx_vstelm_w(vec2, dst + stride3, 0, 3); 1389 dst += stride4; 1390 __lsx_vstelm_w(vec3, dst, 0, 0); 1391 __lsx_vstelm_w(vec3, dst + stride, 0, 1); 1392 __lsx_vstelm_w(vec3, dst + stride2, 0, 2); 1393 __lsx_vstelm_w(vec3, dst + stride3, 0, 3); 1394} 1395 1396void ff_loop_filter_h_44_16_lsx(uint8_t *dst, ptrdiff_t stride, 1397 int32_t b_limit_ptr, 1398 int32_t limit_ptr, 1399 int32_t thresh_ptr) 1400{ 1401 ptrdiff_t stride2 = stride << 1; 1402 ptrdiff_t stride3 = stride2 + stride; 1403 ptrdiff_t stride4 = stride2 << 1; 1404 uint8_t *dst_tmp = dst - 4; 1405 __m128i mask, hev, flat; 1406 __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; 1407 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 1408 __m128i row0, row1, row2, row3, row4, row5, row6, row7; 1409 __m128i row8, row9, row10, row11, row12, row13, row14, row15; 1410 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 1411 1412 row0 = __lsx_vld(dst_tmp, 0); 1413 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row1, row2); 1414 row3 = __lsx_vldx(dst_tmp, stride3); 1415 dst_tmp += stride4; 1416 row4 = __lsx_vld(dst_tmp, 0); 1417 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6); 1418 row7 = __lsx_vldx(dst_tmp, stride3); 1419 dst_tmp += stride4; 1420 row8 = __lsx_vld(dst_tmp, 0); 1421 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row9, row10); 1422 row11 = __lsx_vldx(dst_tmp, stride3); 1423 dst_tmp += stride4; 1424 row12 = __lsx_vld(dst_tmp, 0); 1425 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14); 1426 row15 = __lsx_vldx(dst_tmp, stride3); 1427 1428 LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, 1429 row8, row9, row10, row11, row12, row13, row14, row15, 1430 p3, p2, p1, p0, q0, q1, q2, q3); 1431 1432 thresh0 = __lsx_vreplgr2vr_b(thresh_ptr); 1433 thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8); 1434 thresh0 = __lsx_vilvl_d(thresh1, thresh0); 1435 1436 b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr); 1437 b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8); 1438 b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); 1439 1440 limit0 = __lsx_vreplgr2vr_b(limit_ptr); 1441 limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8); 1442 limit0 = __lsx_vilvl_d(limit1, limit0); 1443 1444 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, 1445 hev, mask, flat); 1446 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); 1447 DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); 1448 tmp2 = __lsx_vilvl_h(tmp1, tmp0); 1449 tmp3 = __lsx_vilvh_h(tmp1, tmp0); 1450 DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1); 1451 tmp4 = __lsx_vilvl_h(tmp1, tmp0); 1452 tmp5 = __lsx_vilvh_h(tmp1, tmp0); 1453 1454 dst -= 2; 1455 __lsx_vstelm_w(tmp2, dst, 0, 0); 1456 __lsx_vstelm_w(tmp2, dst + stride, 0, 1); 1457 __lsx_vstelm_w(tmp2, dst + stride2, 0, 2); 1458 __lsx_vstelm_w(tmp2, dst + stride3, 0, 3); 1459 dst += stride4; 1460 __lsx_vstelm_w(tmp3, dst, 0, 0); 1461 __lsx_vstelm_w(tmp3, dst + stride, 0, 1); 1462 __lsx_vstelm_w(tmp3, dst + stride2, 0, 2); 1463 __lsx_vstelm_w(tmp3, dst + stride3, 0, 3); 1464 dst += stride4; 1465 __lsx_vstelm_w(tmp4, dst, 0, 0); 1466 __lsx_vstelm_w(tmp4, dst + stride, 0, 1); 1467 __lsx_vstelm_w(tmp4, dst + stride2, 0, 2); 1468 __lsx_vstelm_w(tmp4, dst + stride3, 0, 3); 1469 dst += stride4; 1470 __lsx_vstelm_w(tmp5, dst, 0, 0); 1471 __lsx_vstelm_w(tmp5, dst + stride, 0, 1); 1472 __lsx_vstelm_w(tmp5, dst + stride2, 0, 2); 1473 __lsx_vstelm_w(tmp5, dst + stride3, 0, 3); 1474} 1475 1476void ff_loop_filter_h_8_8_lsx(uint8_t *dst, ptrdiff_t stride, 1477 int32_t b_limit_ptr, 1478 int32_t limit_ptr, 1479 int32_t thresh_ptr) 1480{ 1481 ptrdiff_t stride2 = stride << 1; 1482 ptrdiff_t stride3 = stride2 + stride; 1483 ptrdiff_t stride4 = stride2 << 1; 1484 uint8_t *dst_tmp = dst - 4; 1485 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 1486 __m128i p1_out, p0_out, q0_out, q1_out; 1487 __m128i flat, mask, hev, thresh, b_limit, limit; 1488 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 1489 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; 1490 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; 1491 __m128i vec0, vec1, vec2, vec3, vec4; 1492 __m128i zero = __lsx_vldi(0); 1493 1494 /* load vector elements */ 1495 p3 = __lsx_vld(dst_tmp, 0); 1496 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1); 1497 p0 = __lsx_vldx(dst_tmp, stride3); 1498 dst_tmp += stride4; 1499 q0 = __lsx_vld(dst_tmp, 0); 1500 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2); 1501 q3 = __lsx_vldx(dst_tmp, stride3); 1502 1503 LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, 1504 p3, p2, p1, p0, q0, q1, q2, q3); 1505 1506 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 1507 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 1508 limit = __lsx_vreplgr2vr_b(limit_ptr); 1509 1510 /* mask and hev */ 1511 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1512 hev, mask, flat); 1513 /* flat4 */ 1514 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1515 /* filter4 */ 1516 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1517 q1_out); 1518 1519 flat = __lsx_vilvl_d(zero, flat); 1520 1521 /* if flat is zero for all pixels, then no need to calculate other filter */ 1522 if (__lsx_bz_v(flat)) { 1523 /* Store 4 pixels p1-_q1 */ 1524 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1525 vec2 = __lsx_vilvl_h(vec1, vec0); 1526 vec3 = __lsx_vilvh_h(vec1, vec0); 1527 1528 dst -= 2; 1529 __lsx_vstelm_w(vec2, dst, 0, 0); 1530 __lsx_vstelm_w(vec2, dst + stride, 0, 1); 1531 __lsx_vstelm_w(vec2, dst + stride2, 0, 2); 1532 __lsx_vstelm_w(vec2, dst + stride3, 0, 3); 1533 dst += stride4; 1534 __lsx_vstelm_w(vec3, dst, 0, 0); 1535 __lsx_vstelm_w(vec3, dst + stride, 0, 1); 1536 __lsx_vstelm_w(vec3, dst + stride2, 0, 2); 1537 __lsx_vstelm_w(vec3, dst + stride3, 0, 3); 1538 } else { 1539 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, 1540 p3_l, p2_l, p1_l, p0_l); 1541 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, 1542 q0_l, q1_l, q2_l, q3_l); 1543 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 1544 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 1545 /* convert 16 bit output data into 8 bit */ 1546 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, 1547 p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l, 1548 q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, 1549 q0_filt8_l); 1550 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, 1551 q2_filt8_l, q1_filt8_l, q2_filt8_l); 1552 1553 /* store pixel values */ 1554 p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); 1555 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); 1556 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); 1557 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); 1558 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); 1559 q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); 1560 1561 /* Store 6 pixels p2-_q2 */ 1562 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); 1563 vec2 = __lsx_vilvl_h(vec1, vec0); 1564 vec3 = __lsx_vilvh_h(vec1, vec0); 1565 vec4 = __lsx_vilvl_b(q2, q1); 1566 1567 dst -= 3; 1568 __lsx_vstelm_w(vec2, dst, 0, 0); 1569 __lsx_vstelm_h(vec4, dst, 4, 0); 1570 dst += stride; 1571 __lsx_vstelm_w(vec2, dst, 0, 1); 1572 __lsx_vstelm_h(vec4, dst, 4, 1); 1573 dst += stride; 1574 __lsx_vstelm_w(vec2, dst, 0, 2); 1575 __lsx_vstelm_h(vec4, dst, 4, 2); 1576 dst += stride; 1577 __lsx_vstelm_w(vec2, dst, 0, 3); 1578 __lsx_vstelm_h(vec4, dst, 4, 3); 1579 dst += stride; 1580 __lsx_vstelm_w(vec3, dst, 0, 0); 1581 __lsx_vstelm_h(vec4, dst, 4, 4); 1582 dst += stride; 1583 __lsx_vstelm_w(vec3, dst, 0, 1); 1584 __lsx_vstelm_h(vec4, dst, 4, 5); 1585 dst += stride; 1586 __lsx_vstelm_w(vec3, dst, 0, 2); 1587 __lsx_vstelm_h(vec4, dst, 4, 6); 1588 dst += stride; 1589 __lsx_vstelm_w(vec3, dst, 0, 3); 1590 __lsx_vstelm_h(vec4, dst, 4, 7); 1591 } 1592} 1593 1594void ff_loop_filter_h_88_16_lsx(uint8_t *dst, ptrdiff_t stride, 1595 int32_t b_limit_ptr, 1596 int32_t limit_ptr, 1597 int32_t thresh_ptr) 1598{ 1599 ptrdiff_t stride2 = stride << 1; 1600 ptrdiff_t stride3 = stride2 + stride; 1601 ptrdiff_t stride4 = stride2 << 1; 1602 uint8_t *dst_tmp = dst - 4; 1603 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 1604 __m128i p1_out, p0_out, q0_out, q1_out; 1605 __m128i flat, mask, hev, thresh, b_limit, limit; 1606 __m128i row4, row5, row6, row7, row12, row13, row14, row15; 1607 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 1608 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; 1609 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; 1610 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; 1611 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; 1612 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; 1613 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1614 __m128i zero = __lsx_vldi(0); 1615 1616 p0 = __lsx_vld(dst_tmp, 0); 1617 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2); 1618 p3 = __lsx_vldx(dst_tmp, stride3); 1619 dst_tmp += stride4; 1620 row4 = __lsx_vld(dst_tmp, 0); 1621 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6); 1622 row7 = __lsx_vldx(dst_tmp, stride3); 1623 dst_tmp += stride4; 1624 q3 = __lsx_vld(dst_tmp, 0); 1625 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1); 1626 q0 = __lsx_vldx(dst_tmp, stride3); 1627 dst_tmp += stride4; 1628 row12 = __lsx_vld(dst_tmp, 0); 1629 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14); 1630 row15 = __lsx_vldx(dst_tmp, stride3); 1631 1632 /* transpose 16x8 matrix into 8x16 */ 1633 LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, 1634 q3, q2, q1, q0, row12, row13, row14, row15, 1635 p3, p2, p1, p0, q0, q1, q2, q3); 1636 1637 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 1638 vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8); 1639 thresh = __lsx_vilvl_d(vec0, thresh); 1640 1641 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 1642 vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8); 1643 b_limit = __lsx_vilvl_d(vec0, b_limit); 1644 1645 limit = __lsx_vreplgr2vr_b(limit_ptr); 1646 vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8); 1647 limit = __lsx_vilvl_d(vec0, limit); 1648 1649 /* mask and hev */ 1650 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1651 hev, mask, flat); 1652 /* flat4 */ 1653 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1654 /* filter4 */ 1655 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1656 q1_out); 1657 1658 /* if flat is zero for all pixels, then no need to calculate other filter */ 1659 if (__lsx_bz_v(flat)) { 1660 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1661 vec2 = __lsx_vilvl_h(vec1, vec0); 1662 vec3 = __lsx_vilvh_h(vec1, vec0); 1663 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1664 vec4 = __lsx_vilvl_h(vec1, vec0); 1665 vec5 = __lsx_vilvh_h(vec1, vec0); 1666 1667 dst -= 2; 1668 __lsx_vstelm_w(vec2, dst, 0, 0); 1669 __lsx_vstelm_w(vec2, dst + stride, 0, 1); 1670 __lsx_vstelm_w(vec2, dst + stride2, 0, 2); 1671 __lsx_vstelm_w(vec2, dst + stride3, 0, 3); 1672 dst += stride4; 1673 __lsx_vstelm_w(vec3, dst, 0, 0); 1674 __lsx_vstelm_w(vec3, dst + stride, 0, 1); 1675 __lsx_vstelm_w(vec3, dst + stride2, 0, 2); 1676 __lsx_vstelm_w(vec3, dst + stride3, 0, 3); 1677 dst += stride4; 1678 __lsx_vstelm_w(vec4, dst, 0, 0); 1679 __lsx_vstelm_w(vec4, dst + stride, 0, 1); 1680 __lsx_vstelm_w(vec4, dst + stride2, 0, 2); 1681 __lsx_vstelm_w(vec4, dst + stride3, 0, 3); 1682 dst += stride4; 1683 __lsx_vstelm_w(vec5, dst, 0, 0); 1684 __lsx_vstelm_w(vec5, dst + stride, 0, 1); 1685 __lsx_vstelm_w(vec5, dst + stride2, 0, 2); 1686 __lsx_vstelm_w(vec5, dst + stride3, 0, 3); 1687 } else { 1688 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, 1689 p3_l, p2_l, p1_l, p0_l); 1690 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, 1691 q0_l, q1_l, q2_l, q3_l); 1692 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 1693 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 1694 1695 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, 1696 p3_h, p2_h, p1_h, p0_h); 1697 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, 1698 q0_h, q1_h, q2_h, q3_h); 1699 1700 /* filter8 */ 1701 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, 1702 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); 1703 1704 /* convert 16 bit output data into 8 bit */ 1705 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, 1706 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, 1707 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l); 1708 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, 1709 q2_filt8_l, q1_filt8_l, q2_filt8_l); 1710 1711 /* store pixel values */ 1712 p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); 1713 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); 1714 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); 1715 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); 1716 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); 1717 q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); 1718 1719 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); 1720 vec3 = __lsx_vilvl_h(vec1, vec0); 1721 vec4 = __lsx_vilvh_h(vec1, vec0); 1722 DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1); 1723 vec6 = __lsx_vilvl_h(vec1, vec0); 1724 vec7 = __lsx_vilvh_h(vec1, vec0); 1725 vec2 = __lsx_vilvl_b(q2, q1); 1726 vec5 = __lsx_vilvh_b(q2, q1); 1727 1728 dst -= 3; 1729 __lsx_vstelm_w(vec3, dst, 0, 0); 1730 __lsx_vstelm_h(vec2, dst, 4, 0); 1731 dst += stride; 1732 __lsx_vstelm_w(vec3, dst, 0, 1); 1733 __lsx_vstelm_h(vec2, dst, 4, 1); 1734 dst += stride; 1735 __lsx_vstelm_w(vec3, dst, 0, 2); 1736 __lsx_vstelm_h(vec2, dst, 4, 2); 1737 dst += stride; 1738 __lsx_vstelm_w(vec3, dst, 0, 3); 1739 __lsx_vstelm_h(vec2, dst, 4, 3); 1740 dst += stride; 1741 __lsx_vstelm_w(vec4, dst, 0, 0); 1742 __lsx_vstelm_h(vec2, dst, 4, 4); 1743 dst += stride; 1744 __lsx_vstelm_w(vec4, dst, 0, 1); 1745 __lsx_vstelm_h(vec2, dst, 4, 5); 1746 dst += stride; 1747 __lsx_vstelm_w(vec4, dst, 0, 2); 1748 __lsx_vstelm_h(vec2, dst, 4, 6); 1749 dst += stride; 1750 __lsx_vstelm_w(vec4, dst, 0, 3); 1751 __lsx_vstelm_h(vec2, dst, 4, 7); 1752 dst += stride; 1753 __lsx_vstelm_w(vec6, dst, 0, 0); 1754 __lsx_vstelm_h(vec5, dst, 4, 0); 1755 dst += stride; 1756 __lsx_vstelm_w(vec6, dst, 0, 1); 1757 __lsx_vstelm_h(vec5, dst, 4, 1); 1758 dst += stride; 1759 __lsx_vstelm_w(vec6, dst, 0, 2); 1760 __lsx_vstelm_h(vec5, dst, 4, 2); 1761 dst += stride; 1762 __lsx_vstelm_w(vec6, dst, 0, 3); 1763 __lsx_vstelm_h(vec5, dst, 4, 3); 1764 dst += stride; 1765 __lsx_vstelm_w(vec7, dst, 0, 0); 1766 __lsx_vstelm_h(vec5, dst, 4, 4); 1767 dst += stride; 1768 __lsx_vstelm_w(vec7, dst, 0, 1); 1769 __lsx_vstelm_h(vec5, dst, 4, 5); 1770 dst += stride; 1771 __lsx_vstelm_w(vec7, dst, 0, 2); 1772 __lsx_vstelm_h(vec5, dst, 4, 6); 1773 dst += stride; 1774 __lsx_vstelm_w(vec7, dst, 0, 3); 1775 __lsx_vstelm_h(vec5, dst, 4, 7); 1776 } 1777} 1778 1779void ff_loop_filter_h_84_16_lsx(uint8_t *dst, ptrdiff_t stride, 1780 int32_t b_limit_ptr, 1781 int32_t limit_ptr, 1782 int32_t thresh_ptr) 1783{ 1784 ptrdiff_t stride2 = stride << 1; 1785 ptrdiff_t stride3 = stride2 + stride; 1786 ptrdiff_t stride4 = stride2 << 1; 1787 uint8_t *dst_tmp = dst - 4; 1788 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 1789 __m128i p1_out, p0_out, q0_out, q1_out; 1790 __m128i flat, mask, hev, thresh, b_limit, limit; 1791 __m128i row4, row5, row6, row7, row12, row13, row14, row15; 1792 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 1793 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; 1794 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; 1795 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1796 __m128i zero = __lsx_vldi(0); 1797 1798 p0 = __lsx_vld(dst_tmp, 0); 1799 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2); 1800 p3 = __lsx_vldx(dst_tmp, stride3); 1801 dst_tmp += stride4; 1802 row4 = __lsx_vld(dst_tmp, 0); 1803 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6); 1804 row7 = __lsx_vldx(dst_tmp, stride3); 1805 dst_tmp += stride4; 1806 q3 = __lsx_vld(dst_tmp, 0); 1807 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1); 1808 q0 = __lsx_vldx(dst_tmp, stride3); 1809 dst_tmp += stride4; 1810 row12 = __lsx_vld(dst_tmp, 0); 1811 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14); 1812 row15 = __lsx_vldx(dst_tmp, stride3); 1813 1814 /* transpose 16x8 matrix into 8x16 */ 1815 LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, 1816 q3, q2, q1, q0, row12, row13, row14, row15, 1817 p3, p2, p1, p0, q0, q1, q2, q3); 1818 1819 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 1820 vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8); 1821 thresh = __lsx_vilvl_d(vec0, thresh); 1822 1823 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 1824 vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8); 1825 b_limit = __lsx_vilvl_d(vec0, b_limit); 1826 1827 limit = __lsx_vreplgr2vr_b(limit_ptr); 1828 vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8); 1829 limit = __lsx_vilvl_d(vec0, limit); 1830 1831 /* mask and hev */ 1832 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1833 hev, mask, flat); 1834 /* flat4 */ 1835 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1836 /* filter4 */ 1837 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1838 q1_out); 1839 1840 flat = __lsx_vilvl_d(zero, flat); 1841 1842 /* if flat is zero for all pixels, then no need to calculate other filter */ 1843 if (__lsx_bz_v(flat)) { 1844 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1845 vec2 = __lsx_vilvl_h(vec1, vec0); 1846 vec3 = __lsx_vilvh_h(vec1, vec0); 1847 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1848 vec4 = __lsx_vilvl_h(vec1, vec0); 1849 vec5 = __lsx_vilvh_h(vec1, vec0); 1850 1851 dst -= 2; 1852 __lsx_vstelm_w(vec2, dst, 0, 0); 1853 __lsx_vstelm_w(vec2, dst + stride, 0, 1); 1854 __lsx_vstelm_w(vec2, dst + stride2, 0, 2); 1855 __lsx_vstelm_w(vec2, dst + stride3, 0, 3); 1856 dst += stride4; 1857 __lsx_vstelm_w(vec3, dst, 0, 0); 1858 __lsx_vstelm_w(vec3, dst + stride, 0, 1); 1859 __lsx_vstelm_w(vec3, dst + stride2, 0, 2); 1860 __lsx_vstelm_w(vec3, dst + stride3, 0, 3); 1861 dst += stride4; 1862 __lsx_vstelm_w(vec4, dst, 0, 0); 1863 __lsx_vstelm_w(vec4, dst + stride, 0, 1); 1864 __lsx_vstelm_w(vec4, dst + stride2, 0, 2); 1865 __lsx_vstelm_w(vec4, dst + stride3, 0, 3); 1866 dst += stride4; 1867 __lsx_vstelm_w(vec5, dst, 0, 0); 1868 __lsx_vstelm_w(vec5, dst + stride, 0, 1); 1869 __lsx_vstelm_w(vec5, dst + stride2, 0, 2); 1870 __lsx_vstelm_w(vec5, dst + stride3, 0, 3); 1871 } else { 1872 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, 1873 p3_l, p2_l, p1_l, p0_l); 1874 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, 1875 q0_l, q1_l, q2_l, q3_l); 1876 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 1877 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 1878 1879 /* convert 16 bit output data into 8 bit */ 1880 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, 1881 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l, 1882 p1_filt8_l, p0_filt8_l, q0_filt8_l); 1883 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, 1884 q1_filt8_l, q2_filt8_l); 1885 1886 /* store pixel values */ 1887 p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); 1888 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); 1889 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); 1890 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); 1891 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); 1892 q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); 1893 1894 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); 1895 vec3 = __lsx_vilvl_h(vec1, vec0); 1896 vec4 = __lsx_vilvh_h(vec1, vec0); 1897 DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1); 1898 vec6 = __lsx_vilvl_h(vec1, vec0); 1899 vec7 = __lsx_vilvh_h(vec1, vec0); 1900 vec2 = __lsx_vilvl_b(q2, q1); 1901 vec5 = __lsx_vilvh_b(q2, q1); 1902 1903 dst -= 3; 1904 __lsx_vstelm_w(vec3, dst, 0, 0); 1905 __lsx_vstelm_h(vec2, dst, 4, 0); 1906 dst += stride; 1907 __lsx_vstelm_w(vec3, dst, 0, 1); 1908 __lsx_vstelm_h(vec2, dst, 4, 1); 1909 dst += stride; 1910 __lsx_vstelm_w(vec3, dst, 0, 2); 1911 __lsx_vstelm_h(vec2, dst, 4, 2); 1912 dst += stride; 1913 __lsx_vstelm_w(vec3, dst, 0, 3); 1914 __lsx_vstelm_h(vec2, dst, 4, 3); 1915 dst += stride; 1916 __lsx_vstelm_w(vec4, dst, 0, 0); 1917 __lsx_vstelm_h(vec2, dst, 4, 4); 1918 dst += stride; 1919 __lsx_vstelm_w(vec4, dst, 0, 1); 1920 __lsx_vstelm_h(vec2, dst, 4, 5); 1921 dst += stride; 1922 __lsx_vstelm_w(vec4, dst, 0, 2); 1923 __lsx_vstelm_h(vec2, dst, 4, 6); 1924 dst += stride; 1925 __lsx_vstelm_w(vec4, dst, 0, 3); 1926 __lsx_vstelm_h(vec2, dst, 4, 7); 1927 dst += stride; 1928 __lsx_vstelm_w(vec6, dst, 0, 0); 1929 __lsx_vstelm_h(vec5, dst, 4, 0); 1930 dst += stride; 1931 __lsx_vstelm_w(vec6, dst, 0, 1); 1932 __lsx_vstelm_h(vec5, dst, 4, 1); 1933 dst += stride; 1934 __lsx_vstelm_w(vec6, dst, 0, 2); 1935 __lsx_vstelm_h(vec5, dst, 4, 2); 1936 dst += stride; 1937 __lsx_vstelm_w(vec6, dst, 0, 3); 1938 __lsx_vstelm_h(vec5, dst, 4, 3); 1939 dst += stride; 1940 __lsx_vstelm_w(vec7, dst, 0, 0); 1941 __lsx_vstelm_h(vec5, dst, 4, 4); 1942 dst += stride; 1943 __lsx_vstelm_w(vec7, dst, 0, 1); 1944 __lsx_vstelm_h(vec5, dst, 4, 5); 1945 dst += stride; 1946 __lsx_vstelm_w(vec7, dst, 0, 2); 1947 __lsx_vstelm_h(vec5, dst, 4, 6); 1948 dst += stride; 1949 __lsx_vstelm_w(vec7, dst, 0, 3); 1950 __lsx_vstelm_h(vec5, dst, 4, 7); 1951 } 1952} 1953 1954void ff_loop_filter_h_48_16_lsx(uint8_t *dst, ptrdiff_t stride, 1955 int32_t b_limit_ptr, 1956 int32_t limit_ptr, 1957 int32_t thresh_ptr) 1958{ 1959 ptrdiff_t stride2 = stride << 1; 1960 ptrdiff_t stride3 = stride2 + stride; 1961 ptrdiff_t stride4 = stride2 << 1; 1962 uint8_t *dst_tmp = dst - 4; 1963 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 1964 __m128i p1_out, p0_out, q0_out, q1_out; 1965 __m128i flat, mask, hev, thresh, b_limit, limit; 1966 __m128i row4, row5, row6, row7, row12, row13, row14, row15; 1967 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; 1968 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; 1969 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; 1970 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1971 __m128i zero = __lsx_vldi(0); 1972 1973 p0 = __lsx_vld(dst_tmp, 0); 1974 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2); 1975 p3 = __lsx_vldx(dst_tmp, stride3); 1976 dst_tmp += stride4; 1977 row4 = __lsx_vld(dst_tmp, 0); 1978 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6); 1979 row7 = __lsx_vldx(dst_tmp, stride3); 1980 dst_tmp += stride4; 1981 q3 = __lsx_vld(dst_tmp, 0); 1982 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1); 1983 q0 = __lsx_vldx(dst_tmp, stride3); 1984 dst_tmp += stride4; 1985 row12 = __lsx_vld(dst_tmp, 0); 1986 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14); 1987 row15 = __lsx_vldx(dst_tmp, stride3); 1988 1989 /* transpose 16x8 matrix into 8x16 */ 1990 LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, 1991 q3, q2, q1, q0, row12, row13, row14, row15, 1992 p3, p2, p1, p0, q0, q1, q2, q3); 1993 1994 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 1995 vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8); 1996 thresh = __lsx_vilvl_d(vec0, thresh); 1997 1998 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 1999 vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8); 2000 b_limit = __lsx_vilvl_d(vec0, b_limit); 2001 2002 limit = __lsx_vreplgr2vr_b(limit_ptr); 2003 vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8); 2004 limit = __lsx_vilvl_d(vec0, limit); 2005 2006 /* mask and hev */ 2007 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 2008 hev, mask, flat); 2009 /* flat4 */ 2010 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 2011 /* filter4 */ 2012 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 2013 q1_out); 2014 2015 flat = __lsx_vilvh_d(flat, zero); 2016 2017 /* if flat is zero for all pixels, then no need to calculate other filter */ 2018 if (__lsx_bz_v(flat)) { 2019 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); 2020 vec2 = __lsx_vilvl_h(vec1, vec0); 2021 vec3 = __lsx_vilvh_h(vec1, vec0); 2022 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); 2023 vec4 = __lsx_vilvl_h(vec1, vec0); 2024 vec5 = __lsx_vilvh_h(vec1, vec0); 2025 2026 dst -= 2; 2027 __lsx_vstelm_w(vec2, dst, 0, 0); 2028 __lsx_vstelm_w(vec2, dst + stride, 0, 1); 2029 __lsx_vstelm_w(vec2, dst + stride2, 0, 2); 2030 __lsx_vstelm_w(vec2, dst + stride3, 0, 3); 2031 dst += stride4; 2032 __lsx_vstelm_w(vec3, dst, 0, 0); 2033 __lsx_vstelm_w(vec3, dst + stride, 0, 1); 2034 __lsx_vstelm_w(vec3, dst + stride2, 0, 2); 2035 __lsx_vstelm_w(vec3, dst + stride3, 0, 3); 2036 dst += stride4; 2037 __lsx_vstelm_w(vec4, dst, 0, 0); 2038 __lsx_vstelm_w(vec4, dst + stride, 0, 1); 2039 __lsx_vstelm_w(vec4, dst + stride2, 0, 2); 2040 __lsx_vstelm_w(vec4, dst + stride3, 0, 3); 2041 dst += stride4; 2042 __lsx_vstelm_w(vec5, dst, 0, 0); 2043 __lsx_vstelm_w(vec5, dst + stride, 0, 1); 2044 __lsx_vstelm_w(vec5, dst + stride2, 0, 2); 2045 __lsx_vstelm_w(vec5, dst + stride3, 0, 3); 2046 } else { 2047 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, 2048 p3_h, p2_h, p1_h, p0_h); 2049 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, 2050 q0_h, q1_h, q2_h, q3_h); 2051 2052 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, 2053 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); 2054 2055 /* convert 16 bit output data into 8 bit */ 2056 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h, 2057 p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h, 2058 p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h); 2059 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h, 2060 q2_filt8_h, q1_filt8_h, q2_filt8_h); 2061 2062 /* store pixel values */ 2063 p2 = __lsx_vbitsel_v(p2, p2_filt8_h, flat); 2064 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_h, flat); 2065 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_h, flat); 2066 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_h, flat); 2067 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_h, flat); 2068 q2 = __lsx_vbitsel_v(q2, q2_filt8_h, flat); 2069 2070 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); 2071 vec3 = __lsx_vilvl_h(vec1, vec0); 2072 vec4 = __lsx_vilvh_h(vec1, vec0); 2073 DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1); 2074 vec6 = __lsx_vilvl_h(vec1, vec0); 2075 vec7 = __lsx_vilvh_h(vec1, vec0); 2076 vec2 = __lsx_vilvl_b(q2, q1); 2077 vec5 = __lsx_vilvh_b(q2, q1); 2078 2079 dst -= 3; 2080 __lsx_vstelm_w(vec3, dst, 0, 0); 2081 __lsx_vstelm_h(vec2, dst, 4, 0); 2082 dst += stride; 2083 __lsx_vstelm_w(vec3, dst, 0, 1); 2084 __lsx_vstelm_h(vec2, dst, 4, 1); 2085 dst += stride; 2086 __lsx_vstelm_w(vec3, dst, 0, 2); 2087 __lsx_vstelm_h(vec2, dst, 4, 2); 2088 dst += stride; 2089 __lsx_vstelm_w(vec3, dst, 0, 3); 2090 __lsx_vstelm_h(vec2, dst, 4, 3); 2091 dst += stride; 2092 __lsx_vstelm_w(vec4, dst, 0, 0); 2093 __lsx_vstelm_h(vec2, dst, 4, 4); 2094 dst += stride; 2095 __lsx_vstelm_w(vec4, dst, 0, 1); 2096 __lsx_vstelm_h(vec2, dst, 4, 5); 2097 dst += stride; 2098 __lsx_vstelm_w(vec4, dst, 0, 2); 2099 __lsx_vstelm_h(vec2, dst, 4, 6); 2100 dst += stride; 2101 __lsx_vstelm_w(vec4, dst, 0, 3); 2102 __lsx_vstelm_h(vec2, dst, 4, 7); 2103 dst += stride; 2104 __lsx_vstelm_w(vec6, dst, 0, 0); 2105 __lsx_vstelm_h(vec5, dst, 4, 0); 2106 dst += stride; 2107 __lsx_vstelm_w(vec6, dst, 0, 1); 2108 __lsx_vstelm_h(vec5, dst, 4, 1); 2109 dst += stride; 2110 __lsx_vstelm_w(vec6, dst, 0, 2); 2111 __lsx_vstelm_h(vec5, dst, 4, 2); 2112 dst += stride; 2113 __lsx_vstelm_w(vec6, dst, 0, 3); 2114 __lsx_vstelm_h(vec5, dst, 4, 3); 2115 dst += stride; 2116 __lsx_vstelm_w(vec7, dst, 0, 0); 2117 __lsx_vstelm_h(vec5, dst, 4, 4); 2118 dst += stride; 2119 __lsx_vstelm_w(vec7, dst, 0, 1); 2120 __lsx_vstelm_h(vec5, dst, 4, 5); 2121 dst += stride; 2122 __lsx_vstelm_w(vec7, dst, 0, 2); 2123 __lsx_vstelm_h(vec5, dst, 4, 6); 2124 dst += stride; 2125 __lsx_vstelm_w(vec7, dst, 0, 3); 2126 __lsx_vstelm_h(vec5, dst, 4, 7); 2127 } 2128} 2129 2130static void vp9_transpose_16x8_to_8x16(uint8_t *input, ptrdiff_t in_pitch, 2131 uint8_t *output) 2132{ 2133 __m128i p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; 2134 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2135 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 2136 ptrdiff_t in_pitch2 = in_pitch << 1; 2137 ptrdiff_t in_pitch3 = in_pitch2 + in_pitch; 2138 ptrdiff_t in_pitch4 = in_pitch2 << 1; 2139 2140 LSX_LD_8(input, in_pitch, in_pitch2, in_pitch3, in_pitch4, 2141 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org); 2142 /* 8x8 transpose */ 2143 LSX_TRANSPOSE8x8_B(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, 2144 p0_org, p7, p6, p5, p4, p3, p2, p1, p0); 2145 /* 8x8 transpose */ 2146 DUP4_ARG2(__lsx_vilvh_b, p5_org, p7_org, p4_org, p6_org, p1_org, 2147 p3_org, p0_org, p2_org, tmp0, tmp1, tmp2, tmp3); 2148 DUP2_ARG2(__lsx_vilvl_b, tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); 2149 DUP2_ARG2(__lsx_vilvh_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); 2150 DUP2_ARG2(__lsx_vilvl_w, tmp6, tmp4, tmp7, tmp5, q0, q4); 2151 DUP2_ARG2(__lsx_vilvh_w, tmp6, tmp4, tmp7, tmp5, q2, q6); 2152 DUP4_ARG2(__lsx_vbsrl_v, q0, 8, q2, 8, q4, 8, q6, 8, q1, q3, q5, q7); 2153 2154 __lsx_vst(p7, output, 0); 2155 __lsx_vst(p6, output, 16); 2156 __lsx_vst(p5, output, 32); 2157 __lsx_vst(p4, output, 48); 2158 __lsx_vst(p3, output, 64); 2159 __lsx_vst(p2, output, 80); 2160 __lsx_vst(p1, output, 96); 2161 __lsx_vst(p0, output, 112); 2162 __lsx_vst(q0, output, 128); 2163 __lsx_vst(q1, output, 144); 2164 __lsx_vst(q2, output, 160); 2165 __lsx_vst(q3, output, 176); 2166 __lsx_vst(q4, output, 192); 2167 __lsx_vst(q5, output, 208); 2168 __lsx_vst(q6, output, 224); 2169 __lsx_vst(q7, output, 240); 2170} 2171 2172static void vp9_transpose_8x16_to_16x8(uint8_t *input, uint8_t *output, 2173 ptrdiff_t out_pitch) 2174{ 2175 __m128i p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; 2176 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 2177 ptrdiff_t out_pitch2 = out_pitch << 1; 2178 ptrdiff_t out_pitch3 = out_pitch2 + out_pitch; 2179 ptrdiff_t out_pitch4 = out_pitch2 << 1; 2180 2181 DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, 2182 p7, p6, p5, p4); 2183 DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, 2184 p3, p2, p1, p0); 2185 DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, 2186 q0, q1, q2, q3); 2187 DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, 2188 q4, q5, q6, q7); 2189 LSX_TRANSPOSE16x8_B(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, 2190 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); 2191 LSX_ST_8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, 2192 output, out_pitch, out_pitch2, out_pitch3, out_pitch4); 2193} 2194 2195static void vp9_transpose_16x16(uint8_t *input, int32_t in_stride, 2196 uint8_t *output, int32_t out_stride) 2197{ 2198 __m128i row0, row1, row2, row3, row4, row5, row6, row7; 2199 __m128i row8, row9, row10, row11, row12, row13, row14, row15; 2200 __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; 2201 __m128i tmp2, tmp3; 2202 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 2203 int32_t in_stride2 = in_stride << 1; 2204 int32_t in_stride3 = in_stride2 + in_stride; 2205 int32_t in_stride4 = in_stride2 << 1; 2206 int32_t out_stride2 = out_stride << 1; 2207 int32_t out_stride3 = out_stride2 + out_stride; 2208 int32_t out_stride4 = out_stride2 << 1; 2209 2210 LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, 2211 row0, row1, row2, row3, row4, row5, row6, row7); 2212 input += in_stride4; 2213 LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, 2214 row8, row9, row10, row11, row12, row13, row14, row15); 2215 2216 LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, 2217 row8, row9, row10, row11, row12, row13, row14, row15, 2218 p7, p6, p5, p4, p3, p2, p1, p0); 2219 2220 /* transpose 16x8 matrix into 8x16 */ 2221 /* total 8 intermediate register and 32 instructions */ 2222 q7 = __lsx_vpackod_d(row8, row0); 2223 q6 = __lsx_vpackod_d(row9, row1); 2224 q5 = __lsx_vpackod_d(row10, row2); 2225 q4 = __lsx_vpackod_d(row11, row3); 2226 q3 = __lsx_vpackod_d(row12, row4); 2227 q2 = __lsx_vpackod_d(row13, row5); 2228 q1 = __lsx_vpackod_d(row14, row6); 2229 q0 = __lsx_vpackod_d(row15, row7); 2230 2231 DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1); 2232 DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5); 2233 2234 DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7); 2235 DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7); 2236 2237 DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3); 2238 q0 = __lsx_vpackev_w(tmp3, tmp2); 2239 q4 = __lsx_vpackod_w(tmp3, tmp2); 2240 2241 tmp2 = __lsx_vpackod_h(tmp1, tmp0); 2242 tmp3 = __lsx_vpackod_h(q7, q5); 2243 q2 = __lsx_vpackev_w(tmp3, tmp2); 2244 q6 = __lsx_vpackod_w(tmp3, tmp2); 2245 2246 DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3); 2247 q1 = __lsx_vpackev_w(tmp3, tmp2); 2248 q5 = __lsx_vpackod_w(tmp3, tmp2); 2249 2250 tmp2 = __lsx_vpackod_h(tmp5, tmp4); 2251 tmp3 = __lsx_vpackod_h(tmp7, tmp6); 2252 q3 = __lsx_vpackev_w(tmp3, tmp2); 2253 q7 = __lsx_vpackod_w(tmp3, tmp2); 2254 2255 LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, 2256 out_stride2, out_stride3, out_stride4); 2257 output += out_stride4; 2258 LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, 2259 out_stride2, out_stride3, out_stride4); 2260} 2261 2262static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, 2263 uint8_t *src_org, int32_t pitch_org, 2264 int32_t b_limit_ptr, 2265 int32_t limit_ptr, 2266 int32_t thresh_ptr) 2267{ 2268 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 2269 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 2270 __m128i flat, mask, hev, thresh, b_limit, limit; 2271 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 2272 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; 2273 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; 2274 __m128i vec0, vec1, vec2, vec3; 2275 __m128i zero = __lsx_vldi(0); 2276 2277 /* load vector elements */ 2278 DUP4_ARG2(__lsx_vld, src, -64, src, -48, src, -32, src, -16, 2279 p3, p2, p1, p0); 2280 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, q0, q1, q2, q3); 2281 2282 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 2283 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 2284 limit = __lsx_vreplgr2vr_b(limit_ptr); 2285 2286 /* mask and hev */ 2287 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 2288 hev, mask, flat); 2289 /* flat4 */ 2290 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 2291 /* filter4 */ 2292 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 2293 q1_out); 2294 2295 flat = __lsx_vilvl_d(zero, flat); 2296 2297 /* if flat is zero for all pixels, then no need to calculate other filter */ 2298 if (__lsx_bz_v(flat)) { 2299 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); 2300 vec2 = __lsx_vilvl_h(vec1, vec0); 2301 vec3 = __lsx_vilvh_h(vec1, vec0); 2302 2303 src_org -= 2; 2304 __lsx_vstelm_w(vec2, src_org, 0, 0); 2305 src_org += pitch_org; 2306 __lsx_vstelm_w(vec2, src_org, 0, 1); 2307 src_org += pitch_org; 2308 __lsx_vstelm_w(vec2, src_org, 0, 2); 2309 src_org += pitch_org; 2310 __lsx_vstelm_w(vec2, src_org, 0, 3); 2311 src_org += pitch_org; 2312 __lsx_vstelm_w(vec3, src_org, 0, 0); 2313 src_org += pitch_org; 2314 __lsx_vstelm_w(vec3, src_org, 0, 1); 2315 src_org += pitch_org; 2316 __lsx_vstelm_w(vec3, src_org, 0, 2); 2317 src_org += pitch_org; 2318 __lsx_vstelm_w(vec3, src_org, 0, 3); 2319 return 1; 2320 } else { 2321 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, 2322 p3_l, p2_l, p1_l, p0_l); 2323 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, 2324 q0_l, q1_l, q2_l, q3_l); 2325 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 2326 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 2327 2328 /* convert 16 bit output data into 8 bit */ 2329 p2_l = __lsx_vpickev_b(p2_filt8_l, p2_filt8_l); 2330 p1_l = __lsx_vpickev_b(p1_filt8_l, p1_filt8_l); 2331 p0_l = __lsx_vpickev_b(p0_filt8_l, p0_filt8_l); 2332 q0_l = __lsx_vpickev_b(q0_filt8_l, q0_filt8_l); 2333 q1_l = __lsx_vpickev_b(q1_filt8_l, q1_filt8_l); 2334 q2_l = __lsx_vpickev_b(q2_filt8_l, q2_filt8_l); 2335 2336 /* store pixel values */ 2337 p2_out = __lsx_vbitsel_v(p2, p2_l, flat); 2338 p1_out = __lsx_vbitsel_v(p1_out, p1_l, flat); 2339 p0_out = __lsx_vbitsel_v(p0_out, p0_l, flat); 2340 q0_out = __lsx_vbitsel_v(q0_out, q0_l, flat); 2341 q1_out = __lsx_vbitsel_v(q1_out, q1_l, flat); 2342 q2_out = __lsx_vbitsel_v(q2, q2_l, flat); 2343 2344 __lsx_vst(p2_out, filter48, 0); 2345 __lsx_vst(p1_out, filter48, 16); 2346 __lsx_vst(p0_out, filter48, 32); 2347 __lsx_vst(q0_out, filter48, 48); 2348 __lsx_vst(q1_out, filter48, 64); 2349 __lsx_vst(q2_out, filter48, 80); 2350 __lsx_vst(flat, filter48, 96); 2351 2352 return 0; 2353 } 2354} 2355 2356static int32_t vp9_vt_lpf_t16_8w(uint8_t *dst, uint8_t *dst_org, 2357 ptrdiff_t stride, 2358 uint8_t *filter48) 2359{ 2360 __m128i zero = __lsx_vldi(0); 2361 __m128i filter8, flat, flat2; 2362 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 2363 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; 2364 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; 2365 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; 2366 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; 2367 v8u16 tmp0_l, tmp1_l; 2368 __m128i out_l; 2369 uint8_t *dst_tmp = dst - 128; 2370 2371 /* load vector elements */ 2372 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, 2373 dst_tmp, 48, p7, p6, p5, p4); 2374 DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, 2375 dst_tmp, 112, p3, p2, p1, p0); 2376 DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); 2377 DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7); 2378 2379 flat = __lsx_vld(filter48, 96); 2380 2381 2382 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 2383 2384 /* if flat2 is zero for all pixels, then no need to calculate other filter */ 2385 if (__lsx_bz_v(flat2)) { 2386 __m128i vec0, vec1, vec2, vec3, vec4; 2387 2388 DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, 2389 filter48, 48, p2, p1, p0, q0); 2390 DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); 2391 2392 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); 2393 vec3 = __lsx_vilvl_h(vec1, vec0); 2394 vec4 = __lsx_vilvh_h(vec1, vec0); 2395 vec2 = __lsx_vilvl_b(q2, q1); 2396 2397 dst_org -= 3; 2398 __lsx_vstelm_w(vec3, dst_org, 0, 0); 2399 __lsx_vstelm_h(vec2, dst_org, 4, 0); 2400 dst_org += stride; 2401 __lsx_vstelm_w(vec3, dst_org, 0, 1); 2402 __lsx_vstelm_h(vec2, dst_org, 4, 1); 2403 dst_org += stride; 2404 __lsx_vstelm_w(vec3, dst_org, 0, 2); 2405 __lsx_vstelm_h(vec2, dst_org, 4, 2); 2406 dst_org += stride; 2407 __lsx_vstelm_w(vec3, dst_org, 0, 3); 2408 __lsx_vstelm_h(vec2, dst_org, 4, 3); 2409 dst_org += stride; 2410 __lsx_vstelm_w(vec4, dst_org, 0, 0); 2411 __lsx_vstelm_h(vec2, dst_org, 4, 4); 2412 dst_org += stride; 2413 __lsx_vstelm_w(vec4, dst_org, 0, 1); 2414 __lsx_vstelm_h(vec2, dst_org, 4, 5); 2415 dst_org += stride; 2416 __lsx_vstelm_w(vec4, dst_org, 0, 2); 2417 __lsx_vstelm_h(vec2, dst_org, 4, 6); 2418 dst_org += stride; 2419 __lsx_vstelm_w(vec4, dst_org, 0, 3); 2420 __lsx_vstelm_h(vec2, dst_org, 4, 7); 2421 return 1; 2422 } else { 2423 dst -= 7 * 16; 2424 2425 p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7); 2426 p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6); 2427 p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5); 2428 p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4); 2429 p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3); 2430 p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2); 2431 p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1); 2432 p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0); 2433 q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0); 2434 2435 tmp0_l = p7_l_in << 3; 2436 tmp0_l -= p7_l_in; 2437 tmp0_l += p6_l_in; 2438 tmp0_l += q0_l_in; 2439 tmp1_l = p6_l_in + p5_l_in; 2440 tmp1_l += p4_l_in; 2441 tmp1_l += p3_l_in; 2442 tmp1_l += p2_l_in; 2443 tmp1_l += p1_l_in; 2444 tmp1_l += p0_l_in; 2445 tmp1_l += tmp0_l; 2446 2447 out_l =__lsx_vsrari_h((__m128i)tmp1_l, 4); 2448 out_l =__lsx_vpickev_b(out_l, out_l); 2449 p6 = __lsx_vbitsel_v(p6, out_l, flat2); 2450 __lsx_vstelm_d(p6, dst, 0, 0); 2451 dst += 16; 2452 2453 /* p5 */ 2454 q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1); 2455 tmp0_l = p5_l_in - p6_l_in; 2456 tmp0_l += q1_l_in; 2457 tmp0_l -= p7_l_in; 2458 tmp1_l += tmp0_l; 2459 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2460 out_l = __lsx_vpickev_b(out_l, out_l); 2461 p5 = __lsx_vbitsel_v(p5, out_l, flat2); 2462 __lsx_vstelm_d(p5, dst, 0, 0); 2463 dst += 16; 2464 2465 /* p4 */ 2466 q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2); 2467 tmp0_l = p4_l_in - p5_l_in; 2468 tmp0_l += q2_l_in; 2469 tmp0_l -= p7_l_in; 2470 tmp1_l += tmp0_l; 2471 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2472 out_l = __lsx_vpickev_b(out_l, out_l); 2473 p4 = __lsx_vbitsel_v(p4, out_l, flat2); 2474 __lsx_vstelm_d(p4, dst, 0, 0); 2475 dst += 16; 2476 2477 /* p3 */ 2478 q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3); 2479 tmp0_l = p3_l_in - p4_l_in; 2480 tmp0_l += q3_l_in; 2481 tmp0_l -= p7_l_in; 2482 tmp1_l += tmp0_l; 2483 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2484 out_l = __lsx_vpickev_b(out_l, out_l); 2485 p3 = __lsx_vbitsel_v(p3, out_l, flat2); 2486 __lsx_vstelm_d(p3, dst, 0, 0); 2487 dst += 16; 2488 2489 /* p2 */ 2490 q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4); 2491 filter8 = __lsx_vld(filter48, 0); 2492 tmp0_l = p2_l_in - p3_l_in; 2493 tmp0_l += q4_l_in; 2494 tmp0_l -= p7_l_in; 2495 tmp1_l += tmp0_l; 2496 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2497 out_l = __lsx_vpickev_b(out_l, out_l); 2498 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 2499 __lsx_vstelm_d(filter8, dst, 0, 0); 2500 dst += 16; 2501 2502 /* p1 */ 2503 q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5); 2504 filter8 = __lsx_vld(filter48, 16); 2505 tmp0_l = p1_l_in - p2_l_in; 2506 tmp0_l += q5_l_in; 2507 tmp0_l -= p7_l_in; 2508 tmp1_l += tmp0_l; 2509 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2510 out_l = __lsx_vpickev_b(out_l, out_l); 2511 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 2512 __lsx_vstelm_d(filter8, dst, 0, 0); 2513 dst += 16; 2514 2515 /* p0 */ 2516 q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6); 2517 filter8 = __lsx_vld(filter48, 32); 2518 tmp0_l = p0_l_in - p1_l_in; 2519 tmp0_l += q6_l_in; 2520 tmp0_l -= p7_l_in; 2521 tmp1_l += tmp0_l; 2522 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2523 out_l = __lsx_vpickev_b(out_l, out_l); 2524 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 2525 __lsx_vstelm_d(filter8, dst, 0, 0); 2526 dst += 16; 2527 2528 /* q0 */ 2529 q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7); 2530 filter8 = __lsx_vld(filter48, 48); 2531 tmp0_l = q7_l_in - p0_l_in; 2532 tmp0_l += q0_l_in; 2533 tmp0_l -= p7_l_in; 2534 tmp1_l += tmp0_l; 2535 out_l = __lsx_vsrari_h((v8i16) tmp1_l, 4); 2536 out_l = __lsx_vpickev_b(out_l, out_l); 2537 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 2538 __lsx_vstelm_d(filter8, dst, 0, 0); 2539 dst += 16; 2540 2541 /* q1 */ 2542 filter8 = __lsx_vld(filter48, 64); 2543 tmp0_l = q7_l_in - q0_l_in; 2544 tmp0_l += q1_l_in; 2545 tmp0_l -= p6_l_in; 2546 tmp1_l += tmp0_l; 2547 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2548 out_l = __lsx_vpickev_b(out_l, out_l); 2549 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 2550 __lsx_vstelm_d(filter8, dst, 0, 0); 2551 dst += 16; 2552 2553 /* q2 */ 2554 filter8 = __lsx_vld(filter48, 80); 2555 tmp0_l = q7_l_in - q1_l_in; 2556 tmp0_l += q2_l_in; 2557 tmp0_l -= p5_l_in; 2558 tmp1_l += tmp0_l; 2559 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2560 out_l = __lsx_vpickev_b(out_l, out_l); 2561 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 2562 __lsx_vstelm_d(filter8, dst, 0, 0); 2563 dst += 16; 2564 2565 /* q3 */ 2566 tmp0_l = q7_l_in - q2_l_in; 2567 tmp0_l += q3_l_in; 2568 tmp0_l -= p4_l_in; 2569 tmp1_l += tmp0_l; 2570 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2571 out_l = __lsx_vpickev_b(out_l, out_l); 2572 q3 = __lsx_vbitsel_v(q3, out_l, flat2); 2573 __lsx_vstelm_d(q3, dst, 0, 0); 2574 dst += 16; 2575 2576 /* q4 */ 2577 tmp0_l = q7_l_in - q3_l_in; 2578 tmp0_l += q4_l_in; 2579 tmp0_l -= p3_l_in; 2580 tmp1_l += tmp0_l; 2581 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2582 out_l = __lsx_vpickev_b(out_l, out_l); 2583 q4 = __lsx_vbitsel_v(q4, out_l, flat2); 2584 __lsx_vstelm_d(q4, dst, 0, 0); 2585 dst += 16; 2586 2587 /* q5 */ 2588 tmp0_l = q7_l_in - q4_l_in; 2589 tmp0_l += q5_l_in; 2590 tmp0_l -= p2_l_in; 2591 tmp1_l += tmp0_l; 2592 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2593 out_l = __lsx_vpickev_b(out_l, out_l); 2594 q5 = __lsx_vbitsel_v(q5, out_l, flat2); 2595 __lsx_vstelm_d(q5, dst, 0, 0); 2596 dst += 16; 2597 2598 /* q6 */ 2599 tmp0_l = q7_l_in - q5_l_in; 2600 tmp0_l += q6_l_in; 2601 tmp0_l -= p1_l_in; 2602 tmp1_l += tmp0_l; 2603 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2604 out_l = __lsx_vpickev_b(out_l, out_l); 2605 q6 = __lsx_vbitsel_v(q6, out_l, flat2); 2606 __lsx_vstelm_d(q6, dst, 0, 0); 2607 2608 return 0; 2609 } 2610} 2611 2612void ff_loop_filter_h_16_8_lsx(uint8_t *dst, ptrdiff_t stride, 2613 int32_t b_limit_ptr, 2614 int32_t limit_ptr, 2615 int32_t thresh_ptr) 2616{ 2617 uint8_t early_exit = 0; 2618 uint8_t transposed_input[16 * 24] __attribute__ ((aligned(16))); 2619 uint8_t *filter48 = &transposed_input[16 * 16]; 2620 2621 vp9_transpose_16x8_to_8x16(dst - 8, stride, transposed_input); 2622 2623 early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), 2624 &filter48[0], dst, stride, 2625 b_limit_ptr, limit_ptr, thresh_ptr); 2626 2627 if (0 == early_exit) { 2628 early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), dst, stride, 2629 &filter48[0]); 2630 2631 if (0 == early_exit) { 2632 vp9_transpose_8x16_to_16x8(transposed_input, dst - 8, stride); 2633 } 2634 } 2635} 2636 2637static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, 2638 uint8_t *dst_org, ptrdiff_t stride, 2639 int32_t b_limit_ptr, 2640 int32_t limit_ptr, 2641 int32_t thresh_ptr) 2642{ 2643 ptrdiff_t stride2 = stride << 1; 2644 ptrdiff_t stride3 = stride2 + stride; 2645 ptrdiff_t stride4 = stride2 << 1; 2646 __m128i p3, p2, p1, p0, q3, q2, q1, q0; 2647 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 2648 __m128i flat, mask, hev, thresh, b_limit, limit; 2649 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 2650 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; 2651 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; 2652 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; 2653 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; 2654 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; 2655 __m128i vec0, vec1, vec2, vec3, vec4, vec5; 2656 __m128i zero = __lsx_vldi(0); 2657 2658 /* load vector elements */ 2659 DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, 2660 p3, p2, p1, p0); 2661 DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); 2662 2663 thresh = __lsx_vreplgr2vr_b(thresh_ptr); 2664 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr); 2665 limit = __lsx_vreplgr2vr_b(limit_ptr); 2666 2667 /* mask and hev */ 2668 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 2669 hev, mask, flat); 2670 /* flat4 */ 2671 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 2672 /* filter4 */ 2673 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 2674 q1_out); 2675 2676 /* if flat is zero for all pixels, then no need to calculate other filter */ 2677 if (__lsx_bz_v(flat)) { 2678 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); 2679 vec2 = __lsx_vilvl_h(vec1, vec0); 2680 vec3 = __lsx_vilvh_h(vec1, vec0); 2681 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); 2682 vec4 = __lsx_vilvl_h(vec1, vec0); 2683 vec5 = __lsx_vilvh_h(vec1, vec0); 2684 2685 dst_org -= 2; 2686 __lsx_vstelm_w(vec2, dst_org, 0, 0); 2687 __lsx_vstelm_w(vec2, dst_org + stride, 0, 1); 2688 __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2); 2689 __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3); 2690 dst_org += stride4; 2691 __lsx_vstelm_w(vec3, dst_org, 0, 0); 2692 __lsx_vstelm_w(vec3, dst_org + stride, 0, 1); 2693 __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2); 2694 __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3); 2695 dst_org += stride4; 2696 __lsx_vstelm_w(vec4, dst_org, 0, 0); 2697 __lsx_vstelm_w(vec4, dst_org + stride, 0, 1); 2698 __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2); 2699 __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3); 2700 dst_org += stride4; 2701 __lsx_vstelm_w(vec5, dst_org, 0, 0); 2702 __lsx_vstelm_w(vec5, dst_org + stride, 0, 1); 2703 __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2); 2704 __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3); 2705 2706 return 1; 2707 } else { 2708 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, 2709 p3_l, p2_l, p1_l, p0_l); 2710 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, 2711 q0_l, q1_l, q2_l, q3_l); 2712 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 2713 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 2714 DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0, 2715 p3_h, p2_h, p1_h, p0_h); 2716 DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3, 2717 q0_h, q1_h, q2_h, q3_h); 2718 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, 2719 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); 2720 2721 /* convert 16 bit output data into 8 bit */ 2722 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, 2723 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, 2724 q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, 2725 q0_filt8_l); 2726 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, 2727 q2_filt8_l, q1_filt8_l, q2_filt8_l); 2728 2729 /* store pixel values */ 2730 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); 2731 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); 2732 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); 2733 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); 2734 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); 2735 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); 2736 2737 __lsx_vst(p2_out, filter48, 0); 2738 __lsx_vst(p1_out, filter48, 16); 2739 __lsx_vst(p0_out, filter48, 32); 2740 __lsx_vst(q0_out, filter48, 48); 2741 __lsx_vst(q1_out, filter48, 64); 2742 __lsx_vst(q2_out, filter48, 80); 2743 __lsx_vst(flat, filter48, 96); 2744 2745 return 0; 2746 } 2747} 2748 2749static int32_t vp9_vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, 2750 ptrdiff_t stride, 2751 uint8_t *filter48) 2752{ 2753 __m128i zero = __lsx_vldi(0); 2754 __m128i flat, flat2, filter8; 2755 __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 2756 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; 2757 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; 2758 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; 2759 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; 2760 v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in; 2761 v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in; 2762 v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in; 2763 v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in; 2764 v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h; 2765 __m128i out_l, out_h; 2766 uint8_t *dst_tmp = dst - 128; 2767 2768 flat = __lsx_vld(filter48, 96); 2769 2770 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, 2771 dst_tmp, 48, p7, p6, p5, p4); 2772 DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, 2773 dst_tmp, 112, p3, p2, p1, p0); 2774 DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); 2775 DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7); 2776 2777 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 2778 2779 /* if flat2 is zero for all pixels, then no need to calculate other filter */ 2780 if (__lsx_bz_v(flat2)) { 2781 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2782 2783 DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, 2784 filter48, 48, p2, p1, p0, q0); 2785 DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); 2786 2787 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); 2788 vec3 = __lsx_vilvl_h(vec1, vec0); 2789 vec4 = __lsx_vilvh_h(vec1, vec0); 2790 DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1); 2791 vec6 = __lsx_vilvl_h(vec1, vec0); 2792 vec7 = __lsx_vilvh_h(vec1, vec0); 2793 vec2 = __lsx_vilvl_b(q2, q1); 2794 vec5 = __lsx_vilvh_b(q2, q1); 2795 2796 dst_org -= 3; 2797 __lsx_vstelm_w(vec3, dst_org, 0, 0); 2798 __lsx_vstelm_h(vec2, dst_org, 4, 0); 2799 dst_org += stride; 2800 __lsx_vstelm_w(vec3, dst_org, 0, 1); 2801 __lsx_vstelm_h(vec2, dst_org, 4, 1); 2802 dst_org += stride; 2803 __lsx_vstelm_w(vec3, dst_org, 0, 2); 2804 __lsx_vstelm_h(vec2, dst_org, 4, 2); 2805 dst_org += stride; 2806 __lsx_vstelm_w(vec3, dst_org, 0, 3); 2807 __lsx_vstelm_h(vec2, dst_org, 4, 3); 2808 dst_org += stride; 2809 __lsx_vstelm_w(vec4, dst_org, 0, 0); 2810 __lsx_vstelm_h(vec2, dst_org, 4, 4); 2811 dst_org += stride; 2812 __lsx_vstelm_w(vec4, dst_org, 0, 1); 2813 __lsx_vstelm_h(vec2, dst_org, 4, 5); 2814 dst_org += stride; 2815 __lsx_vstelm_w(vec4, dst_org, 0, 2); 2816 __lsx_vstelm_h(vec2, dst_org, 4, 6); 2817 dst_org += stride; 2818 __lsx_vstelm_w(vec4, dst_org, 0, 3); 2819 __lsx_vstelm_h(vec2, dst_org, 4, 7); 2820 dst_org += stride; 2821 __lsx_vstelm_w(vec6, dst_org, 0, 0); 2822 __lsx_vstelm_h(vec5, dst_org, 4, 0); 2823 dst_org += stride; 2824 __lsx_vstelm_w(vec6, dst_org, 0, 1); 2825 __lsx_vstelm_h(vec5, dst_org, 4, 1); 2826 dst_org += stride; 2827 __lsx_vstelm_w(vec6, dst_org, 0, 2); 2828 __lsx_vstelm_h(vec5, dst_org, 4, 2); 2829 dst_org += stride; 2830 __lsx_vstelm_w(vec6, dst_org, 0, 3); 2831 __lsx_vstelm_h(vec5, dst_org, 4, 3); 2832 dst_org += stride; 2833 __lsx_vstelm_w(vec7, dst_org, 0, 0); 2834 __lsx_vstelm_h(vec5, dst_org, 4, 4); 2835 dst_org += stride; 2836 __lsx_vstelm_w(vec7, dst_org, 0, 1); 2837 __lsx_vstelm_h(vec5, dst_org, 4, 5); 2838 dst_org += stride; 2839 __lsx_vstelm_w(vec7, dst_org, 0, 2); 2840 __lsx_vstelm_h(vec5, dst_org, 4, 6); 2841 dst_org += stride; 2842 __lsx_vstelm_w(vec7, dst_org, 0, 3); 2843 __lsx_vstelm_h(vec5, dst_org, 4, 7); 2844 2845 return 1; 2846 } else { 2847 dst -= 7 * 16; 2848 2849 p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7); 2850 p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6); 2851 p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5); 2852 p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4); 2853 p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3); 2854 p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2); 2855 p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1); 2856 p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0); 2857 q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0); 2858 2859 tmp0_l = p7_l_in << 3; 2860 tmp0_l -= p7_l_in; 2861 tmp0_l += p6_l_in; 2862 tmp0_l += q0_l_in; 2863 tmp1_l = p6_l_in + p5_l_in; 2864 tmp1_l += p4_l_in; 2865 tmp1_l += p3_l_in; 2866 tmp1_l += p2_l_in; 2867 tmp1_l += p1_l_in; 2868 tmp1_l += p0_l_in; 2869 tmp1_l += tmp0_l; 2870 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2871 2872 p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7); 2873 p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6); 2874 p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5); 2875 p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4); 2876 p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3); 2877 p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2); 2878 p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1); 2879 p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0); 2880 q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0); 2881 2882 tmp0_h = p7_h_in << 3; 2883 tmp0_h -= p7_h_in; 2884 tmp0_h += p6_h_in; 2885 tmp0_h += q0_h_in; 2886 tmp1_h = p6_h_in + p5_h_in; 2887 tmp1_h += p4_h_in; 2888 tmp1_h += p3_h_in; 2889 tmp1_h += p2_h_in; 2890 tmp1_h += p1_h_in; 2891 tmp1_h += p0_h_in; 2892 tmp1_h += tmp0_h; 2893 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 2894 2895 out_l = __lsx_vpickev_b(out_h, out_l); 2896 p6 = __lsx_vbitsel_v(p6, out_l, flat2); 2897 __lsx_vst(p6, dst, 0); 2898 2899 /* p5 */ 2900 q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1); 2901 tmp0_l = p5_l_in - p6_l_in; 2902 tmp0_l += q1_l_in; 2903 tmp0_l -= p7_l_in; 2904 tmp1_l += tmp0_l; 2905 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2906 q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1); 2907 tmp0_h = p5_h_in - p6_h_in; 2908 tmp0_h += q1_h_in; 2909 tmp0_h -= p7_h_in; 2910 tmp1_h += tmp0_h; 2911 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 2912 out_l = __lsx_vpickev_b(out_h, out_l); 2913 p5 = __lsx_vbitsel_v(p5, out_l, flat2); 2914 __lsx_vst(p5, dst, 16); 2915 2916 /* p4 */ 2917 q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2); 2918 tmp0_l = p4_l_in - p5_l_in; 2919 tmp0_l += q2_l_in; 2920 tmp0_l -= p7_l_in; 2921 tmp1_l += tmp0_l; 2922 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2923 q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2); 2924 tmp0_h = p4_h_in - p5_h_in; 2925 tmp0_h += q2_h_in; 2926 tmp0_h -= p7_h_in; 2927 tmp1_h += tmp0_h; 2928 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 2929 out_l = __lsx_vpickev_b(out_h, out_l); 2930 p4 = __lsx_vbitsel_v(p4, out_l, flat2); 2931 __lsx_vst(p4, dst, 16*2); 2932 2933 /* p3 */ 2934 q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3); 2935 tmp0_l = p3_l_in - p4_l_in; 2936 tmp0_l += q3_l_in; 2937 tmp0_l -= p7_l_in; 2938 tmp1_l += tmp0_l; 2939 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2940 q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3); 2941 tmp0_h = p3_h_in - p4_h_in; 2942 tmp0_h += q3_h_in; 2943 tmp0_h -= p7_h_in; 2944 tmp1_h += tmp0_h; 2945 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 2946 out_l = __lsx_vpickev_b(out_h, out_l); 2947 p3 = __lsx_vbitsel_v(p3, out_l, flat2); 2948 __lsx_vst(p3, dst, 16*3); 2949 2950 /* p2 */ 2951 q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4); 2952 filter8 = __lsx_vld(filter48, 0); 2953 tmp0_l = p2_l_in - p3_l_in; 2954 tmp0_l += q4_l_in; 2955 tmp0_l -= p7_l_in; 2956 tmp1_l += tmp0_l; 2957 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2958 q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4); 2959 tmp0_h = p2_h_in - p3_h_in; 2960 tmp0_h += q4_h_in; 2961 tmp0_h -= p7_h_in; 2962 tmp1_h += tmp0_h; 2963 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 2964 out_l = __lsx_vpickev_b(out_h, out_l); 2965 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 2966 __lsx_vst(filter8, dst, 16*4); 2967 2968 /* p1 */ 2969 q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5); 2970 filter8 = __lsx_vld(filter48, 16); 2971 tmp0_l = p1_l_in - p2_l_in; 2972 tmp0_l += q5_l_in; 2973 tmp0_l -= p7_l_in; 2974 tmp1_l += tmp0_l; 2975 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2976 q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5); 2977 tmp0_h = p1_h_in - p2_h_in; 2978 tmp0_h += q5_h_in; 2979 tmp0_h -= p7_h_in; 2980 tmp1_h += tmp0_h; 2981 out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4); 2982 out_l = __lsx_vpickev_b(out_h, out_l); 2983 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 2984 __lsx_vst(filter8, dst, 16*5); 2985 2986 /* p0 */ 2987 q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6); 2988 filter8 = __lsx_vld(filter48, 32); 2989 tmp0_l = p0_l_in - p1_l_in; 2990 tmp0_l += q6_l_in; 2991 tmp0_l -= p7_l_in; 2992 tmp1_l += tmp0_l; 2993 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 2994 q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6); 2995 tmp0_h = p0_h_in - p1_h_in; 2996 tmp0_h += q6_h_in; 2997 tmp0_h -= p7_h_in; 2998 tmp1_h += tmp0_h; 2999 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 3000 out_l = __lsx_vpickev_b(out_h, out_l); 3001 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 3002 __lsx_vst(filter8, dst, 16*6); 3003 3004 /* q0 */ 3005 q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7); 3006 filter8 = __lsx_vld(filter48, 48); 3007 tmp0_l = q7_l_in - p0_l_in; 3008 tmp0_l += q0_l_in; 3009 tmp0_l -= p7_l_in; 3010 tmp1_l += tmp0_l; 3011 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 3012 q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7); 3013 tmp0_h = q7_h_in - p0_h_in; 3014 tmp0_h += q0_h_in; 3015 tmp0_h -= p7_h_in; 3016 tmp1_h += tmp0_h; 3017 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 3018 out_l = __lsx_vpickev_b(out_h, out_l); 3019 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 3020 __lsx_vst(filter8, dst, 16*7); 3021 3022 /* q1 */ 3023 filter8 = __lsx_vld(filter48, 64); 3024 tmp0_l = q7_l_in - q0_l_in; 3025 tmp0_l += q1_l_in; 3026 tmp0_l -= p6_l_in; 3027 tmp1_l += tmp0_l; 3028 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 3029 tmp0_h = q7_h_in - q0_h_in; 3030 tmp0_h += q1_h_in; 3031 tmp0_h -= p6_h_in; 3032 tmp1_h += tmp0_h; 3033 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 3034 out_l = __lsx_vpickev_b(out_h, out_l); 3035 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 3036 __lsx_vst(filter8, dst, 16*8); 3037 3038 /* q2 */ 3039 filter8 = __lsx_vld(filter48, 80); 3040 tmp0_l = q7_l_in - q1_l_in; 3041 tmp0_l += q2_l_in; 3042 tmp0_l -= p5_l_in; 3043 tmp1_l += tmp0_l; 3044 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 3045 tmp0_h = q7_h_in - q1_h_in; 3046 tmp0_h += q2_h_in; 3047 tmp0_h -= p5_h_in; 3048 tmp1_h += tmp0_h; 3049 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 3050 out_l = __lsx_vpickev_b(out_h, out_l); 3051 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); 3052 __lsx_vst(filter8, dst, 16*9); 3053 3054 /* q3 */ 3055 tmp0_l = q7_l_in - q2_l_in; 3056 tmp0_l += q3_l_in; 3057 tmp0_l -= p4_l_in; 3058 tmp1_l += tmp0_l; 3059 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 3060 tmp0_h = q7_h_in - q2_h_in; 3061 tmp0_h += q3_h_in; 3062 tmp0_h -= p4_h_in; 3063 tmp1_h += tmp0_h; 3064 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 3065 out_l = __lsx_vpickev_b(out_h, out_l); 3066 q3 = __lsx_vbitsel_v(q3, out_l, flat2); 3067 __lsx_vst(q3, dst, 16*10); 3068 3069 /* q4 */ 3070 tmp0_l = q7_l_in - q3_l_in; 3071 tmp0_l += q4_l_in; 3072 tmp0_l -= p3_l_in; 3073 tmp1_l += tmp0_l; 3074 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 3075 tmp0_h = q7_h_in - q3_h_in; 3076 tmp0_h += q4_h_in; 3077 tmp0_h -= p3_h_in; 3078 tmp1_h += tmp0_h; 3079 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 3080 out_l = __lsx_vpickev_b(out_h, out_l); 3081 q4 = __lsx_vbitsel_v(q4, out_l, flat2); 3082 __lsx_vst(q4, dst, 16*11); 3083 3084 /* q5 */ 3085 tmp0_l = q7_l_in - q4_l_in; 3086 tmp0_l += q5_l_in; 3087 tmp0_l -= p2_l_in; 3088 tmp1_l += tmp0_l; 3089 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 3090 tmp0_h = q7_h_in - q4_h_in; 3091 tmp0_h += q5_h_in; 3092 tmp0_h -= p2_h_in; 3093 tmp1_h += tmp0_h; 3094 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 3095 out_l = __lsx_vpickev_b(out_h, out_l); 3096 q5 = __lsx_vbitsel_v(q5, out_l, flat2); 3097 __lsx_vst(q5, dst, 16*12); 3098 3099 /* q6 */ 3100 tmp0_l = q7_l_in - q5_l_in; 3101 tmp0_l += q6_l_in; 3102 tmp0_l -= p1_l_in; 3103 tmp1_l += tmp0_l; 3104 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); 3105 tmp0_h = q7_h_in - q5_h_in; 3106 tmp0_h += q6_h_in; 3107 tmp0_h -= p1_h_in; 3108 tmp1_h += tmp0_h; 3109 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); 3110 out_l = __lsx_vpickev_b(out_h, out_l); 3111 q6 = __lsx_vbitsel_v(q6, out_l, flat2); 3112 __lsx_vst(q6, dst, 16*13); 3113 3114 return 0; 3115 } 3116} 3117 3118void ff_loop_filter_h_16_16_lsx(uint8_t *dst, ptrdiff_t stride, 3119 int32_t b_limit_ptr, 3120 int32_t limit_ptr, 3121 int32_t thresh_ptr) 3122{ 3123 uint8_t early_exit = 0; 3124 uint8_t transposed_input[16 * 24] __attribute__ ((aligned(16))); 3125 uint8_t *filter48 = &transposed_input[16 * 16]; 3126 3127 vp9_transpose_16x16((dst - 8), stride, &transposed_input[0], 16); 3128 3129 early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), 3130 &filter48[0], dst, stride, 3131 b_limit_ptr, limit_ptr, thresh_ptr); 3132 3133 if (0 == early_exit) { 3134 early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), dst, 3135 stride, &filter48[0]); 3136 3137 if (0 == early_exit) { 3138 vp9_transpose_16x16(transposed_input, 16, (dst - 8), stride); 3139 } 3140 } 3141} 3142