1/* 2 * Copyright (c) 2021 Loongson Technology Corporation Limited 3 * Contributed by Hao Chen <chenhao@loongson.cn> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavcodec/vp9dsp.h" 23#include "libavutil/loongarch/loongson_intrinsics.h" 24#include "vp9dsp_loongarch.h" 25 26static const uint8_t mc_filt_mask_arr[16 * 3] = { 27 /* 8 width cases */ 28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 29 /* 4 width cases */ 30 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 31 /* 4 width cases */ 32 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 33}; 34 35 36#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, \ 37 _mask0, _mask1, _mask2, _mask3, \ 38 _filter0, _filter1, _filter2, _filter3, \ 39 _out0, _out1) \ 40{ \ 41 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ 42 __m128i _reg0, _reg1, _reg2, _reg3; \ 43 \ 44 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \ 45 _tmp0, _tmp1); \ 46 DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \ 47 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \ 48 _tmp2, _tmp3); \ 49 DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \ 50 _filter1, _reg0, _reg1); \ 51 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \ 52 _tmp4, _tmp5); \ 53 DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \ 54 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \ 55 _tmp6, _tmp7); \ 56 DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \ 57 _filter3, _reg2, _reg3); \ 58 DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \ 59} 60 61#define HORIZ_8TAP_8WID_4VECS_FILT(_src0, _src1, _src2, _src3, \ 62 _mask0, _mask1, _mask2, _mask3, \ 63 _filter0, _filter1, _filter2, _filter3, \ 64 _out0, _out1, _out2, _out3) \ 65{ \ 66 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ 67 __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \ 68 \ 69 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, _src2,\ 70 _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, _tmp3);\ 71 DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \ 72 _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \ 73 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, _src2,\ 74 _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, _tmp3);\ 75 DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \ 76 _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \ 77 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, _src2,\ 78 _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, _tmp7);\ 79 DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \ 80 _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \ 81 _reg1, _reg2, _reg3); \ 82 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, _src2,\ 83 _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, _tmp7);\ 84 DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \ 85 _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \ 86 _reg5, _reg6, _reg7); \ 87 DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \ 88 _reg7, _out0, _out1, _out2, _out3); \ 89} 90 91#define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, \ 92 _filter0, _filter1, _filter2, _filter3) \ 93( { \ 94 __m128i _vec0, _vec1; \ 95 \ 96 _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); \ 97 _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); \ 98 _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); \ 99 _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); \ 100 _vec0 = __lsx_vsadd_h(_vec0, _vec1); \ 101 \ 102 _vec0; \ 103} ) 104 105#define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3, \ 106 _filt_h0, _filt_h1, _filt_h2, _filt_h3) \ 107( { \ 108 __m128i _tmp0, _tmp1, _tmp2, _tmp3; \ 109 __m128i _out; \ 110 \ 111 DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,\ 112 _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);\ 113 _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \ 114 _filt_h2, _filt_h3); \ 115 _out = __lsx_vsrari_h(_out, 7); \ 116 _out = __lsx_vsat_h(_out, 7); \ 117 \ 118 _out; \ 119} ) 120 121#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ 122{ \ 123 _src0 = __lsx_vld(_src, 0); \ 124 _src += _stride; \ 125 _src1 = __lsx_vld(_src, 0); \ 126 _src += _stride; \ 127 _src2 = __lsx_vld(_src, 0); \ 128 _src += _stride; \ 129 _src3 = __lsx_vld(_src, 0); \ 130} 131 132static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride, 133 uint8_t *dst, int32_t dst_stride, 134 const int8_t *filter) 135{ 136 __m128i src0, src1, src2, src3; 137 __m128i filter0, filter1, filter2, filter3; 138 __m128i mask0, mask1, mask2, mask3; 139 __m128i out, out0, out1; 140 141 mask0 = __lsx_vld(mc_filt_mask_arr, 16); 142 src -= 3; 143 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 144 filter0, filter1, filter2, filter3); 145 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 146 mask3 = __lsx_vaddi_bu(mask0, 6); 147 148 LSX_LD_4(src, src_stride, src0, src1, src2, src3); 149 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 150 src0, src1, src2, src3); 151 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 152 mask3, filter0, filter1, filter2, filter3, out0, out1); 153 out = __lsx_vssrarni_b_h(out1, out0, 7); 154 out = __lsx_vxori_b(out, 128); 155 __lsx_vstelm_w(out, dst, 0, 0); 156 dst += dst_stride; 157 __lsx_vstelm_w(out, dst, 0, 1); 158 dst += dst_stride; 159 __lsx_vstelm_w(out, dst, 0, 2); 160 dst += dst_stride; 161 __lsx_vstelm_w(out, dst, 0, 3); 162} 163 164static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride, 165 uint8_t *dst, int32_t dst_stride, 166 const int8_t *filter) 167{ 168 int32_t src_stride2 = src_stride << 1; 169 int32_t src_stride3 = src_stride + src_stride2; 170 int32_t src_stride4 = src_stride2 << 1; 171 __m128i src0, src1, src2, src3; 172 __m128i filter0, filter1, filter2, filter3; 173 __m128i mask0, mask1, mask2, mask3; 174 __m128i out0, out1, out2, out3; 175 uint8_t *_src = (uint8_t*)src - 3; 176 177 mask0 = __lsx_vld(mc_filt_mask_arr, 16); 178 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 179 mask3 = __lsx_vaddi_bu(mask0, 6); 180 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 181 filter0, filter1, filter2, filter3); 182 183 src0 = __lsx_vld(_src, 0); 184 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 185 src3 = __lsx_vldx(_src, src_stride3); 186 _src += src_stride4; 187 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 188 src0, src1, src2, src3); 189 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 190 mask3, filter0, filter1, filter2, filter3, out0, out1); 191 src0 = __lsx_vld(_src, 0); 192 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 193 src3 = __lsx_vldx(_src, src_stride3); 194 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 195 src0, src1, src2, src3); 196 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 197 mask3, filter0, filter1, filter2, filter3, out2, out3); 198 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 199 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 200 __lsx_vstelm_w(out0, dst, 0, 0); 201 dst += dst_stride; 202 __lsx_vstelm_w(out0, dst, 0, 1); 203 dst += dst_stride; 204 __lsx_vstelm_w(out0, dst, 0, 2); 205 dst += dst_stride; 206 __lsx_vstelm_w(out0, dst, 0, 3); 207 dst += dst_stride; 208 __lsx_vstelm_w(out1, dst, 0, 0); 209 dst += dst_stride; 210 __lsx_vstelm_w(out1, dst, 0, 1); 211 dst += dst_stride; 212 __lsx_vstelm_w(out1, dst, 0, 2); 213 dst += dst_stride; 214 __lsx_vstelm_w(out1, dst, 0, 3); 215} 216 217static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride, 218 uint8_t *dst, int32_t dst_stride, 219 const int8_t *filter, int32_t height) 220{ 221 if (height == 4) { 222 common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter); 223 } else if (height == 8) { 224 common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter); 225 } 226} 227 228static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride, 229 uint8_t *dst, int32_t dst_stride, 230 const int8_t *filter) 231{ 232 __m128i src0, src1, src2, src3; 233 __m128i filter0, filter1, filter2, filter3; 234 __m128i mask0, mask1, mask2, mask3; 235 __m128i out0, out1, out2, out3; 236 237 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 238 src -= 3; 239 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 240 mask3 = __lsx_vaddi_bu(mask0, 6); 241 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 242 filter0, filter1, filter2, filter3); 243 244 LSX_LD_4(src, src_stride, src0, src1, src2, src3); 245 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 246 src0, src1, src2, src3); 247 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 248 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3); 249 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 250 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 251 __lsx_vstelm_d(out0, dst, 0, 0); 252 dst += dst_stride; 253 __lsx_vstelm_d(out0, dst, 0, 1); 254 dst += dst_stride; 255 __lsx_vstelm_d(out1, dst, 0, 0); 256 dst += dst_stride; 257 __lsx_vstelm_d(out1, dst, 0, 1); 258} 259 260static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, 261 uint8_t *dst, int32_t dst_stride, 262 const int8_t *filter, int32_t height) 263{ 264 uint32_t loop_cnt = height >> 2; 265 int32_t src_stride2 = src_stride << 1; 266 int32_t src_stride3 = src_stride + src_stride2; 267 int32_t src_stride4 = src_stride2 << 1; 268 __m128i src0, src1, src2, src3; 269 __m128i filter0, filter1, filter2, filter3; 270 __m128i mask0, mask1, mask2, mask3; 271 __m128i out0, out1, out2, out3; 272 uint8_t* _src = (uint8_t*)src - 3; 273 274 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 275 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 276 mask3 = __lsx_vaddi_bu(mask0, 6); 277 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 278 filter0, filter1, filter2, filter3); 279 280 for (; loop_cnt--;) { 281 src0 = __lsx_vld(_src, 0); 282 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 283 src3 = __lsx_vldx(_src, src_stride3); 284 _src += src_stride4; 285 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 286 src0, src1, src2, src3); 287 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 288 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3); 289 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 290 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 291 __lsx_vstelm_d(out0, dst, 0, 0); 292 dst += dst_stride; 293 __lsx_vstelm_d(out0, dst, 0, 1); 294 dst += dst_stride; 295 __lsx_vstelm_d(out1, dst, 0, 0); 296 dst += dst_stride; 297 __lsx_vstelm_d(out1, dst, 0, 1); 298 dst += dst_stride; 299 } 300} 301 302static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride, 303 uint8_t *dst, int32_t dst_stride, 304 const int8_t *filter, int32_t height) 305{ 306 if (height == 4) { 307 common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter); 308 } else { 309 common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, 310 filter, height); 311 } 312} 313 314static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride, 315 uint8_t *dst, int32_t dst_stride, 316 const int8_t *filter, int32_t height) 317{ 318 uint32_t loop_cnt = height >> 1; 319 int32_t stride = src_stride << 1; 320 __m128i src0, src1, src2, src3; 321 __m128i filter0, filter1, filter2, filter3; 322 __m128i mask0, mask1, mask2, mask3; 323 __m128i out0, out1, out2, out3; 324 325 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 326 src -= 3; 327 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 328 mask3 = __lsx_vaddi_bu(mask0, 6); 329 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 330 filter0, filter1, filter2, filter3); 331 332 for (; loop_cnt--;) { 333 const uint8_t* _src = src + src_stride; 334 DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2); 335 DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3); 336 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 337 src0, src1, src2, src3); 338 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 339 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3); 340 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 341 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 342 __lsx_vst(out0, dst, 0); 343 dst += dst_stride; 344 __lsx_vst(out1, dst, 0); 345 dst += dst_stride; 346 src += stride; 347 } 348} 349 350static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride, 351 uint8_t *dst, int32_t dst_stride, 352 const int8_t *filter, int32_t height) 353{ 354 uint32_t loop_cnt = height >> 1; 355 __m128i src0, src1, src2, src3; 356 __m128i filter0, filter1, filter2, filter3; 357 __m128i mask0, mask1, mask2, mask3; 358 __m128i out0, out1, out2, out3; 359 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; 360 361 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 362 src -= 3; 363 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 364 mask3 = __lsx_vaddi_bu(mask0, 6); 365 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 366 filter0, filter1, filter2, filter3); 367 368 for (; loop_cnt--;) { 369 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); 370 src3 = __lsx_vld(src, 24); 371 src1 = __lsx_vshuf_b(src2, src0, shuff); 372 src += src_stride; 373 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 374 src0, src1, src2, src3); 375 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 376 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3); 377 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 378 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 379 __lsx_vst(out0, dst, 0); 380 __lsx_vst(out1, dst, 16); 381 382 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); 383 src3 = __lsx_vld(src, 24); 384 src1 = __lsx_vshuf_b(src2, src0, shuff); 385 src += src_stride; 386 387 dst += dst_stride; 388 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 389 src0, src1, src2, src3); 390 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 391 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3); 392 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 393 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 394 __lsx_vst(out0, dst, 0); 395 __lsx_vst(out1, dst, 16); 396 dst += dst_stride; 397 } 398} 399 400static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride, 401 uint8_t *dst, int32_t dst_stride, 402 const int8_t *filter, int32_t height) 403{ 404 int32_t loop_cnt = height; 405 __m128i src0, src1, src2, src3; 406 __m128i filter0, filter1, filter2, filter3; 407 __m128i mask0, mask1, mask2, mask3; 408 __m128i out0, out1, out2, out3; 409 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; 410 411 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 412 src -= 3; 413 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 414 mask3 = __lsx_vaddi_bu(mask0, 6); 415 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 416 filter0, filter1, filter2, filter3); 417 418 for (; loop_cnt--;) { 419 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); 420 src3 = __lsx_vld(src, 24); 421 src1 = __lsx_vshuf_b(src2, src0, shuff); 422 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 423 src0, src1, src2, src3); 424 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 425 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3); 426 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 427 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 428 __lsx_vst(out0, dst, 0); 429 __lsx_vst(out1, dst, 16); 430 431 DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2); 432 src3 = __lsx_vld(src, 56); 433 src1 = __lsx_vshuf_b(src2, src0, shuff); 434 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 435 src0, src1, src2, src3); 436 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 437 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3); 438 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 439 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 440 __lsx_vst(out0, dst, 32); 441 __lsx_vst(out1, dst, 48); 442 src += src_stride; 443 dst += dst_stride; 444 } 445} 446 447static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride, 448 uint8_t *dst, int32_t dst_stride, 449 const int8_t *filter, int32_t height) 450{ 451 uint32_t loop_cnt = height >> 2; 452 int32_t src_stride2 = src_stride << 1; 453 int32_t src_stride3 = src_stride + src_stride2; 454 int32_t src_stride4 = src_stride2 << 1; 455 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 456 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 457 __m128i reg0, reg1, reg2, reg3, reg4; 458 __m128i filter0, filter1, filter2, filter3; 459 __m128i out0, out1; 460 uint8_t* _src = (uint8_t*)src - src_stride3; 461 462 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 463 filter0, filter1, filter2, filter3); 464 src0 = __lsx_vld(_src, 0); 465 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 466 src3 = __lsx_vldx(_src, src_stride3); 467 _src += src_stride4; 468 src4 = __lsx_vld(_src, 0); 469 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); 470 _src += src_stride3; 471 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0, 472 tmp1, tmp2, tmp3); 473 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5); 474 DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1); 475 reg2 = __lsx_vilvl_d(tmp5, tmp2); 476 DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1); 477 reg2 = __lsx_vxori_b(reg2, 128); 478 479 for (;loop_cnt--;) { 480 src7 = __lsx_vld(_src, 0); 481 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); 482 src10 = __lsx_vldx(_src, src_stride3); 483 _src += src_stride4; 484 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, 485 src9, tmp0, tmp1, tmp2, tmp3); 486 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); 487 DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); 488 out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1, 489 filter2, filter3); 490 out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1, 491 filter2, filter3); 492 out0 = __lsx_vssrarni_b_h(out1, out0, 7); 493 out0 = __lsx_vxori_b(out0, 128); 494 __lsx_vstelm_w(out0, dst, 0, 0); 495 dst += dst_stride; 496 __lsx_vstelm_w(out0, dst, 0, 1); 497 dst += dst_stride; 498 __lsx_vstelm_w(out0, dst, 0, 2); 499 dst += dst_stride; 500 __lsx_vstelm_w(out0, dst, 0, 3); 501 dst += dst_stride; 502 503 reg0 = reg2; 504 reg1 = reg3; 505 reg2 = reg4; 506 src6 = src10; 507 } 508} 509 510static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride, 511 uint8_t *dst, int32_t dst_stride, 512 const int8_t *filter, int32_t height) 513{ 514 uint32_t loop_cnt = height >> 2; 515 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 516 __m128i tmp0, tmp1, tmp2, tmp3; 517 __m128i reg0, reg1, reg2, reg3, reg4, reg5; 518 __m128i filter0, filter1, filter2, filter3; 519 __m128i out0, out1, out2, out3; 520 int32_t src_stride2 = src_stride << 1; 521 int32_t src_stride3 = src_stride + src_stride2; 522 int32_t src_stride4 = src_stride2 << 1; 523 uint8_t* _src = (uint8_t*)src - src_stride3; 524 525 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 526 filter0, filter1, filter2, filter3); 527 528 src0 = __lsx_vld(_src, 0); 529 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 530 src3 = __lsx_vldx(_src, src_stride3); 531 _src += src_stride4; 532 src4 = __lsx_vld(_src, 0); 533 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); 534 _src += src_stride3; 535 536 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 537 src0, src1, src2, src3); 538 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); 539 src6 = __lsx_vxori_b(src6, 128); 540 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 541 reg0, reg1, reg2, reg3); 542 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); 543 544 for (;loop_cnt--;) { 545 src7 = __lsx_vld(_src, 0); 546 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); 547 src10 = __lsx_vldx(_src, src_stride3); 548 _src += src_stride4; 549 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, 550 src7, src8, src9, src10); 551 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, 552 src9, tmp0, tmp1, tmp2, tmp3); 553 out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1, 554 filter2, filter3); 555 out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1, 556 filter2, filter3); 557 out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1, 558 filter2, filter3); 559 out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1, 560 filter2, filter3); 561 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 562 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 563 __lsx_vstelm_d(out0, dst, 0, 0); 564 dst += dst_stride; 565 __lsx_vstelm_d(out0, dst, 0, 1); 566 dst += dst_stride; 567 __lsx_vstelm_d(out1, dst, 0, 0); 568 dst += dst_stride; 569 __lsx_vstelm_d(out1, dst, 0, 1); 570 dst += dst_stride; 571 572 reg0 = reg2; 573 reg1 = tmp0; 574 reg2 = tmp2; 575 reg3 = reg5; 576 reg4 = tmp1; 577 reg5 = tmp3; 578 src6 = src10; 579 } 580} 581 582static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, 583 uint8_t *dst, int32_t dst_stride, 584 const int8_t *filter, int32_t height) 585{ 586 uint32_t loop_cnt = height >> 2; 587 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 588 __m128i filter0, filter1, filter2, filter3; 589 __m128i reg0, reg1, reg2, reg3, reg4, reg5; 590 __m128i reg6, reg7, reg8, reg9, reg10, reg11; 591 __m128i tmp0, tmp1, tmp2, tmp3; 592 int32_t src_stride2 = src_stride << 1; 593 int32_t src_stride3 = src_stride + src_stride2; 594 int32_t src_stride4 = src_stride2 << 1; 595 uint8_t* _src = (uint8_t*)src - src_stride3; 596 597 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 598 filter0, filter1, filter2, filter3); 599 src0 = __lsx_vld(_src, 0); 600 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 601 src3 = __lsx_vldx(_src, src_stride3); 602 _src += src_stride4; 603 src4 = __lsx_vld(_src, 0); 604 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); 605 _src += src_stride3; 606 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 607 src0, src1, src2, src3); 608 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); 609 src6 = __lsx_vxori_b(src6, 128); 610 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 611 reg0, reg1, reg2, reg3); 612 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); 613 DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, 614 reg6, reg7, reg8, reg9); 615 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); 616 617 for (;loop_cnt--;) { 618 src7 = __lsx_vld(_src, 0); 619 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); 620 src10 = __lsx_vldx(_src, src_stride3); 621 _src += src_stride4; 622 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, 623 src7, src8, src9, src10); 624 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, 625 src0, src1, src2, src3); 626 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, 627 src4, src5, src7, src8); 628 tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1, 629 filter2, filter3); 630 tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1, 631 filter2, filter3); 632 tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1, 633 filter2, filter3); 634 tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1, 635 filter2, filter3); 636 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); 637 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 638 __lsx_vst(tmp0, dst, 0); 639 dst += dst_stride; 640 __lsx_vst(tmp1, dst, 0); 641 dst += dst_stride; 642 tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1, 643 filter2, filter3); 644 tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1, 645 filter2, filter3); 646 tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1, 647 filter2, filter3); 648 tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1, 649 filter2, filter3); 650 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); 651 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 652 __lsx_vst(tmp0, dst, 0); 653 dst += dst_stride; 654 __lsx_vst(tmp1, dst, 0); 655 dst += dst_stride; 656 657 reg0 = reg2; 658 reg1 = src0; 659 reg2 = src2; 660 reg3 = reg5; 661 reg4 = src1; 662 reg5 = src3; 663 reg6 = reg8; 664 reg7 = src4; 665 reg8 = src7; 666 reg9 = reg11; 667 reg10 = src5; 668 reg11 = src8; 669 src6 = src10; 670 } 671} 672 673static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride, 674 uint8_t *dst, int32_t dst_stride, 675 const int8_t *filter, int32_t height, 676 int32_t width) 677{ 678 uint8_t *src_tmp; 679 uint8_t *dst_tmp; 680 uint32_t cnt = width >> 4; 681 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 682 __m128i filter0, filter1, filter2, filter3; 683 __m128i reg0, reg1, reg2, reg3, reg4, reg5; 684 __m128i reg6, reg7, reg8, reg9, reg10, reg11; 685 __m128i tmp0, tmp1, tmp2, tmp3; 686 int32_t src_stride2 = src_stride << 1; 687 int32_t src_stride3 = src_stride + src_stride2; 688 int32_t src_stride4 = src_stride2 << 1; 689 int32_t dst_stride2 = dst_stride << 1; 690 int32_t dst_stride3 = dst_stride2 + dst_stride; 691 int32_t dst_stride4 = dst_stride2 << 1; 692 uint8_t* _src = (uint8_t*)src - src_stride3; 693 694 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 695 filter0, filter1, filter2, filter3); 696 for (;cnt--;) { 697 uint32_t loop_cnt = height >> 2; 698 699 src_tmp = _src; 700 dst_tmp = dst; 701 702 src0 = __lsx_vld(src_tmp, 0); 703 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, 704 src1, src2); 705 src3 = __lsx_vldx(src_tmp, src_stride3); 706 src_tmp += src_stride4; 707 src4 = __lsx_vld(src_tmp, 0); 708 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, 709 src5, src6); 710 src_tmp += src_stride3; 711 712 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 713 src0, src1, src2, src3); 714 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); 715 src6 = __lsx_vxori_b(src6, 128); 716 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 717 reg0, reg1, reg2, reg3); 718 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); 719 DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, 720 reg6, reg7, reg8, reg9); 721 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); 722 723 for (;loop_cnt--;) { 724 src7 = __lsx_vld(src_tmp, 0); 725 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, 726 src8, src9); 727 src10 = __lsx_vldx(src_tmp, src_stride3); 728 src_tmp += src_stride4; 729 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 730 128, src7, src8, src9, src10); 731 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, 732 src10, src9, src0, src1, src2, src3); 733 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, 734 src10, src9, src4, src5, src7, src8); 735 tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, 736 filter1, filter2, filter3); 737 tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, 738 filter1, filter2, filter3); 739 tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, 740 filter1, filter2, filter3); 741 tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, 742 filter1, filter2, filter3); 743 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, 744 tmp0, tmp1); 745 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 746 __lsx_vst(tmp0, dst_tmp, 0); 747 __lsx_vstx(tmp1, dst_tmp, dst_stride); 748 tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, 749 filter1, filter2, filter3); 750 tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, 751 filter1, filter2, filter3); 752 tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, 753 filter1, filter2, filter3); 754 tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, 755 filter1, filter2, filter3); 756 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, 757 tmp0, tmp1); 758 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 759 __lsx_vstx(tmp0, dst_tmp, dst_stride2); 760 __lsx_vstx(tmp1, dst_tmp, dst_stride3); 761 dst_tmp += dst_stride4; 762 763 reg0 = reg2; 764 reg1 = src0; 765 reg2 = src2; 766 reg3 = reg5; 767 reg4 = src1; 768 reg5 = src3; 769 reg6 = reg8; 770 reg7 = src4; 771 reg8 = src7; 772 reg9 = reg11; 773 reg10 = src5; 774 reg11 = src8; 775 src6 = src10; 776 } 777 _src += 16; 778 dst += 16; 779 } 780} 781 782static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride, 783 uint8_t *dst, int32_t dst_stride, 784 const int8_t *filter, int32_t height) 785{ 786 common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, 32); 787} 788 789static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride, 790 uint8_t *dst, int32_t dst_stride, 791 const int8_t *filter, int32_t height) 792{ 793 common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, 794 filter, height, 64); 795} 796 797static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, 798 uint8_t *dst, int32_t dst_stride, 799 const int8_t *filter_horiz, 800 const int8_t *filter_vert, 801 int32_t height) 802{ 803 uint32_t loop_cnt = height >> 2; 804 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 805 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; 806 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; 807 __m128i mask0, mask1, mask2, mask3; 808 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 809 __m128i out0, out1; 810 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; 811 int32_t src_stride2 = src_stride << 1; 812 int32_t src_stride3 = src_stride + src_stride2; 813 int32_t src_stride4 = src_stride2 << 1; 814 uint8_t* _src = (uint8_t*)src - src_stride3 - 3; 815 816 mask0 = __lsx_vld(mc_filt_mask_arr, 16); 817 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, 818 filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 819 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 820 mask3 = __lsx_vaddi_bu(mask0, 6); 821 822 src0 = __lsx_vld(_src, 0); 823 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 824 src3 = __lsx_vldx(_src, src_stride3); 825 _src += src_stride4; 826 src4 = __lsx_vld(_src, 0); 827 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); 828 _src += src_stride3; 829 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 830 src0, src1, src2, src3); 831 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); 832 src6 = __lsx_vxori_b(src6, 128); 833 834 tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, 835 filt_hz1, filt_hz2, filt_hz3); 836 tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, 837 filt_hz1, filt_hz2, filt_hz3); 838 tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, 839 filt_hz1, filt_hz2, filt_hz3); 840 tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, 841 filt_hz1, filt_hz2, filt_hz3); 842 DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); 843 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, 844 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 845 DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 846 tmp2 = __lsx_vpackev_b(tmp5, tmp4); 847 848 for (;loop_cnt--;) { 849 src7 = __lsx_vld(_src, 0); 850 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); 851 src10 = __lsx_vldx(_src, src_stride3); 852 _src += src_stride4; 853 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, 854 src7, src8, src9, src10); 855 tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, 856 filt_hz1, filt_hz2, filt_hz3); 857 tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); 858 tmp4 = __lsx_vpackev_b(tmp3, tmp4); 859 out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, 860 filt_vt2, filt_vt3); 861 src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, 862 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 863 src0 = __lsx_vshuf_b(src1, tmp3, shuff); 864 src0 = __lsx_vpackev_b(src1, src0); 865 out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, 866 filt_vt2, filt_vt3); 867 out0 = __lsx_vssrarni_b_h(out1, out0, 7); 868 out0 = __lsx_vxori_b(out0, 128); 869 __lsx_vstelm_w(out0, dst, 0, 0); 870 dst += dst_stride; 871 __lsx_vstelm_w(out0, dst, 0, 1); 872 dst += dst_stride; 873 __lsx_vstelm_w(out0, dst, 0, 2); 874 dst += dst_stride; 875 __lsx_vstelm_w(out0, dst, 0, 3); 876 dst += dst_stride; 877 878 tmp5 = src1; 879 tmp0 = tmp2; 880 tmp1 = tmp4; 881 tmp2 = src0; 882 } 883} 884 885static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, 886 uint8_t *dst, int32_t dst_stride, 887 const int8_t *filter_horiz, 888 const int8_t *filter_vert, 889 int32_t height) 890{ 891 uint32_t loop_cnt = height >> 2; 892 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 893 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; 894 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; 895 __m128i mask0, mask1, mask2, mask3; 896 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; 897 __m128i out0, out1; 898 int32_t src_stride2 = src_stride << 1; 899 int32_t src_stride3 = src_stride + src_stride2; 900 int32_t src_stride4 = src_stride2 << 1; 901 uint8_t* _src = (uint8_t*)src - src_stride3 - 3; 902 903 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 904 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 905 4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 906 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 907 mask3 = __lsx_vaddi_bu(mask0, 6); 908 909 src0 = __lsx_vld(_src, 0); 910 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 911 src3 = __lsx_vldx(_src, src_stride3); 912 _src += src_stride4; 913 src4 = __lsx_vld(_src, 0); 914 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); 915 _src += src_stride3; 916 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 917 src0, src1, src2, src3); 918 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); 919 src6 = __lsx_vxori_b(src6, 128); 920 921 src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, 922 filt_hz1, filt_hz2, filt_hz3); 923 src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, 924 filt_hz1, filt_hz2, filt_hz3); 925 src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, 926 filt_hz1, filt_hz2, filt_hz3); 927 src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, 928 filt_hz1, filt_hz2, filt_hz3); 929 src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, 930 filt_hz1, filt_hz2, filt_hz3); 931 src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, 932 filt_hz1, filt_hz2, filt_hz3); 933 src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, 934 filt_hz1, filt_hz2, filt_hz3); 935 936 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, 937 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 938 DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, 939 src2, src1, tmp0, tmp1, tmp2, tmp4); 940 DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6); 941 942 for (;loop_cnt--;) { 943 src7 = __lsx_vld(_src, 0); 944 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); 945 src10 = __lsx_vldx(_src, src_stride3); 946 _src += src_stride4; 947 948 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, 949 src7, src8, src9, src10); 950 src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, 951 filt_hz1, filt_hz2, filt_hz3); 952 tmp3 = __lsx_vpackev_b(src7, src6); 953 out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, 954 filt_vt2, filt_vt3); 955 src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, 956 filt_hz1, filt_hz2, filt_hz3); 957 src0 = __lsx_vpackev_b(src8, src7); 958 out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, 959 filt_vt2, filt_vt3); 960 src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, 961 filt_hz1, filt_hz2, filt_hz3); 962 src1 = __lsx_vpackev_b(src9, src8); 963 src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, 964 filt_vt2, filt_vt3); 965 src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, 966 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 967 src2 = __lsx_vpackev_b(src10, src9); 968 src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, 969 filt_vt2, filt_vt3); 970 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1); 971 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 972 __lsx_vstelm_d(out0, dst, 0, 0); 973 dst += dst_stride; 974 __lsx_vstelm_d(out0, dst, 0, 1); 975 dst += dst_stride; 976 __lsx_vstelm_d(out1, dst, 0, 0); 977 dst += dst_stride; 978 __lsx_vstelm_d(out1, dst, 0, 1); 979 dst += dst_stride; 980 981 src6 = src10; 982 tmp0 = tmp2; 983 tmp1 = tmp3; 984 tmp2 = src1; 985 tmp4 = tmp6; 986 tmp5 = src0; 987 tmp6 = src2; 988 } 989} 990 991static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride, 992 uint8_t *dst, int32_t dst_stride, 993 const int8_t *filter_horiz, 994 const int8_t *filter_vert, 995 int32_t height) 996{ 997 int32_t multiple8_cnt; 998 999 for (multiple8_cnt = 2; multiple8_cnt--;) { 1000 common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, 1001 filter_vert, height); 1002 src += 8; 1003 dst += 8; 1004 } 1005} 1006 1007static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride, 1008 uint8_t *dst, int32_t dst_stride, 1009 const int8_t *filter_horiz, 1010 const int8_t *filter_vert, 1011 int32_t height) 1012{ 1013 int32_t multiple8_cnt; 1014 1015 for (multiple8_cnt = 4; multiple8_cnt--;) { 1016 common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, 1017 filter_vert, height); 1018 src += 8; 1019 dst += 8; 1020 } 1021} 1022 1023static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride, 1024 uint8_t *dst, int32_t dst_stride, 1025 const int8_t *filter_horiz, 1026 const int8_t *filter_vert, 1027 int32_t height) 1028{ 1029 int32_t multiple8_cnt; 1030 1031 for (multiple8_cnt = 8; multiple8_cnt--;) { 1032 common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, 1033 filter_vert, height); 1034 src += 8; 1035 dst += 8; 1036 } 1037} 1038 1039static void copy_width8_lsx(const uint8_t *src, int32_t src_stride, 1040 uint8_t *dst, int32_t dst_stride, 1041 int32_t height) 1042{ 1043 int32_t cnt = height >> 2; 1044 __m128i src0, src1, src2, src3; 1045 1046 for (;cnt--;) { 1047 src0 = __lsx_vldrepl_d(src, 0); 1048 src += src_stride; 1049 src1 = __lsx_vldrepl_d(src, 0); 1050 src += src_stride; 1051 src2 = __lsx_vldrepl_d(src, 0); 1052 src += src_stride; 1053 src3 = __lsx_vldrepl_d(src, 0); 1054 src += src_stride; 1055 __lsx_vstelm_d(src0, dst, 0, 0); 1056 dst += dst_stride; 1057 __lsx_vstelm_d(src1, dst, 0, 0); 1058 dst += dst_stride; 1059 __lsx_vstelm_d(src2, dst, 0, 0); 1060 dst += dst_stride; 1061 __lsx_vstelm_d(src3, dst, 0, 0); 1062 dst += dst_stride; 1063 } 1064} 1065 1066static void copy_width16_lsx(const uint8_t *src, int32_t src_stride, 1067 uint8_t *dst, int32_t dst_stride, 1068 int32_t height) 1069{ 1070 int32_t cnt = height >> 2; 1071 __m128i src0, src1, src2, src3; 1072 int32_t src_stride2 = src_stride << 1; 1073 int32_t src_stride3 = src_stride + src_stride2; 1074 int32_t src_stride4 = src_stride2 << 1; 1075 int32_t dst_stride2 = dst_stride << 1; 1076 int32_t dst_stride3 = dst_stride2 + dst_stride; 1077 int32_t dst_stride4 = dst_stride2 << 1; 1078 uint8_t *_src = (uint8_t*)src; 1079 1080 for (;cnt--;) { 1081 src0 = __lsx_vld(_src, 0); 1082 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 1083 src3 = __lsx_vldx(_src, src_stride3); 1084 _src += src_stride4; 1085 __lsx_vst(src0, dst, 0); 1086 __lsx_vstx(src1, dst, dst_stride); 1087 __lsx_vstx(src2, dst, dst_stride2); 1088 __lsx_vstx(src3, dst, dst_stride3); 1089 dst += dst_stride4; 1090 } 1091} 1092 1093static void copy_width32_lsx(const uint8_t *src, int32_t src_stride, 1094 uint8_t *dst, int32_t dst_stride, 1095 int32_t height) 1096{ 1097 int32_t cnt = height >> 2; 1098 uint8_t *src_tmp1 = (uint8_t*)src; 1099 uint8_t *dst_tmp1 = dst; 1100 uint8_t *src_tmp2 = src_tmp1 + 16; 1101 uint8_t *dst_tmp2 = dst_tmp1 + 16; 1102 int32_t src_stride2 = src_stride << 1; 1103 int32_t src_stride3 = src_stride + src_stride2; 1104 int32_t src_stride4 = src_stride2 << 1; 1105 int32_t dst_stride2 = dst_stride << 1; 1106 int32_t dst_stride3 = dst_stride2 + dst_stride; 1107 int32_t dst_stride4 = dst_stride2 << 1; 1108 __m128i src0, src1, src2, src3, src4, src5, src6, src7; 1109 1110 for (;cnt--;) { 1111 src0 = __lsx_vld(src_tmp1, 0); 1112 DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, 1113 src1, src2); 1114 src3 = __lsx_vldx(src_tmp1, src_stride3); 1115 src_tmp1 += src_stride4; 1116 1117 src4 = __lsx_vld(src_tmp2, 0); 1118 DUP2_ARG2(__lsx_vldx, src_tmp2, src_stride, src_tmp2, src_stride2, 1119 src5, src6); 1120 src7 = __lsx_vldx(src_tmp2, src_stride3); 1121 src_tmp2 += src_stride4; 1122 1123 __lsx_vst(src0, dst_tmp1, 0); 1124 __lsx_vstx(src1, dst_tmp1, dst_stride); 1125 __lsx_vstx(src2, dst_tmp1, dst_stride2); 1126 __lsx_vstx(src3, dst_tmp1, dst_stride3); 1127 dst_tmp1 += dst_stride4; 1128 __lsx_vst(src4, dst_tmp2, 0); 1129 __lsx_vstx(src5, dst_tmp2, dst_stride); 1130 __lsx_vstx(src6, dst_tmp2, dst_stride2); 1131 __lsx_vstx(src7, dst_tmp2, dst_stride3); 1132 dst_tmp2 += dst_stride4; 1133 } 1134} 1135 1136static void copy_width64_lsx(const uint8_t *src, int32_t src_stride, 1137 uint8_t *dst, int32_t dst_stride, 1138 int32_t height) 1139{ 1140 int32_t cnt = height >> 2; 1141 __m128i src0, src1, src2, src3, src4, src5, src6, src7; 1142 __m128i src8, src9, src10, src11, src12, src13, src14, src15; 1143 1144 for (;cnt--;) { 1145 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 1146 src0, src1, src2, src3); 1147 src += src_stride; 1148 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 1149 src4, src5, src6, src7); 1150 src += src_stride; 1151 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 1152 src8, src9, src10, src11); 1153 src += src_stride; 1154 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 1155 src12, src13, src14, src15); 1156 src += src_stride; 1157 __lsx_vst(src0, dst, 0); 1158 __lsx_vst(src1, dst, 16); 1159 __lsx_vst(src2, dst, 32); 1160 __lsx_vst(src3, dst, 48); 1161 dst += dst_stride; 1162 __lsx_vst(src4, dst, 0); 1163 __lsx_vst(src5, dst, 16); 1164 __lsx_vst(src6, dst, 32); 1165 __lsx_vst(src7, dst, 48); 1166 dst += dst_stride; 1167 __lsx_vst(src8, dst, 0); 1168 __lsx_vst(src9, dst, 16); 1169 __lsx_vst(src10, dst, 32); 1170 __lsx_vst(src11, dst, 48); 1171 dst += dst_stride; 1172 __lsx_vst(src12, dst, 0); 1173 __lsx_vst(src13, dst, 16); 1174 __lsx_vst(src14, dst, 32); 1175 __lsx_vst(src15, dst, 48); 1176 dst += dst_stride; 1177 } 1178} 1179 1180static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src, 1181 int32_t src_stride, 1182 uint8_t *dst, int32_t dst_stride, 1183 const int8_t *filter) 1184{ 1185 uint8_t *dst_tmp = dst; 1186 __m128i src0, src1, src2, src3; 1187 __m128i filter0, filter1, filter2, filter3; 1188 __m128i mask0, mask1, mask2, mask3; 1189 __m128i tmp0, tmp1; 1190 __m128i dst0, dst1, dst2, dst3; 1191 1192 mask0 = __lsx_vld(mc_filt_mask_arr, 16); 1193 src -= 3; 1194 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1195 mask3 = __lsx_vaddi_bu(mask0, 6); 1196 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1197 filter0, filter1, filter2, filter3); 1198 LSX_LD_4(src, src_stride, src0, src1, src2, src3); 1199 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1200 src0, src1, src2, src3); 1201 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 1202 filter0, filter1, filter2, filter3, tmp0, tmp1); 1203 dst0 = __lsx_vldrepl_w(dst_tmp, 0); 1204 dst_tmp += dst_stride; 1205 dst1 = __lsx_vldrepl_w(dst_tmp, 0); 1206 dst_tmp += dst_stride; 1207 dst2 = __lsx_vldrepl_w(dst_tmp, 0); 1208 dst_tmp += dst_stride; 1209 dst3 = __lsx_vldrepl_w(dst_tmp, 0); 1210 dst0 = __lsx_vilvl_w(dst1, dst0); 1211 dst1 = __lsx_vilvl_w(dst3, dst2); 1212 dst0 = __lsx_vilvl_d(dst1, dst0); 1213 tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); 1214 tmp0 = __lsx_vxori_b(tmp0, 128); 1215 dst0 = __lsx_vavgr_bu(tmp0, dst0); 1216 __lsx_vstelm_w(dst0, dst, 0, 0); 1217 dst += dst_stride; 1218 __lsx_vstelm_w(dst0, dst, 0, 1); 1219 dst += dst_stride; 1220 __lsx_vstelm_w(dst0, dst, 0, 2); 1221 dst += dst_stride; 1222 __lsx_vstelm_w(dst0, dst, 0, 3); 1223} 1224 1225static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src, 1226 int32_t src_stride, 1227 uint8_t *dst, int32_t dst_stride, 1228 const int8_t *filter) 1229{ 1230 uint8_t *dst_tmp = dst; 1231 __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; 1232 __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3; 1233 __m128i dst0, dst1; 1234 1235 mask0 = __lsx_vld(mc_filt_mask_arr, 16); 1236 src -= 3; 1237 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1238 mask3 = __lsx_vaddi_bu(mask0, 6); 1239 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1240 filter0, filter1, filter2, filter3); 1241 1242 LSX_LD_4(src, src_stride, src0, src1, src2, src3); 1243 src += src_stride; 1244 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1245 src0, src1, src2, src3); 1246 tmp0 = __lsx_vldrepl_w(dst_tmp, 0); 1247 dst_tmp += dst_stride; 1248 tmp1 = __lsx_vldrepl_w(dst_tmp, 0); 1249 dst_tmp += dst_stride; 1250 tmp2 = __lsx_vldrepl_w(dst_tmp, 0); 1251 dst_tmp += dst_stride; 1252 tmp3 = __lsx_vldrepl_w(dst_tmp, 0); 1253 dst_tmp += dst_stride; 1254 tmp0 = __lsx_vilvl_w(tmp1, tmp0); 1255 tmp1 = __lsx_vilvl_w(tmp3, tmp2); 1256 dst0 = __lsx_vilvl_d(tmp1, tmp0); 1257 1258 tmp0 = __lsx_vldrepl_w(dst_tmp, 0); 1259 dst_tmp += dst_stride; 1260 tmp1 = __lsx_vldrepl_w(dst_tmp, 0); 1261 dst_tmp += dst_stride; 1262 tmp2 = __lsx_vldrepl_w(dst_tmp, 0); 1263 dst_tmp += dst_stride; 1264 tmp3 = __lsx_vldrepl_w(dst_tmp, 0); 1265 tmp0 = __lsx_vilvl_w(tmp1, tmp0); 1266 tmp1 = __lsx_vilvl_w(tmp3, tmp2); 1267 dst1 = __lsx_vilvl_d(tmp1, tmp0); 1268 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 1269 filter0, filter1, filter2, filter3, tmp0, tmp1); 1270 LSX_LD_4(src, src_stride, src0, src1, src2, src3); 1271 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1272 src0, src1, src2, src3); 1273 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 1274 filter0, filter1, filter2, filter3, tmp2, tmp3); 1275 DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7, 1276 tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3); 1277 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1278 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 1279 DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1); 1280 __lsx_vstelm_w(dst0, dst, 0, 0); 1281 dst += dst_stride; 1282 __lsx_vstelm_w(dst0, dst, 0, 1); 1283 dst += dst_stride; 1284 __lsx_vstelm_w(dst0, dst, 0, 2); 1285 dst += dst_stride; 1286 __lsx_vstelm_w(dst0, dst, 0, 3); 1287 dst += dst_stride; 1288 __lsx_vstelm_w(dst1, dst, 0, 0); 1289 dst += dst_stride; 1290 __lsx_vstelm_w(dst1, dst, 0, 1); 1291 dst += dst_stride; 1292 __lsx_vstelm_w(dst1, dst, 0, 2); 1293 dst += dst_stride; 1294 __lsx_vstelm_w(dst1, dst, 0, 3); 1295} 1296 1297static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src, 1298 int32_t src_stride, 1299 uint8_t *dst, int32_t dst_stride, 1300 const int8_t *filter, 1301 int32_t height) 1302{ 1303 if (height == 4) { 1304 common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); 1305 } else if (height == 8) { 1306 common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); 1307 } 1308} 1309 1310static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src, 1311 int32_t src_stride, 1312 uint8_t *dst, int32_t dst_stride, 1313 const int8_t *filter, 1314 int32_t height) 1315{ 1316 int32_t loop_cnt = height >> 2; 1317 uint8_t *dst_tmp = dst; 1318 __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; 1319 __m128i mask0, mask1, mask2, mask3; 1320 __m128i tmp0, tmp1, tmp2, tmp3; 1321 __m128i dst0, dst1, dst2, dst3; 1322 int32_t src_stride2 = src_stride << 1; 1323 int32_t src_stride3 = src_stride2 + src_stride; 1324 int32_t src_stride4 = src_stride2 << 1; 1325 uint8_t *_src = (uint8_t*)src - 3; 1326 1327 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 1328 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1329 mask3 = __lsx_vaddi_bu(mask0, 6); 1330 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1331 filter0, filter1, filter2, filter3); 1332 1333 for (;loop_cnt--;) { 1334 src0 = __lsx_vld(_src, 0); 1335 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 1336 src3 = __lsx_vldx(_src, src_stride3); 1337 _src += src_stride4; 1338 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1339 src0, src1, src2, src3); 1340 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 1341 mask3,filter0, filter1, filter2, filter3, tmp0, tmp1, tmp2, tmp3); 1342 dst0 = __lsx_vldrepl_d(dst_tmp, 0); 1343 dst_tmp += dst_stride; 1344 dst1 = __lsx_vldrepl_d(dst_tmp, 0); 1345 dst_tmp += dst_stride; 1346 dst2 = __lsx_vldrepl_d(dst_tmp, 0); 1347 dst_tmp += dst_stride; 1348 dst3 = __lsx_vldrepl_d(dst_tmp, 0); 1349 dst_tmp += dst_stride; 1350 DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); 1351 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); 1352 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 1353 DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1); 1354 __lsx_vstelm_d(dst0, dst, 0, 0); 1355 dst += dst_stride; 1356 __lsx_vstelm_d(dst0, dst, 0, 1); 1357 dst += dst_stride; 1358 __lsx_vstelm_d(dst1, dst, 0, 0); 1359 dst += dst_stride; 1360 __lsx_vstelm_d(dst1, dst, 0, 1); 1361 dst += dst_stride; 1362 } 1363} 1364 1365static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src, 1366 int32_t src_stride, 1367 uint8_t *dst, int32_t dst_stride, 1368 const int8_t *filter, 1369 int32_t height) 1370{ 1371 int32_t loop_cnt = height >> 1; 1372 int32_t dst_stride2 = dst_stride << 1; 1373 uint8_t *dst_tmp = dst; 1374 __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; 1375 __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; 1376 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1377 __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 1378 1379 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 1380 src -= 3; 1381 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1382 mask3 = __lsx_vaddi_bu(mask0, 6); 1383 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1384 filter0, filter1, filter2, filter3); 1385 1386 for (;loop_cnt--;) { 1387 DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); 1388 src += src_stride; 1389 DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); 1390 src += src_stride; 1391 dst0 = __lsx_vld(dst_tmp, 0); 1392 dst1 = __lsx_vldx(dst_tmp, dst_stride); 1393 dst_tmp += dst_stride2; 1394 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1395 src0, src1, src2, src3); 1396 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, 1397 mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3); 1398 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, 1399 mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7); 1400 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, 1401 mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11); 1402 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2, 1403 mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15); 1404 DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3, 1405 filter0, tmp0, tmp1, tmp2, tmp3); 1406 DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, tmp11, 1407 filter2, tmp8, tmp9, tmp10, tmp11); 1408 DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2, 1409 tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3); 1410 DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, tmp10, 1411 tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, tmp7); 1412 DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, 1413 tmp0, tmp1, tmp2, tmp3); 1414 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3); 1415 DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3); 1416 DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1); 1417 __lsx_vst(dst0, dst, 0); 1418 __lsx_vstx(dst1, dst, dst_stride); 1419 dst += dst_stride2; 1420 } 1421} 1422 1423static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src, 1424 int32_t src_stride, 1425 uint8_t *dst, int32_t dst_stride, 1426 const int8_t *filter, 1427 int32_t height) 1428{ 1429 uint32_t loop_cnt = height; 1430 uint8_t *dst_tmp = dst; 1431 __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; 1432 __m128i mask0, mask1, mask2, mask3, dst0, dst1; 1433 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1434 __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 1435 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; 1436 1437 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 1438 src -= 3; 1439 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1440 mask3 = __lsx_vaddi_bu(mask0, 6); 1441 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1442 filter0, filter1, filter2, filter3); 1443 1444 for (;loop_cnt--;) { 1445 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); 1446 src3 = __lsx_vld(src, 24); 1447 src1 = __lsx_vshuf_b(src2, src0, shuff); 1448 src += src_stride; 1449 DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1); 1450 dst_tmp += dst_stride; 1451 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1452 src0, src1, src2, src3); 1453 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, 1454 src2, mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3); 1455 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, 1456 src2, mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7); 1457 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, 1458 src2, mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11); 1459 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, 1460 src2, mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15); 1461 DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, 1462 tmp3, filter0, tmp0, tmp1, tmp2, tmp3); 1463 DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, 1464 tmp11, filter2, tmp8, tmp9, tmp10, tmp11); 1465 DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, 1466 tmp2, tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3); 1467 DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, 1468 tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, tmp7); 1469 DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, 1470 tmp0, tmp1, tmp2, tmp3); 1471 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); 1472 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 1473 DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1); 1474 __lsx_vst(dst0, dst, 0); 1475 __lsx_vst(dst1, dst, 16); 1476 dst += dst_stride; 1477 } 1478} 1479 1480static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src, 1481 int32_t src_stride, 1482 uint8_t *dst, int32_t dst_stride, 1483 const int8_t *filter, 1484 int32_t height) 1485{ 1486 int32_t loop_cnt = height; 1487 __m128i src0, src1, src2, src3; 1488 __m128i filter0, filter1, filter2, filter3; 1489 __m128i mask0, mask1, mask2, mask3; 1490 __m128i out0, out1, out2, out3, dst0, dst1; 1491 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; 1492 1493 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 1494 src -= 3; 1495 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1496 mask3 = __lsx_vaddi_bu(mask0, 6); 1497 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1498 filter0, filter1, filter2, filter3); 1499 1500 for (;loop_cnt--;) { 1501 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); 1502 src3 = __lsx_vld(src, 24); 1503 src1 = __lsx_vshuf_b(src2, src0, shuff); 1504 DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); 1505 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1506 src0, src1, src2, src3); 1507 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 1508 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3); 1509 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 1510 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 1511 DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1); 1512 __lsx_vst(out0, dst, 0); 1513 __lsx_vst(out1, dst, 16); 1514 1515 DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2); 1516 src3 = __lsx_vld(src, 56); 1517 src1 = __lsx_vshuf_b(src2, src0, shuff); 1518 DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1); 1519 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1520 src0, src1, src2, src3); 1521 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 1522 mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3); 1523 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 1524 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 1525 DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1); 1526 __lsx_vst(out0, dst, 32); 1527 __lsx_vst(out1, dst, 48); 1528 src += src_stride; 1529 dst += dst_stride; 1530 } 1531} 1532 1533static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src, 1534 int32_t src_stride, 1535 uint8_t *dst, int32_t dst_stride, 1536 const int8_t *filter, 1537 int32_t height) 1538{ 1539 uint32_t loop_cnt = height >> 2; 1540 uint8_t *dst_tmp = dst; 1541 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1542 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 1543 __m128i reg0, reg1, reg2, reg3, reg4; 1544 __m128i filter0, filter1, filter2, filter3; 1545 __m128i out0, out1; 1546 int32_t src_stride2 = src_stride << 1; 1547 int32_t src_stride3 = src_stride + src_stride2; 1548 int32_t src_stride4 = src_stride2 << 1; 1549 uint8_t* _src = (uint8_t*)src - src_stride3; 1550 1551 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1552 filter0, filter1, filter2, filter3); 1553 src0 = __lsx_vld(_src, 0); 1554 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 1555 src3 = __lsx_vldx(_src, src_stride3); 1556 _src += src_stride4; 1557 src4 = __lsx_vld(_src, 0); 1558 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); 1559 _src += src_stride3; 1560 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 1561 tmp0, tmp1, tmp2, tmp3); 1562 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5); 1563 DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1); 1564 reg2 = __lsx_vilvl_d(tmp5, tmp2); 1565 DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1); 1566 reg2 = __lsx_vxori_b(reg2, 128); 1567 1568 for (;loop_cnt--;) { 1569 src7 = __lsx_vld(_src, 0); 1570 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); 1571 src10 = __lsx_vldx(_src, src_stride3); 1572 _src += src_stride4; 1573 src0 = __lsx_vldrepl_w(dst_tmp, 0); 1574 dst_tmp += dst_stride; 1575 src1 = __lsx_vldrepl_w(dst_tmp, 0); 1576 dst_tmp += dst_stride; 1577 src2 = __lsx_vldrepl_w(dst_tmp, 0); 1578 dst_tmp += dst_stride; 1579 src3 = __lsx_vldrepl_w(dst_tmp, 0); 1580 dst_tmp += dst_stride; 1581 DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1); 1582 src0 = __lsx_vilvl_d(src1, src0); 1583 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, 1584 src9, tmp0, tmp1, tmp2, tmp3); 1585 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); 1586 DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); 1587 out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, 1588 filter1, filter2, filter3); 1589 out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, 1590 filter1, filter2, filter3); 1591 out0 = __lsx_vssrarni_b_h(out1, out0, 7); 1592 out0 = __lsx_vxori_b(out0, 128); 1593 out0 = __lsx_vavgr_bu(out0, src0); 1594 __lsx_vstelm_w(out0, dst, 0, 0); 1595 dst += dst_stride; 1596 __lsx_vstelm_w(out0, dst, 0, 1); 1597 dst += dst_stride; 1598 __lsx_vstelm_w(out0, dst, 0, 2); 1599 dst += dst_stride; 1600 __lsx_vstelm_w(out0, dst, 0, 3); 1601 dst += dst_stride; 1602 reg0 = reg2; 1603 reg1 = reg3; 1604 reg2 = reg4; 1605 src6 = src10; 1606 } 1607} 1608 1609static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src, 1610 int32_t src_stride, 1611 uint8_t *dst, int32_t dst_stride, 1612 const int8_t *filter, 1613 int32_t height) 1614{ 1615 uint32_t loop_cnt = height >> 2; 1616 uint8_t *dst_tmp = dst; 1617 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1618 __m128i tmp0, tmp1, tmp2, tmp3; 1619 __m128i reg0, reg1, reg2, reg3, reg4, reg5; 1620 __m128i filter0, filter1, filter2, filter3; 1621 __m128i out0, out1, out2, out3; 1622 int32_t src_stride2 = src_stride << 1; 1623 int32_t src_stride3 = src_stride + src_stride2; 1624 int32_t src_stride4 = src_stride2 << 1; 1625 uint8_t* _src = (uint8_t*)src - src_stride3; 1626 1627 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1628 filter0, filter1, filter2, filter3); 1629 1630 src0 = __lsx_vld(_src, 0); 1631 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 1632 src3 = __lsx_vldx(_src, src_stride3); 1633 _src += src_stride4; 1634 src4 = __lsx_vld(_src, 0); 1635 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); 1636 _src += src_stride3; 1637 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1638 src0, src1, src2, src3); 1639 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); 1640 src6 = __lsx_vxori_b(src6, 128); 1641 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, 1642 src1, reg0, reg1, reg2, reg3); 1643 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); 1644 1645 for (;loop_cnt--;) { 1646 src7 = __lsx_vld(_src, 0); 1647 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); 1648 src10 = __lsx_vldx(_src, src_stride3); 1649 _src += src_stride4; 1650 src0 = __lsx_vldrepl_d(dst_tmp, 0); 1651 dst_tmp += dst_stride; 1652 src1 = __lsx_vldrepl_d(dst_tmp, 0); 1653 dst_tmp += dst_stride; 1654 src2 = __lsx_vldrepl_d(dst_tmp, 0); 1655 dst_tmp += dst_stride; 1656 src3 = __lsx_vldrepl_d(dst_tmp, 0); 1657 dst_tmp += dst_stride; 1658 DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1); 1659 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, 1660 src7, src8, src9, src10); 1661 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, 1662 src9, tmp0, tmp1, tmp2, tmp3); 1663 out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, 1664 filter1, filter2, filter3); 1665 out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, 1666 filter1, filter2, filter3); 1667 out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, 1668 filter1, filter2, filter3); 1669 out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, 1670 filter1, filter2, filter3); 1671 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 1672 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 1673 DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1); 1674 __lsx_vstelm_d(out0, dst, 0, 0); 1675 dst += dst_stride; 1676 __lsx_vstelm_d(out0, dst, 0, 1); 1677 dst += dst_stride; 1678 __lsx_vstelm_d(out1, dst, 0, 0); 1679 dst += dst_stride; 1680 __lsx_vstelm_d(out1, dst, 0, 1); 1681 dst += dst_stride; 1682 1683 reg0 = reg2; 1684 reg1 = tmp0; 1685 reg2 = tmp2; 1686 reg3 = reg5; 1687 reg4 = tmp1; 1688 reg5 = tmp3; 1689 src6 = src10; 1690 } 1691} 1692 1693static void common_vt_8t_and_aver_dst_16w_mult_lsx(const uint8_t *src, 1694 int32_t src_stride, 1695 uint8_t *dst, 1696 int32_t dst_stride, 1697 const int8_t *filter, 1698 int32_t height, 1699 int32_t width) 1700{ 1701 uint8_t *src_tmp; 1702 uint32_t cnt = width >> 4; 1703 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1704 __m128i filter0, filter1, filter2, filter3; 1705 __m128i reg0, reg1, reg2, reg3, reg4, reg5; 1706 __m128i reg6, reg7, reg8, reg9, reg10, reg11; 1707 __m128i tmp0, tmp1, tmp2, tmp3; 1708 int32_t src_stride2 = src_stride << 1; 1709 int32_t src_stride3 = src_stride + src_stride2; 1710 int32_t src_stride4 = src_stride2 << 1; 1711 int32_t dst_stride2 = dst_stride << 1; 1712 int32_t dst_stride3 = dst_stride2 + dst_stride; 1713 int32_t dst_stride4 = dst_stride2 << 1; 1714 uint8_t *_src = (uint8_t*)src - src_stride3; 1715 1716 DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, 1717 filter0, filter1, filter2, filter3); 1718 for (;cnt--;) { 1719 uint32_t loop_cnt = height >> 2; 1720 uint8_t *dst_reg = dst; 1721 1722 src_tmp = _src; 1723 src0 = __lsx_vld(src_tmp, 0); 1724 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, 1725 src1, src2); 1726 src3 = __lsx_vldx(src_tmp, src_stride3); 1727 src_tmp += src_stride4; 1728 src4 = __lsx_vld(src_tmp, 0); 1729 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, 1730 src5, src6); 1731 src_tmp += src_stride3; 1732 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1733 src0, src1, src2, src3); 1734 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); 1735 src6 = __lsx_vxori_b(src6, 128); 1736 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, 1737 reg0, reg1, reg2, reg3); 1738 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); 1739 DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, 1740 reg6, reg7, reg8, reg9); 1741 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); 1742 1743 for (;loop_cnt--;) { 1744 src7 = __lsx_vld(src_tmp, 0); 1745 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, 1746 src8, src9); 1747 src10 = __lsx_vldx(src_tmp, src_stride3); 1748 src_tmp += src_stride4; 1749 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 1750 128, src7, src8, src9, src10); 1751 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, 1752 src10, src9, src0, src1, src2, src3); 1753 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, 1754 src10, src9, src4, src5, src7, src8); 1755 tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, 1756 filter1, filter2, filter3); 1757 tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, 1758 filter1, filter2, filter3); 1759 tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, 1760 filter1, filter2, filter3); 1761 tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, 1762 filter1, filter2, filter3); 1763 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, 1764 tmp0, tmp1); 1765 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 1766 tmp2 = __lsx_vld(dst_reg, 0); 1767 tmp3 = __lsx_vldx(dst_reg, dst_stride); 1768 DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); 1769 __lsx_vst(tmp0, dst_reg, 0); 1770 __lsx_vstx(tmp1, dst_reg, dst_stride); 1771 tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, 1772 filter1, filter2, filter3); 1773 tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, 1774 filter1, filter2, filter3); 1775 tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, 1776 filter1, filter2, filter3); 1777 tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, 1778 filter1, filter2, filter3); 1779 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, 1780 tmp0, tmp1); 1781 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 1782 tmp2 = __lsx_vldx(dst_reg, dst_stride2); 1783 tmp3 = __lsx_vldx(dst_reg, dst_stride3); 1784 DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); 1785 __lsx_vstx(tmp0, dst_reg, dst_stride2); 1786 __lsx_vstx(tmp1, dst_reg, dst_stride3); 1787 dst_reg += dst_stride4; 1788 1789 reg0 = reg2; 1790 reg1 = src0; 1791 reg2 = src2; 1792 reg3 = reg5; 1793 reg4 = src1; 1794 reg5 = src3; 1795 reg6 = reg8; 1796 reg7 = src4; 1797 reg8 = src7; 1798 reg9 = reg11; 1799 reg10 = src5; 1800 reg11 = src8; 1801 src6 = src10; 1802 } 1803 _src += 16; 1804 dst += 16; 1805 } 1806} 1807 1808static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src, 1809 int32_t src_stride, 1810 uint8_t *dst, int32_t dst_stride, 1811 const int8_t *filter, 1812 int32_t height) 1813{ 1814 common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, 1815 filter, height, 16); 1816} 1817 1818static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src, 1819 int32_t src_stride, 1820 uint8_t *dst, int32_t dst_stride, 1821 const int8_t *filter, 1822 int32_t height) 1823{ 1824 common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, 1825 filter, height, 32); 1826} 1827 1828static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src, 1829 int32_t src_stride, 1830 uint8_t *dst, int32_t dst_stride, 1831 const int8_t *filter, 1832 int32_t height) 1833{ 1834 common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, 1835 filter, height, 64); 1836} 1837 1838static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(const uint8_t *src, 1839 int32_t src_stride, 1840 uint8_t *dst, 1841 int32_t dst_stride, 1842 const int8_t *filter_horiz, 1843 const int8_t *filter_vert, 1844 int32_t height) 1845{ 1846 uint32_t loop_cnt = height >> 2; 1847 uint8_t *dst_tmp = dst; 1848 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1849 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; 1850 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; 1851 __m128i mask0, mask1, mask2, mask3; 1852 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 1853 __m128i out0, out1; 1854 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; 1855 int32_t src_stride2 = src_stride << 1; 1856 int32_t src_stride3 = src_stride + src_stride2; 1857 int32_t src_stride4 = src_stride2 << 1; 1858 uint8_t* _src = (uint8_t*)src - 3 - src_stride3; 1859 1860 mask0 = __lsx_vld(mc_filt_mask_arr, 16); 1861 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 1862 4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1863 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1864 mask3 = __lsx_vaddi_bu(mask0, 6); 1865 1866 src0 = __lsx_vld(_src, 0); 1867 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 1868 src3 = __lsx_vldx(_src, src_stride3); 1869 _src += src_stride4; 1870 src4 = __lsx_vld(_src, 0); 1871 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); 1872 _src += src_stride3; 1873 1874 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1875 src0, src1, src2, src3); 1876 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); 1877 src6 = __lsx_vxori_b(src6, 128); 1878 1879 tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, 1880 filt_hz1, filt_hz2, filt_hz3); 1881 tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, 1882 filt_hz1, filt_hz2, filt_hz3); 1883 tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, 1884 filt_hz1, filt_hz2, filt_hz3); 1885 tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, 1886 filt_hz1, filt_hz2, filt_hz3); 1887 DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); 1888 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, 1889 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 1890 DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1891 tmp2 = __lsx_vpackev_b(tmp5, tmp4); 1892 1893 for (;loop_cnt--;) { 1894 src7 = __lsx_vld(_src, 0); 1895 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); 1896 src10 = __lsx_vldx(_src, src_stride3); 1897 _src += src_stride4; 1898 src2 = __lsx_vldrepl_w(dst_tmp, 0); 1899 dst_tmp += dst_stride; 1900 src3 = __lsx_vldrepl_w(dst_tmp, 0); 1901 dst_tmp += dst_stride; 1902 src4 = __lsx_vldrepl_w(dst_tmp, 0); 1903 dst_tmp += dst_stride; 1904 src5 = __lsx_vldrepl_w(dst_tmp, 0); 1905 dst_tmp += dst_stride; 1906 DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3); 1907 src2 = __lsx_vilvl_d(src3, src2); 1908 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, 1909 src7, src8, src9, src10); 1910 tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, 1911 filt_hz1, filt_hz2, filt_hz3); 1912 tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); 1913 tmp4 = __lsx_vpackev_b(tmp3, tmp4); 1914 out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, 1915 filt_vt2, filt_vt3); 1916 src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, 1917 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1918 src0 = __lsx_vshuf_b(src1, tmp3, shuff); 1919 src0 = __lsx_vpackev_b(src1, src0); 1920 out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, 1921 filt_vt2, filt_vt3); 1922 out0 = __lsx_vssrarni_b_h(out1, out0, 7); 1923 out0 = __lsx_vxori_b(out0, 128); 1924 out0 = __lsx_vavgr_bu(out0, src2); 1925 __lsx_vstelm_w(out0, dst, 0, 0); 1926 dst += dst_stride; 1927 __lsx_vstelm_w(out0, dst, 0, 1); 1928 dst += dst_stride; 1929 __lsx_vstelm_w(out0, dst, 0, 2); 1930 dst += dst_stride; 1931 __lsx_vstelm_w(out0, dst, 0, 3); 1932 dst += dst_stride; 1933 1934 tmp5 = src1; 1935 tmp0 = tmp2; 1936 tmp1 = tmp4; 1937 tmp2 = src0; 1938 } 1939} 1940 1941static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(const uint8_t *src, 1942 int32_t src_stride, 1943 uint8_t *dst, 1944 int32_t dst_stride, 1945 const int8_t *filter_horiz, 1946 const int8_t *filter_vert, 1947 int32_t height) 1948{ 1949 uint32_t loop_cnt = height >> 2; 1950 uint8_t *dst_tmp = dst; 1951 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 1952 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; 1953 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; 1954 __m128i mask0, mask1, mask2, mask3; 1955 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; 1956 __m128i out0, out1; 1957 int32_t src_stride2 = src_stride << 1; 1958 int32_t src_stride3 = src_stride + src_stride2; 1959 int32_t src_stride4 = src_stride2 << 1; 1960 uint8_t* _src = (uint8_t*)src - 3 - src_stride3; 1961 1962 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 1963 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 1964 4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 1965 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 1966 mask3 = __lsx_vaddi_bu(mask0, 6); 1967 1968 src0 = __lsx_vld(_src, 0); 1969 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 1970 src3 = __lsx_vldx(_src, src_stride3); 1971 _src += src_stride4; 1972 src4 = __lsx_vld(_src, 0); 1973 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); 1974 _src += src_stride3; 1975 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1976 src0, src1, src2, src3); 1977 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); 1978 src6 = __lsx_vxori_b(src6, 128); 1979 1980 src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, 1981 filt_hz1, filt_hz2, filt_hz3); 1982 src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, 1983 filt_hz1, filt_hz2, filt_hz3); 1984 src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, 1985 filt_hz1, filt_hz2, filt_hz3); 1986 src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, 1987 filt_hz1, filt_hz2, filt_hz3); 1988 src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, 1989 filt_hz1, filt_hz2, filt_hz3); 1990 src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, 1991 filt_hz1, filt_hz2, filt_hz3); 1992 src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, 1993 filt_hz1, filt_hz2, filt_hz3); 1994 1995 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, 1996 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 1997 DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, 1998 src2, src1, tmp0, tmp1, tmp2, tmp4); 1999 DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6); 2000 2001 for (;loop_cnt--;) { 2002 src7 = __lsx_vld(_src, 0); 2003 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); 2004 src10 = __lsx_vldx(_src, src_stride3); 2005 _src += src_stride4; 2006 2007 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, 2008 src7, src8, src9, src10); 2009 src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, 2010 filt_hz1, filt_hz2, filt_hz3); 2011 tmp3 = __lsx_vpackev_b(src7, src6); 2012 out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, 2013 filt_vt2, filt_vt3); 2014 src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, 2015 filt_hz1, filt_hz2, filt_hz3); 2016 src0 = __lsx_vpackev_b(src8, src7); 2017 out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, 2018 filt_vt2, filt_vt3); 2019 src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, 2020 filt_hz1, filt_hz2, filt_hz3); 2021 src1 = __lsx_vpackev_b(src9, src8); 2022 src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, 2023 filt_vt2, filt_vt3); 2024 src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, 2025 filt_hz1, filt_hz2, filt_hz3); 2026 src2 = __lsx_vpackev_b(src10, src9); 2027 src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, 2028 filt_vt2, filt_vt3); 2029 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1); 2030 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 2031 src5 = __lsx_vldrepl_d(dst_tmp, 0); 2032 dst_tmp += dst_stride; 2033 src7 = __lsx_vldrepl_d(dst_tmp, 0); 2034 dst_tmp += dst_stride; 2035 src8 = __lsx_vldrepl_d(dst_tmp, 0); 2036 dst_tmp += dst_stride; 2037 src9 = __lsx_vldrepl_d(dst_tmp, 0); 2038 dst_tmp += dst_stride; 2039 DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7); 2040 DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1); 2041 __lsx_vstelm_d(out0, dst, 0, 0); 2042 dst += dst_stride; 2043 __lsx_vstelm_d(out0, dst, 0, 1); 2044 dst += dst_stride; 2045 __lsx_vstelm_d(out1, dst, 0, 0); 2046 dst += dst_stride; 2047 __lsx_vstelm_d(out1, dst, 0, 1); 2048 dst += dst_stride; 2049 2050 src6 = src10; 2051 tmp0 = tmp2; 2052 tmp1 = tmp3; 2053 tmp2 = src1; 2054 tmp4 = tmp6; 2055 tmp5 = src0; 2056 tmp6 = src2; 2057 } 2058} 2059 2060static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(const uint8_t *src, 2061 int32_t src_stride, 2062 uint8_t *dst, 2063 int32_t dst_stride, 2064 const int8_t *filter_horiz, 2065 const int8_t *filter_vert, 2066 int32_t height) 2067{ 2068 int32_t multiple8_cnt; 2069 2070 for (multiple8_cnt = 2; multiple8_cnt--;) { 2071 common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, 2072 filter_horiz, filter_vert, 2073 height); 2074 2075 src += 8; 2076 dst += 8; 2077 } 2078} 2079 2080static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(const uint8_t *src, 2081 int32_t src_stride, 2082 uint8_t *dst, 2083 int32_t dst_stride, 2084 const int8_t *filter_horiz, 2085 const int8_t *filter_vert, 2086 int32_t height) 2087{ 2088 int32_t multiple8_cnt; 2089 2090 for (multiple8_cnt = 4; multiple8_cnt--;) { 2091 common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, 2092 filter_horiz, filter_vert, 2093 height); 2094 2095 src += 8; 2096 dst += 8; 2097 } 2098} 2099 2100static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(const uint8_t *src, 2101 int32_t src_stride, 2102 uint8_t *dst, 2103 int32_t dst_stride, 2104 const int8_t *filter_horiz, 2105 const int8_t *filter_vert, 2106 int32_t height) 2107{ 2108 int32_t multiple8_cnt; 2109 2110 for (multiple8_cnt = 8; multiple8_cnt--;) { 2111 common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, 2112 filter_horiz, filter_vert, 2113 height); 2114 2115 src += 8; 2116 dst += 8; 2117 } 2118} 2119 2120static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, 2121 uint8_t *dst, int32_t dst_stride, 2122 int32_t height) 2123{ 2124 int32_t cnt = height >> 2; 2125 uint8_t *dst_tmp = dst; 2126 __m128i src0, src1, dst0, dst1; 2127 __m128i tmp0, tmp1, tmp2, tmp3; 2128 2129 for (;cnt--;) { 2130 tmp0 = __lsx_vldrepl_d(src, 0); 2131 src += src_stride; 2132 tmp1 = __lsx_vldrepl_d(src, 0); 2133 src += src_stride; 2134 tmp2 = __lsx_vldrepl_d(src, 0); 2135 src += src_stride; 2136 tmp3 = __lsx_vldrepl_d(src, 0); 2137 src += src_stride; 2138 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, src0, src1); 2139 tmp0 = __lsx_vldrepl_d(dst_tmp, 0); 2140 dst_tmp += dst_stride; 2141 tmp1 = __lsx_vldrepl_d(dst_tmp, 0); 2142 dst_tmp += dst_stride; 2143 tmp2 = __lsx_vldrepl_d(dst_tmp, 0); 2144 dst_tmp += dst_stride; 2145 tmp3 = __lsx_vldrepl_d(dst_tmp, 0); 2146 dst_tmp += dst_stride; 2147 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, dst0, dst1); 2148 DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1); 2149 __lsx_vstelm_d(dst0, dst, 0, 0); 2150 dst += dst_stride; 2151 __lsx_vstelm_d(dst0, dst, 0, 1); 2152 dst += dst_stride; 2153 __lsx_vstelm_d(dst1, dst, 0, 0); 2154 dst += dst_stride; 2155 __lsx_vstelm_d(dst1, dst, 0, 1); 2156 dst += dst_stride; 2157 } 2158} 2159 2160static void avg_width16_lsx(const uint8_t *src, int32_t src_stride, 2161 uint8_t *dst, int32_t dst_stride, 2162 int32_t height) 2163{ 2164 int32_t cnt = height >> 2; 2165 __m128i src0, src1, src2, src3; 2166 __m128i dst0, dst1, dst2, dst3; 2167 int32_t src_stride2 = src_stride << 1; 2168 int32_t src_stride3 = src_stride + src_stride2; 2169 int32_t src_stride4 = src_stride2 << 1; 2170 int32_t dst_stride2 = dst_stride << 1; 2171 int32_t dst_stride3 = dst_stride2 + dst_stride; 2172 int32_t dst_stride4 = dst_stride2 << 1; 2173 uint8_t* _src = (uint8_t*)src; 2174 2175 for (;cnt--;) { 2176 src0 = __lsx_vld(_src, 0); 2177 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); 2178 src3 = __lsx_vldx(_src, src_stride3); 2179 _src += src_stride4; 2180 2181 dst0 = __lsx_vld(dst, 0); 2182 DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, 2183 dst1, dst2); 2184 dst3 = __lsx_vldx(dst, dst_stride3); 2185 DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, 2186 src2, dst2, src3, dst3, dst0, dst1, dst2, dst3); 2187 __lsx_vst(dst0, dst, 0); 2188 __lsx_vstx(dst1, dst, dst_stride); 2189 __lsx_vstx(dst2, dst, dst_stride2); 2190 __lsx_vstx(dst3, dst, dst_stride3); 2191 dst += dst_stride4; 2192 } 2193} 2194 2195static void avg_width32_lsx(const uint8_t *src, int32_t src_stride, 2196 uint8_t *dst, int32_t dst_stride, 2197 int32_t height) 2198{ 2199 int32_t cnt = height >> 2; 2200 uint8_t *src_tmp1 = (uint8_t*)src; 2201 uint8_t *src_tmp2 = src_tmp1 + 16; 2202 uint8_t *dst_tmp1, *dst_tmp2; 2203 __m128i src0, src1, src2, src3, src4, src5, src6, src7; 2204 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2205 int32_t src_stride2 = src_stride << 1; 2206 int32_t src_stride3 = src_stride + src_stride2; 2207 int32_t src_stride4 = src_stride2 << 1; 2208 int32_t dst_stride2 = dst_stride << 1; 2209 int32_t dst_stride3 = dst_stride2 + dst_stride; 2210 int32_t dst_stride4 = dst_stride2 << 1; 2211 2212 dst_tmp1 = dst; 2213 dst_tmp2 = dst + 16; 2214 for (;cnt--;) { 2215 src0 = __lsx_vld(src_tmp1, 0); 2216 DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, 2217 src2, src4); 2218 src6 = __lsx_vldx(src_tmp1, src_stride3); 2219 src_tmp1 += src_stride4; 2220 2221 src1 = __lsx_vld(src_tmp2, 0); 2222 DUP2_ARG2(__lsx_vldx, src_tmp2, src_stride, src_tmp2, src_stride2, 2223 src3, src5); 2224 src7 = __lsx_vldx(src_tmp2, src_stride3); 2225 src_tmp2 += src_stride4; 2226 2227 dst0 = __lsx_vld(dst_tmp1, 0); 2228 DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, 2229 dst2, dst4); 2230 dst6 = __lsx_vldx(dst_tmp1, dst_stride3); 2231 dst1 = __lsx_vld(dst_tmp2, 0); 2232 DUP2_ARG2(__lsx_vldx, dst_tmp2, dst_stride, dst_tmp2, dst_stride2, 2233 dst3, dst5); 2234 dst7 = __lsx_vldx(dst_tmp2, dst_stride3); 2235 2236 DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, 2237 src2, dst2, src3, dst3, dst0, dst1, dst2, dst3); 2238 DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, 2239 src6, dst6, src7, dst7, dst4, dst5, dst6, dst7); 2240 __lsx_vst(dst0, dst_tmp1, 0); 2241 __lsx_vstx(dst2, dst_tmp1, dst_stride); 2242 __lsx_vstx(dst4, dst_tmp1, dst_stride2); 2243 __lsx_vstx(dst6, dst_tmp1, dst_stride3); 2244 dst_tmp1 += dst_stride4; 2245 __lsx_vst(dst1, dst_tmp2, 0); 2246 __lsx_vstx(dst3, dst_tmp2, dst_stride); 2247 __lsx_vstx(dst5, dst_tmp2, dst_stride2); 2248 __lsx_vstx(dst7, dst_tmp2, dst_stride3); 2249 dst_tmp2 += dst_stride4; 2250 } 2251} 2252 2253static void avg_width64_lsx(const uint8_t *src, int32_t src_stride, 2254 uint8_t *dst, int32_t dst_stride, 2255 int32_t height) 2256{ 2257 int32_t cnt = height >> 2; 2258 uint8_t *dst_tmp = dst; 2259 __m128i src0, src1, src2, src3, src4, src5, src6, src7; 2260 __m128i src8, src9, src10, src11, src12, src13, src14, src15; 2261 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2262 __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; 2263 2264 for (;cnt--;) { 2265 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 2266 src0, src1, src2, src3); 2267 src += src_stride; 2268 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 2269 src4, src5, src6, src7); 2270 src += src_stride; 2271 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 2272 src8, src9, src10, src11); 2273 src += src_stride; 2274 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, 2275 src12, src13, src14, src15); 2276 src += src_stride; 2277 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, 2278 dst0, dst1, dst2, dst3); 2279 dst_tmp += dst_stride; 2280 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, 2281 dst4, dst5, dst6, dst7); 2282 dst_tmp += dst_stride; 2283 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, 2284 dst8, dst9, dst10, dst11); 2285 dst_tmp += dst_stride; 2286 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, 2287 dst12, dst13, dst14, dst15); 2288 dst_tmp += dst_stride; 2289 DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, 2290 src2, dst2, src3, dst3, dst0, dst1, dst2, dst3); 2291 DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, 2292 src6, dst6, src7, dst7, dst4, dst5, dst6, dst7); 2293 DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, 2294 dst10, src11, dst11, dst8, dst9, dst10, dst11); 2295 DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, 2296 dst14, src15, dst15, dst12, dst13, dst14, dst15); 2297 __lsx_vst(dst0, dst, 0); 2298 __lsx_vst(dst1, dst, 16); 2299 __lsx_vst(dst2, dst, 32); 2300 __lsx_vst(dst3, dst, 48); 2301 dst += dst_stride; 2302 __lsx_vst(dst4, dst, 0); 2303 __lsx_vst(dst5, dst, 16); 2304 __lsx_vst(dst6, dst, 32); 2305 __lsx_vst(dst7, dst, 48); 2306 dst += dst_stride; 2307 __lsx_vst(dst8, dst, 0); 2308 __lsx_vst(dst9, dst, 16); 2309 __lsx_vst(dst10, dst, 32); 2310 __lsx_vst(dst11, dst, 48); 2311 dst += dst_stride; 2312 __lsx_vst(dst12, dst, 0); 2313 __lsx_vst(dst13, dst, 16); 2314 __lsx_vst(dst14, dst, 32); 2315 __lsx_vst(dst15, dst, 48); 2316 dst += dst_stride; 2317 } 2318} 2319 2320static const int8_t vp9_subpel_filters_lsx[3][15][8] = { 2321 [FILTER_8TAP_REGULAR] = { 2322 {0, 1, -5, 126, 8, -3, 1, 0}, 2323 {-1, 3, -10, 122, 18, -6, 2, 0}, 2324 {-1, 4, -13, 118, 27, -9, 3, -1}, 2325 {-1, 4, -16, 112, 37, -11, 4, -1}, 2326 {-1, 5, -18, 105, 48, -14, 4, -1}, 2327 {-1, 5, -19, 97, 58, -16, 5, -1}, 2328 {-1, 6, -19, 88, 68, -18, 5, -1}, 2329 {-1, 6, -19, 78, 78, -19, 6, -1}, 2330 {-1, 5, -18, 68, 88, -19, 6, -1}, 2331 {-1, 5, -16, 58, 97, -19, 5, -1}, 2332 {-1, 4, -14, 48, 105, -18, 5, -1}, 2333 {-1, 4, -11, 37, 112, -16, 4, -1}, 2334 {-1, 3, -9, 27, 118, -13, 4, -1}, 2335 {0, 2, -6, 18, 122, -10, 3, -1}, 2336 {0, 1, -3, 8, 126, -5, 1, 0}, 2337 }, [FILTER_8TAP_SHARP] = { 2338 {-1, 3, -7, 127, 8, -3, 1, 0}, 2339 {-2, 5, -13, 125, 17, -6, 3, -1}, 2340 {-3, 7, -17, 121, 27, -10, 5, -2}, 2341 {-4, 9, -20, 115, 37, -13, 6, -2}, 2342 {-4, 10, -23, 108, 48, -16, 8, -3}, 2343 {-4, 10, -24, 100, 59, -19, 9, -3}, 2344 {-4, 11, -24, 90, 70, -21, 10, -4}, 2345 {-4, 11, -23, 80, 80, -23, 11, -4}, 2346 {-4, 10, -21, 70, 90, -24, 11, -4}, 2347 {-3, 9, -19, 59, 100, -24, 10, -4}, 2348 {-3, 8, -16, 48, 108, -23, 10, -4}, 2349 {-2, 6, -13, 37, 115, -20, 9, -4}, 2350 {-2, 5, -10, 27, 121, -17, 7, -3}, 2351 {-1, 3, -6, 17, 125, -13, 5, -2}, 2352 {0, 1, -3, 8, 127, -7, 3, -1}, 2353 }, [FILTER_8TAP_SMOOTH] = { 2354 {-3, -1, 32, 64, 38, 1, -3, 0}, 2355 {-2, -2, 29, 63, 41, 2, -3, 0}, 2356 {-2, -2, 26, 63, 43, 4, -4, 0}, 2357 {-2, -3, 24, 62, 46, 5, -4, 0}, 2358 {-2, -3, 21, 60, 49, 7, -4, 0}, 2359 {-1, -4, 18, 59, 51, 9, -4, 0}, 2360 {-1, -4, 16, 57, 53, 12, -4, -1}, 2361 {-1, -4, 14, 55, 55, 14, -4, -1}, 2362 {-1, -4, 12, 53, 57, 16, -4, -1}, 2363 {0, -4, 9, 51, 59, 18, -4, -1}, 2364 {0, -4, 7, 49, 60, 21, -3, -2}, 2365 {0, -4, 5, 46, 62, 24, -3, -2}, 2366 {0, -4, 4, 43, 63, 26, -2, -2}, 2367 {0, -3, 2, 41, 63, 29, -2, -2}, 2368 {0, -3, 1, 38, 64, 32, -1, -3}, 2369 } 2370}; 2371 2372#define VP9_8TAP_LOONGARCH_LSX_FUNC(SIZE, type, type_idx) \ 2373void ff_put_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride, \ 2374 const uint8_t *src, \ 2375 ptrdiff_t srcstride, \ 2376 int h, int mx, int my) \ 2377{ \ 2378 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][mx-1]; \ 2379 \ 2380 common_hz_8t_##SIZE##w_lsx(src, srcstride, dst, dststride, filter, h); \ 2381} \ 2382 \ 2383void ff_put_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride, \ 2384 const uint8_t *src, \ 2385 ptrdiff_t srcstride, \ 2386 int h, int mx, int my) \ 2387{ \ 2388 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][my-1]; \ 2389 \ 2390 common_vt_8t_##SIZE##w_lsx(src, srcstride, dst, dststride, filter, h); \ 2391} \ 2392 \ 2393void ff_put_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride, \ 2394 const uint8_t *src, \ 2395 ptrdiff_t srcstride, \ 2396 int h, int mx, int my) \ 2397{ \ 2398 const int8_t *hfilter = vp9_subpel_filters_lsx[type_idx][mx-1]; \ 2399 const int8_t *vfilter = vp9_subpel_filters_lsx[type_idx][my-1]; \ 2400 \ 2401 common_hv_8ht_8vt_##SIZE##w_lsx(src, srcstride, dst, dststride, hfilter, \ 2402 vfilter, h); \ 2403} \ 2404 \ 2405void ff_avg_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride, \ 2406 const uint8_t *src, \ 2407 ptrdiff_t srcstride, \ 2408 int h, int mx, int my) \ 2409{ \ 2410 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][mx-1]; \ 2411 \ 2412 common_hz_8t_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, \ 2413 dststride, filter, h); \ 2414} \ 2415 \ 2416void ff_avg_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride, \ 2417 const uint8_t *src, \ 2418 ptrdiff_t srcstride, \ 2419 int h, int mx, int my) \ 2420{ \ 2421 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][my-1]; \ 2422 \ 2423 common_vt_8t_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, dststride, \ 2424 filter, h); \ 2425} \ 2426 \ 2427void ff_avg_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride, \ 2428 const uint8_t *src, \ 2429 ptrdiff_t srcstride, \ 2430 int h, int mx, int my) \ 2431{ \ 2432 const int8_t *hfilter = vp9_subpel_filters_lsx[type_idx][mx-1]; \ 2433 const int8_t *vfilter = vp9_subpel_filters_lsx[type_idx][my-1]; \ 2434 \ 2435 common_hv_8ht_8vt_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, \ 2436 dststride, hfilter, \ 2437 vfilter, h); \ 2438} 2439 2440#define VP9_COPY_LOONGARCH_LSX_FUNC(SIZE) \ 2441void ff_copy##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride, \ 2442 const uint8_t *src, ptrdiff_t srcstride, \ 2443 int h, int mx, int my) \ 2444{ \ 2445 \ 2446 copy_width##SIZE##_lsx(src, srcstride, dst, dststride, h); \ 2447} \ 2448void ff_avg##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride, \ 2449 const uint8_t *src, ptrdiff_t srcstride, \ 2450 int h, int mx, int my) \ 2451{ \ 2452 \ 2453 avg_width##SIZE##_lsx(src, srcstride, dst, dststride, h); \ 2454} 2455 2456VP9_8TAP_LOONGARCH_LSX_FUNC(64, regular, FILTER_8TAP_REGULAR); 2457VP9_8TAP_LOONGARCH_LSX_FUNC(32, regular, FILTER_8TAP_REGULAR); 2458VP9_8TAP_LOONGARCH_LSX_FUNC(16, regular, FILTER_8TAP_REGULAR); 2459VP9_8TAP_LOONGARCH_LSX_FUNC(8, regular, FILTER_8TAP_REGULAR); 2460VP9_8TAP_LOONGARCH_LSX_FUNC(4, regular, FILTER_8TAP_REGULAR); 2461 2462VP9_8TAP_LOONGARCH_LSX_FUNC(64, sharp, FILTER_8TAP_SHARP); 2463VP9_8TAP_LOONGARCH_LSX_FUNC(32, sharp, FILTER_8TAP_SHARP); 2464VP9_8TAP_LOONGARCH_LSX_FUNC(16, sharp, FILTER_8TAP_SHARP); 2465VP9_8TAP_LOONGARCH_LSX_FUNC(8, sharp, FILTER_8TAP_SHARP); 2466VP9_8TAP_LOONGARCH_LSX_FUNC(4, sharp, FILTER_8TAP_SHARP); 2467 2468VP9_8TAP_LOONGARCH_LSX_FUNC(64, smooth, FILTER_8TAP_SMOOTH); 2469VP9_8TAP_LOONGARCH_LSX_FUNC(32, smooth, FILTER_8TAP_SMOOTH); 2470VP9_8TAP_LOONGARCH_LSX_FUNC(16, smooth, FILTER_8TAP_SMOOTH); 2471VP9_8TAP_LOONGARCH_LSX_FUNC(8, smooth, FILTER_8TAP_SMOOTH); 2472VP9_8TAP_LOONGARCH_LSX_FUNC(4, smooth, FILTER_8TAP_SMOOTH); 2473 2474VP9_COPY_LOONGARCH_LSX_FUNC(64); 2475VP9_COPY_LOONGARCH_LSX_FUNC(32); 2476VP9_COPY_LOONGARCH_LSX_FUNC(16); 2477VP9_COPY_LOONGARCH_LSX_FUNC(8); 2478 2479#undef VP9_8TAP_LOONGARCH_LSX_FUNC 2480#undef VP9_COPY_LOONGARCH_LSX_FUNC 2481