1/* 2 * Copyright (c) 2021 Loongson Technology Corporation Limited 3 * Contributed by Hecai Yuan <yuanhecai@loongson.cn> 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20#include "libavcodec/vp8dsp.h" 21#include "libavutil/loongarch/loongson_intrinsics.h" 22#include "vp8dsp_loongarch.h" 23 24static const uint8_t mc_filt_mask_arr[16 * 3] = { 25 /* 8 width cases */ 26 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 27 /* 4 width cases */ 28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 29 /* 4 width cases */ 30 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 31}; 32 33static const int8_t subpel_filters_lsx[7][8] = { 34 {-6, 123, 12, -1, 0, 0, 0, 0}, 35 {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ 36 {-9, 93, 50, -6, 0, 0, 0, 0}, 37 {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ 38 {-6, 50, 93, -9, 0, 0, 0, 0}, 39 {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ 40 {-1, 12, 123, -6, 0, 0, 0, 0}, 41}; 42 43#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ 44( { \ 45 __m128i out0_m; \ 46 \ 47 out0_m = __lsx_vdp2_h_b(in0, coeff0); \ 48 out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); \ 49 out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); \ 50 \ 51 out0_m; \ 52} ) 53 54#define VSHF_B3_SB(in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \ 55 out0, out1, out2) \ 56{ \ 57 DUP2_ARG3(__lsx_vshuf_b, in1, in0, mask0, in3, in2, mask1, \ 58 out0, out1); \ 59 out2 = __lsx_vshuf_b(in5, in4, mask2); \ 60} 61 62#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \ 63 filt_h0, filt_h1, filt_h2) \ 64( { \ 65 __m128i vec0_m, vec1_m, vec2_m; \ 66 __m128i hz_out_m; \ 67 \ 68 VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ 69 vec0_m, vec1_m, vec2_m); \ 70 hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \ 71 filt_h0, filt_h1, filt_h2); \ 72 \ 73 hz_out_m = __lsx_vsrari_h(hz_out_m, 7); \ 74 hz_out_m = __lsx_vsat_h(hz_out_m, 7); \ 75 \ 76 hz_out_m; \ 77} ) 78 79#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 80 mask0, mask1, mask2, \ 81 filt0, filt1, filt2, \ 82 out0, out1, out2, out3) \ 83{ \ 84 __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 85 \ 86 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, \ 87 mask0, src3, src3, mask0, vec0_m, vec1_m, vec2_m, vec3_m); \ 88 DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \ 89 vec3_m, filt0, out0, out1, out2, out3); \ 90 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, \ 91 mask1, src3, src3, mask1, vec0_m, vec1_m, vec2_m, vec3_m); \ 92 DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, \ 93 mask2, src3, src3, mask2, vec4_m, vec5_m, vec6_m, vec7_m); \ 94 DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \ 95 out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, out3); \ 96 DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ 97 out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, out3); \ 98} 99 100#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ 101( { \ 102 __m128i tmp0; \ 103 \ 104 tmp0 = __lsx_vdp2_h_b(vec0, filt0); \ 105 tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1); \ 106 \ 107 tmp0; \ 108} ) 109 110#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ 111( { \ 112 __m128i vec0_m, vec1_m; \ 113 __m128i hz_out_m; \ 114 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, \ 115 vec0_m, vec1_m); \ 116 hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \ 117 \ 118 hz_out_m = __lsx_vsrari_h(hz_out_m, 7); \ 119 hz_out_m = __lsx_vsat_h(hz_out_m, 7); \ 120 \ 121 hz_out_m; \ 122} ) 123 124void ff_put_vp8_epel8_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride, 125 uint8_t *src, ptrdiff_t src_stride, 126 int height, int mx, int my) 127{ 128 uint32_t loop_cnt; 129 const int8_t *filter = subpel_filters_lsx[mx - 1]; 130 __m128i src0, src1, src2, src3, filt0, filt1, filt2; 131 __m128i mask0, mask1, mask2; 132 __m128i out0, out1, out2, out3; 133 134 ptrdiff_t src_stride2 = src_stride << 1; 135 ptrdiff_t src_stride3 = src_stride2 + src_stride; 136 ptrdiff_t src_stride4 = src_stride2 << 1; 137 138 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 139 src -= 2; 140 141 /* rearranging filter */ 142 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 143 filt2 = __lsx_vldrepl_h(filter, 4); 144 145 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 146 147 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 148 src + src_stride3, 0, src0, src1, src2, src3); 149 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 150 src0, src1, src2, src3); 151 src += src_stride4; 152 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 153 filt0, filt1, filt2, out0, out1, out2, out3); 154 155 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 156 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 157 __lsx_vstelm_d(out0, dst, 0, 0); 158 dst += dst_stride; 159 __lsx_vstelm_d(out0, dst, 0, 1); 160 dst += dst_stride; 161 __lsx_vstelm_d(out1, dst, 0, 0); 162 dst += dst_stride; 163 __lsx_vstelm_d(out1, dst, 0, 1); 164 dst += dst_stride; 165 166 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { 167 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 168 src + src_stride3, 0, src0, src1, src2, src3); 169 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 170 src0, src1, src2, src3); 171 src += src_stride4; 172 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 173 filt0, filt1, filt2, out0, out1, out2, out3); 174 175 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 176 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 177 178 __lsx_vstelm_d(out0, dst, 0, 0); 179 dst += dst_stride; 180 __lsx_vstelm_d(out0, dst, 0, 1); 181 dst += dst_stride; 182 __lsx_vstelm_d(out1, dst, 0, 0); 183 dst += dst_stride; 184 __lsx_vstelm_d(out1, dst, 0, 1); 185 dst += dst_stride; 186 } 187} 188 189void ff_put_vp8_epel16_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride, 190 uint8_t *src, ptrdiff_t src_stride, 191 int height, int mx, int my) 192{ 193 uint32_t loop_cnt; 194 const int8_t *filter = subpel_filters_lsx[mx - 1]; 195 __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1; 196 __m128i filt2, mask0, mask1, mask2; 197 __m128i out0, out1, out2, out3, out4, out5, out6, out7; 198 199 ptrdiff_t src_stride2 = src_stride << 1; 200 ptrdiff_t src_stride3 = src_stride2 + src_stride; 201 ptrdiff_t src_stride4 = src_stride2 << 1; 202 203 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 204 src -= 2; 205 /* rearranging filter */ 206 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 207 filt2 = __lsx_vldrepl_h(filter, 4); 208 209 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 210 211 for (loop_cnt = (height >> 2); loop_cnt--;) { 212 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 213 0, src + src_stride3, 0, src0 ,src2, src4, src6); 214 DUP4_ARG2(__lsx_vld, src, 8, src + src_stride, 8, src + src_stride2, 215 8, src + src_stride3, 8, src1, src3, src5, src7); 216 217 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 218 src0, src1, src2, src3); 219 DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, 220 src4, src5, src6, src7); 221 src += src_stride4; 222 223 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 224 filt0, filt1, filt2, out0, out1, out2, out3); 225 HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, 226 filt0, filt1, filt2, out4, out5, out6, out7); 227 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 228 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 229 __lsx_vst(out0, dst, 0); 230 dst += dst_stride; 231 __lsx_vst(out1, dst, 0); 232 dst += dst_stride; 233 234 DUP2_ARG3(__lsx_vssrarni_b_h, out5, out4, 7, out7, out6, 7, out4, out5); 235 DUP2_ARG2(__lsx_vxori_b, out4, 128, out5, 128, out4, out5); 236 __lsx_vst(out4, dst, 0); 237 dst += dst_stride; 238 __lsx_vst(out5, dst, 0); 239 dst += dst_stride; 240 } 241} 242 243void ff_put_vp8_epel8_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, 244 uint8_t *src, ptrdiff_t src_stride, 245 int height, int mx, int my) 246{ 247 uint32_t loop_cnt; 248 const int8_t *filter = subpel_filters_lsx[my - 1]; 249 __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10; 250 __m128i src10_l, src32_l, src76_l, src98_l, src21_l, src43_l, src87_l; 251 __m128i src109_l, filt0, filt1, filt2; 252 __m128i out0_l, out1_l, out2_l, out3_l; 253 254 ptrdiff_t src_stride2 = src_stride << 1; 255 ptrdiff_t src_stride3 = src_stride2 + src_stride; 256 ptrdiff_t src_stride4 = src_stride2 << 1; 257 258 src -= src_stride2; 259 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 260 filt2 = __lsx_vldrepl_h(filter, 4); 261 262 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 263 src + src_stride3, 0, src0, src1, src2, src3); 264 src += src_stride4; 265 src4 = __lsx_vld(src, 0); 266 src += src_stride; 267 268 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 269 src0, src1, src2, src3); 270 src4 = __lsx_vxori_b(src4, 128); 271 272 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, 273 src3, src10_l, src32_l, src21_l, src43_l); 274 for (loop_cnt = (height >> 2); loop_cnt--;) { 275 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 276 0, src + src_stride3, 0, src7, src8, src9, src10); 277 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 278 128, src7, src8, src9, src10); 279 src += src_stride4; 280 281 DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, 282 src9, src76_l, src87_l, src98_l, src109_l); 283 284 out0_l = DPADD_SH3_SH(src10_l, src32_l, src76_l, filt0, filt1, filt2); 285 out1_l = DPADD_SH3_SH(src21_l, src43_l, src87_l, filt0, filt1, filt2); 286 out2_l = DPADD_SH3_SH(src32_l, src76_l, src98_l, filt0, filt1, filt2); 287 out3_l = DPADD_SH3_SH(src43_l, src87_l, src109_l, filt0, filt1, filt2); 288 289 DUP2_ARG3(__lsx_vssrarni_b_h, out1_l, out0_l, 7, out3_l, out2_l, 7, 290 out0_l, out1_l); 291 DUP2_ARG2(__lsx_vxori_b, out0_l, 128, out1_l, 128, out0_l, out1_l); 292 293 __lsx_vstelm_d(out0_l, dst, 0, 0); 294 dst += dst_stride; 295 __lsx_vstelm_d(out0_l, dst, 0, 1); 296 dst += dst_stride; 297 __lsx_vstelm_d(out1_l, dst, 0, 0); 298 dst += dst_stride; 299 __lsx_vstelm_d(out1_l, dst, 0, 1); 300 dst += dst_stride; 301 302 src10_l = src76_l; 303 src32_l = src98_l; 304 src21_l = src87_l; 305 src43_l = src109_l; 306 src4 = src10; 307 } 308} 309 310void ff_put_vp8_epel16_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, 311 uint8_t *src, ptrdiff_t src_stride, 312 int height, int mx, int my) 313{ 314 uint32_t loop_cnt; 315 const int8_t *filter = subpel_filters_lsx[my - 1]; 316 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 317 __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l, src65_l, src87_l; 318 __m128i src10_h, src32_h, src54_h, src76_h, src21_h, src43_h, src65_h, src87_h; 319 __m128i filt0, filt1, filt2; 320 __m128i tmp0, tmp1, tmp2, tmp3; 321 322 ptrdiff_t src_stride2 = src_stride << 1; 323 ptrdiff_t src_stride3 = src_stride2 + src_stride; 324 ptrdiff_t src_stride4 = src_stride2 << 1; 325 326 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 327 filt2 = __lsx_vldrepl_h(filter, 4); 328 329 DUP4_ARG2(__lsx_vld, src - src_stride2, 0, src - src_stride, 0, src, 0, 330 src + src_stride, 0, src0, src1, src2, src3); 331 src4 = __lsx_vld(src + src_stride2, 0); 332 src += src_stride3; 333 334 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, 335 src1, src2, src3); 336 src4 = __lsx_vxori_b(src4, 128); 337 338 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1, 339 src10_l, src32_l, src43_l, src21_l); 340 DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1, 341 src10_h, src32_h, src43_h, src21_h); 342 343 for (loop_cnt = (height >> 2); loop_cnt--;) { 344 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 345 src + src_stride3, 0, src5, src6, src7, src8); 346 src += src_stride4; 347 DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, 348 src5, src6, src7, src8); 349 350 DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, 351 src54_l, src65_l, src76_l, src87_l); 352 DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7, 353 src54_h, src65_h, src76_h, src87_h); 354 355 tmp0 = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2); 356 tmp1 = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2); 357 tmp2 = DPADD_SH3_SH(src10_h, src32_h, src54_h, filt0, filt1, filt2); 358 tmp3 = DPADD_SH3_SH(src21_h, src43_h, src65_h, filt0, filt1, filt2); 359 360 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); 361 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 362 __lsx_vst(tmp0, dst, 0); 363 dst += dst_stride; 364 __lsx_vst(tmp1, dst, 0); 365 dst += dst_stride; 366 367 tmp0 = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2); 368 tmp1 = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2); 369 tmp2 = DPADD_SH3_SH(src32_h, src54_h, src76_h, filt0, filt1, filt2); 370 tmp3 = DPADD_SH3_SH(src43_h, src65_h, src87_h, filt0, filt1, filt2); 371 372 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); 373 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 374 __lsx_vst(tmp0, dst, 0); 375 dst += dst_stride; 376 __lsx_vst(tmp1, dst, 0); 377 dst += dst_stride; 378 379 src10_l = src54_l; 380 src32_l = src76_l; 381 src21_l = src65_l; 382 src43_l = src87_l; 383 src10_h = src54_h; 384 src32_h = src76_h; 385 src21_h = src65_h; 386 src43_h = src87_h; 387 src4 = src8; 388 } 389} 390 391void ff_put_vp8_epel8_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, 392 uint8_t *src, ptrdiff_t src_stride, 393 int height, int mx, int my) 394{ 395 uint32_t loop_cnt; 396 const int8_t *filter_horiz = subpel_filters_lsx[mx - 1]; 397 const int8_t *filter_vert = subpel_filters_lsx[my - 1]; 398 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 399 __m128i filt_hz0, filt_hz1, filt_hz2; 400 __m128i mask0, mask1, mask2, filt_vt0, filt_vt1, filt_vt2; 401 __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 402 __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; 403 __m128i tmp0, tmp1, tmp2, tmp3; 404 405 ptrdiff_t src_stride2 = src_stride << 1; 406 ptrdiff_t src_stride3 = src_stride2 + src_stride; 407 ptrdiff_t src_stride4 = src_stride2 << 1; 408 409 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 410 src -= (2 + src_stride2); 411 412 /* rearranging filter */ 413 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1); 414 filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4); 415 416 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 417 418 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 419 src + src_stride3, 0, src0, src1, src2, src3); 420 src += src_stride4; 421 src4 = __lsx_vld(src, 0); 422 src += src_stride; 423 424 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 425 src0 ,src1, src2, src3); 426 src4 = __lsx_vxori_b(src4, 128); 427 428 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, 429 filt_hz1, filt_hz2); 430 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, 431 filt_hz1, filt_hz2); 432 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, 433 filt_hz1, filt_hz2); 434 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, 435 filt_hz1, filt_hz2); 436 hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, 437 filt_hz1, filt_hz2); 438 439 DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1); 440 filt_vt2 = __lsx_vldrepl_h(filter_vert, 4); 441 442 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); 443 DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4); 444 for (loop_cnt = (height >> 2); loop_cnt--;) { 445 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 446 src + src_stride3, 0, src5, src6, src7, src8); 447 src += src_stride4; 448 449 DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, 450 src5, src6, src7, src8); 451 452 hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, 453 filt_hz1, filt_hz2); 454 out2 = __lsx_vpackev_b(hz_out5, hz_out4); 455 tmp0 = DPADD_SH3_SH(out0, out1, out2,filt_vt0, filt_vt1, filt_vt2); 456 457 hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, 458 filt_hz1, filt_hz2); 459 out5 = __lsx_vpackev_b(hz_out6, hz_out5); 460 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); 461 462 hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0, 463 filt_hz1, filt_hz2); 464 465 out7 = __lsx_vpackev_b(hz_out7, hz_out6); 466 tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); 467 468 hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0, 469 filt_hz1, filt_hz2); 470 out6 = __lsx_vpackev_b(hz_out8, hz_out7); 471 tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); 472 473 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); 474 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 475 __lsx_vstelm_d(tmp0, dst, 0, 0); 476 477 dst += dst_stride; 478 __lsx_vstelm_d(tmp0, dst, 0, 1); 479 dst += dst_stride; 480 __lsx_vstelm_d(tmp1, dst, 0, 0); 481 dst += dst_stride; 482 __lsx_vstelm_d(tmp1, dst, 0, 1); 483 dst += dst_stride; 484 485 hz_out4 = hz_out8; 486 out0 = out2; 487 out1 = out7; 488 out3 = out5; 489 out4 = out6; 490 } 491} 492 493void ff_put_vp8_epel16_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, 494 uint8_t *src, ptrdiff_t src_stride, 495 int height, int mx, int my) 496{ 497 int32_t multiple8_cnt; 498 499 for (multiple8_cnt = 2; multiple8_cnt--;) { 500 ff_put_vp8_epel8_h6v6_lsx(dst, dst_stride, src, src_stride, height, mx, my); 501 src += 8; 502 dst += 8; 503 } 504} 505 506void ff_put_vp8_epel8_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, 507 uint8_t *src, ptrdiff_t src_stride, 508 int height, int mx, int my) 509{ 510 uint32_t loop_cnt; 511 const int8_t *filter = subpel_filters_lsx[my - 1]; 512 __m128i src0, src1, src2, src7, src8, src9, src10; 513 __m128i src10_l, src72_l, src98_l, src21_l, src87_l, src109_l, filt0, filt1; 514 __m128i out0, out1, out2, out3; 515 516 ptrdiff_t src_stride2 = src_stride << 1; 517 ptrdiff_t src_stride3 = src_stride2 + src_stride; 518 ptrdiff_t src_stride4 = src_stride2 << 1; 519 520 src -= src_stride; 521 522 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 523 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1); 524 src2 = __lsx_vld(src + src_stride2, 0); 525 src += src_stride3; 526 527 DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); 528 src2 = __lsx_vxori_b(src2, 128); 529 DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_l, src21_l); 530 531 for (loop_cnt = (height >> 2); loop_cnt--;) { 532 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 533 src + src_stride3, 0, src7, src8, src9, src10); 534 src += src_stride4; 535 536 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, 537 src7, src8, src9, src10); 538 DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9, 539 src72_l, src87_l, src98_l, src109_l); 540 541 out0 = FILT_4TAP_DPADD_S_H(src10_l, src72_l, filt0, filt1); 542 out1 = FILT_4TAP_DPADD_S_H(src21_l, src87_l, filt0, filt1); 543 out2 = FILT_4TAP_DPADD_S_H(src72_l, src98_l, filt0, filt1); 544 out3 = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1); 545 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); 546 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); 547 548 __lsx_vstelm_d(out0, dst, 0, 0); 549 dst += dst_stride; 550 __lsx_vstelm_d(out0, dst, 0, 1); 551 dst += dst_stride; 552 __lsx_vstelm_d(out1, dst, 0, 0); 553 dst += dst_stride; 554 __lsx_vstelm_d(out1, dst, 0, 1); 555 dst += dst_stride; 556 557 src10_l = src98_l; 558 src21_l = src109_l; 559 src2 = src10; 560 } 561} 562 563void ff_put_vp8_epel16_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, 564 uint8_t *src, ptrdiff_t src_stride, 565 int height, int mx, int my) 566{ 567 uint32_t loop_cnt; 568 const int8_t *filter = subpel_filters_lsx[my - 1]; 569 __m128i src0, src1, src2, src3, src4, src5, src6; 570 __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l, src10_h; 571 __m128i src32_h, src54_h, src21_h, src43_h, src65_h, filt0, filt1; 572 __m128i tmp0, tmp1, tmp2, tmp3; 573 574 ptrdiff_t src_stride2 = src_stride << 1; 575 ptrdiff_t src_stride3 = src_stride2 + src_stride; 576 ptrdiff_t src_stride4 = src_stride2 << 1; 577 578 src -= src_stride; 579 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); 580 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1); 581 src2 = __lsx_vld(src + src_stride2, 0); 582 src += src_stride3; 583 584 DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); 585 src2 = __lsx_vxori_b(src2, 128); 586 DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_l, src21_l); 587 DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_h, src21_h); 588 589 for (loop_cnt = (height >> 2); loop_cnt--;) { 590 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 591 0, src + src_stride3, 0, src3, src4, src5, src6); 592 src += src_stride4; 593 594 DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, 595 src3, src4, src5, src6); 596 DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, 597 src5, src32_l, src43_l, src54_l, src65_l); 598 DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, 599 src5, src32_h, src43_h, src54_h, src65_h); 600 601 tmp0 = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1); 602 tmp1 = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); 603 tmp2 = FILT_4TAP_DPADD_S_H(src10_h, src32_h, filt0, filt1); 604 tmp3 = FILT_4TAP_DPADD_S_H(src21_h, src43_h, filt0, filt1); 605 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); 606 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 607 608 __lsx_vst(tmp0, dst, 0); 609 dst += dst_stride; 610 __lsx_vst(tmp1, dst, 0); 611 dst += dst_stride; 612 613 tmp0 = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1); 614 tmp1 = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1); 615 tmp2 = FILT_4TAP_DPADD_S_H(src32_h, src54_h, filt0, filt1); 616 tmp3 = FILT_4TAP_DPADD_S_H(src43_h, src65_h, filt0, filt1); 617 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); 618 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 619 620 __lsx_vst(tmp0, dst, 0); 621 dst += dst_stride; 622 __lsx_vst(tmp1, dst, 0); 623 dst += dst_stride; 624 625 src10_l = src54_l; 626 src21_l = src65_l; 627 src10_h = src54_h; 628 src21_h = src65_h; 629 src2 = src6; 630 } 631} 632 633void ff_put_vp8_epel8_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, 634 uint8_t *src, ptrdiff_t src_stride, 635 int height, int mx, int my) 636{ 637 uint32_t loop_cnt; 638 const int8_t *filter_horiz = subpel_filters_lsx[mx - 1]; 639 const int8_t *filter_vert = subpel_filters_lsx[my - 1]; 640 __m128i src0, src1, src2, src3, src4, src5, src6; 641 __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; 642 __m128i filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3; 643 __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3; 644 645 ptrdiff_t src_stride2 = src_stride << 1; 646 ptrdiff_t src_stride3 = src_stride2 + src_stride; 647 ptrdiff_t src_stride4 = src_stride2 << 1; 648 649 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 650 src -= (2 + src_stride); 651 652 /* rearranging filter */ 653 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1); 654 filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4); 655 656 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); 657 658 DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1); 659 src2 = __lsx_vld(src + src_stride2, 0); 660 src += src_stride3; 661 662 DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); 663 src2 = __lsx_vxori_b(src2, 128); 664 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, 665 filt_hz1, filt_hz2); 666 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, 667 filt_hz1, filt_hz2); 668 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, 669 filt_hz1, filt_hz2); 670 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); 671 672 DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1); 673 674 for (loop_cnt = (height >> 2); loop_cnt--;) { 675 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 676 src + src_stride3, 0, src3, src4, src5, src6); 677 src += src_stride4; 678 679 DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, 680 src3, src4, src5, src6); 681 682 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, 683 filt_hz1, filt_hz2); 684 vec1 = __lsx_vpackev_b(hz_out3, hz_out2); 685 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1); 686 687 hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, 688 filt_hz1, filt_hz2); 689 vec3 = __lsx_vpackev_b(hz_out0, hz_out3); 690 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1); 691 692 hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0, 693 filt_hz1, filt_hz2); 694 vec0 = __lsx_vpackev_b(hz_out1, hz_out0); 695 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1); 696 697 hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0, 698 filt_hz1, filt_hz2); 699 DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2); 700 tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1); 701 702 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); 703 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 704 705 __lsx_vstelm_d(tmp0, dst, 0, 0); 706 dst += dst_stride; 707 __lsx_vstelm_d(tmp0, dst, 0, 1); 708 dst += dst_stride; 709 __lsx_vstelm_d(tmp1, dst, 0, 0); 710 dst += dst_stride; 711 __lsx_vstelm_d(tmp1, dst, 0, 1); 712 dst += dst_stride; 713 } 714} 715 716void ff_put_vp8_epel16_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, 717 uint8_t *src, ptrdiff_t src_stride, 718 int height, int mx, int my) 719{ 720 int32_t multiple8_cnt; 721 722 for (multiple8_cnt = 2; multiple8_cnt--;) { 723 ff_put_vp8_epel8_h6v4_lsx(dst, dst_stride, src, src_stride, height, 724 mx, my); 725 src += 8; 726 dst += 8; 727 } 728} 729 730void ff_put_vp8_epel8_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, 731 uint8_t *src, ptrdiff_t src_stride, 732 int height, int mx, int my) 733{ 734 uint32_t loop_cnt; 735 const int8_t *filter_horiz = subpel_filters_lsx[mx - 1]; 736 const int8_t *filter_vert = subpel_filters_lsx[my - 1]; 737 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; 738 __m128i filt_hz0, filt_hz1, mask0, mask1; 739 __m128i filt_vt0, filt_vt1, filt_vt2; 740 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; 741 __m128i out0, out1, out2, out3, out4, out5, out6, out7; 742 743 ptrdiff_t src_stride2 = src_stride << 1; 744 ptrdiff_t src_stride3 = src_stride2 + src_stride; 745 ptrdiff_t src_stride4 = src_stride2 << 1; 746 747 mask0 = __lsx_vld(mc_filt_mask_arr, 0); 748 src -= (1 + src_stride2); 749 750 /* rearranging filter */ 751 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1); 752 mask1 = __lsx_vaddi_bu(mask0, 2); 753 754 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 755 src + src_stride3, 0, src0, src1, src2, src3); 756 src += src_stride4; 757 src4 = __lsx_vld(src, 0); 758 src += src_stride; 759 760 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 761 src0, src1, src2, src3); 762 src4 = __lsx_vxori_b(src4, 128); 763 764 tmp0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1); 765 tmp1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1); 766 tmp2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1); 767 tmp3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1); 768 tmp4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1); 769 770 DUP4_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp2, tmp1, 771 tmp4, tmp3, out0, out1, out3, out4); 772 773 DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1); 774 filt_vt2 = __lsx_vldrepl_h(filter_vert, 4); 775 776 for (loop_cnt = (height >> 2); loop_cnt--;) { 777 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 778 src + src_stride3, 0, src5, src6, src7, src8); 779 src += src_stride4; 780 781 DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, 782 src5, src6, src7, src8); 783 784 tmp5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1); 785 out2 = __lsx_vpackev_b(tmp5, tmp4); 786 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); 787 788 tmp6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1); 789 out5 = __lsx_vpackev_b(tmp6, tmp5); 790 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); 791 792 tmp7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1); 793 out6 = __lsx_vpackev_b(tmp7, tmp6); 794 tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); 795 796 tmp8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1); 797 out7 = __lsx_vpackev_b(tmp8, tmp7); 798 tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); 799 800 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); 801 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); 802 803 __lsx_vstelm_d(tmp0, dst, 0, 0); 804 dst += dst_stride; 805 __lsx_vstelm_d(tmp0, dst, 0, 1); 806 dst += dst_stride; 807 __lsx_vstelm_d(tmp1, dst, 0, 0); 808 dst += dst_stride; 809 __lsx_vstelm_d(tmp1, dst, 0, 1); 810 dst += dst_stride; 811 812 tmp4 = tmp8; 813 out0 = out2; 814 out1 = out6; 815 out3 = out5; 816 out4 = out7; 817 } 818} 819 820void ff_put_vp8_epel16_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, 821 uint8_t *src, ptrdiff_t src_stride, 822 int height, int mx, int my) 823{ 824 int32_t multiple8_cnt; 825 826 for (multiple8_cnt = 2; multiple8_cnt--;) { 827 ff_put_vp8_epel8_h4v6_lsx(dst, dst_stride, src, src_stride, height, 828 mx, my); 829 src += 8; 830 dst += 8; 831 } 832} 833 834void ff_put_vp8_pixels8_lsx(uint8_t *dst, ptrdiff_t dst_stride, 835 uint8_t *src, ptrdiff_t src_stride, 836 int height, int mx, int my) 837{ 838 int32_t cnt; 839 __m128i src0, src1, src2, src3; 840 841 ptrdiff_t src_stride2 = src_stride << 1; 842 ptrdiff_t src_stride3 = src_stride2 + src_stride; 843 ptrdiff_t src_stride4 = src_stride2 << 1; 844 845 if (0 == height % 8) { 846 for (cnt = height >> 3; cnt--;) { 847 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 848 src + src_stride3, 0, src0, src1, src2, src3); 849 src += src_stride4; 850 851 __lsx_vstelm_d(src0, dst, 0, 0); 852 dst += dst_stride; 853 __lsx_vstelm_d(src1, dst, 0, 0); 854 dst += dst_stride; 855 __lsx_vstelm_d(src2, dst, 0, 0); 856 dst += dst_stride; 857 __lsx_vstelm_d(src3, dst, 0, 0); 858 dst += dst_stride; 859 860 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 861 src + src_stride3, 0, src0, src1, src2, src3); 862 src += src_stride4; 863 864 __lsx_vstelm_d(src0, dst, 0, 0); 865 dst += dst_stride; 866 __lsx_vstelm_d(src1, dst, 0, 0); 867 dst += dst_stride; 868 __lsx_vstelm_d(src2, dst, 0, 0); 869 dst += dst_stride; 870 __lsx_vstelm_d(src3, dst, 0, 0); 871 dst += dst_stride; 872 } 873 } else if( 0 == height % 4) { 874 for (cnt = (height >> 2); cnt--;) { 875 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 876 src + src_stride3, 0, src0, src1, src2, src3); 877 src += src_stride4; 878 879 __lsx_vstelm_d(src0, dst, 0, 0); 880 dst += dst_stride; 881 __lsx_vstelm_d(src1, dst, 0, 0); 882 dst += dst_stride; 883 __lsx_vstelm_d(src2, dst, 0, 0); 884 dst += dst_stride; 885 __lsx_vstelm_d(src3, dst, 0, 0); 886 dst += dst_stride; 887 } 888 } 889} 890 891void ff_put_vp8_pixels16_lsx(uint8_t *dst, ptrdiff_t dst_stride, 892 uint8_t *src, ptrdiff_t src_stride, 893 int height, int mx, int my) 894{ 895 int32_t width = 16; 896 int32_t cnt, loop_cnt; 897 uint8_t *src_tmp, *dst_tmp; 898 __m128i src0, src1, src2, src3, src4, src5, src6, src7; 899 900 ptrdiff_t src_stride2 = src_stride << 1; 901 ptrdiff_t src_stride3 = src_stride2 + src_stride; 902 ptrdiff_t src_stride4 = src_stride2 << 1; 903 904 ptrdiff_t dst_stride2 = dst_stride << 1; 905 ptrdiff_t dst_stride3 = dst_stride2 + dst_stride; 906 ptrdiff_t dst_stride4 = dst_stride2 << 1; 907 908 if (0 == height % 8) { 909 for (cnt = (width >> 4); cnt--;) { 910 src_tmp = src; 911 dst_tmp = dst; 912 for (loop_cnt = (height >> 3); loop_cnt--;) { 913 DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0, 914 src_tmp + src_stride2, 0, src_tmp + src_stride3, 0, 915 src4, src5, src6, src7); 916 src_tmp += src_stride4; 917 918 __lsx_vst(src4, dst_tmp, 0); 919 __lsx_vst(src5, dst_tmp + dst_stride, 0); 920 __lsx_vst(src6, dst_tmp + dst_stride2, 0); 921 __lsx_vst(src7, dst_tmp + dst_stride3, 0); 922 dst_tmp += dst_stride4; 923 924 DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0, 925 src_tmp + src_stride2, 0, src_tmp + src_stride3, 0, 926 src4, src5, src6, src7); 927 src_tmp += src_stride4; 928 929 __lsx_vst(src4, dst_tmp, 0); 930 __lsx_vst(src5, dst_tmp + dst_stride, 0); 931 __lsx_vst(src6, dst_tmp + dst_stride2, 0); 932 __lsx_vst(src7, dst_tmp + dst_stride3, 0); 933 dst_tmp += dst_stride4; 934 } 935 src += 16; 936 dst += 16; 937 } 938 } else if (0 == height % 4) { 939 for (cnt = (height >> 2); cnt--;) { 940 DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0, 941 src + src_stride3, 0, src0, src1, src2, src3); 942 src += 4 * src_stride4; 943 944 __lsx_vst(src0, dst, 0); 945 __lsx_vst(src1, dst + dst_stride, 0); 946 __lsx_vst(src2, dst + dst_stride2, 0); 947 __lsx_vst(src3, dst + dst_stride3, 0); 948 dst += dst_stride4; 949 } 950 } 951} 952