1/*
2 * Copyright (c) 2021 Loongson Technology Corporation Limited
3 * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20#include "libavcodec/vp8dsp.h"
21#include "libavutil/loongarch/loongson_intrinsics.h"
22#include "vp8dsp_loongarch.h"
23
24static const uint8_t mc_filt_mask_arr[16 * 3] = {
25    /* 8 width cases */
26    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
27    /* 4 width cases */
28    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
29    /* 4 width cases */
30    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
31};
32
33static const int8_t subpel_filters_lsx[7][8] = {
34    {-6, 123, 12, -1, 0, 0, 0, 0},
35    {2, -11, 108, 36, -8, 1, 0, 0},     /* New 1/4 pel 6 tap filter */
36    {-9, 93, 50, -6, 0, 0, 0, 0},
37    {3, -16, 77, 77, -16, 3, 0, 0},     /* New 1/2 pel 6 tap filter */
38    {-6, 50, 93, -9, 0, 0, 0, 0},
39    {1, -8, 36, 108, -11, 2, 0, 0},     /* New 1/4 pel 6 tap filter */
40    {-1, 12, 123, -6, 0, 0, 0, 0},
41};
42
43#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \
44( {                                                                 \
45    __m128i out0_m;                                                 \
46                                                                    \
47    out0_m = __lsx_vdp2_h_b(in0, coeff0);                           \
48    out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);                \
49    out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);                \
50                                                                    \
51    out0_m;                                                         \
52} )
53
54#define VSHF_B3_SB(in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
55                out0, out1, out2)                                      \
56{                                                                      \
57    DUP2_ARG3(__lsx_vshuf_b, in1, in0, mask0, in3, in2, mask1,         \
58              out0, out1);                                             \
59    out2 = __lsx_vshuf_b(in5, in4, mask2);                             \
60}
61
62#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,                 \
63                        filt_h0, filt_h1, filt_h2)                       \
64( {                                                                      \
65    __m128i vec0_m, vec1_m, vec2_m;                                      \
66    __m128i hz_out_m;                                                    \
67                                                                         \
68    VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,  \
69               vec0_m, vec1_m, vec2_m);                                  \
70    hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m,                      \
71                            filt_h0, filt_h1, filt_h2);                  \
72                                                                         \
73    hz_out_m = __lsx_vsrari_h(hz_out_m, 7);                              \
74    hz_out_m = __lsx_vsat_h(hz_out_m, 7);                                \
75                                                                         \
76    hz_out_m;                                                            \
77} )
78
79#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                            \
80                                   mask0, mask1, mask2,                               \
81                                   filt0, filt1, filt2,                               \
82                                   out0, out1, out2, out3)                            \
83{                                                                                     \
84    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;           \
85                                                                                      \
86    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,        \
87              mask0, src3, src3, mask0, vec0_m, vec1_m, vec2_m, vec3_m);              \
88    DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0,            \
89              vec3_m, filt0, out0, out1, out2, out3);                                 \
90    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,        \
91              mask1, src3, src3, mask1, vec0_m, vec1_m, vec2_m, vec3_m);              \
92    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,        \
93              mask2, src3, src3, mask2, vec4_m, vec5_m, vec6_m, vec7_m);              \
94    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1,            \
95              out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, out3);      \
96    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2,            \
97              out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, out3);      \
98}
99
100#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \
101( {                                                             \
102    __m128i tmp0;                                               \
103                                                                \
104    tmp0 = __lsx_vdp2_h_b(vec0, filt0);                         \
105    tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1);                \
106                                                                \
107    tmp0;                                                       \
108} )
109
110#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)    \
111( {                                                                    \
112    __m128i vec0_m, vec1_m;                                            \
113    __m128i hz_out_m;                                                  \
114    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1,     \
115              vec0_m, vec1_m);                                         \
116    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1);  \
117                                                                       \
118    hz_out_m = __lsx_vsrari_h(hz_out_m, 7);                            \
119    hz_out_m = __lsx_vsat_h(hz_out_m, 7);                              \
120                                                                       \
121    hz_out_m;                                                          \
122} )
123
124void ff_put_vp8_epel8_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
125                             uint8_t *src, ptrdiff_t src_stride,
126                             int height, int mx, int my)
127{
128    uint32_t loop_cnt;
129    const int8_t *filter = subpel_filters_lsx[mx - 1];
130    __m128i src0, src1, src2, src3, filt0, filt1, filt2;
131    __m128i mask0, mask1, mask2;
132    __m128i out0, out1, out2, out3;
133
134    ptrdiff_t src_stride2 = src_stride << 1;
135    ptrdiff_t src_stride3 = src_stride2 + src_stride;
136    ptrdiff_t src_stride4 = src_stride2 << 1;
137
138    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
139    src -= 2;
140
141    /* rearranging filter */
142    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
143    filt2 = __lsx_vldrepl_h(filter, 4);
144
145    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
146
147    DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
148              src + src_stride3, 0, src0, src1, src2, src3);
149    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
150              src0, src1, src2, src3);
151    src += src_stride4;
152    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
153                               filt0, filt1, filt2, out0, out1, out2, out3);
154
155    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
156    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
157    __lsx_vstelm_d(out0, dst, 0, 0);
158    dst += dst_stride;
159    __lsx_vstelm_d(out0, dst, 0, 1);
160    dst += dst_stride;
161    __lsx_vstelm_d(out1, dst, 0, 0);
162    dst += dst_stride;
163    __lsx_vstelm_d(out1, dst, 0, 1);
164    dst += dst_stride;
165
166    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
167        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
168                  src + src_stride3, 0, src0, src1, src2, src3);
169        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
170                  src0, src1, src2, src3);
171        src += src_stride4;
172        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
173                                   filt0, filt1, filt2, out0, out1, out2, out3);
174
175        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
176        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
177
178        __lsx_vstelm_d(out0, dst, 0, 0);
179        dst += dst_stride;
180        __lsx_vstelm_d(out0, dst, 0, 1);
181        dst += dst_stride;
182        __lsx_vstelm_d(out1, dst, 0, 0);
183        dst += dst_stride;
184        __lsx_vstelm_d(out1, dst, 0, 1);
185        dst += dst_stride;
186    }
187}
188
189void ff_put_vp8_epel16_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
190                              uint8_t *src, ptrdiff_t src_stride,
191                              int height, int mx, int my)
192{
193    uint32_t loop_cnt;
194    const int8_t *filter = subpel_filters_lsx[mx - 1];
195    __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1;
196    __m128i filt2, mask0, mask1, mask2;
197    __m128i out0, out1, out2, out3, out4, out5, out6, out7;
198
199    ptrdiff_t src_stride2 = src_stride << 1;
200    ptrdiff_t src_stride3 = src_stride2 + src_stride;
201    ptrdiff_t src_stride4 = src_stride2 << 1;
202
203    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
204    src -= 2;
205    /* rearranging filter */
206    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
207    filt2 = __lsx_vldrepl_h(filter, 4);
208
209    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
210
211    for (loop_cnt = (height >> 2); loop_cnt--;) {
212        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
213                  0, src + src_stride3, 0, src0 ,src2, src4, src6);
214        DUP4_ARG2(__lsx_vld, src, 8, src + src_stride, 8, src + src_stride2,
215                  8, src + src_stride3, 8, src1, src3, src5, src7);
216
217        DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
218                  src0, src1, src2, src3);
219        DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128,
220                  src4, src5, src6, src7);
221        src += src_stride4;
222
223        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
224                                   filt0, filt1, filt2, out0, out1, out2, out3);
225        HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
226                                   filt0, filt1, filt2, out4, out5, out6, out7);
227        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
228        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
229        __lsx_vst(out0, dst, 0);
230        dst += dst_stride;
231        __lsx_vst(out1, dst, 0);
232        dst += dst_stride;
233
234        DUP2_ARG3(__lsx_vssrarni_b_h, out5, out4, 7, out7, out6, 7, out4, out5);
235        DUP2_ARG2(__lsx_vxori_b, out4, 128, out5, 128, out4, out5);
236        __lsx_vst(out4, dst, 0);
237        dst += dst_stride;
238        __lsx_vst(out5, dst, 0);
239        dst += dst_stride;
240    }
241}
242
243void ff_put_vp8_epel8_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
244                             uint8_t *src, ptrdiff_t src_stride,
245                             int height, int mx, int my)
246{
247    uint32_t loop_cnt;
248    const int8_t *filter = subpel_filters_lsx[my - 1];
249    __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
250    __m128i src10_l, src32_l, src76_l, src98_l, src21_l, src43_l, src87_l;
251    __m128i src109_l, filt0, filt1, filt2;
252    __m128i out0_l, out1_l, out2_l, out3_l;
253
254    ptrdiff_t src_stride2 = src_stride << 1;
255    ptrdiff_t src_stride3 = src_stride2 + src_stride;
256    ptrdiff_t src_stride4 = src_stride2 << 1;
257
258    src -= src_stride2;
259    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
260    filt2 = __lsx_vldrepl_h(filter, 4);
261
262    DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
263              src + src_stride3, 0, src0, src1, src2, src3);
264    src += src_stride4;
265    src4 = __lsx_vld(src, 0);
266    src += src_stride;
267
268    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
269              src0, src1, src2, src3);
270    src4 = __lsx_vxori_b(src4, 128);
271
272    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4,
273              src3, src10_l, src32_l, src21_l, src43_l);
274    for (loop_cnt = (height >> 2); loop_cnt--;) {
275        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
276                  0, src + src_stride3, 0, src7, src8, src9, src10);
277        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
278                  128, src7, src8, src9, src10);
279        src += src_stride4;
280
281        DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10,
282                  src9, src76_l, src87_l, src98_l, src109_l);
283
284        out0_l = DPADD_SH3_SH(src10_l, src32_l, src76_l, filt0, filt1, filt2);
285        out1_l = DPADD_SH3_SH(src21_l, src43_l, src87_l, filt0, filt1, filt2);
286        out2_l = DPADD_SH3_SH(src32_l, src76_l, src98_l, filt0, filt1, filt2);
287        out3_l = DPADD_SH3_SH(src43_l, src87_l, src109_l, filt0, filt1, filt2);
288
289        DUP2_ARG3(__lsx_vssrarni_b_h, out1_l, out0_l, 7, out3_l, out2_l, 7,
290                  out0_l, out1_l);
291        DUP2_ARG2(__lsx_vxori_b, out0_l, 128, out1_l, 128, out0_l, out1_l);
292
293        __lsx_vstelm_d(out0_l, dst, 0, 0);
294        dst += dst_stride;
295        __lsx_vstelm_d(out0_l, dst, 0, 1);
296        dst += dst_stride;
297        __lsx_vstelm_d(out1_l, dst, 0, 0);
298        dst += dst_stride;
299        __lsx_vstelm_d(out1_l, dst, 0, 1);
300        dst += dst_stride;
301
302        src10_l = src76_l;
303        src32_l = src98_l;
304        src21_l = src87_l;
305        src43_l = src109_l;
306        src4 = src10;
307    }
308}
309
310void ff_put_vp8_epel16_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
311                              uint8_t *src, ptrdiff_t src_stride,
312                              int height, int mx, int my)
313{
314    uint32_t loop_cnt;
315    const int8_t *filter = subpel_filters_lsx[my - 1];
316    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
317    __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l, src65_l, src87_l;
318    __m128i src10_h, src32_h, src54_h, src76_h, src21_h, src43_h, src65_h, src87_h;
319    __m128i filt0, filt1, filt2;
320    __m128i tmp0, tmp1, tmp2, tmp3;
321
322    ptrdiff_t src_stride2 = src_stride << 1;
323    ptrdiff_t src_stride3 = src_stride2 + src_stride;
324    ptrdiff_t src_stride4 = src_stride2 << 1;
325
326    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
327    filt2 = __lsx_vldrepl_h(filter, 4);
328
329    DUP4_ARG2(__lsx_vld, src - src_stride2, 0, src - src_stride, 0, src, 0,
330              src + src_stride, 0, src0, src1, src2, src3);
331    src4 = __lsx_vld(src + src_stride2, 0);
332    src += src_stride3;
333
334    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
335              src1, src2, src3);
336    src4 = __lsx_vxori_b(src4, 128);
337
338    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
339              src10_l, src32_l, src43_l, src21_l);
340    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
341              src10_h, src32_h, src43_h, src21_h);
342
343    for (loop_cnt = (height >> 2); loop_cnt--;) {
344        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
345                  src + src_stride3, 0, src5, src6, src7, src8);
346        src += src_stride4;
347        DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
348                  src5, src6, src7, src8);
349
350        DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
351                  src54_l, src65_l, src76_l, src87_l);
352        DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
353                  src54_h, src65_h, src76_h, src87_h);
354
355        tmp0 = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
356        tmp1 = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
357        tmp2 = DPADD_SH3_SH(src10_h, src32_h, src54_h, filt0, filt1, filt2);
358        tmp3 = DPADD_SH3_SH(src21_h, src43_h, src65_h, filt0, filt1, filt2);
359
360        DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
361        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
362        __lsx_vst(tmp0, dst, 0);
363        dst += dst_stride;
364        __lsx_vst(tmp1, dst, 0);
365        dst += dst_stride;
366
367        tmp0 = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
368        tmp1 = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
369        tmp2 = DPADD_SH3_SH(src32_h, src54_h, src76_h, filt0, filt1, filt2);
370        tmp3 = DPADD_SH3_SH(src43_h, src65_h, src87_h, filt0, filt1, filt2);
371
372        DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
373        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
374        __lsx_vst(tmp0, dst, 0);
375        dst += dst_stride;
376        __lsx_vst(tmp1, dst, 0);
377        dst += dst_stride;
378
379        src10_l = src54_l;
380        src32_l = src76_l;
381        src21_l = src65_l;
382        src43_l = src87_l;
383        src10_h = src54_h;
384        src32_h = src76_h;
385        src21_h = src65_h;
386        src43_h = src87_h;
387        src4 = src8;
388    }
389}
390
391void ff_put_vp8_epel8_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
392                               uint8_t *src, ptrdiff_t src_stride,
393                               int height, int mx, int my)
394{
395    uint32_t loop_cnt;
396    const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
397    const int8_t *filter_vert = subpel_filters_lsx[my - 1];
398    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
399    __m128i filt_hz0, filt_hz1, filt_hz2;
400    __m128i mask0, mask1, mask2, filt_vt0, filt_vt1, filt_vt2;
401    __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
402    __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
403    __m128i tmp0, tmp1, tmp2, tmp3;
404
405    ptrdiff_t src_stride2 = src_stride << 1;
406    ptrdiff_t src_stride3 = src_stride2 + src_stride;
407    ptrdiff_t src_stride4 = src_stride2 << 1;
408
409    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
410    src -= (2 + src_stride2);
411
412    /* rearranging filter */
413    DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
414    filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
415
416    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
417
418    DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
419              src + src_stride3, 0, src0, src1, src2, src3);
420    src += src_stride4;
421    src4 = __lsx_vld(src, 0);
422    src +=  src_stride;
423
424    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
425              src0 ,src1, src2, src3);
426    src4 = __lsx_vxori_b(src4, 128);
427
428    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
429                              filt_hz1, filt_hz2);
430    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
431                              filt_hz1, filt_hz2);
432    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
433                              filt_hz1, filt_hz2);
434    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
435                              filt_hz1, filt_hz2);
436    hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
437                              filt_hz1, filt_hz2);
438
439    DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
440    filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
441
442    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
443    DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
444    for (loop_cnt = (height >> 2); loop_cnt--;) {
445        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
446                  src + src_stride3, 0, src5, src6, src7, src8);
447        src += src_stride4;
448
449        DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
450                  src5, src6, src7, src8);
451
452        hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
453                                  filt_hz1, filt_hz2);
454        out2 = __lsx_vpackev_b(hz_out5, hz_out4);
455        tmp0 = DPADD_SH3_SH(out0, out1, out2,filt_vt0, filt_vt1, filt_vt2);
456
457        hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
458                                  filt_hz1, filt_hz2);
459        out5 = __lsx_vpackev_b(hz_out6, hz_out5);
460        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
461
462        hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
463                                  filt_hz1, filt_hz2);
464
465        out7 = __lsx_vpackev_b(hz_out7, hz_out6);
466        tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
467
468        hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
469                                  filt_hz1, filt_hz2);
470        out6 = __lsx_vpackev_b(hz_out8, hz_out7);
471        tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
472
473        DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
474        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
475        __lsx_vstelm_d(tmp0, dst, 0, 0);
476
477        dst += dst_stride;
478        __lsx_vstelm_d(tmp0, dst, 0, 1);
479        dst += dst_stride;
480        __lsx_vstelm_d(tmp1, dst, 0, 0);
481        dst += dst_stride;
482        __lsx_vstelm_d(tmp1, dst, 0, 1);
483        dst += dst_stride;
484
485        hz_out4 = hz_out8;
486        out0 = out2;
487        out1 = out7;
488        out3 = out5;
489        out4 = out6;
490    }
491}
492
493void ff_put_vp8_epel16_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
494                                uint8_t *src, ptrdiff_t src_stride,
495                                int height, int mx, int my)
496{
497    int32_t multiple8_cnt;
498
499    for (multiple8_cnt = 2; multiple8_cnt--;) {
500        ff_put_vp8_epel8_h6v6_lsx(dst, dst_stride, src, src_stride, height, mx, my);
501        src += 8;
502        dst += 8;
503    }
504}
505
506void ff_put_vp8_epel8_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
507                             uint8_t *src, ptrdiff_t src_stride,
508                             int height, int mx, int my)
509{
510    uint32_t loop_cnt;
511    const int8_t *filter = subpel_filters_lsx[my - 1];
512    __m128i src0, src1, src2, src7, src8, src9, src10;
513    __m128i src10_l, src72_l, src98_l, src21_l, src87_l, src109_l, filt0, filt1;
514    __m128i out0, out1, out2, out3;
515
516    ptrdiff_t src_stride2 = src_stride << 1;
517    ptrdiff_t src_stride3 = src_stride2 + src_stride;
518    ptrdiff_t src_stride4 = src_stride2 << 1;
519
520    src -= src_stride;
521
522    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
523    DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
524    src2 = __lsx_vld(src + src_stride2, 0);
525    src += src_stride3;
526
527    DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
528    src2 = __lsx_vxori_b(src2, 128);
529    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_l, src21_l);
530
531    for (loop_cnt = (height >> 2); loop_cnt--;) {
532        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
533                  src + src_stride3, 0, src7, src8, src9, src10);
534        src += src_stride4;
535
536        DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
537                  src7, src8, src9, src10);
538        DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
539                  src72_l, src87_l, src98_l, src109_l);
540
541        out0 = FILT_4TAP_DPADD_S_H(src10_l, src72_l, filt0, filt1);
542        out1 = FILT_4TAP_DPADD_S_H(src21_l, src87_l, filt0, filt1);
543        out2 = FILT_4TAP_DPADD_S_H(src72_l, src98_l, filt0, filt1);
544        out3 = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
545        DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
546        DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
547
548        __lsx_vstelm_d(out0, dst, 0, 0);
549        dst += dst_stride;
550        __lsx_vstelm_d(out0, dst, 0, 1);
551        dst += dst_stride;
552        __lsx_vstelm_d(out1, dst, 0, 0);
553        dst += dst_stride;
554        __lsx_vstelm_d(out1, dst, 0, 1);
555        dst += dst_stride;
556
557        src10_l = src98_l;
558        src21_l = src109_l;
559        src2 = src10;
560    }
561}
562
563void ff_put_vp8_epel16_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
564                              uint8_t *src, ptrdiff_t src_stride,
565                              int height, int mx, int my)
566{
567    uint32_t loop_cnt;
568    const int8_t *filter = subpel_filters_lsx[my - 1];
569    __m128i src0, src1, src2, src3, src4, src5, src6;
570    __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l, src10_h;
571    __m128i src32_h, src54_h, src21_h, src43_h, src65_h, filt0, filt1;
572    __m128i tmp0, tmp1, tmp2, tmp3;
573
574    ptrdiff_t src_stride2 = src_stride << 1;
575    ptrdiff_t src_stride3 = src_stride2 + src_stride;
576    ptrdiff_t src_stride4 = src_stride2 << 1;
577
578    src -= src_stride;
579    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
580    DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
581    src2 = __lsx_vld(src + src_stride2, 0);
582    src += src_stride3;
583
584    DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
585    src2 = __lsx_vxori_b(src2, 128);
586    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_l, src21_l);
587    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_h, src21_h);
588
589    for (loop_cnt = (height >> 2); loop_cnt--;) {
590        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
591                  0, src + src_stride3, 0, src3, src4, src5, src6);
592        src += src_stride4;
593
594        DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128,
595                  src3, src4, src5, src6);
596        DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6,
597                  src5, src32_l, src43_l, src54_l, src65_l);
598        DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6,
599                  src5, src32_h, src43_h, src54_h, src65_h);
600
601        tmp0 = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
602        tmp1 = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
603        tmp2 = FILT_4TAP_DPADD_S_H(src10_h, src32_h, filt0, filt1);
604        tmp3 = FILT_4TAP_DPADD_S_H(src21_h, src43_h, filt0, filt1);
605        DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
606        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
607
608        __lsx_vst(tmp0, dst, 0);
609        dst += dst_stride;
610        __lsx_vst(tmp1, dst, 0);
611        dst += dst_stride;
612
613        tmp0 = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
614        tmp1 = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
615        tmp2 = FILT_4TAP_DPADD_S_H(src32_h, src54_h, filt0, filt1);
616        tmp3 = FILT_4TAP_DPADD_S_H(src43_h, src65_h, filt0, filt1);
617        DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
618        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
619
620        __lsx_vst(tmp0, dst, 0);
621        dst += dst_stride;
622        __lsx_vst(tmp1, dst, 0);
623        dst += dst_stride;
624
625        src10_l = src54_l;
626        src21_l = src65_l;
627        src10_h = src54_h;
628        src21_h = src65_h;
629        src2 = src6;
630    }
631}
632
633void ff_put_vp8_epel8_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
634                               uint8_t *src, ptrdiff_t src_stride,
635                               int height, int mx, int my)
636{
637    uint32_t loop_cnt;
638    const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
639    const int8_t *filter_vert = subpel_filters_lsx[my - 1];
640    __m128i src0, src1, src2, src3, src4, src5, src6;
641    __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
642    __m128i filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
643    __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
644
645    ptrdiff_t src_stride2 = src_stride << 1;
646    ptrdiff_t src_stride3 = src_stride2 + src_stride;
647    ptrdiff_t src_stride4 = src_stride2 << 1;
648
649    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
650    src -= (2 + src_stride);
651
652    /* rearranging filter */
653    DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
654    filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
655
656    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
657
658    DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
659    src2 = __lsx_vld(src + src_stride2, 0);
660    src += src_stride3;
661
662    DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
663    src2 = __lsx_vxori_b(src2, 128);
664    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
665                              filt_hz1, filt_hz2);
666    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
667                              filt_hz1, filt_hz2);
668    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
669                              filt_hz1, filt_hz2);
670    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
671
672    DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
673
674    for (loop_cnt = (height >> 2); loop_cnt--;) {
675        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
676                  src + src_stride3, 0, src3, src4, src5, src6);
677        src += src_stride4;
678
679        DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128,
680                  src3, src4, src5, src6);
681
682        hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
683                                  filt_hz1, filt_hz2);
684        vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
685        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
686
687        hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
688                                  filt_hz1, filt_hz2);
689        vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
690        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
691
692        hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
693                                  filt_hz1, filt_hz2);
694        vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
695        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
696
697        hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
698                                  filt_hz1, filt_hz2);
699        DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
700        tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
701
702        DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
703        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
704
705        __lsx_vstelm_d(tmp0, dst, 0, 0);
706        dst += dst_stride;
707        __lsx_vstelm_d(tmp0, dst, 0, 1);
708        dst += dst_stride;
709        __lsx_vstelm_d(tmp1, dst, 0, 0);
710        dst += dst_stride;
711        __lsx_vstelm_d(tmp1, dst, 0, 1);
712        dst += dst_stride;
713    }
714}
715
716void ff_put_vp8_epel16_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
717                                uint8_t *src, ptrdiff_t src_stride,
718                                int height, int mx, int my)
719{
720    int32_t multiple8_cnt;
721
722    for (multiple8_cnt = 2; multiple8_cnt--;) {
723        ff_put_vp8_epel8_h6v4_lsx(dst, dst_stride, src, src_stride, height,
724                                  mx, my);
725        src += 8;
726        dst += 8;
727    }
728}
729
730void ff_put_vp8_epel8_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
731                               uint8_t *src, ptrdiff_t src_stride,
732                               int height, int mx, int my)
733{
734    uint32_t loop_cnt;
735    const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
736    const int8_t *filter_vert = subpel_filters_lsx[my - 1];
737    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
738    __m128i filt_hz0, filt_hz1, mask0, mask1;
739    __m128i filt_vt0, filt_vt1, filt_vt2;
740    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
741    __m128i out0, out1, out2, out3, out4, out5, out6, out7;
742
743    ptrdiff_t src_stride2 = src_stride << 1;
744    ptrdiff_t src_stride3 = src_stride2 + src_stride;
745    ptrdiff_t src_stride4 = src_stride2 << 1;
746
747    mask0 = __lsx_vld(mc_filt_mask_arr, 0);
748    src -= (1 + src_stride2);
749
750    /* rearranging filter */
751    DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
752    mask1 = __lsx_vaddi_bu(mask0, 2);
753
754    DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
755              src + src_stride3, 0, src0, src1, src2, src3);
756    src += src_stride4;
757    src4 = __lsx_vld(src, 0);
758    src += src_stride;
759
760    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
761              src0, src1, src2, src3);
762    src4 = __lsx_vxori_b(src4, 128);
763
764    tmp0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
765    tmp1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
766    tmp2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
767    tmp3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
768    tmp4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
769
770    DUP4_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp2, tmp1,
771              tmp4, tmp3, out0, out1, out3, out4);
772
773    DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
774    filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
775
776    for (loop_cnt = (height >> 2); loop_cnt--;) {
777        DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
778                  src + src_stride3, 0, src5, src6, src7, src8);
779        src += src_stride4;
780
781        DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
782                  src5, src6, src7, src8);
783
784        tmp5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
785        out2 = __lsx_vpackev_b(tmp5, tmp4);
786        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
787
788        tmp6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
789        out5 = __lsx_vpackev_b(tmp6, tmp5);
790        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
791
792        tmp7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
793        out6 = __lsx_vpackev_b(tmp7, tmp6);
794        tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
795
796        tmp8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
797        out7 = __lsx_vpackev_b(tmp8, tmp7);
798        tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
799
800        DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
801        DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
802
803        __lsx_vstelm_d(tmp0, dst, 0, 0);
804        dst += dst_stride;
805        __lsx_vstelm_d(tmp0, dst, 0, 1);
806        dst += dst_stride;
807        __lsx_vstelm_d(tmp1, dst, 0, 0);
808        dst += dst_stride;
809        __lsx_vstelm_d(tmp1, dst, 0, 1);
810        dst += dst_stride;
811
812        tmp4 = tmp8;
813        out0 = out2;
814        out1 = out6;
815        out3 = out5;
816        out4 = out7;
817    }
818}
819
820void ff_put_vp8_epel16_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
821                                uint8_t *src, ptrdiff_t src_stride,
822                                int height, int mx, int my)
823{
824    int32_t multiple8_cnt;
825
826    for (multiple8_cnt = 2; multiple8_cnt--;) {
827        ff_put_vp8_epel8_h4v6_lsx(dst, dst_stride, src, src_stride, height,
828                                  mx, my);
829        src += 8;
830        dst += 8;
831    }
832}
833
834void ff_put_vp8_pixels8_lsx(uint8_t *dst, ptrdiff_t dst_stride,
835                            uint8_t *src, ptrdiff_t src_stride,
836                            int height, int mx, int my)
837{
838    int32_t cnt;
839    __m128i src0, src1, src2, src3;
840
841    ptrdiff_t src_stride2 = src_stride << 1;
842    ptrdiff_t src_stride3 = src_stride2 + src_stride;
843    ptrdiff_t src_stride4 = src_stride2 << 1;
844
845    if (0 == height % 8) {
846        for (cnt = height >> 3; cnt--;) {
847            DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
848                      src + src_stride3, 0, src0, src1, src2, src3);
849            src += src_stride4;
850
851            __lsx_vstelm_d(src0, dst, 0, 0);
852            dst += dst_stride;
853            __lsx_vstelm_d(src1, dst, 0, 0);
854            dst += dst_stride;
855            __lsx_vstelm_d(src2, dst, 0, 0);
856            dst += dst_stride;
857            __lsx_vstelm_d(src3, dst, 0, 0);
858            dst += dst_stride;
859
860            DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
861                      src + src_stride3, 0, src0, src1, src2, src3);
862            src += src_stride4;
863
864            __lsx_vstelm_d(src0, dst, 0, 0);
865            dst += dst_stride;
866            __lsx_vstelm_d(src1, dst, 0, 0);
867            dst += dst_stride;
868            __lsx_vstelm_d(src2, dst, 0, 0);
869            dst += dst_stride;
870            __lsx_vstelm_d(src3, dst, 0, 0);
871            dst += dst_stride;
872        }
873    } else if( 0 == height % 4) {
874        for (cnt = (height >> 2); cnt--;) {
875            DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
876                      src + src_stride3, 0, src0, src1, src2, src3);
877            src += src_stride4;
878
879            __lsx_vstelm_d(src0, dst, 0, 0);
880            dst += dst_stride;
881            __lsx_vstelm_d(src1, dst, 0, 0);
882            dst += dst_stride;
883            __lsx_vstelm_d(src2, dst, 0, 0);
884            dst += dst_stride;
885            __lsx_vstelm_d(src3, dst, 0, 0);
886            dst += dst_stride;
887        }
888    }
889}
890
891void ff_put_vp8_pixels16_lsx(uint8_t *dst, ptrdiff_t dst_stride,
892                             uint8_t *src, ptrdiff_t src_stride,
893                             int height, int mx, int my)
894{
895    int32_t width = 16;
896    int32_t cnt, loop_cnt;
897    uint8_t *src_tmp, *dst_tmp;
898    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
899
900    ptrdiff_t src_stride2 = src_stride << 1;
901    ptrdiff_t src_stride3 = src_stride2 + src_stride;
902    ptrdiff_t src_stride4 = src_stride2 << 1;
903
904    ptrdiff_t dst_stride2 = dst_stride << 1;
905    ptrdiff_t dst_stride3 = dst_stride2 + dst_stride;
906    ptrdiff_t dst_stride4 = dst_stride2 << 1;
907
908    if (0 == height % 8) {
909        for (cnt = (width >> 4); cnt--;) {
910            src_tmp = src;
911            dst_tmp = dst;
912            for (loop_cnt = (height >> 3); loop_cnt--;) {
913                DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0,
914                          src_tmp + src_stride2, 0, src_tmp + src_stride3, 0,
915                          src4, src5, src6, src7);
916                src_tmp += src_stride4;
917
918                __lsx_vst(src4, dst_tmp,               0);
919                __lsx_vst(src5, dst_tmp + dst_stride,  0);
920                __lsx_vst(src6, dst_tmp + dst_stride2, 0);
921                __lsx_vst(src7, dst_tmp + dst_stride3, 0);
922                dst_tmp += dst_stride4;
923
924                DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0,
925                          src_tmp + src_stride2, 0, src_tmp + src_stride3, 0,
926                          src4, src5, src6, src7);
927                src_tmp += src_stride4;
928
929                __lsx_vst(src4, dst_tmp,               0);
930                __lsx_vst(src5, dst_tmp + dst_stride,  0);
931                __lsx_vst(src6, dst_tmp + dst_stride2, 0);
932                __lsx_vst(src7, dst_tmp + dst_stride3, 0);
933                dst_tmp += dst_stride4;
934            }
935            src += 16;
936            dst += 16;
937        }
938    } else if (0 == height % 4) {
939        for (cnt = (height >> 2); cnt--;) {
940            DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
941                      src + src_stride3, 0, src0, src1, src2, src3);
942            src += 4 * src_stride4;
943
944            __lsx_vst(src0, dst,               0);
945            __lsx_vst(src1, dst + dst_stride,  0);
946            __lsx_vst(src2, dst + dst_stride2, 0);
947            __lsx_vst(src3, dst + dst_stride3, 0);
948            dst += dst_stride4;
949       }
950    }
951}
952